* Help Resource Counters Scale better (v4)
@ 2009-08-11 14:44 Balbir Singh
2009-08-11 14:54 ` Prarit Bhargava
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Balbir Singh @ 2009-08-11 14:44 UTC (permalink / raw)
To: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro,
menage, Prarit Bhargava, andi.kleen, Pavel Emelianov, lizf
Cc: linux-kernel, linux-mm
Enhancement: Remove the overhead of root based resource counter accounting
From: Balbir Singh <balbir@linux.vnet.ibm.com>
This patch reduces the resource counter overhead (mostly spinlock)
associated with the root cgroup. This is a part of the several
patches to reduce mem cgroup overhead. I had posted other
approaches earlier (including using percpu counters). Those
patches will be a natural addition and will be added iteratively
on top of these.
The patch stops resource counter accounting for the root cgroup.
The data for display is derived from the statisitcs we maintain
via mem_cgroup_charge_statistics (which is more scalable).
The tests results I see on a 24 way show that
1. The lock contention disappears from /proc/lock_stats
2. The results of the test are comparable to running with
cgroup_disable=memory.
Please test/review.
---
mm/memcontrol.c | 89 +++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 48a38e1..6d82f8d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -70,6 +70,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
+ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
};
@@ -478,6 +479,19 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
return mz;
}
+static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+ bool charge)
+{
+ int val = (charge)? 1 : -1;
+ struct mem_cgroup_stat *stat = &mem->stat;
+ struct mem_cgroup_stat_cpu *cpustat;
+ int cpu = get_cpu();
+
+ cpustat = &stat->cpustat[cpu];
+ __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
+ put_cpu();
+}
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
@@ -1281,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
VM_BUG_ON(css_is_removed(&mem->css));
while (1) {
- int ret;
+ int ret = 0;
unsigned long flags = 0;
+ if (mem_cgroup_is_root(mem))
+ goto done;
ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
&soft_fail_res);
if (likely(!ret)) {
@@ -1343,6 +1359,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
mem_cgroup_update_tree(mem_over_soft_limit, page);
}
+done:
return 0;
nomem:
css_put(&mem->css);
@@ -1415,9 +1432,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE,
+ NULL);
+ }
css_put(&mem->css);
return;
}
@@ -1494,7 +1514,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
if (pc->mem_cgroup != from)
goto out;
- res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(from))
+ res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
mem_cgroup_charge_statistics(from, pc, false);
page = pc->page;
@@ -1513,7 +1534,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1);
}
- if (do_swap_account)
+ if (do_swap_account && !mem_cgroup_is_root(from))
res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
css_put(&from->css);
@@ -1584,9 +1605,11 @@ uncharge:
/* drop extra refcnt by try_charge() */
css_put(&parent->css);
/* uncharge if move fails */
- res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(parent)) {
+ res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+ if (do_swap_account)
+ res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+ }
return ret;
}
@@ -1775,7 +1798,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
* This recorded memcg can be obsolete one. So, avoid
* calling css_tryget
*/
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
+ NULL);
+ mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
rcu_read_unlock();
@@ -1800,9 +1826,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
- if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ }
css_put(&mem->css);
}
@@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
- if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(mem)) {
+ res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+ if (do_swap_account &&
+ (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ }
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem))
+ mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
@@ -1948,7 +1981,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+ mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
rcu_read_unlock();
@@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
name = MEMFILE_ATTR(cft->private);
switch (type) {
case _MEM:
- val = res_counter_read_u64(&mem->res, name);
+ if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+ val = mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_RSS);
+ val <<= PAGE_SHIFT;
+ } else
+ val = res_counter_read_u64(&mem->res, name);
break;
case _MEMSWAP:
- val = res_counter_read_u64(&mem->memsw, name);
+ if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+ val = mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_RSS);
+ val += mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_SWAPOUT);
+ val <<= PAGE_SHIFT;
+ } else
+ val = res_counter_read_u64(&mem->memsw, name);
break;
default:
BUG();
--
Balbir
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:44 Help Resource Counters Scale better (v4) Balbir Singh @ 2009-08-11 14:54 ` Prarit Bhargava 2009-08-11 15:00 ` Balbir Singh ` (2 more replies) 2009-08-11 16:52 ` Balbir Singh 2009-08-11 23:31 ` Andrew Morton 2 siblings, 3 replies; 20+ messages in thread From: Prarit Bhargava @ 2009-08-11 14:54 UTC (permalink / raw) To: balbir Cc: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm Balbir Singh wrote: > Enhancement: Remove the overhead of root based resource counter accounting > > > <snip> > Please test/review. > > FWIW ... On a 64p/32G system running 2.6.31-git2-rc5, with RESOURCE_COUNTERS off, "time make -j64" results in real 4m54.972s user 90m13.456s sys 50m19.711s On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, plus Balbir's "Help Resource Counters Scale Better (v3)" patch, and this patch, results in real 4m18.607s user 84m58.943s sys 50m52.682s P. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:54 ` Prarit Bhargava @ 2009-08-11 15:00 ` Balbir Singh 2009-08-11 15:11 ` Prarit Bhargava 2009-08-11 15:14 ` Prarit Bhargava 2009-08-11 15:20 ` Andi Kleen 2 siblings, 1 reply; 20+ messages in thread From: Balbir Singh @ 2009-08-11 15:00 UTC (permalink / raw) To: Prarit Bhargava Cc: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm * Prarit Bhargava <prarit@redhat.com> [2009-08-11 10:54:50]: > > > Balbir Singh wrote: >> Enhancement: Remove the overhead of root based resource counter accounting >> >> >> > > <snip> >> Please test/review. >> >> > FWIW ... > Thanks for the testing! > On a 64p/32G system running 2.6.31-git2-rc5, with RESOURCE_COUNTERS off, > "time make -j64" results in > > real 4m54.972s > user 90m13.456s > sys 50m19.711s > > On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, > plus Balbir's "Help Resource Counters Scale Better (v3)" patch, and this ^^^ you mean (v4) right? > patch, results in > > real 4m18.607s > user 84m58.943s > sys 50m52.682s > Without the patch and RESOURCE_COUNTERS do you see a big overhead. I'd assume so, I am seeing it on my 24 way box that I have access to. -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 15:00 ` Balbir Singh @ 2009-08-11 15:11 ` Prarit Bhargava 0 siblings, 0 replies; 20+ messages in thread From: Prarit Bhargava @ 2009-08-11 15:11 UTC (permalink / raw) To: balbir Cc: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm >> > > Without the patch and RESOURCE_COUNTERS do you see a big overhead. I'd > assume so, I am seeing it on my 24 way box that I have access to. > I see a *huge* overhead: real 27m8.988s user 87m24.916s sys 382m6.037s P. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:54 ` Prarit Bhargava 2009-08-11 15:00 ` Balbir Singh @ 2009-08-11 15:14 ` Prarit Bhargava 2009-08-11 15:20 ` Andi Kleen 2 siblings, 0 replies; 20+ messages in thread From: Prarit Bhargava @ 2009-08-11 15:14 UTC (permalink / raw) To: balbir Cc: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm Prarit Bhargava wrote: > > > Balbir Singh wrote: >> Enhancement: Remove the overhead of root based resource counter >> accounting >> >> >> > > <snip> >> Please test/review. >> >> > FWIW ... > > On a 64p/32G system running 2.6.31-git2-rc5, with RESOURCE_COUNTERS > off, "time make -j64" results in > > real 4m54.972s > user 90m13.456s > sys 50m19.711s > > On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, > plus Balbir's "Help Resource Counters Scale Better (v3)" patch, and > this patch, results in Oops, sorry Balbir. I meant to write: On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, plus Balbir's "Help Resource Counters Scale Better (v4)" patch results in ... Sorry for the typo, P. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:54 ` Prarit Bhargava 2009-08-11 15:00 ` Balbir Singh 2009-08-11 15:14 ` Prarit Bhargava @ 2009-08-11 15:20 ` Andi Kleen 2009-08-11 15:21 ` Prarit Bhargava 2 siblings, 1 reply; 20+ messages in thread From: Andi Kleen @ 2009-08-11 15:20 UTC (permalink / raw) To: Prarit Bhargava Cc: balbir, Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm Prarit Bhargava <prarit@redhat.com> writes: > > On a 64p/32G system running 2.6.31-git2-rc5, with RESOURCE_COUNTERS This is CONFIG_RESOURCE_COUNTERS off at compile time right? > off, "time make -j64" results in > > real 4m54.972s > user 90m13.456s > sys 50m19.711s > > On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, > plus Balbir's "Help Resource Counters Scale Better (v3)" patch, and > this patch, results in > > real 4m18.607s > user 84m58.943s > sys 50m52.682s Hmm, so resource counters on with the patch is faster than CONFIG_RESOURCE_COUNTERS compiled out in real time? That seems counterintuitive. At best I would expect the patch to break even, but not be actually faster. Is the compilation stable over multiple runs? Still it looks like the patch is clearly needed. -Andi -- ak@linux.intel.com -- Speaking for myself only. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 15:20 ` Andi Kleen @ 2009-08-11 15:21 ` Prarit Bhargava 0 siblings, 0 replies; 20+ messages in thread From: Prarit Bhargava @ 2009-08-11 15:21 UTC (permalink / raw) To: Andi Kleen Cc: balbir, Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, andi.kleen, Pavel Emelianov, lizf, linux-kernel, linux-mm Andi Kleen wrote: > Prarit Bhargava <prarit@redhat.com> writes: > >> On a 64p/32G system running 2.6.31-git2-rc5, with RESOURCE_COUNTERS >> > > This is CONFIG_RESOURCE_COUNTERS off at compile time right? > > >> off, "time make -j64" results in >> ^^^ yes. It is off. >> real 4m54.972s >> user 90m13.456s >> sys 50m19.711s >> >> On the same system, running 2.6.31-git2-rc5, with RESOURCE_COUNTERS on, >> plus Balbir's "Help Resource Counters Scale Better (v3)" patch, and >> this patch, results in >> >> real 4m18.607s >> user 84m58.943s >> sys 50m52.682s >> > > Hmm, so resource counters on with the patch is faster than > CONFIG_RESOURCE_COUNTERS compiled out in real time? That seems > counterintuitive. At best I would expect the patch to break even, but > not be actually faster. > That is only after one run. I'm doing 10 reboot and build runs right now and will average them out. I'll ping back here with results. > > Still it looks like the patch is clearly needed. > > P. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:44 Help Resource Counters Scale better (v4) Balbir Singh 2009-08-11 14:54 ` Prarit Bhargava @ 2009-08-11 16:52 ` Balbir Singh 2009-08-11 23:31 ` Andrew Morton 2 siblings, 0 replies; 20+ messages in thread From: Balbir Singh @ 2009-08-11 16:52 UTC (permalink / raw) To: Andrew Morton, KAMEZAWA Hiroyuki, nishimura, KOSAKI Motohiro, menage, Prarit Bhargava, andi.kleen, Pavel Emelianov, lizf Cc: linux-kernel, linux-mm * Balbir Singh <balbir@linux.vnet.ibm.com> [2009-08-11 20:14:05]: Here is a verion of the patch, with the checkpatch.pl warnings fixed. I forgot to refresh my changes :( From: Balbir Singh <balbir@linux.vnet.ibm.com> Enhancement: Remove the overhead of root based resource counter accounting This patch reduces the resource counter overhead (mostly spinlock) associated with the root cgroup. This is a part of the several patches to reduce mem cgroup overhead. I had posted other approaches earlier (including using percpu counters). Those patches will be a natural addition and will be added iteratively on top of these. The patch stops resource counter accounting for the root cgroup. The data for display is derived from the statisitcs we maintain via mem_cgroup_charge_statistics (which is more scalable). The tests results I see on a 24 way show that 1. The lock contention disappears from /proc/lock_stats 2. The results of the test are comparable to running with cgroup_disable=memory. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> --- mm/memcontrol.c | 89 +++++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 70 insertions(+), 19 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 48a38e1..4f51885 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -70,6 +70,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -478,6 +479,19 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); + + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); + put_cpu(); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) @@ -1281,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret; + int ret = 0; unsigned long flags = 0; + if (mem_cgroup_is_root(mem)) + goto done; ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, &soft_fail_res); if (likely(!ret)) { @@ -1343,6 +1359,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) mem_cgroup_update_tree(mem_over_soft_limit, page); } +done: return 0; nomem: css_put(&mem->css); @@ -1415,9 +1432,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, + NULL); + } css_put(&mem->css); return; } @@ -1494,7 +1514,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - res_counter_uncharge(&from->res, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->res, PAGE_SIZE, NULL); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1513,7 +1534,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account) + if (do_swap_account && !mem_cgroup_is_root(from)) res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); css_put(&from->css); @@ -1584,9 +1605,11 @@ uncharge: /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(parent)) { + res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); + } return ret; } @@ -1775,7 +1798,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, + NULL); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1800,9 +1826,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + } css_put(&mem->css); } @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); + if (do_swap_account && + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + } + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) + mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1948,7 +1981,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + val = mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_RSS); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + val = mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_RSS); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_SWAPOUT); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 14:44 Help Resource Counters Scale better (v4) Balbir Singh 2009-08-11 14:54 ` Prarit Bhargava 2009-08-11 16:52 ` Balbir Singh @ 2009-08-11 23:31 ` Andrew Morton 2009-08-12 2:34 ` Daisuke Nishimura ` (3 more replies) 2 siblings, 4 replies; 20+ messages in thread From: Andrew Morton @ 2009-08-11 23:31 UTC (permalink / raw) To: balbir Cc: kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm On Tue, 11 Aug 2009 20:14:05 +0530 Balbir Singh <balbir@linux.vnet.ibm.com> wrote: > Enhancement: Remove the overhead of root based resource counter accounting > > From: Balbir Singh <balbir@linux.vnet.ibm.com> > > This patch reduces the resource counter overhead (mostly spinlock) > associated with the root cgroup. This is a part of the several > patches to reduce mem cgroup overhead. I had posted other > approaches earlier (including using percpu counters). Those > patches will be a natural addition and will be added iteratively > on top of these. > > The patch stops resource counter accounting for the root cgroup. > The data for display is derived from the statisitcs we maintain > via mem_cgroup_charge_statistics (which is more scalable). > > The tests results I see on a 24 way show that > > 1. The lock contention disappears from /proc/lock_stats > 2. The results of the test are comparable to running with > cgroup_disable=memory. > > Please test/review. I don't get it. The patch apepars to skip accounting altogether for the root memcgroup and then adds some accounting back in for swap. Or something like that. How come? Do we actually not need the root memcgroup accounting? IOW, the changelog sucks ;) Is this an alternative approach to using percpu_counters, or do we do both or do we choose one or the other? res_counter_charge() really is quite sucky. The patch didn't have a signoff. It would be nice to finalise those performance testing results and include them in the new, improved patch description. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 23:31 ` Andrew Morton @ 2009-08-12 2:34 ` Daisuke Nishimura 2009-08-12 3:56 ` Balbir Singh ` (2 subsequent siblings) 3 siblings, 0 replies; 20+ messages in thread From: Daisuke Nishimura @ 2009-08-12 2:34 UTC (permalink / raw) To: Andrew Morton Cc: balbir, kamezawa.hiroyu, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, nishimura, linux-kernel, linux-mm On Tue, 11 Aug 2009 16:31:59 -0700, Andrew Morton <akpm@linux-foundation.org> wrote: > On Tue, 11 Aug 2009 20:14:05 +0530 > Balbir Singh <balbir@linux.vnet.ibm.com> wrote: > > > Enhancement: Remove the overhead of root based resource counter accounting > > > > From: Balbir Singh <balbir@linux.vnet.ibm.com> > > > > This patch reduces the resource counter overhead (mostly spinlock) > > associated with the root cgroup. This is a part of the several > > patches to reduce mem cgroup overhead. I had posted other > > approaches earlier (including using percpu counters). Those > > patches will be a natural addition and will be added iteratively > > on top of these. > > > > The patch stops resource counter accounting for the root cgroup. > > The data for display is derived from the statisitcs we maintain > > via mem_cgroup_charge_statistics (which is more scalable). > > > > The tests results I see on a 24 way show that > > > > 1. The lock contention disappears from /proc/lock_stats > > 2. The results of the test are comparable to running with > > cgroup_disable=memory. > > > > Please test/review. > > I don't get it. > > The patch apepars to skip accounting altogether for the root memcgroup > and then adds some accounting back in for swap. Or something like > that. How come? Do we actually not need the root memcgroup > accounting? > IIUC, this patch doesn't remove the root memcgroup accounting, it just changes the counter for root memcgroup accounting from res_counter to cpustat[cpu] to reduce the lock congestion of res_counter especially on a big platform. Using res_counter(lock, check limit, charge) for root memcgroup would be overkill because root memcgroup has no limit now(by memcg-remove-the-overhead-associated-with-the-root-cgroup.patch). And, MEM_CGROUP_STAT_SWAPOUT would be needed to show memsw.usage_in_bytes of root memcgroup. We didn't have cpustat[cpu] counter for swap accounting so far. > IOW, the changelog sucks ;) > > Is this an alternative approach to using percpu_counters, or do we do > both or do we choose one or the other? res_counter_charge() really is > quite sucky. > > The patch didn't have a signoff. > > It would be nice to finalise those performance testing results and > include them in the new, improved patch description. > agreed. Thanks, Daisuke Nishimura. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 23:31 ` Andrew Morton 2009-08-12 2:34 ` Daisuke Nishimura @ 2009-08-12 3:56 ` Balbir Singh 2009-08-12 3:57 ` Balbir Singh 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh 3 siblings, 0 replies; 20+ messages in thread From: Balbir Singh @ 2009-08-12 3:56 UTC (permalink / raw) To: Andrew Morton Cc: kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm * Andrew Morton <akpm@linux-foundation.org> [2009-08-11 16:31:59]: > On Tue, 11 Aug 2009 20:14:05 +0530 > Balbir Singh <balbir@linux.vnet.ibm.com> wrote: > > > Enhancement: Remove the overhead of root based resource counter accounting > > > > From: Balbir Singh <balbir@linux.vnet.ibm.com> > > > > This patch reduces the resource counter overhead (mostly spinlock) > > associated with the root cgroup. This is a part of the several > > patches to reduce mem cgroup overhead. I had posted other > > approaches earlier (including using percpu counters). Those > > patches will be a natural addition and will be added iteratively > > on top of these. > > > > The patch stops resource counter accounting for the root cgroup. > > The data for display is derived from the statisitcs we maintain > > via mem_cgroup_charge_statistics (which is more scalable). > > > > The tests results I see on a 24 way show that > > > > 1. The lock contention disappears from /proc/lock_stats > > 2. The results of the test are comparable to running with > > cgroup_disable=memory. > > > > Please test/review. > > I don't get it. > > The patch apepars to skip accounting altogether for the root memcgroup > and then adds some accounting back in for swap. Or something like > that. How come? Do we actually not need the root memcgroup > accounting? > The changelog mentions that the statistics are derived. For memsw as Daisuke-San mentioned, the SWAP accounting is for memsw. We can derive memory.usage_in_bytes from RSS+Cache fields in the memory.stat accounting. For memsw, we needed SWAP accounting. > IOW, the changelog sucks ;) > > Is this an alternative approach to using percpu_counters, or do we do > both or do we choose one or the other? res_counter_charge() really is > quite sucky. > > The patch didn't have a signoff. > > It would be nice to finalise those performance testing results and > include them in the new, improved patch description. > I'll submit a new patch with better changelog, checkpatch.pl fixes and test results. -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: Help Resource Counters Scale better (v4) 2009-08-11 23:31 ` Andrew Morton 2009-08-12 2:34 ` Daisuke Nishimura 2009-08-12 3:56 ` Balbir Singh @ 2009-08-12 3:57 ` Balbir Singh 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh 3 siblings, 0 replies; 20+ messages in thread From: Balbir Singh @ 2009-08-12 3:57 UTC (permalink / raw) To: Andrew Morton Cc: kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm * Andrew Morton <akpm@linux-foundation.org> [2009-08-11 16:31:59]: > Is this an alternative approach to using percpu_counters, or do we do > both or do we choose one or the other? res_counter_charge() really is > quite sucky. > This is an alternative approach, I'll still do the percpu counter patches, but once the overhead for root is gone, most users not using this functionality will not see any major overhead. -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-11 23:31 ` Andrew Morton ` (2 preceding siblings ...) 2009-08-12 3:57 ` Balbir Singh @ 2009-08-12 4:57 ` Balbir Singh 2009-08-12 10:53 ` Prarit Bhargava ` (2 more replies) 3 siblings, 3 replies; 20+ messages in thread From: Balbir Singh @ 2009-08-12 4:57 UTC (permalink / raw) To: Andrew Morton Cc: kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm Hi, Andrew, Does this look better, could you please replace the older patch with this one. 1. I did a quick compile test 2. Ran scripts/checkpatch.pl From: Balbir Singh <balbir@linux.vnet.ibm.com> Enhancement: Remove the overhead of root based resource counter accounting This patch reduces the resource counter overhead (mostly spinlock) associated with the root cgroup. This is a part of the several patches to reduce mem cgroup overhead. I had posted other approaches earlier (including using percpu counters). Those patches will be a natural addition and will be added iteratively on top of these. The patch stops resource counter accounting for the root cgroup. The data for display is derived from the statisitcs we maintain via mem_cgroup_charge_statistics (which is more scalable). What happens today is that, we do double accounting, once using res_counter_charge() and once using memory_cgroup_charge_statistics(). For the root, since we don't implement limits any more, we don't need to track every charge via res_counter_charge() and check for limit being exceeded and reclaim. The main mem->res usage_in_bytes can be derived by summing the cache and rss usage data from memory statistics (MEM_CGRUP_STAT_RSS and MEM_CGROUP_STAT_CACHE). However, for memsw->res usage_in_bytes, we need additional data about swapped out memory. This patch adds a MEM_CGROUP_STAT_SWAPOUT and uses that along with MEM_CGROUP_STAT_RSS and MEM_CGROUP_STAT_CACHE to derive the memsw data. The tests results I see on a 24 way show that 1. The lock contention disappears from /proc/lock_stats 2. The results of the test are comparable to running with cgroup_disable=memory. Here is a sample of my program runs Without Patch Performance counter stats for '/home/balbir/parallel_pagefault': 7192804.124144 task-clock-msecs # 23.937 CPUs 424691 context-switches # 0.000 M/sec 267 CPU-migrations # 0.000 M/sec 28498113 page-faults # 0.004 M/sec 5826093739340 cycles # 809.989 M/sec 408883496292 instructions # 0.070 IPC 7057079452 cache-references # 0.981 M/sec 3036086243 cache-misses # 0.422 M/sec 300.485365680 seconds time elapsed With cgroup_disable=memory Performance counter stats for '/home/balbir/parallel_pagefault': 7182183.546587 task-clock-msecs # 23.915 CPUs 425458 context-switches # 0.000 M/sec 203 CPU-migrations # 0.000 M/sec 92545093 page-faults # 0.013 M/sec 6034363609986 cycles # 840.185 M/sec 437204346785 instructions # 0.072 IPC 6636073192 cache-references # 0.924 M/sec 2358117732 cache-misses # 0.328 M/sec 300.320905827 seconds time elapsed With this patch applied Performance counter stats for '/home/balbir/parallel_pagefault': 7191619.223977 task-clock-msecs # 23.955 CPUs 422579 context-switches # 0.000 M/sec 88 CPU-migrations # 0.000 M/sec 91946060 page-faults # 0.013 M/sec 5957054385619 cycles # 828.333 M/sec 1058117350365 instructions # 0.178 IPC 9161776218 cache-references # 1.274 M/sec 1920494280 cache-misses # 0.267 M/sec 300.218764862 seconds time elapsed Data from Prarit (kernel compile with make -j64 on a 64 CPU/32G machine) For a single run Without patch real 27m8.988s user 87m24.916s sys 382m6.037s With patch real 4m18.607s user 84m58.943s sys 50m52.682s With config turned off real 4m54.972s user 90m13.456s sys 50m19.711s NOTE: The data looks counterintuitive due to the increased performance with the patch, even over the config being turned off. We probably need more runs, but so far all testing has shown that the patches definitely help. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> --- mm/memcontrol.c | 89 +++++++++++++++++++++++++++++++++++++++++++------------ 1 files changed, 70 insertions(+), 19 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 48a38e1..4f51885 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -70,6 +70,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -478,6 +479,19 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); + + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); + put_cpu(); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) @@ -1281,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret; + int ret = 0; unsigned long flags = 0; + if (mem_cgroup_is_root(mem)) + goto done; ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, &soft_fail_res); if (likely(!ret)) { @@ -1343,6 +1359,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) mem_cgroup_update_tree(mem_over_soft_limit, page); } +done: return 0; nomem: css_put(&mem->css); @@ -1415,9 +1432,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, + NULL); + } css_put(&mem->css); return; } @@ -1494,7 +1514,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - res_counter_uncharge(&from->res, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->res, PAGE_SIZE, NULL); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1513,7 +1534,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account) + if (do_swap_account && !mem_cgroup_is_root(from)) res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); css_put(&from->css); @@ -1584,9 +1605,11 @@ uncharge: /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(parent)) { + res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); + } return ret; } @@ -1775,7 +1798,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, + NULL); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1800,9 +1826,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + } css_put(&mem->css); } @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); + if (do_swap_account && + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); + } + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) + mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1948,7 +1981,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + val = mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_RSS); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + val = mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_RSS); + val += mem_cgroup_read_stat(&mem->stat, + MEM_CGROUP_STAT_SWAPOUT); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh @ 2009-08-12 10:53 ` Prarit Bhargava 2009-08-12 16:19 ` KAMEZAWA Hiroyuki 2009-08-12 16:28 ` KAMEZAWA Hiroyuki 2009-08-13 1:03 ` Daisuke Nishimura 2 siblings, 1 reply; 20+ messages in thread From: Prarit Bhargava @ 2009-08-12 10:53 UTC (permalink / raw) To: balbir Cc: Andrew Morton, kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, andi.kleen, xemul, lizf, linux-kernel, linux-mm Balbir Singh wrote: > Hi, Andrew, > > Does this look better, could you please replace the older patch with > this one. > > 1. I did a quick compile test > 2. Ran scripts/checkpatch.pl > > > Andi Kleen suggested I use kernbench to profile the kernel. 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS on Tue Aug 11 13:45:14 EDT 2009 2.6.31-rc5-git2-resources Average Half load -j 32 Run (std deviation): Elapsed Time 622.588 (119.243) User Time 4820.8 (962.286) System Time 9807.63 (2669.55) Percent CPU 2324 (167.236) Context Switches 2009606 (368703) Sleeps 1.24949e+06 (118210) Average Optimal load -j 256 Run (std deviation): Elapsed Time 770.97 (90.8685) User Time 5068.42 (750.933) System Time 21499.8 (12822.3) Percent CPU 3660 (1425.28) Context Switches 2.86467e+06 (971764) Sleeps 1.32784e+06 (129048) Average Maximal load -j Run (std deviation): Elapsed Time 757.018 (22.8371) User Time 4958.85 (644.65) System Time 24916.5 (11454.3) Percent CPU 4046.93 (1279.6) Context Switches 3.04894e+06 (826687) Sleeps 1.26053e+06 (146073) 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS off Tue Aug 11 17:58:58 EDT 2009 2.6.31-rc5-git2-no-resources Average Half load -j 32 Run (std deviation): Elapsed Time 280.176 (21.1131) User Time 3558.51 (389.488) System Time 2393.87 (142.692) Percent CPU 2122.6 (50.5104) Context Switches 1.20474e+06 (131112) Sleeps 1062507 (59366.3) Average Optimal load -j 256 Run (std deviation): Elapsed Time 223.192 (42.7007) User Time 4243.19 (967.575) System Time 2649.57 (344.462) Percent CPU 2845.5 (856.217) Context Switches 1.52187e+06 (391821) Sleeps 1.28862e+06 (274222) Average Maximal load -j Run (std deviation): Elapsed Time 216.942 (45.4824) User Time 3860.46 (966.452) System Time 2782.17 (344.154) Percent CPU 2862.47 (720.904) Context Switches 1.43379e+06 (341021) Sleeps 1184325 (269392) 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS on + patch Tue Aug 11 20:58:31 EDT 2009 2.6.31-rc5-git2-mem-patch Average Half load -j 32 Run (std deviation): Elapsed Time 285.788 (18.577) User Time 3483.14 (346.56) System Time 2426.37 (132.015) Percent CPU 2066.8 (80.3754) Context Switches 1.16588e+06 (134701) Sleeps 1048810 (59891.2) Average Optimal load -j 256 Run (std deviation): Elapsed Time 239.81 (14.0759) User Time 3797.7 (422.118) System Time 2622.74 (225.361) Percent CPU 2480.9 (446.735) Context Switches 1.37301e+06 (238886) Sleeps 1195957 (161659) Average Maximal load -j Run (std deviation): Elapsed Time 203.884 (8.59151) User Time 3578.02 (482.79) System Time 2759.9 (273.03) Percent CPU 2663.53 (450.476) Context Switches 1.33907e+06 (199658) Sleeps 1119205 (172089) ... The odd thing is that the run with the patch is still less than the run with CONFIG_RESOURCE_COUNTERS off. It was so odd that I double checked that I actually built in RESOURCE_COUNTERS and had applied the patch, both of which I had done. P. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-12 10:53 ` Prarit Bhargava @ 2009-08-12 16:19 ` KAMEZAWA Hiroyuki 0 siblings, 0 replies; 20+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-08-12 16:19 UTC (permalink / raw) To: Prarit Bhargava Cc: balbir, Andrew Morton, kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, andi.kleen, xemul, lizf, linux-kernel, linux-mm Prarit Bhargava さんは書きました: > > > Balbir Singh wrote: >> Hi, Andrew, >> >> Does this look better, could you please replace the older patch with >> this one. >> >> 1. I did a quick compile test >> 2. Ran scripts/checkpatch.pl >> >> >> > > Andi Kleen suggested I use kernbench to profile the kernel. > > 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS on > > Tue Aug 11 13:45:14 EDT 2009 > 2.6.31-rc5-git2-resources > Average Half load -j 32 Run (std deviation): > Elapsed Time 622.588 (119.243) > User Time 4820.8 (962.286) > System Time 9807.63 (2669.55) > Percent CPU 2324 (167.236) > Context Switches 2009606 (368703) > Sleeps 1.24949e+06 (118210) > > Average Optimal load -j 256 Run (std deviation): > Elapsed Time 770.97 (90.8685) > User Time 5068.42 (750.933) > System Time 21499.8 (12822.3) > Percent CPU 3660 (1425.28) > Context Switches 2.86467e+06 (971764) > Sleeps 1.32784e+06 (129048) > > Average Maximal load -j Run (std deviation): > Elapsed Time 757.018 (22.8371) > User Time 4958.85 (644.65) > System Time 24916.5 (11454.3) > Percent CPU 4046.93 (1279.6) > Context Switches 3.04894e+06 (826687) > Sleeps 1.26053e+06 (146073) > > > 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS off > > Tue Aug 11 17:58:58 EDT 2009 > 2.6.31-rc5-git2-no-resources > Average Half load -j 32 Run (std deviation): > Elapsed Time 280.176 (21.1131) > User Time 3558.51 (389.488) > System Time 2393.87 (142.692) > Percent CPU 2122.6 (50.5104) > Context Switches 1.20474e+06 (131112) > Sleeps 1062507 (59366.3) > > Average Optimal load -j 256 Run (std deviation): > Elapsed Time 223.192 (42.7007) > User Time 4243.19 (967.575) > System Time 2649.57 (344.462) > Percent CPU 2845.5 (856.217) > Context Switches 1.52187e+06 (391821) > Sleeps 1.28862e+06 (274222) > > Average Maximal load -j Run (std deviation): > Elapsed Time 216.942 (45.4824) > User Time 3860.46 (966.452) > System Time 2782.17 (344.154) > Percent CPU 2862.47 (720.904) > Context Switches 1.43379e+06 (341021) > Sleeps 1184325 (269392) > > 2.6.31-rc5-git2 w/ CONFIG_RESOURCE_COUNTERS on + patch > > Tue Aug 11 20:58:31 EDT 2009 > 2.6.31-rc5-git2-mem-patch > Average Half load -j 32 Run (std deviation): > Elapsed Time 285.788 (18.577) > User Time 3483.14 (346.56) > System Time 2426.37 (132.015) > Percent CPU 2066.8 (80.3754) > Context Switches 1.16588e+06 (134701) > Sleeps 1048810 (59891.2) > > Average Optimal load -j 256 Run (std deviation): > Elapsed Time 239.81 (14.0759) > User Time 3797.7 (422.118) > System Time 2622.74 (225.361) > Percent CPU 2480.9 (446.735) > Context Switches 1.37301e+06 (238886) > Sleeps 1195957 (161659) > > Average Maximal load -j Run (std deviation): > Elapsed Time 203.884 (8.59151) > User Time 3578.02 (482.79) > System Time 2759.9 (273.03) > Percent CPU 2663.53 (450.476) > Context Switches 1.33907e+06 (199658) > Sleeps 1119205 (172089) > > > ... The odd thing is that the run with the patch is still less than the > run with CONFIG_RESOURCE_COUNTERS off. It was so odd that I double > checked that I actually built in RESOURCE_COUNTERS and had applied the > patch, both of which I had done. > Hmm, CONFIG_RESOURCE_COUNTER actually means CONFIG_MEM_CGROUP ? Then, more requests from me. (if you can) Apply patch and set config ON. plz check your bench in following cases. 1. boot with cgroup_disable=memory (will have the same effect as config=off) boot without above option...and 2. Run your bench without mount memcg. 3. Run your bench with memcg...as... #mount -tcgroup none /cgroups -omemory #mkdir /cgroups/test #echo $$ > /cgroups/test #kernbench. Thanks, -Kame > P. > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh 2009-08-12 10:53 ` Prarit Bhargava @ 2009-08-12 16:28 ` KAMEZAWA Hiroyuki 2009-08-12 17:19 ` Balbir Singh 2009-08-13 1:03 ` Daisuke Nishimura 2 siblings, 1 reply; 20+ messages in thread From: KAMEZAWA Hiroyuki @ 2009-08-12 16:28 UTC (permalink / raw) To: balbir Cc: Andrew Morton, kamezawa.hiroyu, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm Balbir Singh wrote: > Hi, Andrew, > > Does this look better, could you please replace the older patch with > this one. > > 1. I did a quick compile test > 2. Ran scripts/checkpatch.pl > In general, seems reasonable to me as quick hack for root cgroup. thank you. Reviewed-by: KAMEAZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Finally, we'll have to do some rework for light-weight res_counter. But yes, it will take some amount of time. My only concern is account leak, but, if some leak, it's current code's bug, not this patch. And..hmm...I like following style other than open-coded. == int mem_coutner_charge(struct mem_cgroup *mem) { if (mem_cgroup_is_root(mem)) return 0; // always success return res_counter_charge(....) } == But maybe nitpick. Thanks, -Kame > > > From: Balbir Singh <balbir@linux.vnet.ibm.com> > > Enhancement: Remove the overhead of root based resource counter accounting > > This patch reduces the resource counter overhead (mostly spinlock) > associated with the root cgroup. This is a part of the several > patches to reduce mem cgroup overhead. I had posted other > approaches earlier (including using percpu counters). Those > patches will be a natural addition and will be added iteratively > on top of these. > > The patch stops resource counter accounting for the root cgroup. > The data for display is derived from the statisitcs we maintain > via mem_cgroup_charge_statistics (which is more scalable). > What happens today is that, we do double accounting, once using > res_counter_charge() and once using memory_cgroup_charge_statistics(). > For the root, since we don't implement limits any more, we don't > need to track every charge via res_counter_charge() and check > for limit being exceeded and reclaim. > > The main mem->res usage_in_bytes can be derived by summing > the cache and rss usage data from memory statistics > (MEM_CGRUP_STAT_RSS and MEM_CGROUP_STAT_CACHE). However, for > memsw->res usage_in_bytes, we need additional data about > swapped out memory. This patch adds a MEM_CGROUP_STAT_SWAPOUT > and uses that along with MEM_CGROUP_STAT_RSS and MEM_CGROUP_STAT_CACHE > to derive the memsw data. > > The tests results I see on a 24 way show that > > 1. The lock contention disappears from /proc/lock_stats > 2. The results of the test are comparable to running with > cgroup_disable=memory. > > Here is a sample of my program runs > > Without Patch > > Performance counter stats for '/home/balbir/parallel_pagefault': > > 7192804.124144 task-clock-msecs # 23.937 CPUs > 424691 context-switches # 0.000 M/sec > 267 CPU-migrations # 0.000 M/sec > 28498113 page-faults # 0.004 M/sec > 5826093739340 cycles # 809.989 M/sec > 408883496292 instructions # 0.070 IPC > 7057079452 cache-references # 0.981 M/sec > 3036086243 cache-misses # 0.422 M/sec > > 300.485365680 seconds time elapsed > > With cgroup_disable=memory > > Performance counter stats for '/home/balbir/parallel_pagefault': > > 7182183.546587 task-clock-msecs # 23.915 CPUs > 425458 context-switches # 0.000 M/sec > 203 CPU-migrations # 0.000 M/sec > 92545093 page-faults # 0.013 M/sec > 6034363609986 cycles # 840.185 M/sec > 437204346785 instructions # 0.072 IPC > 6636073192 cache-references # 0.924 M/sec > 2358117732 cache-misses # 0.328 M/sec > > 300.320905827 seconds time elapsed > > With this patch applied > > Performance counter stats for '/home/balbir/parallel_pagefault': > > 7191619.223977 task-clock-msecs # 23.955 CPUs > 422579 context-switches # 0.000 M/sec > 88 CPU-migrations # 0.000 M/sec > 91946060 page-faults # 0.013 M/sec > 5957054385619 cycles # 828.333 M/sec > 1058117350365 instructions # 0.178 IPC > 9161776218 cache-references # 1.274 M/sec > 1920494280 cache-misses # 0.267 M/sec > > 300.218764862 seconds time elapsed > > > Data from Prarit (kernel compile with make -j64 on a 64 > CPU/32G machine) > > For a single run > > Without patch > > real 27m8.988s > user 87m24.916s > sys 382m6.037s > > With patch > > real 4m18.607s > user 84m58.943s > sys 50m52.682s > > > With config turned off > > real 4m54.972s > user 90m13.456s > sys 50m19.711s > > NOTE: The data looks counterintuitive due to the increased performance > with the patch, even over the config being turned off. We probably need > more runs, but so far all testing has shown that the patches definitely > help. > > > Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> > --- > > mm/memcontrol.c | 89 > +++++++++++++++++++++++++++++++++++++++++++------------ > 1 files changed, 70 insertions(+), 19 deletions(-) > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 48a38e1..4f51885 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -70,6 +70,7 @@ enum mem_cgroup_stat_index { > MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ > MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ > MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ > + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ > > MEM_CGROUP_STAT_NSTATS, > }; > @@ -478,6 +479,19 @@ mem_cgroup_largest_soft_limit_node(struct > mem_cgroup_tree_per_zone *mctz) > return mz; > } > > +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, > + bool charge) > +{ > + int val = (charge) ? 1 : -1; > + struct mem_cgroup_stat *stat = &mem->stat; > + struct mem_cgroup_stat_cpu *cpustat; > + int cpu = get_cpu(); > + > + cpustat = &stat->cpustat[cpu]; > + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); > + put_cpu(); > +} > + > static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, > struct page_cgroup *pc, > bool charge) > @@ -1281,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct > *mm, > VM_BUG_ON(css_is_removed(&mem->css)); > > while (1) { > - int ret; > + int ret = 0; > unsigned long flags = 0; > > + if (mem_cgroup_is_root(mem)) > + goto done; > ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, > &soft_fail_res); > if (likely(!ret)) { > @@ -1343,6 +1359,7 @@ static int __mem_cgroup_try_charge(struct mm_struct > *mm, > if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) > mem_cgroup_update_tree(mem_over_soft_limit, page); > } > +done: > return 0; > nomem: > css_put(&mem->css); > @@ -1415,9 +1432,12 @@ static void __mem_cgroup_commit_charge(struct > mem_cgroup *mem, > lock_page_cgroup(pc); > if (unlikely(PageCgroupUsed(pc))) { > unlock_page_cgroup(pc); > - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); > - if (do_swap_account) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); > + if (do_swap_account) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, > + NULL); > + } > css_put(&mem->css); > return; > } > @@ -1494,7 +1514,8 @@ static int mem_cgroup_move_account(struct > page_cgroup *pc, > if (pc->mem_cgroup != from) > goto out; > > - res_counter_uncharge(&from->res, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(from)) > + res_counter_uncharge(&from->res, PAGE_SIZE, NULL); > mem_cgroup_charge_statistics(from, pc, false); > > page = pc->page; > @@ -1513,7 +1534,7 @@ static int mem_cgroup_move_account(struct > page_cgroup *pc, > 1); > } > > - if (do_swap_account) > + if (do_swap_account && !mem_cgroup_is_root(from)) > res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); > css_put(&from->css); > > @@ -1584,9 +1605,11 @@ uncharge: > /* drop extra refcnt by try_charge() */ > css_put(&parent->css); > /* uncharge if move fails */ > - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); > - if (do_swap_account) > - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(parent)) { > + res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); > + if (do_swap_account) > + res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); > + } > return ret; > } > > @@ -1775,7 +1798,10 @@ __mem_cgroup_commit_charge_swapin(struct page > *page, struct mem_cgroup *ptr, > * This recorded memcg can be obsolete one. So, avoid > * calling css_tryget > */ > - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(memcg)) > + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, > + NULL); > + mem_cgroup_swap_statistics(memcg, false); > mem_cgroup_put(memcg); > } > rcu_read_unlock(); > @@ -1800,9 +1826,11 @@ void mem_cgroup_cancel_charge_swapin(struct > mem_cgroup *mem) > return; > if (!mem) > return; > - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); > - if (do_swap_account) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); > + if (do_swap_account) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + } > css_put(&mem->css); > } > > @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, > enum charge_type ctype) > break; > } > > - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > + if (do_swap_account && > + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + } > + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) > + mem_cgroup_swap_statistics(mem, true); > mem_cgroup_charge_statistics(mem, pc, false); > > ClearPageCgroupUsed(pc); > @@ -1948,7 +1981,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) > * We uncharge this because swap is freed. > * This memcg can be obsolete one. We avoid calling css_tryget > */ > - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(memcg)) > + res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); > + mem_cgroup_swap_statistics(memcg, false); > mem_cgroup_put(memcg); > } > rcu_read_unlock(); > @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, > struct cftype *cft) > name = MEMFILE_ATTR(cft->private); > switch (type) { > case _MEM: > - val = res_counter_read_u64(&mem->res, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + val = mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_CACHE); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_RSS); > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->res, name); > break; > case _MEMSWAP: > - val = res_counter_read_u64(&mem->memsw, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + val = mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_CACHE); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_RSS); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_SWAPOUT); > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->memsw, name); > break; > default: > BUG(); > > -- > Balbir > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-12 16:28 ` KAMEZAWA Hiroyuki @ 2009-08-12 17:19 ` Balbir Singh 0 siblings, 0 replies; 20+ messages in thread From: Balbir Singh @ 2009-08-12 17:19 UTC (permalink / raw) To: KAMEZAWA Hiroyuki Cc: Andrew Morton, nishimura, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2009-08-13 01:28:57]: > Balbir Singh wrote: > > Hi, Andrew, > > > > Does this look better, could you please replace the older patch with > > this one. > > > > 1. I did a quick compile test > > 2. Ran scripts/checkpatch.pl > > > > In general, seems reasonable to me as quick hack for root cgroup. > thank you. > > Reviewed-by: KAMEAZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> > Thanks, yes, we still need to do the percpu counter work, but this will give us breathing space to do it correctly and define strict and non-strict accounting. > Finally, we'll have to do some rework for light-weight res_counter. > But yes, it will take some amount of time. > My only concern is account leak, but, if some leak, it's current code's > bug, not this patch. > Yeah.. we'll need to check for that. > And..hmm...I like following style other than open-coded. > == > int mem_coutner_charge(struct mem_cgroup *mem) > { > if (mem_cgroup_is_root(mem)) > return 0; // always success > return res_counter_charge(....) > } > == > But maybe nitpick. > Yes, we can refactor for simplication, but we'll need some exceptions. I'll do that as an add-on patch. -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh 2009-08-12 10:53 ` Prarit Bhargava 2009-08-12 16:28 ` KAMEZAWA Hiroyuki @ 2009-08-13 1:03 ` Daisuke Nishimura 2009-08-13 3:33 ` Balbir Singh 2 siblings, 1 reply; 20+ messages in thread From: Daisuke Nishimura @ 2009-08-13 1:03 UTC (permalink / raw) To: balbir Cc: Andrew Morton, kamezawa.hiroyu, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, nishimura, linux-kernel, linux-mm > @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > break; > } > > - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + if (!mem_cgroup_is_root(mem)) { > + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > + if (do_swap_account && > + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > + } > + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) > + mem_cgroup_swap_statistics(mem, true); I think mem_cgroup_is_root(mem) would be unnecessary here. Otherwise, MEM_CGROUP_STAT_SWAPOUT of groups except root memcgroup wouldn't be counted properly. > @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) > name = MEMFILE_ATTR(cft->private); > switch (type) { > case _MEM: > - val = res_counter_read_u64(&mem->res, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + val = mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_CACHE); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_RSS); > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->res, name); > break; > case _MEMSWAP: > - val = res_counter_read_u64(&mem->memsw, name); > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > + val = mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_CACHE); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_RSS); > + val += mem_cgroup_read_stat(&mem->stat, > + MEM_CGROUP_STAT_SWAPOUT); > + val <<= PAGE_SHIFT; > + } else > + val = res_counter_read_u64(&mem->memsw, name); > break; > default: > BUG(); Considering use_hierarchy==1 case in the root memcgroup, shouldn't we use mem_cgroup_walk_tree() here to sum up all the children's usage ? *.usage_in_bytes show sum of all the children's usage now if use_hierarchy==1. Thanks, Daisuke Nishimura. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-13 1:03 ` Daisuke Nishimura @ 2009-08-13 3:33 ` Balbir Singh 2009-08-13 5:08 ` Daisuke Nishimura 0 siblings, 1 reply; 20+ messages in thread From: Balbir Singh @ 2009-08-13 3:33 UTC (permalink / raw) To: Daisuke Nishimura Cc: Andrew Morton, kamezawa.hiroyu, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, linux-kernel, linux-mm * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2009-08-13 10:03:50]: > > @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > > break; > > } > > > > - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > > - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > > + if (!mem_cgroup_is_root(mem)) { > > + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > > + if (do_swap_account && > > + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > > + } > > + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) > > + mem_cgroup_swap_statistics(mem, true); > I think mem_cgroup_is_root(mem) would be unnecessary here. > Otherwise, MEM_CGROUP_STAT_SWAPOUT of groups except root memcgroup wouldn't > be counted properly. > I think you have a valid point, but it will not impact us currently since we use SWAPOUT only for root accounting. > > > @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) > > name = MEMFILE_ATTR(cft->private); > > switch (type) { > > case _MEM: > > - val = res_counter_read_u64(&mem->res, name); > > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > > + val = mem_cgroup_read_stat(&mem->stat, > > + MEM_CGROUP_STAT_CACHE); > > + val += mem_cgroup_read_stat(&mem->stat, > > + MEM_CGROUP_STAT_RSS); > > + val <<= PAGE_SHIFT; > > + } else > > + val = res_counter_read_u64(&mem->res, name); > > break; > > case _MEMSWAP: > > - val = res_counter_read_u64(&mem->memsw, name); > > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > > + val = mem_cgroup_read_stat(&mem->stat, > > + MEM_CGROUP_STAT_CACHE); > > + val += mem_cgroup_read_stat(&mem->stat, > > + MEM_CGROUP_STAT_RSS); > > + val += mem_cgroup_read_stat(&mem->stat, > > + MEM_CGROUP_STAT_SWAPOUT); > > + val <<= PAGE_SHIFT; > > + } else > > + val = res_counter_read_u64(&mem->memsw, name); > > break; > > default: > > BUG(); > Considering use_hierarchy==1 case in the root memcgroup, shouldn't we use > mem_cgroup_walk_tree() here to sum up all the children's usage ? > *.usage_in_bytes show sum of all the children's usage now if use_hierarchy==1. If memory.use_hiearchy=1, we should use total_stats..right. Let me send out a newer version for review. -- Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] Help Resource Counters Scale better (v4.1) 2009-08-13 3:33 ` Balbir Singh @ 2009-08-13 5:08 ` Daisuke Nishimura 0 siblings, 0 replies; 20+ messages in thread From: Daisuke Nishimura @ 2009-08-13 5:08 UTC (permalink / raw) To: balbir Cc: Andrew Morton, kamezawa.hiroyu, kosaki.motohiro, menage, prarit, andi.kleen, xemul, lizf, nishimura, linux-kernel, linux-mm On Thu, 13 Aug 2009 09:03:35 +0530, Balbir Singh <balbir@linux.vnet.ibm.com> wrote: > * nishimura@mxp.nes.nec.co.jp <nishimura@mxp.nes.nec.co.jp> [2009-08-13 10:03:50]: > > > > @@ -1855,9 +1883,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > > > break; > > > } > > > > > > - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > > > - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > > > - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > > > + if (!mem_cgroup_is_root(mem)) { > > > + res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); > > > + if (do_swap_account && > > > + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) > > > + res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); > > > + } > > > + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT && mem_cgroup_is_root(mem)) > > > + mem_cgroup_swap_statistics(mem, true); > > I think mem_cgroup_is_root(mem) would be unnecessary here. > > Otherwise, MEM_CGROUP_STAT_SWAPOUT of groups except root memcgroup wouldn't > > be counted properly. > > > > I think you have a valid point, but it will not impact us currently > since we use SWAPOUT only for root accounting. > > > > > > @@ -2461,10 +2496,26 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) > > > name = MEMFILE_ATTR(cft->private); > > > switch (type) { > > > case _MEM: > > > - val = res_counter_read_u64(&mem->res, name); > > > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > > > + val = mem_cgroup_read_stat(&mem->stat, > > > + MEM_CGROUP_STAT_CACHE); > > > + val += mem_cgroup_read_stat(&mem->stat, > > > + MEM_CGROUP_STAT_RSS); > > > + val <<= PAGE_SHIFT; > > > + } else > > > + val = res_counter_read_u64(&mem->res, name); > > > break; > > > case _MEMSWAP: > > > - val = res_counter_read_u64(&mem->memsw, name); > > > + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { > > > + val = mem_cgroup_read_stat(&mem->stat, > > > + MEM_CGROUP_STAT_CACHE); > > > + val += mem_cgroup_read_stat(&mem->stat, > > > + MEM_CGROUP_STAT_RSS); > > > + val += mem_cgroup_read_stat(&mem->stat, > > > + MEM_CGROUP_STAT_SWAPOUT); > > > + val <<= PAGE_SHIFT; > > > + } else > > > + val = res_counter_read_u64(&mem->memsw, name); > > > break; > > > default: > > > BUG(); > > Considering use_hierarchy==1 case in the root memcgroup, shouldn't we use > > mem_cgroup_walk_tree() here to sum up all the children's usage ? > > *.usage_in_bytes show sum of all the children's usage now if use_hierarchy==1. > > If memory.use_hiearchy=1, we should use total_stats..right. Let me > send out a newer version for review. > BTW, I don't think the patch title is suitable for this patch. This patch doesn't make res_counter scalable at all :) Of course, I think the patch using percpu_counter for scalability is also important. Thanks, Daisuke Nishimura. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2009-08-13 5:11 UTC | newest] Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2009-08-11 14:44 Help Resource Counters Scale better (v4) Balbir Singh 2009-08-11 14:54 ` Prarit Bhargava 2009-08-11 15:00 ` Balbir Singh 2009-08-11 15:11 ` Prarit Bhargava 2009-08-11 15:14 ` Prarit Bhargava 2009-08-11 15:20 ` Andi Kleen 2009-08-11 15:21 ` Prarit Bhargava 2009-08-11 16:52 ` Balbir Singh 2009-08-11 23:31 ` Andrew Morton 2009-08-12 2:34 ` Daisuke Nishimura 2009-08-12 3:56 ` Balbir Singh 2009-08-12 3:57 ` Balbir Singh 2009-08-12 4:57 ` [PATCH] Help Resource Counters Scale better (v4.1) Balbir Singh 2009-08-12 10:53 ` Prarit Bhargava 2009-08-12 16:19 ` KAMEZAWA Hiroyuki 2009-08-12 16:28 ` KAMEZAWA Hiroyuki 2009-08-12 17:19 ` Balbir Singh 2009-08-13 1:03 ` Daisuke Nishimura 2009-08-13 3:33 ` Balbir Singh 2009-08-13 5:08 ` Daisuke Nishimura
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox