linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] memcg remove css_get/put per pages
@ 2010-06-08  3:19 KAMEZAWA Hiroyuki
  2010-06-08  5:40 ` Balbir Singh
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-08  3:19 UTC (permalink / raw)
  To: linux-mm; +Cc: balbir, nishimura

Now, I think pre_destroy->force_empty() works very well and we can get rid of
css_put/get per pages. This has very big effect in some special case.

This is a test result with a multi-thread page fault program
(I used at rwsem discussion.)

[Before patch]
   25.72%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
     8.18%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
     8.17%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     8.03%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     5.46%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
     5.45%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     4.36%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     4.35%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     3.59%  multi-fault-all  [kernel.kallsyms]      [k] css_put
     2.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
     1.80%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
     1.65%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault

try_get_mem_cgroup_from_mm() is a one of heavy ops because of false-sharing in
css's counter for css_get/put.

I removed that.

[After]
   26.16%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
    11.73%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
     9.23%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     9.07%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     6.09%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     5.57%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     4.86%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     2.54%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
     2.29%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
     2.04%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     1.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault

Hmm. seems nice. But I don't convince my patch has no race.
I'll continue test but your help is welcome.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, memory cgroup increments css(cgroup subsys state)'s reference
count per a charged page. And the reference count is kept until
the page is uncharged. But this has 2 bad effect. 

 1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
    on large smp will not scale well.
 2. Because css's refcnt cannot be in a state as "ready-to-release",
    cgroup's notify_on_release handler can't work with memcg.

This is a trial to remove css's refcnt per a page. Even if we remove
refcnt, pre_destroy() does enough synchronization.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |   66 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 20 deletions(-)

Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
+++ mmotm-2.6.34-Jun6/mm/memcontrol.c
@@ -1717,25 +1717,49 @@ static int __mem_cgroup_try_charge(struc
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
-	if (*memcg) {
+	if (!*memcg && !mm)
+		goto bypass;
+again:
+	if (*memcg) { /* css should be a valid one */
 		mem = *memcg;
+		VM_BUG_ON(css_is_removed(mem));
+		if (mem_cgroup_is_root(mem))
+			goto done;
+		if (consume_stock(mem))
+			goto done;
 		css_get(&mem->css);
 	} else {
-		mem = try_get_mem_cgroup_from_mm(mm);
-		if (unlikely(!mem))
-			return 0;
-		*memcg = mem;
-	}
+		struct task_struct *p;
 
-	VM_BUG_ON(css_is_removed(&mem->css));
-	if (mem_cgroup_is_root(mem))
-		goto done;
+		rcu_read_lock();
+		p = rcu_dereference(mm->owner);
+		VM_BUG_ON(!p);
+		/*
+ 		 * while task_lock, this task cannot be disconnected with
+ 		 * the cgroup we see.
+ 		 */
+		task_lock(p);
+		mem = mem_cgroup_from_task(p);
+		VM_BUG_ON(!mem);
+		if (mem_cgroup_is_root(mem)) {
+			task_unlock(p);
+			rcu_read_unlock();
+			goto done;
+		}
+		if (consume_stock(mem)) {
+			*memcg = mem;
+			task_unlock(p);
+			rcu_read_unlock();
+			goto done;
+		}
+		css_get(&mem->css);
+		task_unlock(p);
+		rcu_read_unlock();
+	}
 
 	do {
 		bool oom_check;
 
-		if (consume_stock(mem))
-			goto done; /* don't need to fill stock */
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current))
 			goto bypass;
@@ -1750,10 +1774,13 @@ static int __mem_cgroup_try_charge(struc
 
 		switch (ret) {
 		case CHARGE_OK:
+			*memcg = mem;
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			csize = PAGE_SIZE;
-			break;
+			css_put(&mem->css);
+			mem = NULL;
+			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
@@ -1769,6 +1796,7 @@ static int __mem_cgroup_try_charge(struc
 
 	if (csize > PAGE_SIZE)
 		refill_stock(mem, csize - PAGE_SIZE);
+	css_put(&mem->css);
 done:
 	return 0;
 nomem:
@@ -1795,7 +1823,6 @@ static void __mem_cgroup_cancel_charge(s
 			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
 		WARN_ON_ONCE(count > INT_MAX);
-		__css_put(&mem->css, (int)count);
 	}
 	/* we don't need css_put for root */
 }
@@ -2158,7 +2185,6 @@ int mem_cgroup_try_charge_swapin(struct 
 		goto charge_cur_mm;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-	/* drop extra refcnt from tryget */
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
@@ -2345,9 +2371,6 @@ __mem_cgroup_uncharge_common(struct page
 	unlock_page_cgroup(pc);
 
 	memcg_check_events(mem, page);
-	/* at swapout, this memcg will be accessed to record to swap */
-	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-		css_put(&mem->css);
 
 	return mem;
 
@@ -2432,14 +2455,18 @@ mem_cgroup_uncharge_swapcache(struct pag
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 
-	memcg = __mem_cgroup_uncharge_common(page, ctype);
+	memcg = try_get_mem_cgroup_from_page(page);
+	if (!memcg)
+		return;
+
+	__mem_cgroup_uncharge_common(page, ctype);
 
 	/* record memcg information */
 	if (do_swap_account && swapout && memcg) {
 		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
-	if (swapout && memcg)
+	if (memcg)
 		css_put(&memcg->css);
 }
 #endif
@@ -4219,7 +4246,6 @@ static int mem_cgroup_do_precharge(unsig
 		mc.precharge += count;
 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
 		WARN_ON_ONCE(count > INT_MAX);
-		__css_get(&mem->css, (int)count);
 		return ret;
 	}
 one_by_one:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-08  3:19 [RFC][PATCH] memcg remove css_get/put per pages KAMEZAWA Hiroyuki
@ 2010-06-08  5:40 ` Balbir Singh
  2010-06-09  0:47   ` KAMEZAWA Hiroyuki
  2010-06-08  7:31 ` Daisuke Nishimura
  2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 15+ messages in thread
From: Balbir Singh @ 2010-06-08  5:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, nishimura

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-06-08 12:19:01]:

> Now, I think pre_destroy->force_empty() works very well and we can get rid of
> css_put/get per pages. This has very big effect in some special case.
> 
> This is a test result with a multi-thread page fault program
> (I used at rwsem discussion.)
> 
> [Before patch]
>    25.72%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
>      8.18%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
>      8.17%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
>      8.03%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>      5.46%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
>      5.45%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
>      4.36%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
>      4.35%  multi-fault-all  [kernel.kallsyms]      [k] up_read
>      3.59%  multi-fault-all  [kernel.kallsyms]      [k] css_put
>      2.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
>      1.80%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
>      1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
>      1.65%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> 
> try_get_mem_cgroup_from_mm() is a one of heavy ops because of false-sharing in
> css's counter for css_get/put.
> 
> I removed that.
> 
> [After]
>    26.16%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
>     11.73%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
>      9.23%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>      9.07%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
>      6.09%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
>      5.57%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
>      4.86%  multi-fault-all  [kernel.kallsyms]      [k] up_read
>      2.54%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
>      2.29%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
>      2.04%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
>      1.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> 
> Hmm. seems nice. But I don't convince my patch has no race.
> I'll continue test but your help is welcome.
>

Looks nice, Kamezawa-San could you please confirm the source of
raw_spin_lock_irqsave and trylock from /proc/lock_stat?
 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Now, memory cgroup increments css(cgroup subsys state)'s reference
> count per a charged page. And the reference count is kept until
> the page is uncharged. But this has 2 bad effect. 
> 
>  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
>     on large smp will not scale well.
>  2. Because css's refcnt cannot be in a state as "ready-to-release",
>     cgroup's notify_on_release handler can't work with memcg.
> 
> This is a trial to remove css's refcnt per a page. Even if we remove
                                             ^^ (per page)
> refcnt, pre_destroy() does enough synchronization.

Could you also document what the rules for css_get/put now become? I
like the idea, but I am not sure if I understand the new rules
correctly by looking at the code.


-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-08  3:19 [RFC][PATCH] memcg remove css_get/put per pages KAMEZAWA Hiroyuki
  2010-06-08  5:40 ` Balbir Singh
@ 2010-06-08  7:31 ` Daisuke Nishimura
  2010-06-09  0:54   ` KAMEZAWA Hiroyuki
  2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 15+ messages in thread
From: Daisuke Nishimura @ 2010-06-08  7:31 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, Daisuke Nishimura

On Tue, 8 Jun 2010 12:19:01 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Now, I think pre_destroy->force_empty() works very well and we can get rid of
> css_put/get per pages. This has very big effect in some special case.
> 
> This is a test result with a multi-thread page fault program
> (I used at rwsem discussion.)
> 
> [Before patch]
>    25.72%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
>      8.18%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
>      8.17%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
>      8.03%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>      5.46%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
>      5.45%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
>      4.36%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
>      4.35%  multi-fault-all  [kernel.kallsyms]      [k] up_read
>      3.59%  multi-fault-all  [kernel.kallsyms]      [k] css_put
>      2.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
>      1.80%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
>      1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
>      1.65%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> 
> try_get_mem_cgroup_from_mm() is a one of heavy ops because of false-sharing in
> css's counter for css_get/put.
> 
I'm sorry, what do you mean by "false-sharing" ?
And I think it would be better to add these performance data to commit log.

> I removed that.
> 
> [After]
>    26.16%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
>     11.73%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
>      9.23%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
>      9.07%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
>      6.09%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
>      5.57%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
>      4.86%  multi-fault-all  [kernel.kallsyms]      [k] up_read
>      2.54%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
>      2.29%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
>      2.04%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
>      1.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> 
> Hmm. seems nice. But I don't convince my patch has no race.
> I'll continue test but your help is welcome.
> 
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Now, memory cgroup increments css(cgroup subsys state)'s reference
> count per a charged page. And the reference count is kept until
> the page is uncharged. But this has 2 bad effect. 
> 
>  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
>     on large smp will not scale well.
I'm sorry if I'm asking a stupid question, the number of css_get/put
would be:

	before:
		get:1 in charge
		put:1 in uncharge
	after:
		get:1, put:1 in charge
		no get/put in uncharge

right ?
Then, isn't there any change as a whole ?

>  2. Because css's refcnt cannot be in a state as "ready-to-release",
>     cgroup's notify_on_release handler can't work with memcg.
> 
Yes, 2 is one of weak point of memcg, IMHO.

> This is a trial to remove css's refcnt per a page. Even if we remove
> refcnt, pre_destroy() does enough synchronization.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  mm/memcontrol.c |   66 +++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 46 insertions(+), 20 deletions(-)
> 
> Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
> +++ mmotm-2.6.34-Jun6/mm/memcontrol.c
> @@ -1717,25 +1717,49 @@ static int __mem_cgroup_try_charge(struc
>  	 * thread group leader migrates. It's possible that mm is not
>  	 * set, if so charge the init_mm (happens for pagecache usage).
>  	 */
> -	if (*memcg) {
> +	if (!*memcg && !mm)
> +		goto bypass;
Shouldn't it be VM_BUG_ON(!*memcg && !mm) ?

> +again:
> +	if (*memcg) { /* css should be a valid one */
>  		mem = *memcg;
> +		VM_BUG_ON(css_is_removed(mem));
> +		if (mem_cgroup_is_root(mem))
> +			goto done;
> +		if (consume_stock(mem))
> +			goto done;
>  		css_get(&mem->css);
>  	} else {
> -		mem = try_get_mem_cgroup_from_mm(mm);
> -		if (unlikely(!mem))
> -			return 0;
> -		*memcg = mem;
> -	}
> +		struct task_struct *p;
>  
> -	VM_BUG_ON(css_is_removed(&mem->css));
> -	if (mem_cgroup_is_root(mem))
> -		goto done;
> +		rcu_read_lock();
> +		p = rcu_dereference(mm->owner);
> +		VM_BUG_ON(!p);
> +		/*
> + 		 * while task_lock, this task cannot be disconnected with
> + 		 * the cgroup we see.
> + 		 */
> +		task_lock(p);
> +		mem = mem_cgroup_from_task(p);
> +		VM_BUG_ON(!mem);
> +		if (mem_cgroup_is_root(mem)) {
Shoudn't we do "*memcg = mem" here ?
hmm, how about doing:

	done:
		*memcg = mem;
		return 0;

instead of doing "*memcg = mem" in some places ?

> +			task_unlock(p);
> +			rcu_read_unlock();
> +			goto done;
> +		}
> +		if (consume_stock(mem)) {
> +			*memcg = mem;
> +			task_unlock(p);
> +			rcu_read_unlock();
> +			goto done;
> +		}
> +		css_get(&mem->css);
> +		task_unlock(p);
> +		rcu_read_unlock();
> +	}
>  
>  	do {
>  		bool oom_check;
>  
> -		if (consume_stock(mem))
> -			goto done; /* don't need to fill stock */
>  		/* If killed, bypass charge */
>  		if (fatal_signal_pending(current))
>  			goto bypass;
> @@ -1750,10 +1774,13 @@ static int __mem_cgroup_try_charge(struc
>  
>  		switch (ret) {
>  		case CHARGE_OK:
> +			*memcg = mem;
>  			break;
>  		case CHARGE_RETRY: /* not in OOM situation but retry */
>  			csize = PAGE_SIZE;
> -			break;
> +			css_put(&mem->css);
> +			mem = NULL;
> +			goto again;
>  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
>  			goto nomem;
>  		case CHARGE_NOMEM: /* OOM routine works */
> @@ -1769,6 +1796,7 @@ static int __mem_cgroup_try_charge(struc
>  
>  	if (csize > PAGE_SIZE)
>  		refill_stock(mem, csize - PAGE_SIZE);
> +	css_put(&mem->css);
>  done:
>  	return 0;
>  nomem:
> @@ -1795,7 +1823,6 @@ static void __mem_cgroup_cancel_charge(s
>  			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
>  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
>  		WARN_ON_ONCE(count > INT_MAX);
> -		__css_put(&mem->css, (int)count);
>  	}
>  	/* we don't need css_put for root */
>  }
These VM_BUG_ON() and WARN_ON_ONCE() will be unnecessary, too.

> @@ -2158,7 +2185,6 @@ int mem_cgroup_try_charge_swapin(struct 
>  		goto charge_cur_mm;
>  	*ptr = mem;
>  	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
> -	/* drop extra refcnt from tryget */
>  	css_put(&mem->css);
>  	return ret;
>  charge_cur_mm:
> @@ -2345,9 +2371,6 @@ __mem_cgroup_uncharge_common(struct page
>  	unlock_page_cgroup(pc);
>  
>  	memcg_check_events(mem, page);
> -	/* at swapout, this memcg will be accessed to record to swap */
> -	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> -		css_put(&mem->css);
>  
>  	return mem;
>  
> @@ -2432,14 +2455,18 @@ mem_cgroup_uncharge_swapcache(struct pag
>  	if (!swapout) /* this was a swap cache but the swap is unused ! */
>  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
>  
> -	memcg = __mem_cgroup_uncharge_common(page, ctype);
> +	memcg = try_get_mem_cgroup_from_page(page);
> +	if (!memcg)
> +		return;
> +
> +	__mem_cgroup_uncharge_common(page, ctype);
>  
>  	/* record memcg information */
>  	if (do_swap_account && swapout && memcg) {
>  		swap_cgroup_record(ent, css_id(&memcg->css));
>  		mem_cgroup_get(memcg);
>  	}
> -	if (swapout && memcg)
> +	if (memcg)
>  		css_put(&memcg->css);
>  }
>  #endif
"if (memcg)" is unnecessary(it's checked above).

> @@ -4219,7 +4246,6 @@ static int mem_cgroup_do_precharge(unsig
>  		mc.precharge += count;
>  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
>  		WARN_ON_ONCE(count > INT_MAX);
> -		__css_get(&mem->css, (int)count);
>  		return ret;
>  	}
>  one_by_one:
> 
ditto.

IIUC this patch, we should remove css_put() in mem_cgroup_move_swap_account()
and __css_put() in mem_cgroup_clear_mc() too, and modify some comments.
Anyway, we must test these changes carefully.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-08  5:40 ` Balbir Singh
@ 2010-06-09  0:47   ` KAMEZAWA Hiroyuki
  2010-06-09  5:14     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-09  0:47 UTC (permalink / raw)
  To: balbir; +Cc: linux-mm, nishimura

On Tue, 8 Jun 2010 11:10:04 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-06-08 12:19:01]:
> 
> > Now, I think pre_destroy->force_empty() works very well and we can get rid of
> > css_put/get per pages. This has very big effect in some special case.
> > 
> > This is a test result with a multi-thread page fault program
> > (I used at rwsem discussion.)
> > 
> > [Before patch]
> >    25.72%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
> >      8.18%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
> >      8.17%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
> >      8.03%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
> >      5.46%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
> >      5.45%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
> >      4.36%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
> >      4.35%  multi-fault-all  [kernel.kallsyms]      [k] up_read
> >      3.59%  multi-fault-all  [kernel.kallsyms]      [k] css_put
> >      2.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
> >      1.80%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
> >      1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
> >      1.65%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> > 
> > try_get_mem_cgroup_from_mm() is a one of heavy ops because of false-sharing in
> > css's counter for css_get/put.
> > 
> > I removed that.
> > 
> > [After]
> >    26.16%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
> >     11.73%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
> >      9.23%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
> >      9.07%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
> >      6.09%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
> >      5.57%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
> >      4.86%  multi-fault-all  [kernel.kallsyms]      [k] up_read
> >      2.54%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
> >      2.29%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
> >      2.04%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
> >      1.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> > 
> > Hmm. seems nice. But I don't convince my patch has no race.
> > I'll continue test but your help is welcome.
> >
> 
> Looks nice, Kamezawa-San could you please confirm the source of
> raw_spin_lock_irqsave and trylock from /proc/lock_stat?
>  
Sure. But above result can be got when lockdep etc..are off.
(it increase lock overhead)

But yes, new _raw_spin_lock seems strange.


> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > Now, memory cgroup increments css(cgroup subsys state)'s reference
> > count per a charged page. And the reference count is kept until
> > the page is uncharged. But this has 2 bad effect. 
> > 
> >  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
> >     on large smp will not scale well.
> >  2. Because css's refcnt cannot be in a state as "ready-to-release",
> >     cgroup's notify_on_release handler can't work with memcg.
> > 
> > This is a trial to remove css's refcnt per a page. Even if we remove
>                                              ^^ (per page)
> > refcnt, pre_destroy() does enough synchronization.
> 
> Could you also document what the rules for css_get/put now become? I
> like the idea, but I am not sure if I understand the new rules
> correctly by looking at the code.

Hm. I'll try....but this just removes css_get/put per pages..

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-08  7:31 ` Daisuke Nishimura
@ 2010-06-09  0:54   ` KAMEZAWA Hiroyuki
  2010-06-09  2:05     ` Daisuke Nishimura
  0 siblings, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-09  0:54 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: linux-mm, balbir

On Tue, 8 Jun 2010 16:31:29 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Tue, 8 Jun 2010 12:19:01 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > Now, I think pre_destroy->force_empty() works very well and we can get rid of
> > css_put/get per pages. This has very big effect in some special case.
> > 
> > This is a test result with a multi-thread page fault program
> > (I used at rwsem discussion.)
> > 
> > [Before patch]
> >    25.72%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
> >      8.18%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
> >      8.17%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
> >      8.03%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
> >      5.46%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
> >      5.45%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
> >      4.36%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
> >      4.35%  multi-fault-all  [kernel.kallsyms]      [k] up_read
> >      3.59%  multi-fault-all  [kernel.kallsyms]      [k] css_put
> >      2.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
> >      1.80%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
> >      1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
> >      1.65%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> > 
> > try_get_mem_cgroup_from_mm() is a one of heavy ops because of false-sharing in
> > css's counter for css_get/put.
> > 
> I'm sorry, what do you mean by "false-sharing" ?

cacheline pingpong among cpus arount css' counter.

> And I think it would be better to add these performance data to commit log.
> 
yes, I'll move this when I remove RFC.

> > I removed that.
> > 
> > [After]
> >    26.16%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
> >     11.73%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
> >      9.23%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
> >      9.07%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
> >      6.09%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
> >      5.57%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
> >      4.86%  multi-fault-all  [kernel.kallsyms]      [k] up_read
> >      2.54%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
> >      2.29%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
> >      2.04%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
> >      1.82%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
> > 
> > Hmm. seems nice. But I don't convince my patch has no race.
> > I'll continue test but your help is welcome.
> > 
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > Now, memory cgroup increments css(cgroup subsys state)'s reference
> > count per a charged page. And the reference count is kept until
> > the page is uncharged. But this has 2 bad effect. 
> > 
> >  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
> >     on large smp will not scale well.
> I'm sorry if I'm asking a stupid question, the number of css_get/put
> would be:
> 
> 	before:
> 		get:1 in charge
> 		put:1 in uncharge
> 	after:
> 		get:1, put:1 in charge
> 		no get/put in uncharge
> 
> right ?

No.

	before: get 1 in charge.
		put 1 at charge

	after:
		no get at charge in fast path (cunsume_stcok hits.)
		get 1 at accssing res_counter and reclaim, put 1 after it.
		no get/put in uncharge.

> Then, isn't there any change as a whole ?
> 
We get much benefit when consume_stock() works. 

> >  2. Because css's refcnt cannot be in a state as "ready-to-release",
> >     cgroup's notify_on_release handler can't work with memcg.
> > 
> Yes, 2 is one of weak point of memcg, IMHO.
> 
> > This is a trial to remove css's refcnt per a page. Even if we remove
> > refcnt, pre_destroy() does enough synchronization.
> > 
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > ---
> >  mm/memcontrol.c |   66 +++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 46 insertions(+), 20 deletions(-)
> > 
> > Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
> > +++ mmotm-2.6.34-Jun6/mm/memcontrol.c
> > @@ -1717,25 +1717,49 @@ static int __mem_cgroup_try_charge(struc
> >  	 * thread group leader migrates. It's possible that mm is not
> >  	 * set, if so charge the init_mm (happens for pagecache usage).
> >  	 */
> > -	if (*memcg) {
> > +	if (!*memcg && !mm)
> > +		goto bypass;
> Shouldn't it be VM_BUG_ON(!*memcg && !mm) ?
> 
IIUC, we were afraid of a thread with no mm comes here...at boot, etc..
I don't want to change the minor behavior in this set.


> > +again:
> > +	if (*memcg) { /* css should be a valid one */
> >  		mem = *memcg;
> > +		VM_BUG_ON(css_is_removed(mem));
> > +		if (mem_cgroup_is_root(mem))
> > +			goto done;
> > +		if (consume_stock(mem))
> > +			goto done;
> >  		css_get(&mem->css);
> >  	} else {
> > -		mem = try_get_mem_cgroup_from_mm(mm);
> > -		if (unlikely(!mem))
> > -			return 0;
> > -		*memcg = mem;
> > -	}
> > +		struct task_struct *p;
> >  
> > -	VM_BUG_ON(css_is_removed(&mem->css));
> > -	if (mem_cgroup_is_root(mem))
> > -		goto done;
> > +		rcu_read_lock();
> > +		p = rcu_dereference(mm->owner);
> > +		VM_BUG_ON(!p);
> > +		/*
> > + 		 * while task_lock, this task cannot be disconnected with
> > + 		 * the cgroup we see.
> > + 		 */
> > +		task_lock(p);
> > +		mem = mem_cgroup_from_task(p);
> > +		VM_BUG_ON(!mem);
> > +		if (mem_cgroup_is_root(mem)) {
> Shoudn't we do "*memcg = mem" here ?
> hmm, how about doing:
> 
> 	done:
> 		*memcg = mem;
> 		return 0;
> 
> instead of doing "*memcg = mem" in some places ?
> 
Ok, I'll consider about that.



> > +			task_unlock(p);
> > +			rcu_read_unlock();
> > +			goto done;
> > +		}
> > +		if (consume_stock(mem)) {
> > +			*memcg = mem;
> > +			task_unlock(p);
> > +			rcu_read_unlock();
> > +			goto done;
> > +		}
> > +		css_get(&mem->css);
> > +		task_unlock(p);
> > +		rcu_read_unlock();
> > +	}
> >  
> >  	do {
> >  		bool oom_check;
> >  
> > -		if (consume_stock(mem))
> > -			goto done; /* don't need to fill stock */
> >  		/* If killed, bypass charge */
> >  		if (fatal_signal_pending(current))
> >  			goto bypass;
> > @@ -1750,10 +1774,13 @@ static int __mem_cgroup_try_charge(struc
> >  
> >  		switch (ret) {
> >  		case CHARGE_OK:
> > +			*memcg = mem;
> >  			break;
> >  		case CHARGE_RETRY: /* not in OOM situation but retry */
> >  			csize = PAGE_SIZE;
> > -			break;
> > +			css_put(&mem->css);
> > +			mem = NULL;
> > +			goto again;
> >  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
> >  			goto nomem;
> >  		case CHARGE_NOMEM: /* OOM routine works */
> > @@ -1769,6 +1796,7 @@ static int __mem_cgroup_try_charge(struc
> >  
> >  	if (csize > PAGE_SIZE)
> >  		refill_stock(mem, csize - PAGE_SIZE);
> > +	css_put(&mem->css);
> >  done:
> >  	return 0;
> >  nomem:
> > @@ -1795,7 +1823,6 @@ static void __mem_cgroup_cancel_charge(s
> >  			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
> >  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
> >  		WARN_ON_ONCE(count > INT_MAX);
> > -		__css_put(&mem->css, (int)count);
> >  	}
> >  	/* we don't need css_put for root */
> >  }
> These VM_BUG_ON() and WARN_ON_ONCE() will be unnecessary, too.
> 
ok.


> > @@ -2158,7 +2185,6 @@ int mem_cgroup_try_charge_swapin(struct 
> >  		goto charge_cur_mm;
> >  	*ptr = mem;
> >  	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
> > -	/* drop extra refcnt from tryget */
> >  	css_put(&mem->css);
> >  	return ret;
> >  charge_cur_mm:
> > @@ -2345,9 +2371,6 @@ __mem_cgroup_uncharge_common(struct page
> >  	unlock_page_cgroup(pc);
> >  
> >  	memcg_check_events(mem, page);
> > -	/* at swapout, this memcg will be accessed to record to swap */
> > -	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> > -		css_put(&mem->css);
> >  
> >  	return mem;
> >  
> > @@ -2432,14 +2455,18 @@ mem_cgroup_uncharge_swapcache(struct pag
> >  	if (!swapout) /* this was a swap cache but the swap is unused ! */
> >  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
> >  
> > -	memcg = __mem_cgroup_uncharge_common(page, ctype);
> > +	memcg = try_get_mem_cgroup_from_page(page);
> > +	if (!memcg)
> > +		return;
> > +
> > +	__mem_cgroup_uncharge_common(page, ctype);
> >  
> >  	/* record memcg information */
> >  	if (do_swap_account && swapout && memcg) {
> >  		swap_cgroup_record(ent, css_id(&memcg->css));
> >  		mem_cgroup_get(memcg);
> >  	}
> > -	if (swapout && memcg)
> > +	if (memcg)
> >  		css_put(&memcg->css);
> >  }
> >  #endif
> "if (memcg)" is unnecessary(it's checked above).
> 
Sure.



> > @@ -4219,7 +4246,6 @@ static int mem_cgroup_do_precharge(unsig
> >  		mc.precharge += count;
> >  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
> >  		WARN_ON_ONCE(count > INT_MAX);
> > -		__css_get(&mem->css, (int)count);
> >  		return ret;
> >  	}
> >  one_by_one:
> > 
> ditto.
> 
ok.

> IIUC this patch, we should remove css_put() in mem_cgroup_move_swap_account()
> and __css_put() in mem_cgroup_clear_mc() too, and modify some comments.
> Anyway, we must test these changes carefully.
> 
will do in the next version.

Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-09  0:54   ` KAMEZAWA Hiroyuki
@ 2010-06-09  2:05     ` Daisuke Nishimura
  0 siblings, 0 replies; 15+ messages in thread
From: Daisuke Nishimura @ 2010-06-09  2:05 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, Daisuke Nishimura

On Wed, 9 Jun 2010 09:54:48 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 8 Jun 2010 16:31:29 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Tue, 8 Jun 2010 12:19:01 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
(snip)
> > >  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
> > >     on large smp will not scale well.
> > I'm sorry if I'm asking a stupid question, the number of css_get/put
> > would be:
> > 
> > 	before:
> > 		get:1 in charge
> > 		put:1 in uncharge
> > 	after:
> > 		get:1, put:1 in charge
> > 		no get/put in uncharge
> > 
> > right ?
> 
> No.
> 
> 	before: get 1 in charge.
> 		put 1 at charge
> 
> 	after:
> 		no get at charge in fast path (cunsume_stcok hits.)
> 		get 1 at accssing res_counter and reclaim, put 1 after it.
> 		no get/put in uncharge.
> 
> > Then, isn't there any change as a whole ?
> > 
> We get much benefit when consume_stock() works. 
> 
Ah, I missed comsume_stock(). The number of get/put would be decreased very much.
Thank you for your explanation.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages
  2010-06-09  0:47   ` KAMEZAWA Hiroyuki
@ 2010-06-09  5:14     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-09  5:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, linux-mm, nishimura

On Wed, 9 Jun 2010 09:47:34 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> > Looks nice, Kamezawa-San could you please confirm the source of
> > raw_spin_lock_irqsave and trylock from /proc/lock_stat?
> >  
> Sure. But above result can be got when lockdep etc..are off.
> (it increase lock overhead)
> 
> But yes, new _raw_spin_lock seems strange.
> 
Here.
==
         ------------------------------
          &(&mm->page_table_lock)->rlock       20812995          [<ffffffff81124019>] handle_mm_fault+0x7a9/0x9b0
          &(&mm->page_table_lock)->rlock              9          [<ffffffff81120c5b>] __pte_alloc+0x4b/0xf0
          &(&mm->page_table_lock)->rlock              4          [<ffffffff8112c70d>] anon_vma_prepare+0xad/0x180
          &(&mm->page_table_lock)->rlock          83395          [<ffffffff811204b4>] unmap_vmas+0x3c4/0xa60
          ------------------------------
          &(&mm->page_table_lock)->rlock              7          [<ffffffff81120c5b>] __pte_alloc+0x4b/0xf0
          &(&mm->page_table_lock)->rlock       20812987          [<ffffffff81124019>] handle_mm_fault+0x7a9/0x9b0
          &(&mm->page_table_lock)->rlock              2          [<ffffffff8112c70d>] anon_vma_prepare+0xad/0x180
          &(&mm->page_table_lock)->rlock          83408          [<ffffffff811204b4>] unmap_vmas+0x3c4/0xa60

                &(&p->alloc_lock)->rlock:       6304532        6308276           0.14        1772.97     7098177.74       23165904       23222238           0.00        1980.76    12445023.62
                ------------------------
                &(&p->alloc_lock)->rlock        6308277          [<ffffffff81153e17>] __mem_cgroup_try_charge+0x327/0x590
                ------------------------
                &(&p->alloc_lock)->rlock        6308277          [<ffffffff81153e17>] __mem_cgroup_try_charge+0x327/0x590



==

Then, new raw_spin_lock is task_lock(). This is because task_lock(mm->owner) makes
cacheline ping pong ;(

So, this is not very good patch for multi-threaded programs, Sigh...
I'll consider how I can get safe access without locks again..

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-08  3:19 [RFC][PATCH] memcg remove css_get/put per pages KAMEZAWA Hiroyuki
  2010-06-08  5:40 ` Balbir Singh
  2010-06-08  7:31 ` Daisuke Nishimura
@ 2010-06-09  6:59 ` KAMEZAWA Hiroyuki
  2010-06-10  2:34   ` Daisuke Nishimura
                     ` (2 more replies)
  2 siblings, 3 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-09  6:59 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura, linux-kernel

Still RFC, added lkml to CC: list.
==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, memory cgroup increments css(cgroup subsys state)'s reference
count per a charged page. And the reference count is kept until
the page is uncharged. But this has 2 bad effect. 

 1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
    on large smp will not scale well.
 2. Because css's refcnt cannot be in a state as "ready-to-release",
    cgroup's notify_on_release handler can't work with memcg.

This is a trial to remove css's refcnt per a page. Even if we remove
refcnt, pre_destroy() does enough synchronization.

After this patch, it seems css_get() is still called in try_charge().
But the logic is.

  1. task_lock(mm->owner)
  2. If a memcg of mm->owner is cached one, consume_stock() will work.
     If it returns true, task_unlock and return immediately.
  3. If a memcg doesn't hit cached one, css_get() and access res_counter
     and do charge.
So, in the fast path, we don't call css_get() and can avoid access to
shared counter.
The point is that while there are used resource, memcg can't be freed.
So, if it's cached, we don't have to take css_get().

Brief test result on mmotm-2.6.32-Jul-06 + Nishimura-san's cleanup.
test with multi-threaded page fault program on 8cpu system.
(Each threads touch 2M of pages and release them by madvice().)

[Before]

           39522457  page-faults
          549387860  cache-misses   (cache-miss/page-fault 13.90)

       60.003715887  seconds time elapsed

    25.75%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
     8.59%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     8.22%  multi-fault-all  [kernel.kallsyms]      [k] try_get_mem_cgroup_from_mm
     8.01%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     5.41%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     5.41%  multi-fault-all  [kernel.kallsyms]      [k] __css_put
     4.48%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     4.41%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     3.60%  multi-fault-all  [kernel.kallsyms]      [k] css_put
     1.93%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     1.91%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
     1.89%  multi-fault-all  [kernel.kallsyms]      [k] _cond_resched
     1.78%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
     1.69%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     1.63%  multi-fault-all  [kernel.kallsyms]      [k] page_fault
     1.45%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
     1.34%  multi-fault-all  [kernel.kallsyms]      [k] find_vma

[After]

           43253862  page-faults
          505750203  cache-misses   (cache-miss/page-fault 11.69)

       60.004123555  seconds time elapsed

    27.98%  multi-fault-all  [kernel.kallsyms]      [k] clear_page_c
     9.89%  multi-fault-all  [kernel.kallsyms]      [k] down_read_trylock
     9.88%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irq
     9.37%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock_irqsave
     5.91%  multi-fault-all  [kernel.kallsyms]      [k] __alloc_pages_nodemask
     5.69%  multi-fault-all  [kernel.kallsyms]      [k] up_read
     2.94%  multi-fault-all  [kernel.kallsyms]      [k] _raw_spin_lock
     2.70%  multi-fault-all  [kernel.kallsyms]      [k] handle_mm_fault
     2.66%  multi-fault-all  [kernel.kallsyms]      [k] mem_cgroup_add_lru_list
     2.25%  multi-fault-all  [kernel.kallsyms]      [k] __mem_cgroup_commit_charge
     1.90%  multi-fault-all  [kernel.kallsyms]      [k] __rmqueue
     1.74%  multi-fault-all  [kernel.kallsyms]      [k] page_fault
     1.52%  multi-fault-all  [kernel.kallsyms]      [k] find_vma

Then, overhead is reduced.

Note: Because this is an extreme, not-realistic, test, I'm not sure how
this change will be benefits for usual applications.
 

Changelog 20100609:
 - clean up try_charge().
 - fixed mem_cgroup_clear_mc
 - removed unnecessary warnings.
 - removed task_lock, added more comments.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |   96 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 33 deletions(-)

Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
+++ mmotm-2.6.34-Jun6/mm/memcontrol.c
@@ -1717,28 +1717,61 @@ static int __mem_cgroup_try_charge(struc
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
-	if (*memcg) {
+	if (!*memcg && !mm)
+		goto bypass;
+again:
+	if (*memcg) { /* css should be a valid one */
 		mem = *memcg;
+		VM_BUG_ON(css_is_removed(&mem->css));
+		if (mem_cgroup_is_root(mem))
+			goto done;
+		if (consume_stock(mem))
+			goto done;
 		css_get(&mem->css);
 	} else {
-		mem = try_get_mem_cgroup_from_mm(mm);
-		if (unlikely(!mem))
-			return 0;
-		*memcg = mem;
-	}
+		struct task_struct *p;
 
-	VM_BUG_ON(css_is_removed(&mem->css));
-	if (mem_cgroup_is_root(mem))
-		goto done;
+		rcu_read_lock();
+		p = rcu_dereference(mm->owner);
+		VM_BUG_ON(!p);
+		/*
+		 * It seems there are some races when 'p' exits. But, at exit(),
+		 * p->cgroups will be reset to init_css_set. So, we'll just
+		 * find the root cgroup even if 'p' gets to be obsolete.
+ 		 */
+		mem = mem_cgroup_from_task(p);
+		VM_BUG_ON(!mem);
+		if (mem_cgroup_is_root(mem)) {
+			rcu_read_unlock();
+			goto done;
+		}
+		if (consume_stock(mem)) {
+			/*
+			 * It seems dagerous to access memcg without css_get().
+			 * But considering how consume_stok works, it's not
+			 * necessary. If consume_stock success, some charges
+			 * from this memcg are cached on this cpu. So, we
+			 * don't need to call css_get()/css_tryget() before
+			 * calling consume_stock().
+			 */
+			rcu_read_unlock();
+			goto done;
+		}
+		if (!css_tryget(&mem->css)) {
+			rcu_read_unlock();
+			goto again;
+		}
+		rcu_read_unlock();
+	}
 
 	do {
 		bool oom_check;
 
-		if (consume_stock(mem))
-			goto done; /* don't need to fill stock */
 		/* If killed, bypass charge */
-		if (fatal_signal_pending(current))
+		if (fatal_signal_pending(current)) {
+			css_put(&mem->css);
 			goto bypass;
+		}
 
 		oom_check = false;
 		if (oom && !nr_oom_retries) {
@@ -1753,30 +1786,36 @@ static int __mem_cgroup_try_charge(struc
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			csize = PAGE_SIZE;
-			break;
+			css_put(&mem->css);
+			mem = NULL;
+			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+			css_put(&mem->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
-			if (!oom)
+			if (!oom) {
+				css_put(&mem->css);
 				goto nomem;
+			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
+			css_put(&mem->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 
 	if (csize > PAGE_SIZE)
 		refill_stock(mem, csize - PAGE_SIZE);
+	css_put(&mem->css);
 done:
+	*memcg = mem;
 	return 0;
 nomem:
-	css_put(&mem->css);
+	*memcg = NULL;
 	return -ENOMEM;
 bypass:
-	if (mem)
-		css_put(&mem->css);
 	*memcg = NULL;
 	return 0;
 }
@@ -1793,11 +1832,7 @@ static void __mem_cgroup_cancel_charge(s
 		res_counter_uncharge(&mem->res, PAGE_SIZE * count);
 		if (do_swap_account)
 			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-		WARN_ON_ONCE(count > INT_MAX);
-		__css_put(&mem->css, (int)count);
 	}
-	/* we don't need css_put for root */
 }
 
 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -2158,7 +2193,6 @@ int mem_cgroup_try_charge_swapin(struct 
 		goto charge_cur_mm;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-	/* drop extra refcnt from tryget */
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
@@ -2345,9 +2379,6 @@ __mem_cgroup_uncharge_common(struct page
 	unlock_page_cgroup(pc);
 
 	memcg_check_events(mem, page);
-	/* at swapout, this memcg will be accessed to record to swap */
-	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-		css_put(&mem->css);
 
 	return mem;
 
@@ -2432,15 +2463,18 @@ mem_cgroup_uncharge_swapcache(struct pag
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 
-	memcg = __mem_cgroup_uncharge_common(page, ctype);
+	memcg = try_get_mem_cgroup_from_page(page);
+	if (!memcg)
+		return;
+
+	__mem_cgroup_uncharge_common(page, ctype);
 
 	/* record memcg information */
-	if (do_swap_account && swapout && memcg) {
+	if (do_swap_account && swapout) {
 		swap_cgroup_record(ent, css_id(&memcg->css));
 		mem_cgroup_get(memcg);
 	}
-	if (swapout && memcg)
-		css_put(&memcg->css);
+	css_put(&memcg->css);
 }
 #endif
 
@@ -2518,7 +2552,6 @@ static int mem_cgroup_move_swap_account(
 			 */
 			if (!mem_cgroup_is_root(to))
 				res_counter_uncharge(&to->res, PAGE_SIZE);
-			css_put(&to->css);
 		}
 		return 0;
 	}
@@ -4219,7 +4252,6 @@ static int mem_cgroup_do_precharge(unsig
 		mc.precharge += count;
 		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
 		WARN_ON_ONCE(count > INT_MAX);
-		__css_get(&mem->css, (int)count);
 		return ret;
 	}
 one_by_one:
@@ -4469,8 +4501,6 @@ static void mem_cgroup_clear_mc(void)
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
-			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-			__css_put(&mc.to->css, mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
@ 2010-06-10  2:34   ` Daisuke Nishimura
  2010-06-10  2:49     ` KAMEZAWA Hiroyuki
  2010-06-11  4:37   ` Daisuke Nishimura
  2010-06-11  6:11   ` Balbir Singh
  2 siblings, 1 reply; 15+ messages in thread
From: Daisuke Nishimura @ 2010-06-10  2:34 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, linux-kernel, Daisuke Nishimura

I can't find any trivial bugs from my review at the moment.
I'll do some tests.

Some minor commens.

On Wed, 9 Jun 2010 15:59:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Still RFC, added lkml to CC: list.
> ==
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Now, memory cgroup increments css(cgroup subsys state)'s reference
> count per a charged page. And the reference count is kept until
> the page is uncharged. But this has 2 bad effect. 
> 
>  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
>     on large smp will not scale well.
>  2. Because css's refcnt cannot be in a state as "ready-to-release",
>     cgroup's notify_on_release handler can't work with memcg.
> 
> This is a trial to remove css's refcnt per a page. Even if we remove
> refcnt, pre_destroy() does enough synchronization.
> 
> After this patch, it seems css_get() is still called in try_charge().
> But the logic is.
> 
>   1. task_lock(mm->owner)
There is no task_lock() in this version :)

(snip)
> @@ -4219,7 +4252,6 @@ static int mem_cgroup_do_precharge(unsig
>  		mc.precharge += count;
>  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
>  		WARN_ON_ONCE(count > INT_MAX);
> -		__css_get(&mem->css, (int)count);
>  		return ret;
>  	}
>  one_by_one:
You can remove VM_BUG_ON() and WARN_ON_ONCE() here, too.

> @@ -4469,8 +4501,6 @@ static void mem_cgroup_clear_mc(void)
>  			 */
>  			res_counter_uncharge(&mc.to->res,
>  						PAGE_SIZE * mc.moved_swap);
> -			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
> -			__css_put(&mc.to->css, mc.moved_swap);
>  		}
>  		/* we've already done mem_cgroup_get(mc.to) */
>  
> 
And, you can remove "WARN_ON_ONCE(mc.moved_swap > INT_MAX)" at the beginning
of this block, too.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-10  2:34   ` Daisuke Nishimura
@ 2010-06-10  2:49     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-10  2:49 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: linux-mm, balbir, linux-kernel

On Thu, 10 Jun 2010 11:34:24 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> I can't find any trivial bugs from my review at the moment.
> I'll do some tests.
> 

Thank you for review.

> Some minor commens.
> 
> On Wed, 9 Jun 2010 15:59:40 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > Still RFC, added lkml to CC: list.
> > ==
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> > Now, memory cgroup increments css(cgroup subsys state)'s reference
> > count per a charged page. And the reference count is kept until
> > the page is uncharged. But this has 2 bad effect. 
> > 
> >  1. Because css_get/put calls atoimic_inc()/dec, heavy call of them
> >     on large smp will not scale well.
> >  2. Because css's refcnt cannot be in a state as "ready-to-release",
> >     cgroup's notify_on_release handler can't work with memcg.
> > 
> > This is a trial to remove css's refcnt per a page. Even if we remove
> > refcnt, pre_destroy() does enough synchronization.
> > 
> > After this patch, it seems css_get() is still called in try_charge().
> > But the logic is.
> > 
> >   1. task_lock(mm->owner)
> There is no task_lock() in this version :)
> 
yes.


> (snip)
> > @@ -4219,7 +4252,6 @@ static int mem_cgroup_do_precharge(unsig
> >  		mc.precharge += count;
> >  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
> >  		WARN_ON_ONCE(count > INT_MAX);
> > -		__css_get(&mem->css, (int)count);
> >  		return ret;
> >  	}
> >  one_by_one:
> You can remove VM_BUG_ON() and WARN_ON_ONCE() here, too.
> 
ok.


> > @@ -4469,8 +4501,6 @@ static void mem_cgroup_clear_mc(void)
> >  			 */
> >  			res_counter_uncharge(&mc.to->res,
> >  						PAGE_SIZE * mc.moved_swap);
> > -			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
> > -			__css_put(&mc.to->css, mc.moved_swap);
> >  		}
> >  		/* we've already done mem_cgroup_get(mc.to) */
> >  
> > 
> And, you can remove "WARN_ON_ONCE(mc.moved_swap > INT_MAX)" at the beginning
> of this block, too.
> 
Hmm. ok.
Ah...them, this patch fixes the limitation by "css's refcnt is int" problem.
sounds nice.

Thanks,
-Kmae

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
  2010-06-10  2:34   ` Daisuke Nishimura
@ 2010-06-11  4:37   ` Daisuke Nishimura
  2010-06-11  4:52     ` KAMEZAWA Hiroyuki
  2010-06-11  6:11   ` Balbir Singh
  2 siblings, 1 reply; 15+ messages in thread
From: Daisuke Nishimura @ 2010-06-11  4:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, linux-kernel, Daisuke Nishimura

> @@ -2432,15 +2463,18 @@ mem_cgroup_uncharge_swapcache(struct pag
>  	if (!swapout) /* this was a swap cache but the swap is unused ! */
>  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
>  
> -	memcg = __mem_cgroup_uncharge_common(page, ctype);
> +	memcg = try_get_mem_cgroup_from_page(page);
> +	if (!memcg)
> +		return;
> +
> +	__mem_cgroup_uncharge_common(page, ctype);
>  
>  	/* record memcg information */
> -	if (do_swap_account && swapout && memcg) {
> +	if (do_swap_account && swapout) {
>  		swap_cgroup_record(ent, css_id(&memcg->css));
>  		mem_cgroup_get(memcg);
>  	}
> -	if (swapout && memcg)
> -		css_put(&memcg->css);
> +	css_put(&memcg->css);
>  }
>  #endif
>  
hmm, this change seems to cause a problem.
I can see under flow of mem->memsw and "swap" field in memory.stat. 

I think doing swap_cgroup_record() against mem_cgroup which is not returned
by __mem_cgroup_uncharge_common() is a bad behavior.

How about doing like this ? We can safely access mem_cgroup while it has
memory.usage, iow, before we call res_counter_uncharge().
After this change, it seems to work well.

---
 mm/memcontrol.c |   22 +++++++++-------------
 1 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6e7c1c9..2fae26f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2362,10 +2362,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	if (!mem_cgroup_is_root(mem))
-		__do_uncharge(mem, ctype);
-	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-		mem_cgroup_swap_statistics(mem, true);
 	mem_cgroup_charge_statistics(mem, pc, false);
 
 	ClearPageCgroupUsed(pc);
@@ -2379,6 +2375,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	unlock_page_cgroup(pc);
 
 	memcg_check_events(mem, page);
+	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+		mem_cgroup_swap_statistics(mem, true);
+		mem_cgroup_get(mem);
+	}
+	if (!mem_cgroup_is_root(mem))
+		__do_uncharge(mem, ctype);
 
 	return mem;
 
@@ -2463,18 +2465,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 
-	memcg = try_get_mem_cgroup_from_page(page);
-	if (!memcg)
-		return;
-
-	__mem_cgroup_uncharge_common(page, ctype);
+	memcg = __mem_cgroup_uncharge_common(page, ctype);
 
 	/* record memcg information */
-	if (do_swap_account && swapout) {
+	if (do_swap_account && swapout && memcg)
+		/* We've already done mem_cgroup_get() in uncharge_common(). */
 		swap_cgroup_record(ent, css_id(&memcg->css));
-		mem_cgroup_get(memcg);
-	}
-	css_put(&memcg->css);
 }
 #endif
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-11  4:37   ` Daisuke Nishimura
@ 2010-06-11  4:52     ` KAMEZAWA Hiroyuki
  2010-06-11  4:59       ` Daisuke Nishimura
  0 siblings, 1 reply; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-11  4:52 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: linux-mm, balbir, linux-kernel

On Fri, 11 Jun 2010 13:37:44 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> > @@ -2432,15 +2463,18 @@ mem_cgroup_uncharge_swapcache(struct pag
> >  	if (!swapout) /* this was a swap cache but the swap is unused ! */
> >  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
> >  
> > -	memcg = __mem_cgroup_uncharge_common(page, ctype);
> > +	memcg = try_get_mem_cgroup_from_page(page);
> > +	if (!memcg)
> > +		return;
> > +
> > +	__mem_cgroup_uncharge_common(page, ctype);
> >  
> >  	/* record memcg information */
> > -	if (do_swap_account && swapout && memcg) {
> > +	if (do_swap_account && swapout) {
> >  		swap_cgroup_record(ent, css_id(&memcg->css));
> >  		mem_cgroup_get(memcg);
> >  	}
> > -	if (swapout && memcg)
> > -		css_put(&memcg->css);
> > +	css_put(&memcg->css);
> >  }
> >  #endif
> >  
> hmm, this change seems to cause a problem.
> I can see under flow of mem->memsw and "swap" field in memory.stat. 
> 
> I think doing swap_cgroup_record() against mem_cgroup which is not returned
> by __mem_cgroup_uncharge_common() is a bad behavior.
> 
> How about doing like this ? We can safely access mem_cgroup while it has
> memory.usage, iow, before we call res_counter_uncharge().
> After this change, it seems to work well.
> 

Thank you!. seems to work. I'll merge your change.
Can I add your Signed-off-by ?

Thanks,
-Kame

> ---
>  mm/memcontrol.c |   22 +++++++++-------------
>  1 files changed, 9 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6e7c1c9..2fae26f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2362,10 +2362,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  		break;
>  	}
>  
> -	if (!mem_cgroup_is_root(mem))
> -		__do_uncharge(mem, ctype);
> -	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> -		mem_cgroup_swap_statistics(mem, true);
>  	mem_cgroup_charge_statistics(mem, pc, false);
>  
>  	ClearPageCgroupUsed(pc);
> @@ -2379,6 +2375,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>  	unlock_page_cgroup(pc);
>  
>  	memcg_check_events(mem, page);
> +	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
> +		mem_cgroup_swap_statistics(mem, true);
> +		mem_cgroup_get(mem);
> +	}
> +	if (!mem_cgroup_is_root(mem))
> +		__do_uncharge(mem, ctype);
>  
>  	return mem;
>  
> @@ -2463,18 +2465,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
>  	if (!swapout) /* this was a swap cache but the swap is unused ! */
>  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
>  
> -	memcg = try_get_mem_cgroup_from_page(page);
> -	if (!memcg)
> -		return;
> -
> -	__mem_cgroup_uncharge_common(page, ctype);
> +	memcg = __mem_cgroup_uncharge_common(page, ctype);
>  
>  	/* record memcg information */
> -	if (do_swap_account && swapout) {
> +	if (do_swap_account && swapout && memcg)
> +		/* We've already done mem_cgroup_get() in uncharge_common(). */
>  		swap_cgroup_record(ent, css_id(&memcg->css));
> -		mem_cgroup_get(memcg);
> -	}
> -	css_put(&memcg->css);
>  }
>  #endif
>  
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-11  4:52     ` KAMEZAWA Hiroyuki
@ 2010-06-11  4:59       ` Daisuke Nishimura
  0 siblings, 0 replies; 15+ messages in thread
From: Daisuke Nishimura @ 2010-06-11  4:59 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, linux-kernel, Daisuke Nishimura

On Fri, 11 Jun 2010 13:52:02 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Fri, 11 Jun 2010 13:37:44 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > > @@ -2432,15 +2463,18 @@ mem_cgroup_uncharge_swapcache(struct pag
> > >  	if (!swapout) /* this was a swap cache but the swap is unused ! */
> > >  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
> > >  
> > > -	memcg = __mem_cgroup_uncharge_common(page, ctype);
> > > +	memcg = try_get_mem_cgroup_from_page(page);
> > > +	if (!memcg)
> > > +		return;
> > > +
> > > +	__mem_cgroup_uncharge_common(page, ctype);
> > >  
> > >  	/* record memcg information */
> > > -	if (do_swap_account && swapout && memcg) {
> > > +	if (do_swap_account && swapout) {
> > >  		swap_cgroup_record(ent, css_id(&memcg->css));
> > >  		mem_cgroup_get(memcg);
> > >  	}
> > > -	if (swapout && memcg)
> > > -		css_put(&memcg->css);
> > > +	css_put(&memcg->css);
> > >  }
> > >  #endif
> > >  
> > hmm, this change seems to cause a problem.
> > I can see under flow of mem->memsw and "swap" field in memory.stat. 
> > 
> > I think doing swap_cgroup_record() against mem_cgroup which is not returned
> > by __mem_cgroup_uncharge_common() is a bad behavior.
> > 
> > How about doing like this ? We can safely access mem_cgroup while it has
> > memory.usage, iow, before we call res_counter_uncharge().
> > After this change, it seems to work well.
> > 
> 
> Thank you!. seems to work. I'll merge your change.
> Can I add your Signed-off-by ?
> 
Sure.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
  2010-06-10  2:34   ` Daisuke Nishimura
  2010-06-11  4:37   ` Daisuke Nishimura
@ 2010-06-11  6:11   ` Balbir Singh
  2010-06-11  6:21     ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 15+ messages in thread
From: Balbir Singh @ 2010-06-11  6:11 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, nishimura, linux-kernel

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-06-09 15:59:40]:

> +		if (consume_stock(mem)) {
> +			/*
> +			 * It seems dagerous to access memcg without css_get().
> +			 * But considering how consume_stok works, it's not
> +			 * necessary. If consume_stock success, some charges
> +			 * from this memcg are cached on this cpu. So, we
> +			 * don't need to call css_get()/css_tryget() before
> +			 * calling consume_stock().
> +			 */
> +			rcu_read_unlock();
> +			goto done;
> +		}
> +		if (!css_tryget(&mem->css)) {

If tryget fails, can one assume that this due to a race and the mem is
about to be freed?


-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [RFC][PATCH] memcg remove css_get/put per pages v2
  2010-06-11  6:11   ` Balbir Singh
@ 2010-06-11  6:21     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 15+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-06-11  6:21 UTC (permalink / raw)
  To: balbir; +Cc: linux-mm, nishimura, linux-kernel

On Fri, 11 Jun 2010 11:41:02 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2010-06-09 15:59:40]:
> 
> > +		if (consume_stock(mem)) {
> > +			/*
> > +			 * It seems dagerous to access memcg without css_get().
> > +			 * But considering how consume_stok works, it's not
> > +			 * necessary. If consume_stock success, some charges
> > +			 * from this memcg are cached on this cpu. So, we
> > +			 * don't need to call css_get()/css_tryget() before
> > +			 * calling consume_stock().
> > +			 */
> > +			rcu_read_unlock();
> > +			goto done;
> > +		}
> > +		if (!css_tryget(&mem->css)) {
> 
> If tryget fails, can one assume that this due to a race and the mem is
> about to be freed?
> 
Yes. it's due to a race and "mem" will be no longer used.
This does the same thing which try_get_mem_cgrou_from_mm() does now.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2010-06-11  7:03 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-08  3:19 [RFC][PATCH] memcg remove css_get/put per pages KAMEZAWA Hiroyuki
2010-06-08  5:40 ` Balbir Singh
2010-06-09  0:47   ` KAMEZAWA Hiroyuki
2010-06-09  5:14     ` KAMEZAWA Hiroyuki
2010-06-08  7:31 ` Daisuke Nishimura
2010-06-09  0:54   ` KAMEZAWA Hiroyuki
2010-06-09  2:05     ` Daisuke Nishimura
2010-06-09  6:59 ` [RFC][PATCH] memcg remove css_get/put per pages v2 KAMEZAWA Hiroyuki
2010-06-10  2:34   ` Daisuke Nishimura
2010-06-10  2:49     ` KAMEZAWA Hiroyuki
2010-06-11  4:37   ` Daisuke Nishimura
2010-06-11  4:52     ` KAMEZAWA Hiroyuki
2010-06-11  4:59       ` Daisuke Nishimura
2010-06-11  6:11   ` Balbir Singh
2010-06-11  6:21     ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox