[PATCH] vmscan: bail out of page reclaim after swap_cluster

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
@ 2008-11-24 19:50 Rik van Riel
  2008-11-24 20:53 ` Andrew Morton
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Rik van Riel @ 2008-11-24 19:50 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel, mel, KOSAKI Motohiro, akpm

Sometimes the VM spends the first few priority rounds rotating back
referenced pages and submitting IO.  Once we get to a lower priority,
sometimes the VM ends up freeing way too many pages.

The fix is relatively simple: in shrink_zone() we can check how many
pages we have already freed, direct reclaim tasks break out of the
scanning loop if they have already freed enough pages and have reached
a lower priority level.

However, in order to do this we do need to know how many pages we already
freed, so move nr_reclaimed into scan_control.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
Kosaki, this should address the zone scanning pressure issue.

Nick, this includes the cleanups suggested by you.

 mm/vmscan.c |   62 +++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

Index: linux-2.6.28-rc5/mm/vmscan.c
===================================================================
--- linux-2.6.28-rc5.orig/mm/vmscan.c	2008-11-17 15:22:22.000000000 -0500
+++ linux-2.6.28-rc5/mm/vmscan.c	2008-11-24 14:47:17.000000000 -0500
@@ -52,6 +52,9 @@ struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
+	/* Number of pages freed so far during a call to shrink_zones() */
+	unsigned long nr_reclaimed;
+
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -1405,12 +1408,11 @@ static void get_scan_ratio(struct zone *
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
-	unsigned long nr_reclaimed = 0;
 	unsigned long percent[2];	/* anon @ 0; file @ 1 */
 	enum lru_list l;
 
@@ -1451,10 +1453,21 @@ static unsigned long shrink_zone(int pri
 					(unsigned long)sc->swap_cluster_max);
 				nr[l] -= nr_to_scan;
 
-				nr_reclaimed += shrink_list(l, nr_to_scan,
+				sc->nr_reclaimed += shrink_list(l, nr_to_scan,
 							zone, sc, priority);
 			}
 		}
+		/*
+		 * On large memory systems, scan >> priority can become
+		 * really large. This is fine for the starting priority;
+		 * we want to put equal scanning pressure on each zone.
+		 * However, if the VM has a harder time of freeing pages,
+		 * with multiple processes reclaiming pages, the total
+		 * freeing target can get unreasonably large.
+		 */
+		if (sc->nr_reclaimed > sc->swap_cluster_max &&
+			sc->priority < DEF_PRIORITY && !current_is_kswapd())
+			break;
 	}
 
 	/*
@@ -1467,7 +1480,6 @@ static unsigned long shrink_zone(int pri
 		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
 
 	throttle_vm_writeout(sc->gfp_mask);
-	return nr_reclaimed;
 }
 
 /*
@@ -1481,16 +1493,13 @@ static unsigned long shrink_zone(int pri
  * b) The zones may be over pages_high but they must go *over* pages_high to
  *    satisfy the `incremental min' zone defense algorithm.
  *
- * Returns the number of reclaimed pages.
- *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
-	unsigned long nr_reclaimed = 0;
 	struct zoneref *z;
 	struct zone *zone;
 
@@ -1521,10 +1530,8 @@ static unsigned long shrink_zones(int pr
 							priority);
 		}
 
-		nr_reclaimed += shrink_zone(priority, zone, sc);
+		shrink_zone(priority, zone, sc);
 	}
-
-	return nr_reclaimed;
 }
 
 /*
@@ -1549,7 +1556,6 @@ static unsigned long do_try_to_free_page
 	int priority;
 	unsigned long ret = 0;
 	unsigned long total_scanned = 0;
-	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	struct zoneref *z;
@@ -1577,7 +1583,7 @@ static unsigned long do_try_to_free_page
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zonelist, sc);
+		shrink_zones(priority, zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from
 		 * over limit cgroups
@@ -1585,13 +1591,13 @@ static unsigned long do_try_to_free_page
 		if (scan_global_lru(sc)) {
 			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
 			if (reclaim_state) {
-				nr_reclaimed += reclaim_state->reclaimed_slab;
+				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
 			}
 		}
 		total_scanned += sc->nr_scanned;
-		if (nr_reclaimed >= sc->swap_cluster_max) {
-			ret = nr_reclaimed;
+		if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+			ret = sc->nr_reclaimed;
 			goto out;
 		}
 
@@ -1614,7 +1620,7 @@ static unsigned long do_try_to_free_page
 	}
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (!sc->all_unreclaimable && scan_global_lru(sc))
-		ret = nr_reclaimed;
+		ret = sc->nr_reclaimed;
 out:
 	/*
 	 * Now that we've scanned all the zones at this priority level, note
@@ -1709,7 +1715,6 @@ static unsigned long balance_pgdat(pg_da
 	int priority;
 	int i;
 	unsigned long total_scanned;
-	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1728,7 +1733,7 @@ static unsigned long balance_pgdat(pg_da
 
 loop_again:
 	total_scanned = 0;
-	nr_reclaimed = 0;
+	sc.nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
@@ -1814,11 +1819,11 @@ loop_again:
 			 */
 			if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
 						end_zone, 0))
-				nr_reclaimed += shrink_zone(priority, zone, &sc);
+				shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-			nr_reclaimed += reclaim_state->reclaimed_slab;
+			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone_is_all_unreclaimable(zone))
 				continue;
@@ -1832,7 +1837,7 @@ loop_again:
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
+			    total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
 		if (all_zones_ok)
@@ -1850,7 +1855,7 @@ loop_again:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
 	}
 out:
@@ -1872,7 +1877,7 @@ out:
 		goto loop_again;
 	}
 
-	return nr_reclaimed;
+	return sc.nr_reclaimed;
 }
 
 /*
@@ -2224,7 +2229,6 @@ static int __zone_reclaim(struct zone *z
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
-	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2257,9 +2261,9 @@ static int __zone_reclaim(struct zone *z
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			shrink_zone(priority, zone, &sc);
 			priority--;
-		} while (priority >= 0 && nr_reclaimed < nr_pages);
+		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
 
 	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2283,13 +2287,13 @@ static int __zone_reclaim(struct zone *z
 		 * Update nr_reclaimed by the number of slab pages we
 		 * reclaimed from this zone.
 		 */
-		nr_reclaimed += slab_reclaimable -
+		sc.nr_reclaimed += slab_reclaimable -
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	}
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-	return nr_reclaimed >= nr_pages;
+	return sc.nr_reclaimed >= nr_pages;
 }
 
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
@ 2008-11-24 20:53 ` Andrew Morton
  2008-11-25 11:35 ` KOSAKI Motohiro
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Andrew Morton @ 2008-11-24 20:53 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, kosaki.motohiro

On Mon, 24 Nov 2008 14:50:57 -0500
Rik van Riel <riel@redhat.com> wrote:

> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO.  Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.

It would help (a lot) if we had a much more specific and detailed
description of the problem which is being fixed.  Nobody has noticed it
in half a decade, so it can't be very serious?

> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed, direct reclaim tasks break out of the
> scanning loop if they have already freed enough pages and have reached
> a lower priority level.

So in the common scenario where there's a lot of dirty highmem and
little dirty lowmem, the kernel will start reclaiming highmem at a
vastly higher rate than lowmem.  iirc, this was the reason why this
change was tried then reverted.

Please demonstrate that this regression is not worse than the problem
which is being fixed!

> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.

Thus carrying the state across the *entire* scanning pass: all zones.

So as soon as sc.nr_reclaimed exceeds swap_cluster_max, the scanner
will fall into a different mode for the remaining zones wherein it will
scan only swap_cluster_max pages from them, then will bale.

This will heavily bias scanning onto the zones at the start of the zone
list.  In fact it probably means that the zone at the head of the
zonelist gets thrashed and the remaining zones will just sit there
doing almost nothing.  Where's the sense in that?

Has any testing been done to demonstrate and quantify this effect?

> Signed-off-by: Rik van Riel <riel@redhat.com>
> ---
> Kosaki, this should address the zone scanning pressure issue.

What is the "zone scanning pressure issue"?

Please don't put "should" in a vmscan changelog :( Either it does, or
it does not?


This should look familiar:

	commit e468e46a9bea3297011d5918663ce6d19094cf87
	Author: akpm <akpm>
	Date:   Thu Jun 24 15:53:52 2004 +0000

	[PATCH] vmscan.c: dont reclaim too many pages
	    
	    The shrink_zone() logic can, under some circumstances, cause far too many
	    pages to be reclaimed.  Say, we're scanning at high priority and suddenly hit
	    a large number of reclaimable pages on the LRU.                                                                        
	    Change things so we bale out when SWAP_CLUSTER_MAX pages have been reclaimed.
	    
	    Signed-off-by: Andrew Morton <akpm@osdl.org>
	    Signed-off-by: Linus Torvalds <torvalds@osdl.org>
	    
	    BKrev: 40daf910sac4yN_aUhhJF2U8Upx1ww


And here is where it was reverted.  Note that this was nearly two years
later!  It takes that long for these things to be discovered, analysed
and fixed.



	commit 210fe530305ee50cd889fe9250168228b2994f32
	Author: Andrew Morton <akpm@osdl.org>
	Date:   Fri Jan 6 00:11:14 2006 -0800

	    [PATCH] vmscan: balancing fix
	    
	    Revert a patch which went into 2.6.8-rc1.  The changelog for that patch was:
	    
	      The shrink_zone() logic can, under some circumstances, cause far too many
	      pages to be reclaimed.  Say, we're scanning at high priority and suddenly
	      hit a large number of reclaimable pages on the LRU.
	    
	      Change things so we bale out when SWAP_CLUSTER_MAX pages have been
	      reclaimed.
	    
	    Problem is, this change caused significant imbalance in inter-zone scan
	    balancing by truncating scans of larger zones.
	    
	    Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL.  The zone
	    balancing algorithm would require that if we're scanning 100 pages of
	    ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL.  But this logic will
	    cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are
	    reclaimed.  Thus effectively causing smaller zones to be scanned relatively
	    harder than large ones.
	    
	    Now I need to remember what the workload was which caused me to write this
	    patch originally, then fix it up in a different way...
	    
	    Signed-off-by: Andrew Morton <akpm@osdl.org>
	    Signed-off-by: Linus Torvalds <torvalds@osdl.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
  2008-11-24 20:53 ` Andrew Morton
@ 2008-11-25 11:35 ` KOSAKI Motohiro
  2008-11-25 13:32   ` Rik van Riel
  2008-11-28  7:02   ` KOSAKI Motohiro
  2008-11-26  2:24 ` KOSAKI Motohiro
  2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
  3 siblings, 2 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-25 11:35 UTC (permalink / raw)
  To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm

> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO.  Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
> 
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed, direct reclaim tasks break out of the
> scanning loop if they have already freed enough pages and have reached
> a lower priority level.
> 
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.
> 
> Signed-off-by: Rik van Riel <riel@redhat.com>
> ---
> Kosaki, this should address the zone scanning pressure issue.

hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
but I also should explain by code and benchmark.

therefore, I'll try to mesure this patch in this week.

thanks.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-25 11:35 ` KOSAKI Motohiro
@ 2008-11-25 13:32   ` Rik van Riel
  2008-11-25 14:30     ` KOSAKI Motohiro
  2008-11-28  7:02   ` KOSAKI Motohiro
  1 sibling, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-11-25 13:32 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, mel, akpm

KOSAKI Motohiro wrote:
>> Sometimes the VM spends the first few priority rounds rotating back
>> referenced pages and submitting IO.  Once we get to a lower priority,
>> sometimes the VM ends up freeing way too many pages.
>>
>> The fix is relatively simple: in shrink_zone() we can check how many
>> pages we have already freed, direct reclaim tasks break out of the
>> scanning loop if they have already freed enough pages and have reached
>> a lower priority level.
>>
>> However, in order to do this we do need to know how many pages we already
>> freed, so move nr_reclaimed into scan_control.
>>
>> Signed-off-by: Rik van Riel <riel@redhat.com>
>> ---
>> Kosaki, this should address the zone scanning pressure issue.
> 
> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
> but I also should explain by code and benchmark.

Well, the behaviour when priority==DEF_PRIORITY is the
same as the kernel's behaviour without the patch...

> therefore, I'll try to mesure this patch in this week.

Looking forward to it.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-25 13:32   ` Rik van Riel
@ 2008-11-25 14:30     ` KOSAKI Motohiro
  0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-25 14:30 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, akpm

2008/11/25 Rik van Riel <riel@redhat.com>:
> KOSAKI Motohiro wrote:
>>>
>>> Sometimes the VM spends the first few priority rounds rotating back
>>> referenced pages and submitting IO.  Once we get to a lower priority,
>>> sometimes the VM ends up freeing way too many pages.
>>>
>>> The fix is relatively simple: in shrink_zone() we can check how many
>>> pages we have already freed, direct reclaim tasks break out of the
>>> scanning loop if they have already freed enough pages and have reached
>>> a lower priority level.
>>>
>>> However, in order to do this we do need to know how many pages we already
>>> freed, so move nr_reclaimed into scan_control.
>>>
>>> Signed-off-by: Rik van Riel <riel@redhat.com>
>>> ---
>>> Kosaki, this should address the zone scanning pressure issue.
>>
>> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
>> but I also should explain by code and benchmark.
>
> Well, the behaviour when priority==DEF_PRIORITY is the
> same as the kernel's behaviour without the patch...


Yes, but I think it decrease this patch's valueable...



>> therefore, I'll try to mesure this patch in this week.
>
> Looking forward to it.

thank you.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
  2008-11-24 20:53 ` Andrew Morton
  2008-11-25 11:35 ` KOSAKI Motohiro
@ 2008-11-26  2:24 ` KOSAKI Motohiro
  2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
  3 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-26  2:24 UTC (permalink / raw)
  To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm

> +		/*
> +		 * On large memory systems, scan >> priority can become
> +		 * really large. This is fine for the starting priority;
> +		 * we want to put equal scanning pressure on each zone.
> +		 * However, if the VM has a harder time of freeing pages,
> +		 * with multiple processes reclaiming pages, the total
> +		 * freeing target can get unreasonably large.
> +		 */
> +		if (sc->nr_reclaimed > sc->swap_cluster_max &&
> +			sc->priority < DEF_PRIORITY && !current_is_kswapd())
> +			break;

typo.
this patch can't compile.

---
 mm/vmscan.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
 		 * freeing target can get unreasonably large.
 		 */
 		if (sc->nr_reclaimed > sc->swap_cluster_max &&
-		    sc->priority < DEF_PRIORITY && !current_is_kswapd())
+		    priority < DEF_PRIORITY && !current_is_kswapd())
 			break;
 	}
 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [rfc] vmscan: serialize aggressive reclaimers
  2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
                   ` (2 preceding siblings ...)
  2008-11-26  2:24 ` KOSAKI Motohiro
@ 2008-11-27 17:36 ` Johannes Weiner
  2008-11-29  7:46   ` KOSAKI Motohiro
  3 siblings, 1 reply; 23+ messages in thread
From: Johannes Weiner @ 2008-11-27 17:36 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, KOSAKI Motohiro, akpm

Since we have to pull through a reclaim cycle once we commited to it,
what do you think about serializing the lower priority levels
completely?

The idea is that when one reclaimer has done a low priority level
iteration with a huge reclaim target, chances are that succeeding
reclaimers don't even need to drop to lower levels at all because
enough memory has already been freed.

My testprogram maps and faults in a file that is about as large as my
physical memory.  Then it spawns off n processes that try allocate
1/2n of total memory in anon pages, i.e. half of it in sum.  After it
ran, I check how much memory has been reclaimed.  But my zone sizes
are too small to induce enormous reclaim targets so I don't see vast
over-reclaims.

I have measured the time of other tests on an SMP machine with 4 cores
and the following patch applied.  I couldn't see any performance
degradation.  But since the bug is not triggerable here, I can not
prove it helps the original problem, either.

The level where it starts serializing is chosen pretty arbitrarily.
Suggestions welcome :)

	Hannes

---

Prevent over-reclaiming by serializing direct reclaimers below a
certain priority level.

Over-reclaiming happens when the sum of the reclaim targets of all
reclaiming processes is larger than the sum of the needed free pages,
thus leading to excessive eviction of more cache and anonymous pages
than required.

A scan iteration over all zones can not be aborted intermittently when
enough pages are reclaimed because that would mess up the scan balance
between the zones.  Instead, prevent that too many processes
simultaneously commit themselves to lower priority level scans in the
first place.

Chances are that after the exclusive reclaimer has finished, enough
memory has been freed that succeeding scanners don't need to drop to
lower priority levels at all anymore.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
---
 mm/vmscan.c |   20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,6 +35,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/wait.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
@@ -42,6 +43,7 @@
 #include <linux/sysctl.h>

 #include <asm/tlbflush.h>
+#include <asm/atomic.h>
 #include <asm/div64.h>

 #include <linux/swapops.h>
@@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
  * returns:	0, if no pages reclaimed
  * 		else, the number of pages reclaimed
  */
+
+static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
+static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
+
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	int priority;
+	int exclusive = 0;
 	unsigned long ret = 0;
 	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
@@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
+		/*
+		 * Serialize aggressive reclaimers
+		 */
+		if (priority <= DEF_PRIORITY / 2 && !exclusive) {
+			wait_event(reclaim_wait,
+				!atomic_cmpxchg(&reclaim_exclusive, 0, 1));
+			exclusive = 1;
+		}
 		nr_reclaimed += shrink_zones(priority, zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from
@@ -1629,6 +1644,11 @@ out:
 	if (priority < 0)
 		priority = 0;

+	if (exclusive) {
+		atomic_set(&reclaim_exclusive, 0);
+		wake_up(&reclaim_wait);
+	}
+
 	if (scan_global_lru(sc)) {
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-25 11:35 ` KOSAKI Motohiro
  2008-11-25 13:32   ` Rik van Riel
@ 2008-11-28  7:02   ` KOSAKI Motohiro
  2008-11-28 11:03     ` Rik van Riel
  1 sibling, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-28  7:02 UTC (permalink / raw)
  To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm

Hi

I mesured some data in this week and I got some interesting data.

> > Kosaki, this should address the zone scanning pressure issue.
> 
> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
> but I also should explain by code and benchmark.
> 
> therefore, I'll try to mesure this patch in this week.

1. many # of process reclaiming at that time

I mesure ten times  following bench.

$ hackbench 140 process 300

rc6+stream: 2.6.28-rc6 + 
            vmscan-evict-streaming-io-first.patch (in -mm)
rvr:        above + Rik's bailing out patch
kosaki:     above + kosaki modify (attached the last in this mail)

result (unit: second)

	rc6+stream	rvr		+kosaki patch
-----------------------------------------------------------
	175.457		62.514		104.87
	168.409		225.698		133.128
	154.658		114.694		194.867
	46.148		179.108		11.82
	289.575		111.08		60.779
	146.871		189.796		86.515
	305.036		114.124		54.009
	225.21		112.999		273.841
	224.674		227.842		166.547
	118.071		81.869		84.431
	------------------------------------------
avg	185.4109	141.9724	117.0807
std	74.18484	55.93676126	73.28439987
min	46.148		62.514		11.82
max	305.036		227.842		273.841

OK.
Rik patch improve about 30% and my patch improve 20% more.
totally, We got about 50% improvement.

2. "communicate each other application" conflict the other

console A
  $ dbench 100

console B
  $ hackbench 130 process 300

hackbench result (unit :second)

	rc6+stream	rvr		+kosaki
====================================================
	588.74		57.084		139.448
	569.876		325.063		52.233
	427.078		295.259		53.492
	65.264		132.873		59.009
	136.636		136.367		319.115
	221.538		76.352		187.937
	244.881		125.774		158.846
	37.523		115.77		122.17
	182.485		382.376		105.195
	273.983		299.577		130.478
	----------------------------------------
avg	274.8004	194.6495	132.7923
std	184.4902365	111.5699478	75.88299814
min	37.523		57.084		52.233
max	588.74		382.376		319.115

That's more interesting.

-rc6 reclaim victory on min score. but also it has worst max score. why?

traditional reclaim assume following two case is equivalent.

  case (1)
    - task (a) spent 1 sec for reclaim.
    - task (b) spent 1 sec for reclaim.
    - task (c) spent 1 sec for reclaim.

  case (2)
    - task (a) spent 3 sec for reclaim.
    - task (b) spent 0 sec for reclaim.
    - task (c) spent 0 sec for reclaim.

However, when these task comminicate each other, it isn't correct.

if task (2)-(a) is dbench process, ok, you are lucky.
dbench process don't comminicate each other. that's performance is
decided from avarage performance.
one process slowdown don't become system slowdown.

then, hackbench and dbench get both good result.

In the other hand, if task (2)-(a) is hackbench process, you are unlucky.
hackbench process communicate each other.
then the performance is decided slowest process.

then, hackbench performance dramatically decreased although dbench
performance almost don't increased.

Therefore, I think case (1) is better.

So, rik patch and my patch improve perfectly different reclaim aspect.
In general, kernel reclaim processing has several key goals.

 (1) if system has droppable cache, system shouldn't happen oom kill.
 (2) if system has avaiable swap space, system shouldn't happen 
     oom kill as poosible as.
 (3) if system has enough free memory, system shouldn't reclaim any page
     at all.
 (4) if memory pressure is lite, system shouldn't cause heavy reclaim 
     latency to application.

rik patch improve (3), my (this mail) modification improve (4).

BTW, reclaim throttle patch has one another improvement likes 
Hanns's "vmscan: serialize aggressive reclaimers" patch.
actually, rik patch improvement (3). but it isn't perfect.
if 10000 thread call reclaim at that time, system reclaim 32*10000 pages.
it is definitly too much.
but it is obviously offtopic :)

Rik, could you please merge my modify into your patch?

======
In past, HPC guys want to improve lite reclaim latency multiple times.
(e.g. http://marc.info/?l=linux-mm&m=121418258514542&w=2)

because their workload has following characteristics.
  - their workload make many process or many thread.
  - typically, write() is called at job ending phase.
    then, many file cache isn't reused.
  - In their parallel job, any process comminicate each other.
    then, system performance is decide from slowest thread.
    then, large reclaim latency decrease system performance directly.

Actually, kswapd background reclaim and direct reclaim have perfectly
different purpose and goal.

background reclaim
  - kswapd don't need latency overhead reducing.
    it isn't observe from end user.
  - kswapd shoudn't only increase free pages, but also should 
    zone balancing.

foreground reclaim
  - it used application task context.
  - as possible as, it shouldn't increase latency overhead.
  - this reclaiming purpose is to make the memory for _own_ taks.
    for other tasks memory don't need to concern.
    kswap does it.

Almost developer don't payed attention for HPC in past.
However, in these days, # of cpus increase rapidly. and
parallel processing technique become commonly.
(e.g. now, gcc has OpenMP feature support directly)

Therefore, we shouldn't ignore parallel application modern days.

o remove priority==DEF_PRIORITY condision
o shrink_zones() also should have bailing out feature.

---
 mm/vmscan.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
 		 * freeing target can get unreasonably large.
 		 */
 		if (sc->nr_reclaimed > sc->swap_cluster_max &&
-		    priority < DEF_PRIORITY && !current_is_kswapd())
+		    !current_is_kswapd())
 			break;
 	}

@@ -1534,6 +1534,8 @@ static void shrink_zones(int priority, s
 		}

 		shrink_zone(priority, zone, sc);
+		if (sc->nr_reclaimed > sc->swap_cluster_max)
+			break;
 	}
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-28  7:02   ` KOSAKI Motohiro
@ 2008-11-28 11:03     ` Rik van Riel
  2008-11-29 10:53       ` KOSAKI Motohiro
  0 siblings, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-11-28 11:03 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, mel, akpm

KOSAKI Motohiro wrote:
> Hi
> 
> I mesured some data in this week and I got some interesting data.

> Rik patch improve about 30% and my patch improve 20% more.
> totally, We got about 50% improvement.

Very interesting indeed!   I did not know there was this easy
a reproducer of the problem that my patch is trying to solve.

> 	rc6+stream	rvr		+kosaki
> 	----------------------------------------
> avg	274.8004	194.6495	132.7923
> std	184.4902365	111.5699478	75.88299814
> min	37.523		57.084		52.233
> max	588.74		382.376		319.115

Impressive.

> So, rik patch and my patch improve perfectly different reclaim aspect.
> In general, kernel reclaim processing has several key goals.
> 
>  (1) if system has droppable cache, system shouldn't happen oom kill.
>  (2) if system has avaiable swap space, system shouldn't happen 
>      oom kill as poosible as.
>  (3) if system has enough free memory, system shouldn't reclaim any page
>      at all.
>  (4) if memory pressure is lite, system shouldn't cause heavy reclaim 
>      latency to application.
> 
> rik patch improve (3), my (this mail) modification improve (4).

Actually, to achieve (3) we would want to skip zones with way
more than enough free memory in shrink_zones().  Kswapd already
skips zones like this in shrink_pgdat(), so we definately want
this change:

@@ -1519,6 +1519,9 @@ static void shrink_zones(int priority, s
                         if (zone_is_all_unreclaimable(zone) &&
                                                 priority != DEF_PRIORITY)
                                 continue;       /* Let kswapd poll it */
+                       if (zone_watermark_ok(zone, order, 
4*zone->pages_high,
+                                               end_zone, 0))
+                               continue;       /* Lots free already */
                         sc->all_unreclaimable = 0;
                 } else {
                         /*

I'm sending a patch with this right now :)

> Actually, kswapd background reclaim and direct reclaim have perfectly
> different purpose and goal.
> 
> background reclaim
>   - kswapd don't need latency overhead reducing.
>     it isn't observe from end user.
>   - kswapd shoudn't only increase free pages, but also should 
>     zone balancing.
> 
> foreground reclaim
>   - it used application task context.
>   - as possible as, it shouldn't increase latency overhead.
>   - this reclaiming purpose is to make the memory for _own_ taks.
>     for other tasks memory don't need to concern.
>     kswap does it.

I am not entirely convinced that breaking out of the loop early
in a zone is not harmful for direct reclaimers.  Maybe it works
fine, maybe it won't.

Or maybe direct reclaimers should start scanning the largest zone
first, so your change can be done with the lowest risk possible?

Having said that, the 20% additional performance achieved with
your changes is impressive.

> o remove priority==DEF_PRIORITY condision

This one could definately be worth considering.

However, looking at the changeset that was backed out in the
early 2.6 series suggests that it may not be the best idea.

> o shrink_zones() also should have bailing out feature.

This one is similar.  What are the downsides of skipping a
zone entirely, when that zone has pages that should be freed?

If it can lead to the VM reclaiming new pages from one zone,
while leaving old pages from another zone in memory, we can
greatly reduce the caching efficiency of the page cache.

> ---
>  mm/vmscan.c |    4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
>  		 * freeing target can get unreasonably large.
>  		 */
>  		if (sc->nr_reclaimed > sc->swap_cluster_max &&
> -		    priority < DEF_PRIORITY && !current_is_kswapd())
> +		    !current_is_kswapd())
>  			break;
>  	}
>  
> @@ -1534,6 +1534,8 @@ static void shrink_zones(int priority, s
>  		}
>  
>  		shrink_zone(priority, zone, sc);
> +		if (sc->nr_reclaimed > sc->swap_cluster_max)
> +			break;
>  	}
>  }

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] vmscan: serialize aggressive reclaimers
  2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
@ 2008-11-29  7:46   ` KOSAKI Motohiro
  2008-11-29 15:39     ` Johannes Weiner
  0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-29  7:46 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: kosaki.motohiro, Rik van Riel, linux-mm, linux-kernel, mel, akpm

> Since we have to pull through a reclaim cycle once we commited to it,
> what do you think about serializing the lower priority levels
> completely?
> 
> The idea is that when one reclaimer has done a low priority level
> iteration with a huge reclaim target, chances are that succeeding
> reclaimers don't even need to drop to lower levels at all because
> enough memory has already been freed.
> 
> My testprogram maps and faults in a file that is about as large as my
> physical memory.  Then it spawns off n processes that try allocate
> 1/2n of total memory in anon pages, i.e. half of it in sum.  After it
> ran, I check how much memory has been reclaimed.  But my zone sizes
> are too small to induce enormous reclaim targets so I don't see vast
> over-reclaims.
> 
> I have measured the time of other tests on an SMP machine with 4 cores
> and the following patch applied.  I couldn't see any performance
> degradation.  But since the bug is not triggerable here, I can not
> prove it helps the original problem, either.

I wonder why nobody of vmscan folks write actual performance improvement value
in patch description.

I think this patch point to right direction.
but, unfortunately, this implementation isn't fast as I mesured as.


> 
> The level where it starts serializing is chosen pretty arbitrarily.
> Suggestions welcome :)
> 
> 	Hannes
> 
> ---
> 
> Prevent over-reclaiming by serializing direct reclaimers below a
> certain priority level.
> 
> Over-reclaiming happens when the sum of the reclaim targets of all
> reclaiming processes is larger than the sum of the needed free pages,
> thus leading to excessive eviction of more cache and anonymous pages
> than required.
> 
> A scan iteration over all zones can not be aborted intermittently when
> enough pages are reclaimed because that would mess up the scan balance
> between the zones.  Instead, prevent that too many processes
> simultaneously commit themselves to lower priority level scans in the
> first place.
> 
> Chances are that after the exclusive reclaimer has finished, enough
> memory has been freed that succeeding scanners don't need to drop to
> lower priority levels at all anymore.
> 
> Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
> ---
>  mm/vmscan.c |   20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
> 
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -35,6 +35,7 @@
>  #include <linux/notifier.h>
>  #include <linux/rwsem.h>
>  #include <linux/delay.h>
> +#include <linux/wait.h>
>  #include <linux/kthread.h>
>  #include <linux/freezer.h>
>  #include <linux/memcontrol.h>
> @@ -42,6 +43,7 @@
>  #include <linux/sysctl.h>
>  
>  #include <asm/tlbflush.h>
> +#include <asm/atomic.h>
>  #include <asm/div64.h>
>  
>  #include <linux/swapops.h>
> @@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
>   * returns:	0, if no pages reclaimed
>   * 		else, the number of pages reclaimed
>   */
> +
> +static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
> +static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
> +
>  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
>  					struct scan_control *sc)
>  {
>  	int priority;
> +	int exclusive = 0;
>  	unsigned long ret = 0;
>  	unsigned long total_scanned = 0;
>  	unsigned long nr_reclaimed = 0;
> @@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
>  		sc->nr_scanned = 0;
>  		if (!priority)
>  			disable_swap_token();
> +		/*
> +		 * Serialize aggressive reclaimers
> +		 */
> +		if (priority <= DEF_PRIORITY / 2 && !exclusive) {

On large machine, DEF_PRIORITY / 2 is really catastrophe situation.
2^6 = 64. 
if zone has 64GB memory, it mean 1GB reclaim.
I think more early restriction is better.


> +			wait_event(reclaim_wait,
> +				!atomic_cmpxchg(&reclaim_exclusive, 0, 1));
> +			exclusive = 1;
> +		}

if you want to restrict to one task, you can use mutex.
and this wait_queue should put on global variable. it should be zone variable.

In addision, you don't consider recursive relaim and several task can't sleep there.


please believe me. I have richest experience about reclaim throttling in the planet.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-28 11:03     ` Rik van Riel
@ 2008-11-29 10:53       ` KOSAKI Motohiro
  2008-11-29 16:24         ` Rik van Riel
  0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-29 10:53 UTC (permalink / raw)
  To: Rik van Riel, akpm; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel

> > So, rik patch and my patch improve perfectly different reclaim aspect.
> > In general, kernel reclaim processing has several key goals.
> > 
> >  (1) if system has droppable cache, system shouldn't happen oom kill.
> >  (2) if system has avaiable swap space, system shouldn't happen 
> >      oom kill as poosible as.
> >  (3) if system has enough free memory, system shouldn't reclaim any page
> >      at all.
> >  (4) if memory pressure is lite, system shouldn't cause heavy reclaim 
> >      latency to application.
> > 
> > rik patch improve (3), my (this mail) modification improve (4).
> 
> Actually, to achieve (3) we would want to skip zones with way
> more than enough free memory in shrink_zones().  Kswapd already
> skips zones like this in shrink_pgdat(), so we definately want
> this change:
> 
> @@ -1519,6 +1519,9 @@ static void shrink_zones(int priority, s
>                          if (zone_is_all_unreclaimable(zone) &&
>                                                  priority != DEF_PRIORITY)
>                                  continue;       /* Let kswapd poll it */
> +                       if (zone_watermark_ok(zone, order, 
> 4*zone->pages_high,
> +                                               end_zone, 0))
> +                               continue;       /* Lots free already */
>                          sc->all_unreclaimable = 0;
>                  } else {
>                          /*
> 
> I'm sending a patch with this right now :)

please wait few days.

Actually, I made similar patch half year ago.
but I droped it because I observe performance degression.

but my recall isn't clear. 
I should mesure it again.

My guessing is,
zone_waterwark_ok() is very slow function, it doesn't only check
the number of the free memory, but also check memory fragmentation.
So, it is called when lite memory pressure, we violate above rule (4).




> > Actually, kswapd background reclaim and direct reclaim have perfectly
> > different purpose and goal.
> > 
> > background reclaim
> >   - kswapd don't need latency overhead reducing.
> >     it isn't observe from end user.
> >   - kswapd shoudn't only increase free pages, but also should 
> >     zone balancing.
> > 
> > foreground reclaim
> >   - it used application task context.
> >   - as possible as, it shouldn't increase latency overhead.
> >   - this reclaiming purpose is to make the memory for _own_ taks.
> >     for other tasks memory don't need to concern.
> >     kswap does it.
> 
> I am not entirely convinced that breaking out of the loop early
> in a zone is not harmful for direct reclaimers.  Maybe it works
> fine, maybe it won't.
> 
> Or maybe direct reclaimers should start scanning the largest zone
> first, so your change can be done with the lowest risk possible?
> 
> Having said that, the 20% additional performance achieved with
> your changes is impressive.
> 
> > o remove priority==DEF_PRIORITY condision
> 
> This one could definately be worth considering.
> 
> However, looking at the changeset that was backed out in the
> early 2.6 series suggests that it may not be the best idea.
> 
> > o shrink_zones() also should have bailing out feature.
> 
> This one is similar.  What are the downsides of skipping a
> zone entirely, when that zone has pages that should be freed?
> 
> If it can lead to the VM reclaiming new pages from one zone,
> while leaving old pages from another zone in memory, we can
> greatly reduce the caching efficiency of the page cache.

I think I can explain logically.

At first, please see below ML archive url.
it describe why akpm's "vmscan.c: dont reclaim too many pages" was dropped.

http://groups.google.co.jp/group/linux.kernel/browse_thread/thread/383853cdce059d1f/f13d5f87d726e325?hl=ja%3Fhl&lnk=gst&q=vmscan%3A+balancing+fix+akpm#f13d5f87d726e325


Again, old akpm patch restrict direct reclaim and background reclaim.
Therefore that url's discussion didn't separate two things too.
but your and mine restrict direct reclaim only.

At that time, Marcelo Tosatti reported akpm patch cause reclaim
imbalancing by FFSB benchmark.

I mesured ffsb on 2.6.28-rc6 and our patch.


mesured machine spec:
  CPU:   IA64 x 8
  MEM
    Node0: DMA ZONE:    2GB
           NORMAL ZONE: 2GB
    Node1: DMA ZONE:    4GB
                 -----------------
                 total  8GB

used configuration file (the same of marcelo's conf)

---------------------------------------------------
directio=0
time=300

[filesystem0]
location=/mnt/sdb1/kosaki/ffsb
num_files=20
num_dirs=10
max_filesize=91534338
min_filesize=65535
[end0]

[threadgroup0]
num_threads=10
write_size=2816
write_blocksize=4096
read_size=2816
read_blocksize=4096
create_weight=100
write_weight=30
read_weight=100
[end0]
--------------------------------------------------------

result:
------------  2.6.28-rc6 ----------------------------------

pgscan_kswapd_dma 10624
pgscan_kswapd_normal 20640

        -> normal/dma ratio 20640 / 10624 = 1.9

pgscan_direct_dma 576
pgscan_direct_normal 2528

        -> normal/dma ratio 2528 / 576 = 4.38

kswapd+direct dma    11200
kswapd+direct normal 23168

	-> normal/dma ratio 2.0

------------  rvr bail out ---------------------------------

pgscan_kswapd_dma 15552
pgscan_kswapd_normal 31936

        -> normal/dma ratio 2.05

pgscan_direct_dma 1216
pgscan_direct_normal 3968

        -> normal/dma ratio 3.26

kswapd+direct dma    16768
kswapd+direct normal 35904

	-> normal/dma ratio 2.1


------------  +kosaki ---------------------------------

pgscan_kswapd_dma 14208
pgscan_kswapd_normal 31616

        -> normal/dma ratio 31616/14208 = 2.25

pgscan_direct_dma 1024
pgscan_direct_normal 3328

        -> normal/dma ratio 3328/1024 = 3.25

kswapd+direct dma    15232
kswapd+direct normal 34944

	-> normal/dma ratio 2.2

----------------------------------------------------------

The result talk about three things.

  - rvr and mine patch increase direct reclaim imbalancing, indeed.
  - However, background reclaim scanning is _very_ much than direct reclaim.
    Then, direct reclaim imbalancing is ignorable on the big view.
    rvr patch doesn't reintroduce zone imbalancing issue.
  - rvr's priority==DEF_PRIORITY condition checking doesn't improve
    zone balancing at all.
    we can drop it.

Again, I believe my patch improve vm scanning totally.

Any comments?


Andrew, I hope add this mesurement result to rvr bailing out patch description too.
Please let me know what I should do.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] vmscan: serialize aggressive reclaimers
  2008-11-29  7:46   ` KOSAKI Motohiro
@ 2008-11-29 15:39     ` Johannes Weiner
  0 siblings, 0 replies; 23+ messages in thread
From: Johannes Weiner @ 2008-11-29 15:39 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, mel, akpm

On Sat, Nov 29, 2008 at 04:46:24PM +0900, KOSAKI Motohiro wrote:
> > Since we have to pull through a reclaim cycle once we commited to it,
> > what do you think about serializing the lower priority levels
> > completely?
> > 
> > The idea is that when one reclaimer has done a low priority level
> > iteration with a huge reclaim target, chances are that succeeding
> > reclaimers don't even need to drop to lower levels at all because
> > enough memory has already been freed.
> > 
> > My testprogram maps and faults in a file that is about as large as my
> > physical memory.  Then it spawns off n processes that try allocate
> > 1/2n of total memory in anon pages, i.e. half of it in sum.  After it
> > ran, I check how much memory has been reclaimed.  But my zone sizes
> > are too small to induce enormous reclaim targets so I don't see vast
> > over-reclaims.
> > 
> > I have measured the time of other tests on an SMP machine with 4 cores
> > and the following patch applied.  I couldn't see any performance
> > degradation.  But since the bug is not triggerable here, I can not
> > prove it helps the original problem, either.
> 
> I wonder why nobody of vmscan folks write actual performance improvement value
> in patch description.

That's why I made it RFC.  I haven't seriously tested it, I just
wanted to know what people that understand more than I do think of the
idea.

> I think this patch point to right direction.
> but, unfortunately, this implementation isn't fast as I mesured as.

Fair enough.

> > The level where it starts serializing is chosen pretty arbitrarily.
> > Suggestions welcome :)
> > 
> > 	Hannes
> > 
> > ---
> > 
> > Prevent over-reclaiming by serializing direct reclaimers below a
> > certain priority level.
> > 
> > Over-reclaiming happens when the sum of the reclaim targets of all
> > reclaiming processes is larger than the sum of the needed free pages,
> > thus leading to excessive eviction of more cache and anonymous pages
> > than required.
> > 
> > A scan iteration over all zones can not be aborted intermittently when
> > enough pages are reclaimed because that would mess up the scan balance
> > between the zones.  Instead, prevent that too many processes
> > simultaneously commit themselves to lower priority level scans in the
> > first place.
> > 
> > Chances are that after the exclusive reclaimer has finished, enough
> > memory has been freed that succeeding scanners don't need to drop to
> > lower priority levels at all anymore.
> > 
> > Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
> > ---
> >  mm/vmscan.c |   20 ++++++++++++++++++++
> >  1 file changed, 20 insertions(+)
> > 
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -35,6 +35,7 @@
> >  #include <linux/notifier.h>
> >  #include <linux/rwsem.h>
> >  #include <linux/delay.h>
> > +#include <linux/wait.h>
> >  #include <linux/kthread.h>
> >  #include <linux/freezer.h>
> >  #include <linux/memcontrol.h>
> > @@ -42,6 +43,7 @@
> >  #include <linux/sysctl.h>
> >  
> >  #include <asm/tlbflush.h>
> > +#include <asm/atomic.h>
> >  #include <asm/div64.h>
> >  
> >  #include <linux/swapops.h>
> > @@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
> >   * returns:	0, if no pages reclaimed
> >   * 		else, the number of pages reclaimed
> >   */
> > +
> > +static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
> > +static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
> > +
> >  static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> >  					struct scan_control *sc)
> >  {
> >  	int priority;
> > +	int exclusive = 0;
> >  	unsigned long ret = 0;
> >  	unsigned long total_scanned = 0;
> >  	unsigned long nr_reclaimed = 0;
> > @@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
> >  		sc->nr_scanned = 0;
> >  		if (!priority)
> >  			disable_swap_token();
> > +		/*
> > +		 * Serialize aggressive reclaimers
> > +		 */
> > +		if (priority <= DEF_PRIORITY / 2 && !exclusive) {
> 
> On large machine, DEF_PRIORITY / 2 is really catastrophe situation.
> 2^6 = 64. 
> if zone has 64GB memory, it mean 1GB reclaim.
> I think more early restriction is better.

I am just afraid that it kills parallelity.

> > +			wait_event(reclaim_wait,
> > +				!atomic_cmpxchg(&reclaim_exclusive, 0, 1));
> > +			exclusive = 1;
> > +		}
> 
> if you want to restrict to one task, you can use mutex.
> and this wait_queue should put on global variable. it should be zone variable.

Hm, global or per-zone?  Rik suggested to do it per-node and I like
that idea.

> In addision, you don't consider recursive relaim and several task can't sleep there.
> 
> 
> please believe me. I have richest experience about reclaim throttling in the planet.

Hehe, okay.  Than I am glad you don't hate the idea completely.  Do
you have any patches flying around that do something similar?

	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-29 10:53       ` KOSAKI Motohiro
@ 2008-11-29 16:24         ` Rik van Riel
  2008-11-30  6:30           ` KOSAKI Motohiro
  2008-12-01 13:40           ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
  0 siblings, 2 replies; 23+ messages in thread
From: Rik van Riel @ 2008-11-29 16:24 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: akpm, linux-mm, linux-kernel, mel

KOSAKI Motohiro wrote:

> The result talk about three things.
> 
>   - rvr and mine patch increase direct reclaim imbalancing, indeed.
>   - However, background reclaim scanning is _very_ much than direct reclaim.
>     Then, direct reclaim imbalancing is ignorable on the big view.
>     rvr patch doesn't reintroduce zone imbalancing issue.
>   - rvr's priority==DEF_PRIORITY condition checking doesn't improve
>     zone balancing at all.
>     we can drop it.
> 
> Again, I believe my patch improve vm scanning totally.
> 
> Any comments?

Reclaiming is very easy when the workload is just page cache,
because the application will be throttled when too many page
cache pages are dirty.

When using mmap or memory hogs writing to swap, applications
will not be throttled by the "too many dirty pages" logic,
but may instead end up being throttled in the direct reclaim
path instead.

At that point direct reclaim may become a lot more common,
making the imbalance more significant.

I'll run a few tests.

> Andrew, I hope add this mesurement result to rvr bailing out patch description too.

So far the performance numbers you have measured are very
encouraging and do indeed suggest that the priority==DEF_PRIORITY
thing does not make a difference.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-29 16:24         ` Rik van Riel
@ 2008-11-30  6:30           ` KOSAKI Motohiro
  2008-12-03  5:26             ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
  2008-12-01 13:40           ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
  1 sibling, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-30  6:30 UTC (permalink / raw)
  To: Rik van Riel; +Cc: kosaki.motohiro, akpm, linux-mm, linux-kernel, mel

> Reclaiming is very easy when the workload is just page cache,
> because the application will be throttled when too many page
> cache pages are dirty.
> 
> When using mmap or memory hogs writing to swap, applications
> will not be throttled by the "too many dirty pages" logic,
> but may instead end up being throttled in the direct reclaim
> path instead.
> 
> At that point direct reclaim may become a lot more common,
> making the imbalance more significant.

fair enough.


> I'll run a few tests.

Great.
I'm looking for your mail :)


> > Andrew, I hope add this mesurement result to rvr bailing out patch description too.
> 
> So far the performance numbers you have measured are very
> encouraging and do indeed suggest that the priority==DEF_PRIORITY
> thing does not make a difference.

thank you.

I believe reclaim latency reducing doesn't only improve hpc, but also
improve several multimedia and desktop application.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
  2008-11-29 16:24         ` Rik van Riel
  2008-11-30  6:30           ` KOSAKI Motohiro
@ 2008-12-01 13:40           ` Christoph Lameter
  1 sibling, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2008-12-01 13:40 UTC (permalink / raw)
  To: Rik van Riel; +Cc: KOSAKI Motohiro, akpm, linux-mm, linux-kernel, mel

On Sat, 29 Nov 2008, Rik van Riel wrote:

> When using mmap or memory hogs writing to swap, applications
> will not be throttled by the "too many dirty pages" logic,
> but may instead end up being throttled in the direct reclaim
> path instead.

The too many dirty pages logic will throttle applications dirtying
mmapped pages these days.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] vmscan: improve reclaim throuput to bail out patch
  2008-11-30  6:30           ` KOSAKI Motohiro
@ 2008-12-03  5:26             ` KOSAKI Motohiro
  2008-12-03 13:46               ` Rik van Riel
  0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-03  5:26 UTC (permalink / raw)
  To: Rik van Riel, akpm, linux-mm, linux-kernel, mel; +Cc: kosaki.motohiro

Hi

I evaluate rvr bailout and skip-freeing patch in this week conteniously.
I'd like to dump first output here.



Rik, could you please review following?
==
vmscan bail out patch move nr_reclaimed variable to struct scan_control.
Unfortunately, indirect access can easily happen cache miss.
More unfortunately, Some architecture (e.g. ia64) don't access global
variable so fast.

if heavy memory pressure happend, that's ok.
cache miss already plenty. it is not observable.

but, if memory pressure is lite, performance degression is obserbable.


I compared following three pattern (it was mesured 10 times each)

hackbench 125 process 3000
hackbench 130 process 3000
hackbench 135 process 3000

            2.6.28-rc6                       bail-out

	125	130	135		125	130	135  
      ==============================================================
	71.866	75.86	81.274		93.414	73.254	193.382
	74.145	78.295	77.27		74.897	75.021	80.17
	70.305	77.643	75.855		70.134	77.571	79.896
	74.288	73.986	75.955		77.222	78.48	80.619
	72.029	79.947	78.312		75.128	82.172	79.708
	71.499	77.615	77.042		74.177	76.532	77.306
	76.188	74.471	83.562		73.839	72.43	79.833
	73.236	75.606	78.743		76.001	76.557	82.726
	69.427	77.271	76.691		76.236	79.371	103.189
	72.473	76.978	80.643		69.128	78.932	75.736

avg	72.545	76.767	78.534		76.017	77.03	93.256
std	1.89	1.71	2.41		6.29	2.79	34.16
min	69.427	73.986	75.855		69.128	72.43	75.736
max	76.188	79.947	83.562		93.414	82.172	193.382


about 4-5% degression.

Then, this patch introduce temporal local variable.

result:

            2.6.28-rc6                       this patch

num	125	130	135		125	130	135  
      ==============================================================
	71.866	75.86	81.274		67.302	68.269	77.161
	74.145	78.295	77.27   	72.616	72.712	79.06
	70.305	77.643	75.855  	72.475	75.712	77.735
	74.288	73.986	75.955  	69.229	73.062	78.814
	72.029	79.947	78.312  	71.551	74.392	78.564
	71.499	77.615	77.042  	69.227	74.31	78.837
	76.188	74.471	83.562  	70.759	75.256	76.6
	73.236	75.606	78.743  	69.966	76.001	78.464
	69.427	77.271	76.691  	69.068	75.218	80.321
	72.473	76.978	80.643  	72.057	77.151	79.068
                        
avg	72.545	76.767	78.534 		70.425	74.2083	78.462
std 	1.89	1.71	2.41    	1.66	2.34	1.00
min 	69.427	73.986	75.855  	67.302	68.269	76.6
max 	76.188	79.947	83.562  	72.616	77.151	80.321


OK. the degression is disappeared.



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 mm/vmscan.c |   15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
 	unsigned long nr_to_scan;
 	unsigned long percent[2];	/* anon @ 0; file @ 1 */
 	enum lru_list l;
+	unsigned long nr_reclaimed = sc->nr_reclaimed;
+	unsigned long swap_cluster_max = sc->swap_cluster_max;
 
 	get_scan_ratio(zone, sc, percent);
 
@@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
 			}
 			zone->lru[l].nr_scan += scan;
 			nr[l] = zone->lru[l].nr_scan;
-			if (nr[l] >= sc->swap_cluster_max)
+			if (nr[l] >= swap_cluster_max)
 				zone->lru[l].nr_scan = 0;
 			else
 				nr[l] = 0;
@@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
 					nr[LRU_INACTIVE_FILE]) {
 		for_each_evictable_lru(l) {
 			if (nr[l]) {
-				nr_to_scan = min(nr[l],
-					(unsigned long)sc->swap_cluster_max);
+				nr_to_scan = min(nr[l], swap_cluster_max);
 				nr[l] -= nr_to_scan;
 
-				sc->nr_reclaimed += shrink_list(l, nr_to_scan,
-							zone, sc, priority);
+				nr_reclaimed += shrink_list(l, nr_to_scan,
+							    zone, sc, priority);
 			}
 		}
 		/*
@@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (sc->nr_reclaimed > sc->swap_cluster_max &&
+		if (nr_reclaimed > swap_cluster_max &&
 		    priority < DEF_PRIORITY && !current_is_kswapd())
 			break;
 	}
 
+	sc->nr_reclaimed = nr_reclaimed;
+
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch
  2008-12-03  5:26             ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
@ 2008-12-03 13:46               ` Rik van Riel
  2008-12-03 15:12                 ` KOSAKI Motohiro
  0 siblings, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-12-03 13:46 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: akpm, linux-mm, linux-kernel, mel

KOSAKI Motohiro wrote:
> Hi
> 
> I evaluate rvr bailout and skip-freeing patch in this week conteniously.
> I'd like to dump first output here.
> 
> 
> 
> Rik, could you please review following?
> ==
> vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
> More unfortunately, Some architecture (e.g. ia64) don't access global
> variable so fast.

That is amazing.  Especially considering that the scan_control
is a local variable on the stack.

> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
> 
> but, if memory pressure is lite, performance degression is obserbable.

> about 4-5% degression.
> 
> Then, this patch introduce temporal local variable.

> OK. the degression is disappeared.

I can't argue with the numbers, though :)

Maybe all the scanning we do ends up evicting the cache lines
with the scan_control struct in it from the fast part of the
CPU cache?

> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

Acked-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch
  2008-12-03 13:46               ` Rik van Riel
@ 2008-12-03 15:12                 ` KOSAKI Motohiro
  2008-12-04  1:28                   ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
  0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-03 15:12 UTC (permalink / raw)
  To: Rik van Riel; +Cc: akpm, linux-mm, linux-kernel, mel

>> I evaluate rvr bailout and skip-freeing patch in this week conteniously.
>> I'd like to dump first output here.
>>
>>
>>
>> Rik, could you please review following?
>> ==
>> vmscan bail out patch move nr_reclaimed variable to struct scan_control.
>> Unfortunately, indirect access can easily happen cache miss.
>> More unfortunately, Some architecture (e.g. ia64) don't access global
>> variable so fast.
>
> That is amazing.  Especially considering that the scan_control
> is a local variable on the stack.

Ahhhhh, I did want to write "indirect access(or likes global variables)",
but my brain was sucked. sorry.

I'll  post description fixed version soon. thanks.


>> if heavy memory pressure happend, that's ok.
>> cache miss already plenty. it is not observable.
>>
>> but, if memory pressure is lite, performance degression is obserbable.
>
>> about 4-5% degression.
>>
>> Then, this patch introduce temporal local variable.
>
>> OK. the degression is disappeared.
>
> I can't argue with the numbers, though :)
>
> Maybe all the scanning we do ends up evicting the cache lines
> with the scan_control struct in it from the fast part of the
> CPU cache?

Yeah, I think so.


>
>> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
>
> Acked-by: Rik van Riel <riel@redhat.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH] vmscan: improve reclaim throuput to bail out patch take2
  2008-12-03 15:12                 ` KOSAKI Motohiro
@ 2008-12-04  1:28                   ` KOSAKI Motohiro
  2008-12-04  4:20                     ` MinChan Kim
  2008-12-07  3:28                     ` Andrew Morton
  0 siblings, 2 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-04  1:28 UTC (permalink / raw)
  To: Rik van Riel, akpm, linux-mm, linux-kernel, mel; +Cc: kosaki.motohiro

The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
Unfortunately, indirect access can easily happen cache miss.

if heavy memory pressure happend, that's ok.
cache miss already plenty. it is not observable.

but, if memory pressure is lite, performance degression is obserbable.


I compared following three pattern (it was mesured 10 times each)

hackbench 125 process 3000
hackbench 130 process 3000
hackbench 135 process 3000

            2.6.28-rc6                       bail-out

	125	130	135		125	130	135  
      ==============================================================
	71.866	75.86	81.274		93.414	73.254	193.382
	74.145	78.295	77.27		74.897	75.021	80.17
	70.305	77.643	75.855		70.134	77.571	79.896
	74.288	73.986	75.955		77.222	78.48	80.619
	72.029	79.947	78.312		75.128	82.172	79.708
	71.499	77.615	77.042		74.177	76.532	77.306
	76.188	74.471	83.562		73.839	72.43	79.833
	73.236	75.606	78.743		76.001	76.557	82.726
	69.427	77.271	76.691		76.236	79.371	103.189
	72.473	76.978	80.643		69.128	78.932	75.736

avg	72.545	76.767	78.534		76.017	77.03	93.256
std	1.89	1.71	2.41		6.29	2.79	34.16
min	69.427	73.986	75.855		69.128	72.43	75.736
max	76.188	79.947	83.562		93.414	82.172	193.382


about 4-5% degression.

Then, this patch introduce temporal local variable.

result:

            2.6.28-rc6                       this patch

num	125	130	135		125	130	135  
      ==============================================================
	71.866	75.86	81.274		67.302	68.269	77.161
	74.145	78.295	77.27   	72.616	72.712	79.06
	70.305	77.643	75.855  	72.475	75.712	77.735
	74.288	73.986	75.955  	69.229	73.062	78.814
	72.029	79.947	78.312  	71.551	74.392	78.564
	71.499	77.615	77.042  	69.227	74.31	78.837
	76.188	74.471	83.562  	70.759	75.256	76.6
	73.236	75.606	78.743  	69.966	76.001	78.464
	69.427	77.271	76.691  	69.068	75.218	80.321
	72.473	76.978	80.643  	72.057	77.151	79.068
                        
avg	72.545	76.767	78.534 		70.425	74.2083	78.462
std 	1.89	1.71	2.41    	1.66	2.34	1.00
min 	69.427	73.986	75.855  	67.302	68.269	76.6
max 	76.188	79.947	83.562  	72.616	77.151	80.321


OK. the degression is disappeared.



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
---
 mm/vmscan.c |   15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
 	unsigned long nr_to_scan;
 	unsigned long percent[2];	/* anon @ 0; file @ 1 */
 	enum lru_list l;
+	unsigned long nr_reclaimed = sc->nr_reclaimed;
+	unsigned long swap_cluster_max = sc->swap_cluster_max;
 
 	get_scan_ratio(zone, sc, percent);
 
@@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
 			}
 			zone->lru[l].nr_scan += scan;
 			nr[l] = zone->lru[l].nr_scan;
-			if (nr[l] >= sc->swap_cluster_max)
+			if (nr[l] >= swap_cluster_max)
 				zone->lru[l].nr_scan = 0;
 			else
 				nr[l] = 0;
@@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
 					nr[LRU_INACTIVE_FILE]) {
 		for_each_evictable_lru(l) {
 			if (nr[l]) {
-				nr_to_scan = min(nr[l],
-					(unsigned long)sc->swap_cluster_max);
+				nr_to_scan = min(nr[l], swap_cluster_max);
 				nr[l] -= nr_to_scan;
 
-				sc->nr_reclaimed += shrink_list(l, nr_to_scan,
-							zone, sc, priority);
+				nr_reclaimed += shrink_list(l, nr_to_scan,
+							    zone, sc, priority);
 			}
 		}
 		/*
@@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
 		 * with multiple processes reclaiming pages, the total
 		 * freeing target can get unreasonably large.
 		 */
-		if (sc->nr_reclaimed > sc->swap_cluster_max &&
+		if (nr_reclaimed > swap_cluster_max &&
 		    priority < DEF_PRIORITY && !current_is_kswapd())
 			break;
 	}
 
+	sc->nr_reclaimed = nr_reclaimed;
+
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
  2008-12-04  1:28                   ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
@ 2008-12-04  4:20                     ` MinChan Kim
  2008-12-04  5:04                       ` KOSAKI Motohiro
  2008-12-07  3:28                     ` Andrew Morton
  1 sibling, 1 reply; 23+ messages in thread
From: MinChan Kim @ 2008-12-04  4:20 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Rik van Riel, akpm, linux-mm, linux-kernel, mel

Hi, Kosaki-san.

It's a great improvement with only one variable than I expected. :)
What is your test environment ? (CPU, L1, L2 cache size and so )
Just out of curiosity.


On Thu, Dec 4, 2008 at 10:28 AM, KOSAKI Motohiro
<kosaki.motohiro@jp.fujitsu.com> wrote:
> The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
>
> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
>
> but, if memory pressure is lite, performance degression is obserbable.
>
>
> I compared following three pattern (it was mesured 10 times each)
>
> hackbench 125 process 3000
> hackbench 130 process 3000
> hackbench 135 process 3000
>
>            2.6.28-rc6                       bail-out
>
>        125     130     135             125     130     135
>      ==============================================================
>        71.866  75.86   81.274          93.414  73.254  193.382
>        74.145  78.295  77.27           74.897  75.021  80.17
>        70.305  77.643  75.855          70.134  77.571  79.896
>        74.288  73.986  75.955          77.222  78.48   80.619
>        72.029  79.947  78.312          75.128  82.172  79.708
>        71.499  77.615  77.042          74.177  76.532  77.306
>        76.188  74.471  83.562          73.839  72.43   79.833
>        73.236  75.606  78.743          76.001  76.557  82.726
>        69.427  77.271  76.691          76.236  79.371  103.189
>        72.473  76.978  80.643          69.128  78.932  75.736
>
> avg     72.545  76.767  78.534          76.017  77.03   93.256
> std     1.89    1.71    2.41            6.29    2.79    34.16
> min     69.427  73.986  75.855          69.128  72.43   75.736
> max     76.188  79.947  83.562          93.414  82.172  193.382
>
>
> about 4-5% degression.
>
> Then, this patch introduce temporal local variable.
>
> result:
>
>            2.6.28-rc6                       this patch
>
> num     125     130     135             125     130     135
>      ==============================================================
>        71.866  75.86   81.274          67.302  68.269  77.161
>        74.145  78.295  77.27           72.616  72.712  79.06
>        70.305  77.643  75.855          72.475  75.712  77.735
>        74.288  73.986  75.955          69.229  73.062  78.814
>        72.029  79.947  78.312          71.551  74.392  78.564
>        71.499  77.615  77.042          69.227  74.31   78.837
>        76.188  74.471  83.562          70.759  75.256  76.6
>        73.236  75.606  78.743          69.966  76.001  78.464
>        69.427  77.271  76.691          69.068  75.218  80.321
>        72.473  76.978  80.643          72.057  77.151  79.068
>
> avg     72.545  76.767  78.534          70.425  74.2083 78.462
> std     1.89    1.71    2.41            1.66    2.34    1.00
> min     69.427  73.986  75.855          67.302  68.269  76.6
> max     76.188  79.947  83.562          72.616  77.151  80.321
>
>
> OK. the degression is disappeared.
>
>
>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Acked-by: Rik van Riel <riel@redhat.com>
> ---
>  mm/vmscan.c |   15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
>        unsigned long nr_to_scan;
>        unsigned long percent[2];       /* anon @ 0; file @ 1 */
>        enum lru_list l;
> +       unsigned long nr_reclaimed = sc->nr_reclaimed;
> +       unsigned long swap_cluster_max = sc->swap_cluster_max;
>
>        get_scan_ratio(zone, sc, percent);
>
> @@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
>                        }
>                        zone->lru[l].nr_scan += scan;
>                        nr[l] = zone->lru[l].nr_scan;
> -                       if (nr[l] >= sc->swap_cluster_max)
> +                       if (nr[l] >= swap_cluster_max)
>                                zone->lru[l].nr_scan = 0;
>                        else
>                                nr[l] = 0;
> @@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
>                                        nr[LRU_INACTIVE_FILE]) {
>                for_each_evictable_lru(l) {
>                        if (nr[l]) {
> -                               nr_to_scan = min(nr[l],
> -                                       (unsigned long)sc->swap_cluster_max);
> +                               nr_to_scan = min(nr[l], swap_cluster_max);
>                                nr[l] -= nr_to_scan;
>
> -                               sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> -                                                       zone, sc, priority);
> +                               nr_reclaimed += shrink_list(l, nr_to_scan,
> +                                                           zone, sc, priority);
>                        }
>                }
>                /*
> @@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
>                 * with multiple processes reclaiming pages, the total
>                 * freeing target can get unreasonably large.
>                 */
> -               if (sc->nr_reclaimed > sc->swap_cluster_max &&
> +               if (nr_reclaimed > swap_cluster_max &&
>                    priority < DEF_PRIORITY && !current_is_kswapd())
>                        break;
>        }
>
> +       sc->nr_reclaimed = nr_reclaimed;
> +
>        /*
>         * Even if we did not try to evict anon pages at all, we want to
>         * rebalance the anon lru active/inactive ratio.
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>



-- 
Kinds regards,
MinChan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
  2008-12-04  4:20                     ` MinChan Kim
@ 2008-12-04  5:04                       ` KOSAKI Motohiro
  0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-04  5:04 UTC (permalink / raw)
  To: MinChan Kim
  Cc: kosaki.motohiro, Rik van Riel, akpm, linux-mm, linux-kernel, mel

Hi

> Hi, Kosaki-san.
> 
> It's a great improvement with only one variable than I expected. :)
> What is your test environment ? (CPU, L1, L2 cache size and so )
> Just out of curiosity.

CPU: ia64x8
L1: 16KB
L2: 512KB
L3: 24MB




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
  2008-12-04  1:28                   ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
  2008-12-04  4:20                     ` MinChan Kim
@ 2008-12-07  3:28                     ` Andrew Morton
  2008-12-08  2:49                       ` KOSAKI Motohiro
  1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2008-12-07  3:28 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, mel

On Thu,  4 Dec 2008 10:28:39 +0900 (JST) KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
> 
> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
> 
> but, if memory pressure is lite, performance degression is obserbable.
> 
> 
> I compared following three pattern (it was mesured 10 times each)
> 
> hackbench 125 process 3000
> hackbench 130 process 3000
> hackbench 135 process 3000
> 
>             2.6.28-rc6                       bail-out
> 
> 	125	130	135		125	130	135  
>       ==============================================================
> 	71.866	75.86	81.274		93.414	73.254	193.382
> 	74.145	78.295	77.27		74.897	75.021	80.17
> 	70.305	77.643	75.855		70.134	77.571	79.896
> 	74.288	73.986	75.955		77.222	78.48	80.619
> 	72.029	79.947	78.312		75.128	82.172	79.708
> 	71.499	77.615	77.042		74.177	76.532	77.306
> 	76.188	74.471	83.562		73.839	72.43	79.833
> 	73.236	75.606	78.743		76.001	76.557	82.726
> 	69.427	77.271	76.691		76.236	79.371	103.189
> 	72.473	76.978	80.643		69.128	78.932	75.736
> 
> avg	72.545	76.767	78.534		76.017	77.03	93.256
> std	1.89	1.71	2.41		6.29	2.79	34.16
> min	69.427	73.986	75.855		69.128	72.43	75.736
> max	76.188	79.947	83.562		93.414	82.172	193.382
> 
> 
> about 4-5% degression.
> 
> Then, this patch introduce temporal local variable.
> 
> result:
> 
>             2.6.28-rc6                       this patch
> 
> num	125	130	135		125	130	135  
>       ==============================================================
> 	71.866	75.86	81.274		67.302	68.269	77.161
> 	74.145	78.295	77.27   	72.616	72.712	79.06
> 	70.305	77.643	75.855  	72.475	75.712	77.735
> 	74.288	73.986	75.955  	69.229	73.062	78.814
> 	72.029	79.947	78.312  	71.551	74.392	78.564
> 	71.499	77.615	77.042  	69.227	74.31	78.837
> 	76.188	74.471	83.562  	70.759	75.256	76.6
> 	73.236	75.606	78.743  	69.966	76.001	78.464
> 	69.427	77.271	76.691  	69.068	75.218	80.321
> 	72.473	76.978	80.643  	72.057	77.151	79.068
>                         
> avg	72.545	76.767	78.534 		70.425	74.2083	78.462
> std 	1.89	1.71	2.41    	1.66	2.34	1.00
> min 	69.427	73.986	75.855  	67.302	68.269	76.6
> max 	76.188	79.947	83.562  	72.616	77.151	80.321
> 
> 
> OK. the degression is disappeared.
> 

Yes, this is a very surprising result.  Suspicious, in fact.

> 
> 
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Acked-by: Rik van Riel <riel@redhat.com>
> ---
>  mm/vmscan.c |   15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
>  	unsigned long nr_to_scan;
>  	unsigned long percent[2];	/* anon @ 0; file @ 1 */
>  	enum lru_list l;
> +	unsigned long nr_reclaimed = sc->nr_reclaimed;
> +	unsigned long swap_cluster_max = sc->swap_cluster_max;
>  
>  	get_scan_ratio(zone, sc, percent);
>  
> @@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
>  			}
>  			zone->lru[l].nr_scan += scan;
>  			nr[l] = zone->lru[l].nr_scan;
> -			if (nr[l] >= sc->swap_cluster_max)
> +			if (nr[l] >= swap_cluster_max)
>  				zone->lru[l].nr_scan = 0;
>  			else
>  				nr[l] = 0;
> @@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
>  					nr[LRU_INACTIVE_FILE]) {
>  		for_each_evictable_lru(l) {
>  			if (nr[l]) {
> -				nr_to_scan = min(nr[l],
> -					(unsigned long)sc->swap_cluster_max);
> +				nr_to_scan = min(nr[l], swap_cluster_max);
>  				nr[l] -= nr_to_scan;
>  
> -				sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> -							zone, sc, priority);
> +				nr_reclaimed += shrink_list(l, nr_to_scan,
> +							    zone, sc, priority);
>  			}
>  		}
>  		/*
> @@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
>  		 * with multiple processes reclaiming pages, the total
>  		 * freeing target can get unreasonably large.
>  		 */
> -		if (sc->nr_reclaimed > sc->swap_cluster_max &&
> +		if (nr_reclaimed > swap_cluster_max &&
>  		    priority < DEF_PRIORITY && !current_is_kswapd())
>  			break;
>  	}
>  
> +	sc->nr_reclaimed = nr_reclaimed;
> +
>  	/*
>  	 * Even if we did not try to evict anon pages at all, we want to
>  	 * rebalance the anon lru active/inactive ratio.

If this improved the throughput of direct-reclaim callers then one
would expect it to make larger improvements for kswapd (assuming 
that all other things are equal for those tasks, which they are not).

What is your direct-reclaim to kswapd-reclaim ratio for that workload?
(grep pgscan /proc/vmstat)

Does that patch make any change to the amount of CPU time which kswapd
consumed?


Or you can not bother doing this work ;) The patch looks sensible
anyway.  It's just that the numbers look whacky.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
  2008-12-07  3:28                     ` Andrew Morton
@ 2008-12-08  2:49                       ` KOSAKI Motohiro
  0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-08  2:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: kosaki.motohiro, Rik van Riel, linux-mm, linux-kernel, mel


I think my last explain was too poor.


> If this improved the throughput of direct-reclaim callers then one
> would expect it to make larger improvements for kswapd (assuming 
> that all other things are equal for those tasks, which they are not).
> 
> What is your direct-reclaim to kswapd-reclaim ratio for that workload?
> (grep pgscan /proc/vmstat)
> 

because that benchmark is direct reclaim torturess workload.

/proc/vmstat changing was

<before>
pgscan_kswapd_dma 1152
pgscan_kswapd_normal 2400
pgscan_kswapd_movable 0
pgscan_direct_dma 32
pgscan_direct_normal 512
pgscan_direct_movable 0

<after>
pgscan_kswapd_dma 3520
pgscan_kswapd_normal 12160
pgscan_kswapd_movable 0
pgscan_direct_dma 10048
pgscan_direct_normal 31904
pgscan_direct_movable 0

	-> kswapd:direct = 1 : 3.4


Why I test non typical extreame woakload?
I have two reason.

1. nobody want to regression although workload isn't typical.

2. if the patch can scale performance although extreme case,
   of cource it also can works well on light weight workload.


if my patch have any regression, it definityly is valueless.
my patch only solve extreme case.
but I don't think it has.


> Does that patch make any change to the amount of CPU time which kswapd
> consumed?

I don't mesure it yet.
but at least, top coomand didn't find any consumption increasing.


> 
> Or you can not bother doing this work ;) The patch looks sensible
> anyway.  It's just that the numbers look whacky.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-12-08  2:49 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
2008-11-24 20:53 ` Andrew Morton
2008-11-25 11:35 ` KOSAKI Motohiro
2008-11-25 13:32   ` Rik van Riel
2008-11-25 14:30     ` KOSAKI Motohiro
2008-11-28  7:02   ` KOSAKI Motohiro
2008-11-28 11:03     ` Rik van Riel
2008-11-29 10:53       ` KOSAKI Motohiro
2008-11-29 16:24         ` Rik van Riel
2008-11-30  6:30           ` KOSAKI Motohiro
2008-12-03  5:26             ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
2008-12-03 13:46               ` Rik van Riel
2008-12-03 15:12                 ` KOSAKI Motohiro
2008-12-04  1:28                   ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
2008-12-04  4:20                     ` MinChan Kim
2008-12-04  5:04                       ` KOSAKI Motohiro
2008-12-07  3:28                     ` Andrew Morton
2008-12-08  2:49                       ` KOSAKI Motohiro
2008-12-01 13:40           ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
2008-11-26  2:24 ` KOSAKI Motohiro
2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
2008-11-29  7:46   ` KOSAKI Motohiro
2008-11-29 15:39     ` Johannes Weiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox