* [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
@ 2008-11-24 19:50 Rik van Riel
2008-11-24 20:53 ` Andrew Morton
` (3 more replies)
0 siblings, 4 replies; 23+ messages in thread
From: Rik van Riel @ 2008-11-24 19:50 UTC (permalink / raw)
To: linux-mm; +Cc: linux-kernel, mel, KOSAKI Motohiro, akpm
Sometimes the VM spends the first few priority rounds rotating back
referenced pages and submitting IO. Once we get to a lower priority,
sometimes the VM ends up freeing way too many pages.
The fix is relatively simple: in shrink_zone() we can check how many
pages we have already freed, direct reclaim tasks break out of the
scanning loop if they have already freed enough pages and have reached
a lower priority level.
However, in order to do this we do need to know how many pages we already
freed, so move nr_reclaimed into scan_control.
Signed-off-by: Rik van Riel <riel@redhat.com>
---
Kosaki, this should address the zone scanning pressure issue.
Nick, this includes the cleanups suggested by you.
mm/vmscan.c | 62 +++++++++++++++++++++++++++++++-----------------------------
1 file changed, 33 insertions(+), 29 deletions(-)
Index: linux-2.6.28-rc5/mm/vmscan.c
===================================================================
--- linux-2.6.28-rc5.orig/mm/vmscan.c 2008-11-17 15:22:22.000000000 -0500
+++ linux-2.6.28-rc5/mm/vmscan.c 2008-11-24 14:47:17.000000000 -0500
@@ -52,6 +52,9 @@ struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
+ /* Number of pages freed so far during a call to shrink_zones() */
+ unsigned long nr_reclaimed;
+
/* This context's GFP mask */
gfp_t gfp_mask;
@@ -1405,12 +1408,11 @@ static void get_scan_ratio(struct zone *
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long nr_reclaimed = 0;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
@@ -1451,10 +1453,21 @@ static unsigned long shrink_zone(int pri
(unsigned long)sc->swap_cluster_max);
nr[l] -= nr_to_scan;
- nr_reclaimed += shrink_list(l, nr_to_scan,
+ sc->nr_reclaimed += shrink_list(l, nr_to_scan,
zone, sc, priority);
}
}
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is fine for the starting priority;
+ * we want to put equal scanning pressure on each zone.
+ * However, if the VM has a harder time of freeing pages,
+ * with multiple processes reclaiming pages, the total
+ * freeing target can get unreasonably large.
+ */
+ if (sc->nr_reclaimed > sc->swap_cluster_max &&
+ sc->priority < DEF_PRIORITY && !current_is_kswapd())
+ break;
}
/*
@@ -1467,7 +1480,6 @@ static unsigned long shrink_zone(int pri
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
- return nr_reclaimed;
}
/*
@@ -1481,16 +1493,13 @@ static unsigned long shrink_zone(int pri
* b) The zones may be over pages_high but they must go *over* pages_high to
* satisfy the `incremental min' zone defense algorithm.
*
- * Returns the number of reclaimed pages.
- *
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
- unsigned long nr_reclaimed = 0;
struct zoneref *z;
struct zone *zone;
@@ -1521,10 +1530,8 @@ static unsigned long shrink_zones(int pr
priority);
}
- nr_reclaimed += shrink_zone(priority, zone, sc);
+ shrink_zone(priority, zone, sc);
}
-
- return nr_reclaimed;
}
/*
@@ -1549,7 +1556,6 @@ static unsigned long do_try_to_free_page
int priority;
unsigned long ret = 0;
unsigned long total_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
struct zoneref *z;
@@ -1577,7 +1583,7 @@ static unsigned long do_try_to_free_page
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1585,13 +1591,13 @@ static unsigned long do_try_to_free_page
if (scan_global_lru(sc)) {
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
}
total_scanned += sc->nr_scanned;
- if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+ ret = sc->nr_reclaimed;
goto out;
}
@@ -1614,7 +1620,7 @@ static unsigned long do_try_to_free_page
}
/* top priority shrink_zones still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = nr_reclaimed;
+ ret = sc->nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1709,7 +1715,6 @@ static unsigned long balance_pgdat(pg_da
int priority;
int i;
unsigned long total_scanned;
- unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1728,7 +1733,7 @@ static unsigned long balance_pgdat(pg_da
loop_again:
total_scanned = 0;
- nr_reclaimed = 0;
+ sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
@@ -1814,11 +1819,11 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
continue;
@@ -1832,7 +1837,7 @@ loop_again:
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > nr_reclaimed + nr_reclaimed / 2)
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
}
if (all_zones_ok)
@@ -1850,7 +1855,7 @@ loop_again:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
@@ -1872,7 +1877,7 @@ out:
goto loop_again;
}
- return nr_reclaimed;
+ return sc.nr_reclaimed;
}
/*
@@ -2224,7 +2229,6 @@ static int __zone_reclaim(struct zone *z
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
- unsigned long nr_reclaimed = 0;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2257,9 +2261,9 @@ static int __zone_reclaim(struct zone *z
priority = ZONE_RECLAIM_PRIORITY;
do {
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}
slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2283,13 +2287,13 @@ static int __zone_reclaim(struct zone *z
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- nr_reclaimed += slab_reclaimable -
+ sc.nr_reclaimed += slab_reclaimable -
zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}
p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- return nr_reclaimed >= nr_pages;
+ return sc.nr_reclaimed >= nr_pages;
}
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
@ 2008-11-24 20:53 ` Andrew Morton
2008-11-25 11:35 ` KOSAKI Motohiro
` (2 subsequent siblings)
3 siblings, 0 replies; 23+ messages in thread
From: Andrew Morton @ 2008-11-24 20:53 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, kosaki.motohiro
On Mon, 24 Nov 2008 14:50:57 -0500
Rik van Riel <riel@redhat.com> wrote:
> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
It would help (a lot) if we had a much more specific and detailed
description of the problem which is being fixed. Nobody has noticed it
in half a decade, so it can't be very serious?
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed, direct reclaim tasks break out of the
> scanning loop if they have already freed enough pages and have reached
> a lower priority level.
So in the common scenario where there's a lot of dirty highmem and
little dirty lowmem, the kernel will start reclaiming highmem at a
vastly higher rate than lowmem. iirc, this was the reason why this
change was tried then reverted.
Please demonstrate that this regression is not worse than the problem
which is being fixed!
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.
Thus carrying the state across the *entire* scanning pass: all zones.
So as soon as sc.nr_reclaimed exceeds swap_cluster_max, the scanner
will fall into a different mode for the remaining zones wherein it will
scan only swap_cluster_max pages from them, then will bale.
This will heavily bias scanning onto the zones at the start of the zone
list. In fact it probably means that the zone at the head of the
zonelist gets thrashed and the remaining zones will just sit there
doing almost nothing. Where's the sense in that?
Has any testing been done to demonstrate and quantify this effect?
> Signed-off-by: Rik van Riel <riel@redhat.com>
> ---
> Kosaki, this should address the zone scanning pressure issue.
What is the "zone scanning pressure issue"?
Please don't put "should" in a vmscan changelog :( Either it does, or
it does not?
This should look familiar:
commit e468e46a9bea3297011d5918663ce6d19094cf87
Author: akpm <akpm>
Date: Thu Jun 24 15:53:52 2004 +0000
[PATCH] vmscan.c: dont reclaim too many pages
The shrink_zone() logic can, under some circumstances, cause far too many
pages to be reclaimed. Say, we're scanning at high priority and suddenly hit
a large number of reclaimable pages on the LRU.
Change things so we bale out when SWAP_CLUSTER_MAX pages have been reclaimed.
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
BKrev: 40daf910sac4yN_aUhhJF2U8Upx1ww
And here is where it was reverted. Note that this was nearly two years
later! It takes that long for these things to be discovered, analysed
and fixed.
commit 210fe530305ee50cd889fe9250168228b2994f32
Author: Andrew Morton <akpm@osdl.org>
Date: Fri Jan 6 00:11:14 2006 -0800
[PATCH] vmscan: balancing fix
Revert a patch which went into 2.6.8-rc1. The changelog for that patch was:
The shrink_zone() logic can, under some circumstances, cause far too many
pages to be reclaimed. Say, we're scanning at high priority and suddenly
hit a large number of reclaimable pages on the LRU.
Change things so we bale out when SWAP_CLUSTER_MAX pages have been
reclaimed.
Problem is, this change caused significant imbalance in inter-zone scan
balancing by truncating scans of larger zones.
Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL. The zone
balancing algorithm would require that if we're scanning 100 pages of
ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL. But this logic will
cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are
reclaimed. Thus effectively causing smaller zones to be scanned relatively
harder than large ones.
Now I need to remember what the workload was which caused me to write this
patch originally, then fix it up in a different way...
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
2008-11-24 20:53 ` Andrew Morton
@ 2008-11-25 11:35 ` KOSAKI Motohiro
2008-11-25 13:32 ` Rik van Riel
2008-11-28 7:02 ` KOSAKI Motohiro
2008-11-26 2:24 ` KOSAKI Motohiro
2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
3 siblings, 2 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-25 11:35 UTC (permalink / raw)
To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm
> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
>
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed, direct reclaim tasks break out of the
> scanning loop if they have already freed enough pages and have reached
> a lower priority level.
>
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.
>
> Signed-off-by: Rik van Riel <riel@redhat.com>
> ---
> Kosaki, this should address the zone scanning pressure issue.
hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
but I also should explain by code and benchmark.
therefore, I'll try to mesure this patch in this week.
thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-25 11:35 ` KOSAKI Motohiro
@ 2008-11-25 13:32 ` Rik van Riel
2008-11-25 14:30 ` KOSAKI Motohiro
2008-11-28 7:02 ` KOSAKI Motohiro
1 sibling, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-11-25 13:32 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, mel, akpm
KOSAKI Motohiro wrote:
>> Sometimes the VM spends the first few priority rounds rotating back
>> referenced pages and submitting IO. Once we get to a lower priority,
>> sometimes the VM ends up freeing way too many pages.
>>
>> The fix is relatively simple: in shrink_zone() we can check how many
>> pages we have already freed, direct reclaim tasks break out of the
>> scanning loop if they have already freed enough pages and have reached
>> a lower priority level.
>>
>> However, in order to do this we do need to know how many pages we already
>> freed, so move nr_reclaimed into scan_control.
>>
>> Signed-off-by: Rik van Riel <riel@redhat.com>
>> ---
>> Kosaki, this should address the zone scanning pressure issue.
>
> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
> but I also should explain by code and benchmark.
Well, the behaviour when priority==DEF_PRIORITY is the
same as the kernel's behaviour without the patch...
> therefore, I'll try to mesure this patch in this week.
Looking forward to it.
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-25 13:32 ` Rik van Riel
@ 2008-11-25 14:30 ` KOSAKI Motohiro
0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-25 14:30 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, akpm
2008/11/25 Rik van Riel <riel@redhat.com>:
> KOSAKI Motohiro wrote:
>>>
>>> Sometimes the VM spends the first few priority rounds rotating back
>>> referenced pages and submitting IO. Once we get to a lower priority,
>>> sometimes the VM ends up freeing way too many pages.
>>>
>>> The fix is relatively simple: in shrink_zone() we can check how many
>>> pages we have already freed, direct reclaim tasks break out of the
>>> scanning loop if they have already freed enough pages and have reached
>>> a lower priority level.
>>>
>>> However, in order to do this we do need to know how many pages we already
>>> freed, so move nr_reclaimed into scan_control.
>>>
>>> Signed-off-by: Rik van Riel <riel@redhat.com>
>>> ---
>>> Kosaki, this should address the zone scanning pressure issue.
>>
>> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
>> but I also should explain by code and benchmark.
>
> Well, the behaviour when priority==DEF_PRIORITY is the
> same as the kernel's behaviour without the patch...
Yes, but I think it decrease this patch's valueable...
>> therefore, I'll try to mesure this patch in this week.
>
> Looking forward to it.
thank you.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
2008-11-24 20:53 ` Andrew Morton
2008-11-25 11:35 ` KOSAKI Motohiro
@ 2008-11-26 2:24 ` KOSAKI Motohiro
2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
3 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-26 2:24 UTC (permalink / raw)
To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm
> + /*
> + * On large memory systems, scan >> priority can become
> + * really large. This is fine for the starting priority;
> + * we want to put equal scanning pressure on each zone.
> + * However, if the VM has a harder time of freeing pages,
> + * with multiple processes reclaiming pages, the total
> + * freeing target can get unreasonably large.
> + */
> + if (sc->nr_reclaimed > sc->swap_cluster_max &&
> + sc->priority < DEF_PRIORITY && !current_is_kswapd())
> + break;
typo.
this patch can't compile.
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
* freeing target can get unreasonably large.
*/
if (sc->nr_reclaimed > sc->swap_cluster_max &&
- sc->priority < DEF_PRIORITY && !current_is_kswapd())
+ priority < DEF_PRIORITY && !current_is_kswapd())
break;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* [rfc] vmscan: serialize aggressive reclaimers
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
` (2 preceding siblings ...)
2008-11-26 2:24 ` KOSAKI Motohiro
@ 2008-11-27 17:36 ` Johannes Weiner
2008-11-29 7:46 ` KOSAKI Motohiro
3 siblings, 1 reply; 23+ messages in thread
From: Johannes Weiner @ 2008-11-27 17:36 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, linux-kernel, mel, KOSAKI Motohiro, akpm
Since we have to pull through a reclaim cycle once we commited to it,
what do you think about serializing the lower priority levels
completely?
The idea is that when one reclaimer has done a low priority level
iteration with a huge reclaim target, chances are that succeeding
reclaimers don't even need to drop to lower levels at all because
enough memory has already been freed.
My testprogram maps and faults in a file that is about as large as my
physical memory. Then it spawns off n processes that try allocate
1/2n of total memory in anon pages, i.e. half of it in sum. After it
ran, I check how much memory has been reclaimed. But my zone sizes
are too small to induce enormous reclaim targets so I don't see vast
over-reclaims.
I have measured the time of other tests on an SMP machine with 4 cores
and the following patch applied. I couldn't see any performance
degradation. But since the bug is not triggerable here, I can not
prove it helps the original problem, either.
The level where it starts serializing is chosen pretty arbitrarily.
Suggestions welcome :)
Hannes
---
Prevent over-reclaiming by serializing direct reclaimers below a
certain priority level.
Over-reclaiming happens when the sum of the reclaim targets of all
reclaiming processes is larger than the sum of the needed free pages,
thus leading to excessive eviction of more cache and anonymous pages
than required.
A scan iteration over all zones can not be aborted intermittently when
enough pages are reclaimed because that would mess up the scan balance
between the zones. Instead, prevent that too many processes
simultaneously commit themselves to lower priority level scans in the
first place.
Chances are that after the exclusive reclaimer has finished, enough
memory has been freed that succeeding scanners don't need to drop to
lower priority levels at all anymore.
Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
---
mm/vmscan.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -35,6 +35,7 @@
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
+#include <linux/wait.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
@@ -42,6 +43,7 @@
#include <linux/sysctl.h>
#include <asm/tlbflush.h>
+#include <asm/atomic.h>
#include <asm/div64.h>
#include <linux/swapops.h>
@@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
* returns: 0, if no pages reclaimed
* else, the number of pages reclaimed
*/
+
+static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
+static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
+
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
+ int exclusive = 0;
unsigned long ret = 0;
unsigned long total_scanned = 0;
unsigned long nr_reclaimed = 0;
@@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
+ /*
+ * Serialize aggressive reclaimers
+ */
+ if (priority <= DEF_PRIORITY / 2 && !exclusive) {
+ wait_event(reclaim_wait,
+ !atomic_cmpxchg(&reclaim_exclusive, 0, 1));
+ exclusive = 1;
+ }
nr_reclaimed += shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
@@ -1629,6 +1644,11 @@ out:
if (priority < 0)
priority = 0;
+ if (exclusive) {
+ atomic_set(&reclaim_exclusive, 0);
+ wake_up(&reclaim_wait);
+ }
+
if (scan_global_lru(sc)) {
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-25 11:35 ` KOSAKI Motohiro
2008-11-25 13:32 ` Rik van Riel
@ 2008-11-28 7:02 ` KOSAKI Motohiro
2008-11-28 11:03 ` Rik van Riel
1 sibling, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-28 7:02 UTC (permalink / raw)
To: Rik van Riel; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel, akpm
Hi
I mesured some data in this week and I got some interesting data.
> > Kosaki, this should address the zone scanning pressure issue.
>
> hmmmm. I still don't like the behavior when priority==DEF_PRIORITY.
> but I also should explain by code and benchmark.
>
> therefore, I'll try to mesure this patch in this week.
1. many # of process reclaiming at that time
I mesure ten times following bench.
$ hackbench 140 process 300
rc6+stream: 2.6.28-rc6 +
vmscan-evict-streaming-io-first.patch (in -mm)
rvr: above + Rik's bailing out patch
kosaki: above + kosaki modify (attached the last in this mail)
result (unit: second)
rc6+stream rvr +kosaki patch
-----------------------------------------------------------
175.457 62.514 104.87
168.409 225.698 133.128
154.658 114.694 194.867
46.148 179.108 11.82
289.575 111.08 60.779
146.871 189.796 86.515
305.036 114.124 54.009
225.21 112.999 273.841
224.674 227.842 166.547
118.071 81.869 84.431
------------------------------------------
avg 185.4109 141.9724 117.0807
std 74.18484 55.93676126 73.28439987
min 46.148 62.514 11.82
max 305.036 227.842 273.841
OK.
Rik patch improve about 30% and my patch improve 20% more.
totally, We got about 50% improvement.
2. "communicate each other application" conflict the other
console A
$ dbench 100
console B
$ hackbench 130 process 300
hackbench result (unit :second)
rc6+stream rvr +kosaki
====================================================
588.74 57.084 139.448
569.876 325.063 52.233
427.078 295.259 53.492
65.264 132.873 59.009
136.636 136.367 319.115
221.538 76.352 187.937
244.881 125.774 158.846
37.523 115.77 122.17
182.485 382.376 105.195
273.983 299.577 130.478
----------------------------------------
avg 274.8004 194.6495 132.7923
std 184.4902365 111.5699478 75.88299814
min 37.523 57.084 52.233
max 588.74 382.376 319.115
That's more interesting.
-rc6 reclaim victory on min score. but also it has worst max score. why?
traditional reclaim assume following two case is equivalent.
case (1)
- task (a) spent 1 sec for reclaim.
- task (b) spent 1 sec for reclaim.
- task (c) spent 1 sec for reclaim.
case (2)
- task (a) spent 3 sec for reclaim.
- task (b) spent 0 sec for reclaim.
- task (c) spent 0 sec for reclaim.
However, when these task comminicate each other, it isn't correct.
if task (2)-(a) is dbench process, ok, you are lucky.
dbench process don't comminicate each other. that's performance is
decided from avarage performance.
one process slowdown don't become system slowdown.
then, hackbench and dbench get both good result.
In the other hand, if task (2)-(a) is hackbench process, you are unlucky.
hackbench process communicate each other.
then the performance is decided slowest process.
then, hackbench performance dramatically decreased although dbench
performance almost don't increased.
Therefore, I think case (1) is better.
So, rik patch and my patch improve perfectly different reclaim aspect.
In general, kernel reclaim processing has several key goals.
(1) if system has droppable cache, system shouldn't happen oom kill.
(2) if system has avaiable swap space, system shouldn't happen
oom kill as poosible as.
(3) if system has enough free memory, system shouldn't reclaim any page
at all.
(4) if memory pressure is lite, system shouldn't cause heavy reclaim
latency to application.
rik patch improve (3), my (this mail) modification improve (4).
BTW, reclaim throttle patch has one another improvement likes
Hanns's "vmscan: serialize aggressive reclaimers" patch.
actually, rik patch improvement (3). but it isn't perfect.
if 10000 thread call reclaim at that time, system reclaim 32*10000 pages.
it is definitly too much.
but it is obviously offtopic :)
Rik, could you please merge my modify into your patch?
======
In past, HPC guys want to improve lite reclaim latency multiple times.
(e.g. http://marc.info/?l=linux-mm&m=121418258514542&w=2)
because their workload has following characteristics.
- their workload make many process or many thread.
- typically, write() is called at job ending phase.
then, many file cache isn't reused.
- In their parallel job, any process comminicate each other.
then, system performance is decide from slowest thread.
then, large reclaim latency decrease system performance directly.
Actually, kswapd background reclaim and direct reclaim have perfectly
different purpose and goal.
background reclaim
- kswapd don't need latency overhead reducing.
it isn't observe from end user.
- kswapd shoudn't only increase free pages, but also should
zone balancing.
foreground reclaim
- it used application task context.
- as possible as, it shouldn't increase latency overhead.
- this reclaiming purpose is to make the memory for _own_ taks.
for other tasks memory don't need to concern.
kswap does it.
Almost developer don't payed attention for HPC in past.
However, in these days, # of cpus increase rapidly. and
parallel processing technique become commonly.
(e.g. now, gcc has OpenMP feature support directly)
Therefore, we shouldn't ignore parallel application modern days.
o remove priority==DEF_PRIORITY condision
o shrink_zones() also should have bailing out feature.
---
mm/vmscan.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
* freeing target can get unreasonably large.
*/
if (sc->nr_reclaimed > sc->swap_cluster_max &&
- priority < DEF_PRIORITY && !current_is_kswapd())
+ !current_is_kswapd())
break;
}
@@ -1534,6 +1534,8 @@ static void shrink_zones(int priority, s
}
shrink_zone(priority, zone, sc);
+ if (sc->nr_reclaimed > sc->swap_cluster_max)
+ break;
}
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-28 7:02 ` KOSAKI Motohiro
@ 2008-11-28 11:03 ` Rik van Riel
2008-11-29 10:53 ` KOSAKI Motohiro
0 siblings, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-11-28 11:03 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: linux-mm, linux-kernel, mel, akpm
KOSAKI Motohiro wrote:
> Hi
>
> I mesured some data in this week and I got some interesting data.
> Rik patch improve about 30% and my patch improve 20% more.
> totally, We got about 50% improvement.
Very interesting indeed! I did not know there was this easy
a reproducer of the problem that my patch is trying to solve.
> rc6+stream rvr +kosaki
> ----------------------------------------
> avg 274.8004 194.6495 132.7923
> std 184.4902365 111.5699478 75.88299814
> min 37.523 57.084 52.233
> max 588.74 382.376 319.115
Impressive.
> So, rik patch and my patch improve perfectly different reclaim aspect.
> In general, kernel reclaim processing has several key goals.
>
> (1) if system has droppable cache, system shouldn't happen oom kill.
> (2) if system has avaiable swap space, system shouldn't happen
> oom kill as poosible as.
> (3) if system has enough free memory, system shouldn't reclaim any page
> at all.
> (4) if memory pressure is lite, system shouldn't cause heavy reclaim
> latency to application.
>
> rik patch improve (3), my (this mail) modification improve (4).
Actually, to achieve (3) we would want to skip zones with way
more than enough free memory in shrink_zones(). Kswapd already
skips zones like this in shrink_pgdat(), so we definately want
this change:
@@ -1519,6 +1519,9 @@ static void shrink_zones(int priority, s
if (zone_is_all_unreclaimable(zone) &&
priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (zone_watermark_ok(zone, order,
4*zone->pages_high,
+ end_zone, 0))
+ continue; /* Lots free already */
sc->all_unreclaimable = 0;
} else {
/*
I'm sending a patch with this right now :)
> Actually, kswapd background reclaim and direct reclaim have perfectly
> different purpose and goal.
>
> background reclaim
> - kswapd don't need latency overhead reducing.
> it isn't observe from end user.
> - kswapd shoudn't only increase free pages, but also should
> zone balancing.
>
> foreground reclaim
> - it used application task context.
> - as possible as, it shouldn't increase latency overhead.
> - this reclaiming purpose is to make the memory for _own_ taks.
> for other tasks memory don't need to concern.
> kswap does it.
I am not entirely convinced that breaking out of the loop early
in a zone is not harmful for direct reclaimers. Maybe it works
fine, maybe it won't.
Or maybe direct reclaimers should start scanning the largest zone
first, so your change can be done with the lowest risk possible?
Having said that, the 20% additional performance achieved with
your changes is impressive.
> o remove priority==DEF_PRIORITY condision
This one could definately be worth considering.
However, looking at the changeset that was backed out in the
early 2.6 series suggests that it may not be the best idea.
> o shrink_zones() also should have bailing out feature.
This one is similar. What are the downsides of skipping a
zone entirely, when that zone has pages that should be freed?
If it can lead to the VM reclaiming new pages from one zone,
while leaving old pages from another zone in memory, we can
greatly reduce the caching efficiency of the page cache.
> ---
> mm/vmscan.c | 4 +++-
> 1 file changed, 3 insertions(+), 1 deletion(-)
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, st
> * freeing target can get unreasonably large.
> */
> if (sc->nr_reclaimed > sc->swap_cluster_max &&
> - priority < DEF_PRIORITY && !current_is_kswapd())
> + !current_is_kswapd())
> break;
> }
>
> @@ -1534,6 +1534,8 @@ static void shrink_zones(int priority, s
> }
>
> shrink_zone(priority, zone, sc);
> + if (sc->nr_reclaimed > sc->swap_cluster_max)
> + break;
> }
> }
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [rfc] vmscan: serialize aggressive reclaimers
2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
@ 2008-11-29 7:46 ` KOSAKI Motohiro
2008-11-29 15:39 ` Johannes Weiner
0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-29 7:46 UTC (permalink / raw)
To: Johannes Weiner
Cc: kosaki.motohiro, Rik van Riel, linux-mm, linux-kernel, mel, akpm
> Since we have to pull through a reclaim cycle once we commited to it,
> what do you think about serializing the lower priority levels
> completely?
>
> The idea is that when one reclaimer has done a low priority level
> iteration with a huge reclaim target, chances are that succeeding
> reclaimers don't even need to drop to lower levels at all because
> enough memory has already been freed.
>
> My testprogram maps and faults in a file that is about as large as my
> physical memory. Then it spawns off n processes that try allocate
> 1/2n of total memory in anon pages, i.e. half of it in sum. After it
> ran, I check how much memory has been reclaimed. But my zone sizes
> are too small to induce enormous reclaim targets so I don't see vast
> over-reclaims.
>
> I have measured the time of other tests on an SMP machine with 4 cores
> and the following patch applied. I couldn't see any performance
> degradation. But since the bug is not triggerable here, I can not
> prove it helps the original problem, either.
I wonder why nobody of vmscan folks write actual performance improvement value
in patch description.
I think this patch point to right direction.
but, unfortunately, this implementation isn't fast as I mesured as.
>
> The level where it starts serializing is chosen pretty arbitrarily.
> Suggestions welcome :)
>
> Hannes
>
> ---
>
> Prevent over-reclaiming by serializing direct reclaimers below a
> certain priority level.
>
> Over-reclaiming happens when the sum of the reclaim targets of all
> reclaiming processes is larger than the sum of the needed free pages,
> thus leading to excessive eviction of more cache and anonymous pages
> than required.
>
> A scan iteration over all zones can not be aborted intermittently when
> enough pages are reclaimed because that would mess up the scan balance
> between the zones. Instead, prevent that too many processes
> simultaneously commit themselves to lower priority level scans in the
> first place.
>
> Chances are that after the exclusive reclaimer has finished, enough
> memory has been freed that succeeding scanners don't need to drop to
> lower priority levels at all anymore.
>
> Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
> ---
> mm/vmscan.c | 20 ++++++++++++++++++++
> 1 file changed, 20 insertions(+)
>
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -35,6 +35,7 @@
> #include <linux/notifier.h>
> #include <linux/rwsem.h>
> #include <linux/delay.h>
> +#include <linux/wait.h>
> #include <linux/kthread.h>
> #include <linux/freezer.h>
> #include <linux/memcontrol.h>
> @@ -42,6 +43,7 @@
> #include <linux/sysctl.h>
>
> #include <asm/tlbflush.h>
> +#include <asm/atomic.h>
> #include <asm/div64.h>
>
> #include <linux/swapops.h>
> @@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
> * returns: 0, if no pages reclaimed
> * else, the number of pages reclaimed
> */
> +
> +static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
> +static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
> +
> static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> struct scan_control *sc)
> {
> int priority;
> + int exclusive = 0;
> unsigned long ret = 0;
> unsigned long total_scanned = 0;
> unsigned long nr_reclaimed = 0;
> @@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
> sc->nr_scanned = 0;
> if (!priority)
> disable_swap_token();
> + /*
> + * Serialize aggressive reclaimers
> + */
> + if (priority <= DEF_PRIORITY / 2 && !exclusive) {
On large machine, DEF_PRIORITY / 2 is really catastrophe situation.
2^6 = 64.
if zone has 64GB memory, it mean 1GB reclaim.
I think more early restriction is better.
> + wait_event(reclaim_wait,
> + !atomic_cmpxchg(&reclaim_exclusive, 0, 1));
> + exclusive = 1;
> + }
if you want to restrict to one task, you can use mutex.
and this wait_queue should put on global variable. it should be zone variable.
In addision, you don't consider recursive relaim and several task can't sleep there.
please believe me. I have richest experience about reclaim throttling in the planet.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-28 11:03 ` Rik van Riel
@ 2008-11-29 10:53 ` KOSAKI Motohiro
2008-11-29 16:24 ` Rik van Riel
0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-29 10:53 UTC (permalink / raw)
To: Rik van Riel, akpm; +Cc: kosaki.motohiro, linux-mm, linux-kernel, mel
> > So, rik patch and my patch improve perfectly different reclaim aspect.
> > In general, kernel reclaim processing has several key goals.
> >
> > (1) if system has droppable cache, system shouldn't happen oom kill.
> > (2) if system has avaiable swap space, system shouldn't happen
> > oom kill as poosible as.
> > (3) if system has enough free memory, system shouldn't reclaim any page
> > at all.
> > (4) if memory pressure is lite, system shouldn't cause heavy reclaim
> > latency to application.
> >
> > rik patch improve (3), my (this mail) modification improve (4).
>
> Actually, to achieve (3) we would want to skip zones with way
> more than enough free memory in shrink_zones(). Kswapd already
> skips zones like this in shrink_pgdat(), so we definately want
> this change:
>
> @@ -1519,6 +1519,9 @@ static void shrink_zones(int priority, s
> if (zone_is_all_unreclaimable(zone) &&
> priority != DEF_PRIORITY)
> continue; /* Let kswapd poll it */
> + if (zone_watermark_ok(zone, order,
> 4*zone->pages_high,
> + end_zone, 0))
> + continue; /* Lots free already */
> sc->all_unreclaimable = 0;
> } else {
> /*
>
> I'm sending a patch with this right now :)
please wait few days.
Actually, I made similar patch half year ago.
but I droped it because I observe performance degression.
but my recall isn't clear.
I should mesure it again.
My guessing is,
zone_waterwark_ok() is very slow function, it doesn't only check
the number of the free memory, but also check memory fragmentation.
So, it is called when lite memory pressure, we violate above rule (4).
> > Actually, kswapd background reclaim and direct reclaim have perfectly
> > different purpose and goal.
> >
> > background reclaim
> > - kswapd don't need latency overhead reducing.
> > it isn't observe from end user.
> > - kswapd shoudn't only increase free pages, but also should
> > zone balancing.
> >
> > foreground reclaim
> > - it used application task context.
> > - as possible as, it shouldn't increase latency overhead.
> > - this reclaiming purpose is to make the memory for _own_ taks.
> > for other tasks memory don't need to concern.
> > kswap does it.
>
> I am not entirely convinced that breaking out of the loop early
> in a zone is not harmful for direct reclaimers. Maybe it works
> fine, maybe it won't.
>
> Or maybe direct reclaimers should start scanning the largest zone
> first, so your change can be done with the lowest risk possible?
>
> Having said that, the 20% additional performance achieved with
> your changes is impressive.
>
> > o remove priority==DEF_PRIORITY condision
>
> This one could definately be worth considering.
>
> However, looking at the changeset that was backed out in the
> early 2.6 series suggests that it may not be the best idea.
>
> > o shrink_zones() also should have bailing out feature.
>
> This one is similar. What are the downsides of skipping a
> zone entirely, when that zone has pages that should be freed?
>
> If it can lead to the VM reclaiming new pages from one zone,
> while leaving old pages from another zone in memory, we can
> greatly reduce the caching efficiency of the page cache.
I think I can explain logically.
At first, please see below ML archive url.
it describe why akpm's "vmscan.c: dont reclaim too many pages" was dropped.
http://groups.google.co.jp/group/linux.kernel/browse_thread/thread/383853cdce059d1f/f13d5f87d726e325?hl=ja%3Fhl&lnk=gst&q=vmscan%3A+balancing+fix+akpm#f13d5f87d726e325
Again, old akpm patch restrict direct reclaim and background reclaim.
Therefore that url's discussion didn't separate two things too.
but your and mine restrict direct reclaim only.
At that time, Marcelo Tosatti reported akpm patch cause reclaim
imbalancing by FFSB benchmark.
I mesured ffsb on 2.6.28-rc6 and our patch.
mesured machine spec:
CPU: IA64 x 8
MEM
Node0: DMA ZONE: 2GB
NORMAL ZONE: 2GB
Node1: DMA ZONE: 4GB
-----------------
total 8GB
used configuration file (the same of marcelo's conf)
---------------------------------------------------
directio=0
time=300
[filesystem0]
location=/mnt/sdb1/kosaki/ffsb
num_files=20
num_dirs=10
max_filesize=91534338
min_filesize=65535
[end0]
[threadgroup0]
num_threads=10
write_size=2816
write_blocksize=4096
read_size=2816
read_blocksize=4096
create_weight=100
write_weight=30
read_weight=100
[end0]
--------------------------------------------------------
result:
------------ 2.6.28-rc6 ----------------------------------
pgscan_kswapd_dma 10624
pgscan_kswapd_normal 20640
-> normal/dma ratio 20640 / 10624 = 1.9
pgscan_direct_dma 576
pgscan_direct_normal 2528
-> normal/dma ratio 2528 / 576 = 4.38
kswapd+direct dma 11200
kswapd+direct normal 23168
-> normal/dma ratio 2.0
------------ rvr bail out ---------------------------------
pgscan_kswapd_dma 15552
pgscan_kswapd_normal 31936
-> normal/dma ratio 2.05
pgscan_direct_dma 1216
pgscan_direct_normal 3968
-> normal/dma ratio 3.26
kswapd+direct dma 16768
kswapd+direct normal 35904
-> normal/dma ratio 2.1
------------ +kosaki ---------------------------------
pgscan_kswapd_dma 14208
pgscan_kswapd_normal 31616
-> normal/dma ratio 31616/14208 = 2.25
pgscan_direct_dma 1024
pgscan_direct_normal 3328
-> normal/dma ratio 3328/1024 = 3.25
kswapd+direct dma 15232
kswapd+direct normal 34944
-> normal/dma ratio 2.2
----------------------------------------------------------
The result talk about three things.
- rvr and mine patch increase direct reclaim imbalancing, indeed.
- However, background reclaim scanning is _very_ much than direct reclaim.
Then, direct reclaim imbalancing is ignorable on the big view.
rvr patch doesn't reintroduce zone imbalancing issue.
- rvr's priority==DEF_PRIORITY condition checking doesn't improve
zone balancing at all.
we can drop it.
Again, I believe my patch improve vm scanning totally.
Any comments?
Andrew, I hope add this mesurement result to rvr bailing out patch description too.
Please let me know what I should do.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [rfc] vmscan: serialize aggressive reclaimers
2008-11-29 7:46 ` KOSAKI Motohiro
@ 2008-11-29 15:39 ` Johannes Weiner
0 siblings, 0 replies; 23+ messages in thread
From: Johannes Weiner @ 2008-11-29 15:39 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, mel, akpm
On Sat, Nov 29, 2008 at 04:46:24PM +0900, KOSAKI Motohiro wrote:
> > Since we have to pull through a reclaim cycle once we commited to it,
> > what do you think about serializing the lower priority levels
> > completely?
> >
> > The idea is that when one reclaimer has done a low priority level
> > iteration with a huge reclaim target, chances are that succeeding
> > reclaimers don't even need to drop to lower levels at all because
> > enough memory has already been freed.
> >
> > My testprogram maps and faults in a file that is about as large as my
> > physical memory. Then it spawns off n processes that try allocate
> > 1/2n of total memory in anon pages, i.e. half of it in sum. After it
> > ran, I check how much memory has been reclaimed. But my zone sizes
> > are too small to induce enormous reclaim targets so I don't see vast
> > over-reclaims.
> >
> > I have measured the time of other tests on an SMP machine with 4 cores
> > and the following patch applied. I couldn't see any performance
> > degradation. But since the bug is not triggerable here, I can not
> > prove it helps the original problem, either.
>
> I wonder why nobody of vmscan folks write actual performance improvement value
> in patch description.
That's why I made it RFC. I haven't seriously tested it, I just
wanted to know what people that understand more than I do think of the
idea.
> I think this patch point to right direction.
> but, unfortunately, this implementation isn't fast as I mesured as.
Fair enough.
> > The level where it starts serializing is chosen pretty arbitrarily.
> > Suggestions welcome :)
> >
> > Hannes
> >
> > ---
> >
> > Prevent over-reclaiming by serializing direct reclaimers below a
> > certain priority level.
> >
> > Over-reclaiming happens when the sum of the reclaim targets of all
> > reclaiming processes is larger than the sum of the needed free pages,
> > thus leading to excessive eviction of more cache and anonymous pages
> > than required.
> >
> > A scan iteration over all zones can not be aborted intermittently when
> > enough pages are reclaimed because that would mess up the scan balance
> > between the zones. Instead, prevent that too many processes
> > simultaneously commit themselves to lower priority level scans in the
> > first place.
> >
> > Chances are that after the exclusive reclaimer has finished, enough
> > memory has been freed that succeeding scanners don't need to drop to
> > lower priority levels at all anymore.
> >
> > Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
> > ---
> > mm/vmscan.c | 20 ++++++++++++++++++++
> > 1 file changed, 20 insertions(+)
> >
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -35,6 +35,7 @@
> > #include <linux/notifier.h>
> > #include <linux/rwsem.h>
> > #include <linux/delay.h>
> > +#include <linux/wait.h>
> > #include <linux/kthread.h>
> > #include <linux/freezer.h>
> > #include <linux/memcontrol.h>
> > @@ -42,6 +43,7 @@
> > #include <linux/sysctl.h>
> >
> > #include <asm/tlbflush.h>
> > +#include <asm/atomic.h>
> > #include <asm/div64.h>
> >
> > #include <linux/swapops.h>
> > @@ -1546,10 +1548,15 @@ static unsigned long shrink_zones(int pr
> > * returns: 0, if no pages reclaimed
> > * else, the number of pages reclaimed
> > */
> > +
> > +static DECLARE_WAIT_QUEUE_HEAD(reclaim_wait);
> > +static atomic_t reclaim_exclusive = ATOMIC_INIT(0);
> > +
> > static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
> > struct scan_control *sc)
> > {
> > int priority;
> > + int exclusive = 0;
> > unsigned long ret = 0;
> > unsigned long total_scanned = 0;
> > unsigned long nr_reclaimed = 0;
> > @@ -1580,6 +1587,14 @@ static unsigned long do_try_to_free_page
> > sc->nr_scanned = 0;
> > if (!priority)
> > disable_swap_token();
> > + /*
> > + * Serialize aggressive reclaimers
> > + */
> > + if (priority <= DEF_PRIORITY / 2 && !exclusive) {
>
> On large machine, DEF_PRIORITY / 2 is really catastrophe situation.
> 2^6 = 64.
> if zone has 64GB memory, it mean 1GB reclaim.
> I think more early restriction is better.
I am just afraid that it kills parallelity.
> > + wait_event(reclaim_wait,
> > + !atomic_cmpxchg(&reclaim_exclusive, 0, 1));
> > + exclusive = 1;
> > + }
>
> if you want to restrict to one task, you can use mutex.
> and this wait_queue should put on global variable. it should be zone variable.
Hm, global or per-zone? Rik suggested to do it per-node and I like
that idea.
> In addision, you don't consider recursive relaim and several task can't sleep there.
>
>
> please believe me. I have richest experience about reclaim throttling in the planet.
Hehe, okay. Than I am glad you don't hate the idea completely. Do
you have any patches flying around that do something similar?
Hannes
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-29 10:53 ` KOSAKI Motohiro
@ 2008-11-29 16:24 ` Rik van Riel
2008-11-30 6:30 ` KOSAKI Motohiro
2008-12-01 13:40 ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
0 siblings, 2 replies; 23+ messages in thread
From: Rik van Riel @ 2008-11-29 16:24 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: akpm, linux-mm, linux-kernel, mel
KOSAKI Motohiro wrote:
> The result talk about three things.
>
> - rvr and mine patch increase direct reclaim imbalancing, indeed.
> - However, background reclaim scanning is _very_ much than direct reclaim.
> Then, direct reclaim imbalancing is ignorable on the big view.
> rvr patch doesn't reintroduce zone imbalancing issue.
> - rvr's priority==DEF_PRIORITY condition checking doesn't improve
> zone balancing at all.
> we can drop it.
>
> Again, I believe my patch improve vm scanning totally.
>
> Any comments?
Reclaiming is very easy when the workload is just page cache,
because the application will be throttled when too many page
cache pages are dirty.
When using mmap or memory hogs writing to swap, applications
will not be throttled by the "too many dirty pages" logic,
but may instead end up being throttled in the direct reclaim
path instead.
At that point direct reclaim may become a lot more common,
making the imbalance more significant.
I'll run a few tests.
> Andrew, I hope add this mesurement result to rvr bailing out patch description too.
So far the performance numbers you have measured are very
encouraging and do indeed suggest that the priority==DEF_PRIORITY
thing does not make a difference.
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-29 16:24 ` Rik van Riel
@ 2008-11-30 6:30 ` KOSAKI Motohiro
2008-12-03 5:26 ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
2008-12-01 13:40 ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
1 sibling, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-11-30 6:30 UTC (permalink / raw)
To: Rik van Riel; +Cc: kosaki.motohiro, akpm, linux-mm, linux-kernel, mel
> Reclaiming is very easy when the workload is just page cache,
> because the application will be throttled when too many page
> cache pages are dirty.
>
> When using mmap or memory hogs writing to swap, applications
> will not be throttled by the "too many dirty pages" logic,
> but may instead end up being throttled in the direct reclaim
> path instead.
>
> At that point direct reclaim may become a lot more common,
> making the imbalance more significant.
fair enough.
> I'll run a few tests.
Great.
I'm looking for your mail :)
> > Andrew, I hope add this mesurement result to rvr bailing out patch description too.
>
> So far the performance numbers you have measured are very
> encouraging and do indeed suggest that the priority==DEF_PRIORITY
> thing does not make a difference.
thank you.
I believe reclaim latency reducing doesn't only improve hpc, but also
improve several multimedia and desktop application.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages
2008-11-29 16:24 ` Rik van Riel
2008-11-30 6:30 ` KOSAKI Motohiro
@ 2008-12-01 13:40 ` Christoph Lameter
1 sibling, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2008-12-01 13:40 UTC (permalink / raw)
To: Rik van Riel; +Cc: KOSAKI Motohiro, akpm, linux-mm, linux-kernel, mel
On Sat, 29 Nov 2008, Rik van Riel wrote:
> When using mmap or memory hogs writing to swap, applications
> will not be throttled by the "too many dirty pages" logic,
> but may instead end up being throttled in the direct reclaim
> path instead.
The too many dirty pages logic will throttle applications dirtying
mmapped pages these days.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] vmscan: improve reclaim throuput to bail out patch
2008-11-30 6:30 ` KOSAKI Motohiro
@ 2008-12-03 5:26 ` KOSAKI Motohiro
2008-12-03 13:46 ` Rik van Riel
0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-03 5:26 UTC (permalink / raw)
To: Rik van Riel, akpm, linux-mm, linux-kernel, mel; +Cc: kosaki.motohiro
Hi
I evaluate rvr bailout and skip-freeing patch in this week conteniously.
I'd like to dump first output here.
Rik, could you please review following?
==
vmscan bail out patch move nr_reclaimed variable to struct scan_control.
Unfortunately, indirect access can easily happen cache miss.
More unfortunately, Some architecture (e.g. ia64) don't access global
variable so fast.
if heavy memory pressure happend, that's ok.
cache miss already plenty. it is not observable.
but, if memory pressure is lite, performance degression is obserbable.
I compared following three pattern (it was mesured 10 times each)
hackbench 125 process 3000
hackbench 130 process 3000
hackbench 135 process 3000
2.6.28-rc6 bail-out
125 130 135 125 130 135
==============================================================
71.866 75.86 81.274 93.414 73.254 193.382
74.145 78.295 77.27 74.897 75.021 80.17
70.305 77.643 75.855 70.134 77.571 79.896
74.288 73.986 75.955 77.222 78.48 80.619
72.029 79.947 78.312 75.128 82.172 79.708
71.499 77.615 77.042 74.177 76.532 77.306
76.188 74.471 83.562 73.839 72.43 79.833
73.236 75.606 78.743 76.001 76.557 82.726
69.427 77.271 76.691 76.236 79.371 103.189
72.473 76.978 80.643 69.128 78.932 75.736
avg 72.545 76.767 78.534 76.017 77.03 93.256
std 1.89 1.71 2.41 6.29 2.79 34.16
min 69.427 73.986 75.855 69.128 72.43 75.736
max 76.188 79.947 83.562 93.414 82.172 193.382
about 4-5% degression.
Then, this patch introduce temporal local variable.
result:
2.6.28-rc6 this patch
num 125 130 135 125 130 135
==============================================================
71.866 75.86 81.274 67.302 68.269 77.161
74.145 78.295 77.27 72.616 72.712 79.06
70.305 77.643 75.855 72.475 75.712 77.735
74.288 73.986 75.955 69.229 73.062 78.814
72.029 79.947 78.312 71.551 74.392 78.564
71.499 77.615 77.042 69.227 74.31 78.837
76.188 74.471 83.562 70.759 75.256 76.6
73.236 75.606 78.743 69.966 76.001 78.464
69.427 77.271 76.691 69.068 75.218 80.321
72.473 76.978 80.643 72.057 77.151 79.068
avg 72.545 76.767 78.534 70.425 74.2083 78.462
std 1.89 1.71 2.41 1.66 2.34 1.00
min 69.427 73.986 75.855 67.302 68.269 76.6
max 76.188 79.947 83.562 72.616 77.151 80.321
OK. the degression is disappeared.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
mm/vmscan.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
unsigned long nr_to_scan;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long swap_cluster_max = sc->swap_cluster_max;
get_scan_ratio(zone, sc, percent);
@@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
}
zone->lru[l].nr_scan += scan;
nr[l] = zone->lru[l].nr_scan;
- if (nr[l] >= sc->swap_cluster_max)
+ if (nr[l] >= swap_cluster_max)
zone->lru[l].nr_scan = 0;
else
nr[l] = 0;
@@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr[l]) {
- nr_to_scan = min(nr[l],
- (unsigned long)sc->swap_cluster_max);
+ nr_to_scan = min(nr[l], swap_cluster_max);
nr[l] -= nr_to_scan;
- sc->nr_reclaimed += shrink_list(l, nr_to_scan,
- zone, sc, priority);
+ nr_reclaimed += shrink_list(l, nr_to_scan,
+ zone, sc, priority);
}
}
/*
@@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (sc->nr_reclaimed > sc->swap_cluster_max &&
+ if (nr_reclaimed > swap_cluster_max &&
priority < DEF_PRIORITY && !current_is_kswapd())
break;
}
+ sc->nr_reclaimed = nr_reclaimed;
+
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch
2008-12-03 5:26 ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
@ 2008-12-03 13:46 ` Rik van Riel
2008-12-03 15:12 ` KOSAKI Motohiro
0 siblings, 1 reply; 23+ messages in thread
From: Rik van Riel @ 2008-12-03 13:46 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: akpm, linux-mm, linux-kernel, mel
KOSAKI Motohiro wrote:
> Hi
>
> I evaluate rvr bailout and skip-freeing patch in this week conteniously.
> I'd like to dump first output here.
>
>
>
> Rik, could you please review following?
> ==
> vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
> More unfortunately, Some architecture (e.g. ia64) don't access global
> variable so fast.
That is amazing. Especially considering that the scan_control
is a local variable on the stack.
> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
>
> but, if memory pressure is lite, performance degression is obserbable.
> about 4-5% degression.
>
> Then, this patch introduce temporal local variable.
> OK. the degression is disappeared.
I can't argue with the numbers, though :)
Maybe all the scanning we do ends up evicting the cache lines
with the scan_control struct in it from the fast part of the
CPU cache?
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch
2008-12-03 13:46 ` Rik van Riel
@ 2008-12-03 15:12 ` KOSAKI Motohiro
2008-12-04 1:28 ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
0 siblings, 1 reply; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-03 15:12 UTC (permalink / raw)
To: Rik van Riel; +Cc: akpm, linux-mm, linux-kernel, mel
>> I evaluate rvr bailout and skip-freeing patch in this week conteniously.
>> I'd like to dump first output here.
>>
>>
>>
>> Rik, could you please review following?
>> ==
>> vmscan bail out patch move nr_reclaimed variable to struct scan_control.
>> Unfortunately, indirect access can easily happen cache miss.
>> More unfortunately, Some architecture (e.g. ia64) don't access global
>> variable so fast.
>
> That is amazing. Especially considering that the scan_control
> is a local variable on the stack.
Ahhhhh, I did want to write "indirect access(or likes global variables)",
but my brain was sucked. sorry.
I'll post description fixed version soon. thanks.
>> if heavy memory pressure happend, that's ok.
>> cache miss already plenty. it is not observable.
>>
>> but, if memory pressure is lite, performance degression is obserbable.
>
>> about 4-5% degression.
>>
>> Then, this patch introduce temporal local variable.
>
>> OK. the degression is disappeared.
>
> I can't argue with the numbers, though :)
>
> Maybe all the scanning we do ends up evicting the cache lines
> with the scan_control struct in it from the fast part of the
> CPU cache?
Yeah, I think so.
>
>> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
>
> Acked-by: Rik van Riel <riel@redhat.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH] vmscan: improve reclaim throuput to bail out patch take2
2008-12-03 15:12 ` KOSAKI Motohiro
@ 2008-12-04 1:28 ` KOSAKI Motohiro
2008-12-04 4:20 ` MinChan Kim
2008-12-07 3:28 ` Andrew Morton
0 siblings, 2 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-04 1:28 UTC (permalink / raw)
To: Rik van Riel, akpm, linux-mm, linux-kernel, mel; +Cc: kosaki.motohiro
The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
Unfortunately, indirect access can easily happen cache miss.
if heavy memory pressure happend, that's ok.
cache miss already plenty. it is not observable.
but, if memory pressure is lite, performance degression is obserbable.
I compared following three pattern (it was mesured 10 times each)
hackbench 125 process 3000
hackbench 130 process 3000
hackbench 135 process 3000
2.6.28-rc6 bail-out
125 130 135 125 130 135
==============================================================
71.866 75.86 81.274 93.414 73.254 193.382
74.145 78.295 77.27 74.897 75.021 80.17
70.305 77.643 75.855 70.134 77.571 79.896
74.288 73.986 75.955 77.222 78.48 80.619
72.029 79.947 78.312 75.128 82.172 79.708
71.499 77.615 77.042 74.177 76.532 77.306
76.188 74.471 83.562 73.839 72.43 79.833
73.236 75.606 78.743 76.001 76.557 82.726
69.427 77.271 76.691 76.236 79.371 103.189
72.473 76.978 80.643 69.128 78.932 75.736
avg 72.545 76.767 78.534 76.017 77.03 93.256
std 1.89 1.71 2.41 6.29 2.79 34.16
min 69.427 73.986 75.855 69.128 72.43 75.736
max 76.188 79.947 83.562 93.414 82.172 193.382
about 4-5% degression.
Then, this patch introduce temporal local variable.
result:
2.6.28-rc6 this patch
num 125 130 135 125 130 135
==============================================================
71.866 75.86 81.274 67.302 68.269 77.161
74.145 78.295 77.27 72.616 72.712 79.06
70.305 77.643 75.855 72.475 75.712 77.735
74.288 73.986 75.955 69.229 73.062 78.814
72.029 79.947 78.312 71.551 74.392 78.564
71.499 77.615 77.042 69.227 74.31 78.837
76.188 74.471 83.562 70.759 75.256 76.6
73.236 75.606 78.743 69.966 76.001 78.464
69.427 77.271 76.691 69.068 75.218 80.321
72.473 76.978 80.643 72.057 77.151 79.068
avg 72.545 76.767 78.534 70.425 74.2083 78.462
std 1.89 1.71 2.41 1.66 2.34 1.00
min 69.427 73.986 75.855 67.302 68.269 76.6
max 76.188 79.947 83.562 72.616 77.151 80.321
OK. the degression is disappeared.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
---
mm/vmscan.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
unsigned long nr_to_scan;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long swap_cluster_max = sc->swap_cluster_max;
get_scan_ratio(zone, sc, percent);
@@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
}
zone->lru[l].nr_scan += scan;
nr[l] = zone->lru[l].nr_scan;
- if (nr[l] >= sc->swap_cluster_max)
+ if (nr[l] >= swap_cluster_max)
zone->lru[l].nr_scan = 0;
else
nr[l] = 0;
@@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
if (nr[l]) {
- nr_to_scan = min(nr[l],
- (unsigned long)sc->swap_cluster_max);
+ nr_to_scan = min(nr[l], swap_cluster_max);
nr[l] -= nr_to_scan;
- sc->nr_reclaimed += shrink_list(l, nr_to_scan,
- zone, sc, priority);
+ nr_reclaimed += shrink_list(l, nr_to_scan,
+ zone, sc, priority);
}
}
/*
@@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
- if (sc->nr_reclaimed > sc->swap_cluster_max &&
+ if (nr_reclaimed > swap_cluster_max &&
priority < DEF_PRIORITY && !current_is_kswapd())
break;
}
+ sc->nr_reclaimed = nr_reclaimed;
+
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
2008-12-04 1:28 ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
@ 2008-12-04 4:20 ` MinChan Kim
2008-12-04 5:04 ` KOSAKI Motohiro
2008-12-07 3:28 ` Andrew Morton
1 sibling, 1 reply; 23+ messages in thread
From: MinChan Kim @ 2008-12-04 4:20 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Rik van Riel, akpm, linux-mm, linux-kernel, mel
Hi, Kosaki-san.
It's a great improvement with only one variable than I expected. :)
What is your test environment ? (CPU, L1, L2 cache size and so )
Just out of curiosity.
On Thu, Dec 4, 2008 at 10:28 AM, KOSAKI Motohiro
<kosaki.motohiro@jp.fujitsu.com> wrote:
> The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
>
> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
>
> but, if memory pressure is lite, performance degression is obserbable.
>
>
> I compared following three pattern (it was mesured 10 times each)
>
> hackbench 125 process 3000
> hackbench 130 process 3000
> hackbench 135 process 3000
>
> 2.6.28-rc6 bail-out
>
> 125 130 135 125 130 135
> ==============================================================
> 71.866 75.86 81.274 93.414 73.254 193.382
> 74.145 78.295 77.27 74.897 75.021 80.17
> 70.305 77.643 75.855 70.134 77.571 79.896
> 74.288 73.986 75.955 77.222 78.48 80.619
> 72.029 79.947 78.312 75.128 82.172 79.708
> 71.499 77.615 77.042 74.177 76.532 77.306
> 76.188 74.471 83.562 73.839 72.43 79.833
> 73.236 75.606 78.743 76.001 76.557 82.726
> 69.427 77.271 76.691 76.236 79.371 103.189
> 72.473 76.978 80.643 69.128 78.932 75.736
>
> avg 72.545 76.767 78.534 76.017 77.03 93.256
> std 1.89 1.71 2.41 6.29 2.79 34.16
> min 69.427 73.986 75.855 69.128 72.43 75.736
> max 76.188 79.947 83.562 93.414 82.172 193.382
>
>
> about 4-5% degression.
>
> Then, this patch introduce temporal local variable.
>
> result:
>
> 2.6.28-rc6 this patch
>
> num 125 130 135 125 130 135
> ==============================================================
> 71.866 75.86 81.274 67.302 68.269 77.161
> 74.145 78.295 77.27 72.616 72.712 79.06
> 70.305 77.643 75.855 72.475 75.712 77.735
> 74.288 73.986 75.955 69.229 73.062 78.814
> 72.029 79.947 78.312 71.551 74.392 78.564
> 71.499 77.615 77.042 69.227 74.31 78.837
> 76.188 74.471 83.562 70.759 75.256 76.6
> 73.236 75.606 78.743 69.966 76.001 78.464
> 69.427 77.271 76.691 69.068 75.218 80.321
> 72.473 76.978 80.643 72.057 77.151 79.068
>
> avg 72.545 76.767 78.534 70.425 74.2083 78.462
> std 1.89 1.71 2.41 1.66 2.34 1.00
> min 69.427 73.986 75.855 67.302 68.269 76.6
> max 76.188 79.947 83.562 72.616 77.151 80.321
>
>
> OK. the degression is disappeared.
>
>
>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Acked-by: Rik van Riel <riel@redhat.com>
> ---
> mm/vmscan.c | 15 +++++++++------
> 1 file changed, 9 insertions(+), 6 deletions(-)
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
> unsigned long nr_to_scan;
> unsigned long percent[2]; /* anon @ 0; file @ 1 */
> enum lru_list l;
> + unsigned long nr_reclaimed = sc->nr_reclaimed;
> + unsigned long swap_cluster_max = sc->swap_cluster_max;
>
> get_scan_ratio(zone, sc, percent);
>
> @@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
> }
> zone->lru[l].nr_scan += scan;
> nr[l] = zone->lru[l].nr_scan;
> - if (nr[l] >= sc->swap_cluster_max)
> + if (nr[l] >= swap_cluster_max)
> zone->lru[l].nr_scan = 0;
> else
> nr[l] = 0;
> @@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> if (nr[l]) {
> - nr_to_scan = min(nr[l],
> - (unsigned long)sc->swap_cluster_max);
> + nr_to_scan = min(nr[l], swap_cluster_max);
> nr[l] -= nr_to_scan;
>
> - sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> - zone, sc, priority);
> + nr_reclaimed += shrink_list(l, nr_to_scan,
> + zone, sc, priority);
> }
> }
> /*
> @@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
> * with multiple processes reclaiming pages, the total
> * freeing target can get unreasonably large.
> */
> - if (sc->nr_reclaimed > sc->swap_cluster_max &&
> + if (nr_reclaimed > swap_cluster_max &&
> priority < DEF_PRIORITY && !current_is_kswapd())
> break;
> }
>
> + sc->nr_reclaimed = nr_reclaimed;
> +
> /*
> * Even if we did not try to evict anon pages at all, we want to
> * rebalance the anon lru active/inactive ratio.
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
Kinds regards,
MinChan Kim
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
2008-12-04 4:20 ` MinChan Kim
@ 2008-12-04 5:04 ` KOSAKI Motohiro
0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-04 5:04 UTC (permalink / raw)
To: MinChan Kim
Cc: kosaki.motohiro, Rik van Riel, akpm, linux-mm, linux-kernel, mel
Hi
> Hi, Kosaki-san.
>
> It's a great improvement with only one variable than I expected. :)
> What is your test environment ? (CPU, L1, L2 cache size and so )
> Just out of curiosity.
CPU: ia64x8
L1: 16KB
L2: 512KB
L3: 24MB
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
2008-12-04 1:28 ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
2008-12-04 4:20 ` MinChan Kim
@ 2008-12-07 3:28 ` Andrew Morton
2008-12-08 2:49 ` KOSAKI Motohiro
1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2008-12-07 3:28 UTC (permalink / raw)
To: KOSAKI Motohiro; +Cc: Rik van Riel, linux-mm, linux-kernel, mel
On Thu, 4 Dec 2008 10:28:39 +0900 (JST) KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
> The vmscan bail out patch move nr_reclaimed variable to struct scan_control.
> Unfortunately, indirect access can easily happen cache miss.
>
> if heavy memory pressure happend, that's ok.
> cache miss already plenty. it is not observable.
>
> but, if memory pressure is lite, performance degression is obserbable.
>
>
> I compared following three pattern (it was mesured 10 times each)
>
> hackbench 125 process 3000
> hackbench 130 process 3000
> hackbench 135 process 3000
>
> 2.6.28-rc6 bail-out
>
> 125 130 135 125 130 135
> ==============================================================
> 71.866 75.86 81.274 93.414 73.254 193.382
> 74.145 78.295 77.27 74.897 75.021 80.17
> 70.305 77.643 75.855 70.134 77.571 79.896
> 74.288 73.986 75.955 77.222 78.48 80.619
> 72.029 79.947 78.312 75.128 82.172 79.708
> 71.499 77.615 77.042 74.177 76.532 77.306
> 76.188 74.471 83.562 73.839 72.43 79.833
> 73.236 75.606 78.743 76.001 76.557 82.726
> 69.427 77.271 76.691 76.236 79.371 103.189
> 72.473 76.978 80.643 69.128 78.932 75.736
>
> avg 72.545 76.767 78.534 76.017 77.03 93.256
> std 1.89 1.71 2.41 6.29 2.79 34.16
> min 69.427 73.986 75.855 69.128 72.43 75.736
> max 76.188 79.947 83.562 93.414 82.172 193.382
>
>
> about 4-5% degression.
>
> Then, this patch introduce temporal local variable.
>
> result:
>
> 2.6.28-rc6 this patch
>
> num 125 130 135 125 130 135
> ==============================================================
> 71.866 75.86 81.274 67.302 68.269 77.161
> 74.145 78.295 77.27 72.616 72.712 79.06
> 70.305 77.643 75.855 72.475 75.712 77.735
> 74.288 73.986 75.955 69.229 73.062 78.814
> 72.029 79.947 78.312 71.551 74.392 78.564
> 71.499 77.615 77.042 69.227 74.31 78.837
> 76.188 74.471 83.562 70.759 75.256 76.6
> 73.236 75.606 78.743 69.966 76.001 78.464
> 69.427 77.271 76.691 69.068 75.218 80.321
> 72.473 76.978 80.643 72.057 77.151 79.068
>
> avg 72.545 76.767 78.534 70.425 74.2083 78.462
> std 1.89 1.71 2.41 1.66 2.34 1.00
> min 69.427 73.986 75.855 67.302 68.269 76.6
> max 76.188 79.947 83.562 72.616 77.151 80.321
>
>
> OK. the degression is disappeared.
>
Yes, this is a very surprising result. Suspicious, in fact.
>
>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Acked-by: Rik van Riel <riel@redhat.com>
> ---
> mm/vmscan.c | 15 +++++++++------
> 1 file changed, 9 insertions(+), 6 deletions(-)
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1418,6 +1418,8 @@ static void shrink_zone(int priority, st
> unsigned long nr_to_scan;
> unsigned long percent[2]; /* anon @ 0; file @ 1 */
> enum lru_list l;
> + unsigned long nr_reclaimed = sc->nr_reclaimed;
> + unsigned long swap_cluster_max = sc->swap_cluster_max;
>
> get_scan_ratio(zone, sc, percent);
>
> @@ -1433,7 +1435,7 @@ static void shrink_zone(int priority, st
> }
> zone->lru[l].nr_scan += scan;
> nr[l] = zone->lru[l].nr_scan;
> - if (nr[l] >= sc->swap_cluster_max)
> + if (nr[l] >= swap_cluster_max)
> zone->lru[l].nr_scan = 0;
> else
> nr[l] = 0;
> @@ -1452,12 +1454,11 @@ static void shrink_zone(int priority, st
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> if (nr[l]) {
> - nr_to_scan = min(nr[l],
> - (unsigned long)sc->swap_cluster_max);
> + nr_to_scan = min(nr[l], swap_cluster_max);
> nr[l] -= nr_to_scan;
>
> - sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> - zone, sc, priority);
> + nr_reclaimed += shrink_list(l, nr_to_scan,
> + zone, sc, priority);
> }
> }
> /*
> @@ -1468,11 +1469,13 @@ static void shrink_zone(int priority, st
> * with multiple processes reclaiming pages, the total
> * freeing target can get unreasonably large.
> */
> - if (sc->nr_reclaimed > sc->swap_cluster_max &&
> + if (nr_reclaimed > swap_cluster_max &&
> priority < DEF_PRIORITY && !current_is_kswapd())
> break;
> }
>
> + sc->nr_reclaimed = nr_reclaimed;
> +
> /*
> * Even if we did not try to evict anon pages at all, we want to
> * rebalance the anon lru active/inactive ratio.
If this improved the throughput of direct-reclaim callers then one
would expect it to make larger improvements for kswapd (assuming
that all other things are equal for those tasks, which they are not).
What is your direct-reclaim to kswapd-reclaim ratio for that workload?
(grep pgscan /proc/vmstat)
Does that patch make any change to the amount of CPU time which kswapd
consumed?
Or you can not bother doing this work ;) The patch looks sensible
anyway. It's just that the numbers look whacky.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH] vmscan: improve reclaim throuput to bail out patch take2
2008-12-07 3:28 ` Andrew Morton
@ 2008-12-08 2:49 ` KOSAKI Motohiro
0 siblings, 0 replies; 23+ messages in thread
From: KOSAKI Motohiro @ 2008-12-08 2:49 UTC (permalink / raw)
To: Andrew Morton; +Cc: kosaki.motohiro, Rik van Riel, linux-mm, linux-kernel, mel
I think my last explain was too poor.
> If this improved the throughput of direct-reclaim callers then one
> would expect it to make larger improvements for kswapd (assuming
> that all other things are equal for those tasks, which they are not).
>
> What is your direct-reclaim to kswapd-reclaim ratio for that workload?
> (grep pgscan /proc/vmstat)
>
because that benchmark is direct reclaim torturess workload.
/proc/vmstat changing was
<before>
pgscan_kswapd_dma 1152
pgscan_kswapd_normal 2400
pgscan_kswapd_movable 0
pgscan_direct_dma 32
pgscan_direct_normal 512
pgscan_direct_movable 0
<after>
pgscan_kswapd_dma 3520
pgscan_kswapd_normal 12160
pgscan_kswapd_movable 0
pgscan_direct_dma 10048
pgscan_direct_normal 31904
pgscan_direct_movable 0
-> kswapd:direct = 1 : 3.4
Why I test non typical extreame woakload?
I have two reason.
1. nobody want to regression although workload isn't typical.
2. if the patch can scale performance although extreme case,
of cource it also can works well on light weight workload.
if my patch have any regression, it definityly is valueless.
my patch only solve extreme case.
but I don't think it has.
> Does that patch make any change to the amount of CPU time which kswapd
> consumed?
I don't mesure it yet.
but at least, top coomand didn't find any consumption increasing.
>
> Or you can not bother doing this work ;) The patch looks sensible
> anyway. It's just that the numbers look whacky.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2008-12-08 2:49 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-11-24 19:50 [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Rik van Riel
2008-11-24 20:53 ` Andrew Morton
2008-11-25 11:35 ` KOSAKI Motohiro
2008-11-25 13:32 ` Rik van Riel
2008-11-25 14:30 ` KOSAKI Motohiro
2008-11-28 7:02 ` KOSAKI Motohiro
2008-11-28 11:03 ` Rik van Riel
2008-11-29 10:53 ` KOSAKI Motohiro
2008-11-29 16:24 ` Rik van Riel
2008-11-30 6:30 ` KOSAKI Motohiro
2008-12-03 5:26 ` [PATCH] vmscan: improve reclaim throuput to bail out patch KOSAKI Motohiro
2008-12-03 13:46 ` Rik van Riel
2008-12-03 15:12 ` KOSAKI Motohiro
2008-12-04 1:28 ` [PATCH] vmscan: improve reclaim throuput to bail out patch take2 KOSAKI Motohiro
2008-12-04 4:20 ` MinChan Kim
2008-12-04 5:04 ` KOSAKI Motohiro
2008-12-07 3:28 ` Andrew Morton
2008-12-08 2:49 ` KOSAKI Motohiro
2008-12-01 13:40 ` [PATCH] vmscan: bail out of page reclaim after swap_cluster_max pages Christoph Lameter
2008-11-26 2:24 ` KOSAKI Motohiro
2008-11-27 17:36 ` [rfc] vmscan: serialize aggressive reclaimers Johannes Weiner
2008-11-29 7:46 ` KOSAKI Motohiro
2008-11-29 15:39 ` Johannes Weiner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox