* [for -mm][PATCH][1/2] page reclaim throttle take3
@ 2008-03-22 10:45 KOSAKI Motohiro
2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
0 siblings, 2 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 10:45 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh,
Rik van Riel, David Rientjes
Cc: kosaki.motohiro
Hi
this is latest version of page reclaim throttle patch series.
I explain performance result by another mail.
(now, I working on increase coverage of mesurement that patch)
at least, In some measurements, a considerably good result has come out.
---------------------------------------------------------------------
changelog
========================================
v2 -> v3:
o use wake_up() instead wake_up_all()
o max reclaimers can be changed Kconfig option and sysctl.
o some cleanups
v1 -> v2:
o make per zone throttle
description
========================================
current VM implementation doesn't has limit of # of parallel reclaim.
when heavy workload, it bring to 2 bad things
- heavy lock contention
- unnecessary swap out
Dec 2007, KAMEZA Hiroyuki proposed the patch of page
reclaim throttle and explain it improve reclaim time.
http://marc.info/?l=linux-mm&m=119667465917215&w=2
but unfortunately it works only memcgroup reclaim.
Today, I implement it again for support global reclaim and mesure it.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
include/linux/mmzone.h | 2 +
mm/Kconfig | 10 ++++++
mm/page_alloc.c | 4 ++
mm/vmscan.c | 73 ++++++++++++++++++++++++++++++++++++++++---------
4 files changed, 76 insertions(+), 13 deletions(-)
Index: b/include/linux/mmzone.h
===================================================================
--- a/include/linux/mmzone.h 2008-03-14 21:51:36.000000000 +0900
+++ b/include/linux/mmzone.h 2008-03-14 21:58:52.000000000 +0900
@@ -335,6 +335,8 @@ struct zone {
unsigned long spanned_pages; /* total size, including holes */
unsigned long present_pages; /* amount of memory (excluding holes) */
+ atomic_t nr_reclaimers;
+ wait_queue_head_t reclaim_throttle_waitq;
/*
* rarely used fields:
*/
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c 2008-03-14 21:52:19.000000000 +0900
+++ b/mm/page_alloc.c 2008-03-14 21:58:52.000000000 +0900
@@ -3473,6 +3473,10 @@ static void __paginginit free_area_init_
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
+
+ zone->nr_reclaimers = ATOMIC_INIT(0);
+ init_waitqueue_head(&zone->reclaim_throttle_waitq);
+
if (!size)
continue;
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c 2008-03-14 21:52:18.000000000 +0900
+++ b/mm/vmscan.c 2008-03-21 22:35:14.000000000 +0900
@@ -1190,13 +1190,30 @@ static void shrink_active_list(unsigned
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
- struct scan_control *sc)
+static int shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc, unsigned long *ret_reclaimed)
{
unsigned long nr_active;
unsigned long nr_inactive;
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
+ unsigned long start_time = jiffies;
+ int ret = 0;
+
+ wait_event(zone->reclaim_throttle_waitq,
+ atomic_add_unless(&zone->nr_reclaimers, 1,
+ CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
+
+ /* more reclaim until needed? */
+ if (scan_global_lru(sc) &&
+ !(current->flags & PF_KSWAPD) &&
+ time_after(jiffies, start_time + HZ/10)) {
+ if (zone_watermark_ok(zone, sc->order, 4*zone->pages_high,
+ MAX_NR_ZONES-1, 0)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
if (scan_global_lru(sc)) {
/*
@@ -1248,9 +1265,13 @@ static unsigned long shrink_zone(int pri
sc);
}
}
-
+out:
+ *ret_reclaimed += nr_reclaimed;
+ atomic_dec(&zone->nr_reclaimers);
+ wake_up(&zone->reclaim_throttle_waitq);
throttle_vm_writeout(sc->gfp_mask);
- return nr_reclaimed;
+
+ return ret;
}
/*
@@ -1269,13 +1290,13 @@ static unsigned long shrink_zone(int pri
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
- struct scan_control *sc)
+static int shrink_zones(int priority, struct zonelist *zonelist,
+ struct scan_control *sc, unsigned long *ret_reclaimed)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
- unsigned long nr_reclaimed = 0;
struct zoneref *z;
struct zone *zone;
+ int ret;
sc->all_unreclaimable = 1;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
@@ -1304,10 +1325,14 @@ static unsigned long shrink_zones(int pr
priority);
}
- nr_reclaimed += shrink_zone(priority, zone, sc);
+ ret = shrink_zone(priority, zone, sc, ret_reclaimed);
+ if (ret == -EAGAIN)
+ goto out;
}
+ ret = 0;
- return nr_reclaimed;
+out:
+ return ret;
}
/*
@@ -1335,6 +1360,8 @@ static unsigned long do_try_to_free_page
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ unsigned long last_check_time = jiffies;
+ int err;
if (scan_global_lru(sc))
count_vm_event(ALLOCSTALL);
@@ -1357,7 +1384,12 @@ static unsigned long do_try_to_free_page
sc->nr_io_pages = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ err = shrink_zones(priority, zonelist, sc, &nr_reclaimed);
+ if (err == -EAGAIN) {
+ ret = 1;
+ goto out;
+ }
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1390,8 +1422,23 @@ static unsigned long do_try_to_free_page
/* Take a nap, wait for some writeback to complete */
if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
- sc->nr_io_pages > sc->swap_cluster_max)
+ sc->nr_io_pages > sc->swap_cluster_max)
congestion_wait(WRITE, HZ/10);
+
+ if (scan_global_lru(sc) &&
+ time_after(jiffies, last_check_time+HZ)) {
+ last_check_time = jiffies;
+
+ /* more reclaim until needed? */
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ if (zone_watermark_ok(zone, sc->order,
+ 4 * zone->pages_high,
+ high_zoneidx, 0)) {
+ ret = 1;
+ goto out;
+ }
+ }
+ }
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
@@ -1589,7 +1636,7 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority,zone, &sc, &nr_reclaimed);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
@@ -2034,7 +2081,7 @@ static int __zone_reclaim(struct zone *z
priority = ZONE_RECLAIM_PRIORITY;
do {
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc, &nr_reclaimed);
priority--;
} while (priority >= 0 && nr_reclaimed < nr_pages);
}
Index: b/mm/Kconfig
===================================================================
--- a/mm/Kconfig 2008-03-14 21:52:16.000000000 +0900
+++ b/mm/Kconfig 2008-03-14 22:25:02.000000000 +0900
@@ -193,3 +193,13 @@ config NR_QUICK
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config NR_MAX_RECLAIM_TASKS_PER_ZONE
+ int "maximum number of reclaiming tasks at the same time"
+ default 3
+ help
+ This value determines the number of threads which can do page reclaim
+ in a zone simultaneously. If this is too big, performance under heavy memory
+ pressure will decrease.
+ If unsure, use default.
+
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread* [for -mm][PATCH][2/2] page reclaim throttle take3
2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
@ 2008-03-22 10:51 ` KOSAKI Motohiro
2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
1 sibling, 0 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 10:51 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh,
Rik van Riel, David Rientjes
Cc: kosaki.motohiro
This patch adds sysctl that changes the number of max reclaim task.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
include/linux/swap.h | 2 ++
kernel/sysctl.c | 9 +++++++++
mm/vmscan.c | 7 +++++--
3 files changed, 16 insertions(+), 2 deletions(-)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c 2008-03-21 22:36:10.000000000 +0900
+++ b/mm/vmscan.c 2008-03-21 22:36:12.000000000 +0900
@@ -127,6 +127,8 @@ long vm_total_pages; /* The total number
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+int vm_max_nr_task_per_zone = CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE;
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
#define scan_global_lru(sc) (!(sc)->mem_cgroup)
#else
@@ -1202,7 +1204,7 @@ static int shrink_zone(int priority, str
wait_event(zone->reclaim_throttle_waitq,
atomic_add_unless(&zone->nr_reclaimers, 1,
- CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
+ vm_max_nr_task_per_zone));
/* more reclaim until needed? */
if (scan_global_lru(sc) &&
@@ -1430,7 +1432,8 @@ static unsigned long do_try_to_free_page
last_check_time = jiffies;
/* more reclaim until needed? */
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist,
+ high_zoneidx) {
if (zone_watermark_ok(zone, sc->order,
4 * zone->pages_high,
high_zoneidx, 0)) {
Index: b/include/linux/swap.h
===================================================================
--- a/include/linux/swap.h 2008-03-14 21:51:36.000000000 +0900
+++ b/include/linux/swap.h 2008-03-14 22:31:35.000000000 +0900
@@ -206,6 +206,8 @@ static inline int zone_reclaim(struct zo
extern int kswapd_run(int nid);
+extern int vm_max_nr_task_per_zone;
+
#ifdef CONFIG_MMU
/* linux/mm/shmem.c */
extern int shmem_unuse(swp_entry_t entry, struct page *page);
Index: b/kernel/sysctl.c
===================================================================
--- a/kernel/sysctl.c 2008-03-14 22:23:09.000000000 +0900
+++ b/kernel/sysctl.c 2008-03-14 22:32:08.000000000 +0900
@@ -1141,6 +1141,15 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "vm_max_nr_task_per_zone",
+ .data = &vm_max_nr_task_per_zone,
+ .maxlen = sizeof(vm_max_nr_task_per_zone),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ .strategy = &sysctl_intvec,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
@ 2008-03-22 14:55 ` Rik van Riel
2008-03-22 16:01 ` KOSAKI Motohiro
1 sibling, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2008-03-22 14:55 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes
On Sat, 22 Mar 2008 19:45:54 +0900
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
> + wait_event(zone->reclaim_throttle_waitq,
> + atomic_add_unless(&zone->nr_reclaimers, 1,
> + CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
I like your patch, but can see one potential problem. Sometimes
tasks that go into page reclaim with GFP_HIGHUSER end up recursing
back into page reclaim without __GFP_FS and/or __GFP_IO.
In that scenario, a task could end up waiting on itself and
deadlocking.
Maybe we should only let tasks with __GFP_FS, __GFP_IO and other
"I can do everything" flags wait on this waitqueue, letting the
tasks that cannot do IO (and are just here to reclaim clean pages)
bypass this waitqueue.
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
@ 2008-03-22 16:01 ` KOSAKI Motohiro
2008-03-22 16:15 ` Rik van Riel
0 siblings, 1 reply; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 16:01 UTC (permalink / raw)
To: Rik van Riel
Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes
Hi Rik,
> On Sat, 22 Mar 2008 19:45:54 +0900
> KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
>
> > + wait_event(zone->reclaim_throttle_waitq,
> > + atomic_add_unless(&zone->nr_reclaimers, 1,
> > + CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
>
> I like your patch, but can see one potential problem. Sometimes
> tasks that go into page reclaim with GFP_HIGHUSER end up recursing
> back into page reclaim without __GFP_FS and/or __GFP_IO.
>
> In that scenario, a task could end up waiting on itself and
> deadlocking.
interesting point out.
but unfortunately I don't understand yet, sorry ;)
I think recursing reclaim doesn't happend because call graph is maked
as the following at that time.
__alloc_pages_internal (turn on PF_MEMALLOC)
+- try_to_free_pages
+- (skip)
+- pageout
+- (skip)
+- __alloc_pages_internal
in second __alloc_pages_internal, PF_MEMALLOC populated.
thus bypassed try_to_free_pages.
Am I misunderstanding anything?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
2008-03-22 16:01 ` KOSAKI Motohiro
@ 2008-03-22 16:15 ` Rik van Riel
2008-03-22 16:43 ` KOSAKI Motohiro
0 siblings, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2008-03-22 16:15 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes
On Sun, 23 Mar 2008 01:01:04 +0900
"KOSAKI Motohiro" <kosaki.motohiro@jp.fujitsu.com> wrote:
> Hi Rik,
>
> > On Sat, 22 Mar 2008 19:45:54 +0900
> > KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
> >
> > > + wait_event(zone->reclaim_throttle_waitq,
> > > + atomic_add_unless(&zone->nr_reclaimers, 1,
> > > + CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
> >
> > I like your patch, but can see one potential problem. Sometimes
> > tasks that go into page reclaim with GFP_HIGHUSER end up recursing
> > back into page reclaim without __GFP_FS and/or __GFP_IO.
> >
> > In that scenario, a task could end up waiting on itself and
> > deadlocking.
>
> interesting point out.
> but unfortunately I don't understand yet, sorry ;)
>
> I think recursing reclaim doesn't happend because call graph is maked
> as the following at that time.
>
> __alloc_pages_internal (turn on PF_MEMALLOC)
> +- try_to_free_pages
> +- (skip)
> +- pageout
> +- (skip)
> +- __alloc_pages_internal
>
> in second __alloc_pages_internal, PF_MEMALLOC populated.
> thus bypassed try_to_free_pages.
>
> Am I misunderstanding anything?
Look at free_more_memory() in fs/buffer.c
There used to be a path through which try_to_free_pages can
recurse once. I am not sure if it is still there.
Even if it is not, having !__GFP_IO or !__GFP_FS tasks wait
on tasks that have those GFP flags is a problem, because
the tasks that have __GFP_IO and __GFP_FS set may be waiting
on some lock that the task in the waitqueue has...
--
All rights reversed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
2008-03-22 16:15 ` Rik van Riel
@ 2008-03-22 16:43 ` KOSAKI Motohiro
0 siblings, 0 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 16:43 UTC (permalink / raw)
To: Rik van Riel
Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes
> > __alloc_pages_internal (turn on PF_MEMALLOC)
> > +- try_to_free_pages
> > +- (skip)
> > +- pageout
> > +- (skip)
> > +- __alloc_pages_internal
> >
> > in second __alloc_pages_internal, PF_MEMALLOC populated.
> > thus bypassed try_to_free_pages.
> >
> > Am I misunderstanding anything?
>
> Look at free_more_memory() in fs/buffer.c
Oh, I see.
thanks.
I will add the mechanism of avoid recursive throttleing problem.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2008-03-22 16:43 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
2008-03-22 16:01 ` KOSAKI Motohiro
2008-03-22 16:15 ` Rik van Riel
2008-03-22 16:43 ` KOSAKI Motohiro
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox