[for -mm][PATCH][1/2] page reclaim throttle take3

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [for -mm][PATCH][1/2] page reclaim throttle take3
@ 2008-03-22 10:45 KOSAKI Motohiro
  2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
  2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
  0 siblings, 2 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 10:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh,
	Rik van Riel, David Rientjes
  Cc: kosaki.motohiro

Hi

this is latest version of page reclaim throttle patch series.
I explain performance result by another mail.
(now, I working on increase coverage of mesurement that patch)

at least, In some measurements, a considerably good result has come out. 


---------------------------------------------------------------------
changelog
========================================
  v2 -> v3:
     o use wake_up() instead wake_up_all()
     o max reclaimers can be changed Kconfig option and sysctl.
     o some cleanups

  v1 -> v2:
     o make per zone throttle 


description
========================================
current VM implementation doesn't has limit of # of parallel reclaim.
when heavy workload, it bring to 2 bad things
  - heavy lock contention
  - unnecessary swap out

Dec 2007, KAMEZA Hiroyuki proposed the patch of page 
reclaim throttle and explain it improve reclaim time.
	http://marc.info/?l=linux-mm&m=119667465917215&w=2

but unfortunately it works only memcgroup reclaim.
Today, I implement it again for support global reclaim and mesure it.



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

---
 include/linux/mmzone.h |    2 +
 mm/Kconfig             |   10 ++++++
 mm/page_alloc.c        |    4 ++
 mm/vmscan.c            |   73 ++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 76 insertions(+), 13 deletions(-)

Index: b/include/linux/mmzone.h
===================================================================
--- a/include/linux/mmzone.h	2008-03-14 21:51:36.000000000 +0900
+++ b/include/linux/mmzone.h	2008-03-14 21:58:52.000000000 +0900
@@ -335,6 +335,8 @@ struct zone {
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 
+	atomic_t		nr_reclaimers;
+	wait_queue_head_t	reclaim_throttle_waitq;
 	/*
 	 * rarely used fields:
 	 */
Index: b/mm/page_alloc.c
===================================================================
--- a/mm/page_alloc.c	2008-03-14 21:52:19.000000000 +0900
+++ b/mm/page_alloc.c	2008-03-14 21:58:52.000000000 +0900
@@ -3473,6 +3473,10 @@ static void __paginginit free_area_init_
 		zone->nr_scan_inactive = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
+
+		zone->nr_reclaimers = ATOMIC_INIT(0);
+		init_waitqueue_head(&zone->reclaim_throttle_waitq);
+
 		if (!size)
 			continue;
 
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c	2008-03-14 21:52:18.000000000 +0900
+++ b/mm/vmscan.c	2008-03-21 22:35:14.000000000 +0900
@@ -1190,13 +1190,30 @@ static void shrink_active_list(unsigned 
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static unsigned long shrink_zone(int priority, struct zone *zone,
-				struct scan_control *sc)
+static int shrink_zone(int priority, struct zone *zone,
+		       struct scan_control *sc, unsigned long *ret_reclaimed)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	unsigned long start_time = jiffies;
+	int ret = 0;
+
+	wait_event(zone->reclaim_throttle_waitq,
+		   atomic_add_unless(&zone->nr_reclaimers, 1,
+				     CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
+
+	/* more reclaim until needed? */
+	if (scan_global_lru(sc) &&
+	    !(current->flags & PF_KSWAPD) &&
+	    time_after(jiffies, start_time + HZ/10)) {
+		if (zone_watermark_ok(zone, sc->order, 4*zone->pages_high,
+				      MAX_NR_ZONES-1, 0)) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
 
 	if (scan_global_lru(sc)) {
 		/*
@@ -1248,9 +1265,13 @@ static unsigned long shrink_zone(int pri
 								sc);
 		}
 	}
-
+out:
+	*ret_reclaimed += nr_reclaimed;
+	atomic_dec(&zone->nr_reclaimers);
+	wake_up(&zone->reclaim_throttle_waitq);
 	throttle_vm_writeout(sc->gfp_mask);
-	return nr_reclaimed;
+
+	return ret;
 }
 
 /*
@@ -1269,13 +1290,13 @@ static unsigned long shrink_zone(int pri
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
-					struct scan_control *sc)
+static int shrink_zones(int priority, struct zonelist *zonelist,
+			struct scan_control *sc, unsigned long *ret_reclaimed)
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
-	unsigned long nr_reclaimed = 0;
 	struct zoneref *z;
 	struct zone *zone;
+	int ret;
 
 	sc->all_unreclaimable = 1;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
@@ -1304,10 +1325,14 @@ static unsigned long shrink_zones(int pr
 							priority);
 		}
 
-		nr_reclaimed += shrink_zone(priority, zone, sc);
+		ret = shrink_zone(priority, zone, sc, ret_reclaimed);
+		if (ret == -EAGAIN)
+			goto out;
 	}
+	ret = 0;
 
-	return nr_reclaimed;
+out:
+	return ret;
 }
  
 /*
@@ -1335,6 +1360,8 @@ static unsigned long do_try_to_free_page
 	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ 	unsigned long last_check_time = jiffies;
+ 	int err;
 
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
@@ -1357,7 +1384,12 @@ static unsigned long do_try_to_free_page
 		sc->nr_io_pages = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ 		err = shrink_zones(priority, zonelist, sc, &nr_reclaimed);
+ 		if (err == -EAGAIN) {
+ 			ret = 1;
+ 			goto out;
+ 		}
+
 		/*
 		 * Don't shrink slabs when reclaiming memory from
 		 * over limit cgroups
@@ -1390,8 +1422,23 @@ static unsigned long do_try_to_free_page
 
 		/* Take a nap, wait for some writeback to complete */
 		if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
-				sc->nr_io_pages > sc->swap_cluster_max)
+		    sc->nr_io_pages > sc->swap_cluster_max)
 			congestion_wait(WRITE, HZ/10);
+
+		if (scan_global_lru(sc) &&
+		    time_after(jiffies, last_check_time+HZ)) {
+			last_check_time = jiffies;
+
+			/* more reclaim until needed? */
+			for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+				if (zone_watermark_ok(zone, sc->order,
+						      4 * zone->pages_high,
+						      high_zoneidx, 0)) {
+					ret = 1;
+					goto out;
+				}
+			}
+		}
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
 	if (!sc->all_unreclaimable && scan_global_lru(sc))
@@ -1589,7 +1636,7 @@ loop_again:
 			 */
 			if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
 						end_zone, 0))
-				nr_reclaimed += shrink_zone(priority, zone, &sc);
+				shrink_zone(priority,zone, &sc, &nr_reclaimed);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -2034,7 +2081,7 @@ static int __zone_reclaim(struct zone *z
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
 			note_zone_scanning_priority(zone, priority);
-			nr_reclaimed += shrink_zone(priority, zone, &sc);
+			shrink_zone(priority, zone, &sc, &nr_reclaimed);
 			priority--;
 		} while (priority >= 0 && nr_reclaimed < nr_pages);
 	}
Index: b/mm/Kconfig
===================================================================
--- a/mm/Kconfig	2008-03-14 21:52:16.000000000 +0900
+++ b/mm/Kconfig	2008-03-14 22:25:02.000000000 +0900
@@ -193,3 +193,13 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config NR_MAX_RECLAIM_TASKS_PER_ZONE
+	int "maximum number of reclaiming tasks at the same time"
+	default 3
+	help
+	  This value determines the number of threads which can do page reclaim
+	  in a zone simultaneously. If this is too big, performance under heavy memory
+	  pressure will decrease.
+	  If unsure, use default.
+


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [for -mm][PATCH][2/2] page reclaim throttle take3
  2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
@ 2008-03-22 10:51 ` KOSAKI Motohiro
  2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
  1 sibling, 0 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 10:51 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh,
	Rik van Riel, David Rientjes
  Cc: kosaki.motohiro

This patch adds sysctl that changes the number of max reclaim task. 



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

---
 include/linux/swap.h |    2 ++
 kernel/sysctl.c      |    9 +++++++++
 mm/vmscan.c          |    7 +++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c	2008-03-21 22:36:10.000000000 +0900
+++ b/mm/vmscan.c	2008-03-21 22:36:12.000000000 +0900
@@ -127,6 +127,8 @@ long vm_total_pages;	/* The total number
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
+int vm_max_nr_task_per_zone = CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE;
+
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 #define scan_global_lru(sc)	(!(sc)->mem_cgroup)
 #else
@@ -1202,7 +1204,7 @@ static int shrink_zone(int priority, str
 
 	wait_event(zone->reclaim_throttle_waitq,
 		   atomic_add_unless(&zone->nr_reclaimers, 1,
-				     CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
+ 				     vm_max_nr_task_per_zone));
 
 	/* more reclaim until needed? */
 	if (scan_global_lru(sc) &&
@@ -1430,7 +1432,8 @@ static unsigned long do_try_to_free_page
 			last_check_time = jiffies;
 
 			/* more reclaim until needed? */
-			for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+			for_each_zone_zonelist(zone, z, zonelist,
+					       high_zoneidx) {
 				if (zone_watermark_ok(zone, sc->order,
 						      4 * zone->pages_high,
 						      high_zoneidx, 0)) {
Index: b/include/linux/swap.h
===================================================================
--- a/include/linux/swap.h	2008-03-14 21:51:36.000000000 +0900
+++ b/include/linux/swap.h	2008-03-14 22:31:35.000000000 +0900
@@ -206,6 +206,8 @@ static inline int zone_reclaim(struct zo
 
 extern int kswapd_run(int nid);
 
+extern int vm_max_nr_task_per_zone;
+
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
Index: b/kernel/sysctl.c
===================================================================
--- a/kernel/sysctl.c	2008-03-14 22:23:09.000000000 +0900
+++ b/kernel/sysctl.c	2008-03-14 22:32:08.000000000 +0900
@@ -1141,6 +1141,15 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "vm_max_nr_task_per_zone",
+		.data           = &vm_max_nr_task_per_zone,
+		.maxlen         = sizeof(vm_max_nr_task_per_zone),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+		.strategy       = &sysctl_intvec,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
  2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
  2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
@ 2008-03-22 14:55 ` Rik van Riel
  2008-03-22 16:01   ` KOSAKI Motohiro
  1 sibling, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2008-03-22 14:55 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes

On Sat, 22 Mar 2008 19:45:54 +0900
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> +	wait_event(zone->reclaim_throttle_waitq,
> +		   atomic_add_unless(&zone->nr_reclaimers, 1,
> +				     CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));

I like your patch, but can see one potential problem.   Sometimes
tasks that go into page reclaim with GFP_HIGHUSER end up recursing
back into page reclaim without __GFP_FS and/or __GFP_IO.

In that scenario, a task could end up waiting on itself and
deadlocking.

Maybe we should only let tasks with __GFP_FS, __GFP_IO and other
"I can do everything" flags wait on this waitqueue, letting the
tasks that cannot do IO (and are just here to reclaim clean pages)
bypass this waitqueue.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
  2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
@ 2008-03-22 16:01   ` KOSAKI Motohiro
  2008-03-22 16:15     ` Rik van Riel
  0 siblings, 1 reply; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 16:01 UTC (permalink / raw)
  To: Rik van Riel
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes

Hi Rik,

> On Sat, 22 Mar 2008 19:45:54 +0900
>  KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
>
>  > +     wait_event(zone->reclaim_throttle_waitq,
>  > +                atomic_add_unless(&zone->nr_reclaimers, 1,
>  > +                                  CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
>
>  I like your patch, but can see one potential problem.   Sometimes
>  tasks that go into page reclaim with GFP_HIGHUSER end up recursing
>  back into page reclaim without __GFP_FS and/or __GFP_IO.
>
>  In that scenario, a task could end up waiting on itself and
>  deadlocking.

interesting point out.
but unfortunately I don't understand yet, sorry ;)

I think recursing reclaim doesn't happend because call graph is maked
as the following at that time.

  __alloc_pages_internal  (turn on PF_MEMALLOC)
    +- try_to_free_pages
        +- (skip)
            +- pageout
                +- (skip)
                    +-  __alloc_pages_internal

in second __alloc_pages_internal, PF_MEMALLOC populated.
thus bypassed try_to_free_pages.

Am I misunderstanding anything?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
  2008-03-22 16:01   ` KOSAKI Motohiro
@ 2008-03-22 16:15     ` Rik van Riel
  2008-03-22 16:43       ` KOSAKI Motohiro
  0 siblings, 1 reply; 6+ messages in thread
From: Rik van Riel @ 2008-03-22 16:15 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes

On Sun, 23 Mar 2008 01:01:04 +0900
"KOSAKI Motohiro" <kosaki.motohiro@jp.fujitsu.com> wrote:

> Hi Rik,
> 
> > On Sat, 22 Mar 2008 19:45:54 +0900
> >  KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:
> >
> >  > +     wait_event(zone->reclaim_throttle_waitq,
> >  > +                atomic_add_unless(&zone->nr_reclaimers, 1,
> >  > +                                  CONFIG_NR_MAX_RECLAIM_TASKS_PER_ZONE));
> >
> >  I like your patch, but can see one potential problem.   Sometimes
> >  tasks that go into page reclaim with GFP_HIGHUSER end up recursing
> >  back into page reclaim without __GFP_FS and/or __GFP_IO.
> >
> >  In that scenario, a task could end up waiting on itself and
> >  deadlocking.
> 
> interesting point out.
> but unfortunately I don't understand yet, sorry ;)
> 
> I think recursing reclaim doesn't happend because call graph is maked
> as the following at that time.
> 
>   __alloc_pages_internal  (turn on PF_MEMALLOC)
>     +- try_to_free_pages
>         +- (skip)
>             +- pageout
>                 +- (skip)
>                     +-  __alloc_pages_internal
> 
> in second __alloc_pages_internal, PF_MEMALLOC populated.
> thus bypassed try_to_free_pages.
> 
> Am I misunderstanding anything?

Look at free_more_memory() in fs/buffer.c 

There used to be a path through which try_to_free_pages can
recurse once.  I am not sure if it is still there.

Even if it is not, having !__GFP_IO or !__GFP_FS tasks wait 
on tasks that have those GFP flags is a problem, because
the tasks that have __GFP_IO and __GFP_FS set may be waiting
on some lock that the task in the waitqueue has...

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [for -mm][PATCH][1/2] page reclaim throttle take3
  2008-03-22 16:15     ` Rik van Riel
@ 2008-03-22 16:43       ` KOSAKI Motohiro
  0 siblings, 0 replies; 6+ messages in thread
From: KOSAKI Motohiro @ 2008-03-22 16:43 UTC (permalink / raw)
  To: Rik van Riel
  Cc: KAMEZAWA Hiroyuki, Andrew Morton, linux-mm, Balbir Singh, David Rientjes

>  >   __alloc_pages_internal  (turn on PF_MEMALLOC)
>  >     +- try_to_free_pages
>  >         +- (skip)
>  >             +- pageout
>  >                 +- (skip)
>  >                     +-  __alloc_pages_internal
>  >
>  > in second __alloc_pages_internal, PF_MEMALLOC populated.
>  > thus bypassed try_to_free_pages.
>  >
>  > Am I misunderstanding anything?
>
>  Look at free_more_memory() in fs/buffer.c

Oh, I see.
thanks.

I will add the mechanism of avoid recursive throttleing problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-03-22 16:43 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-22 10:45 [for -mm][PATCH][1/2] page reclaim throttle take3 KOSAKI Motohiro
2008-03-22 10:51 ` [for -mm][PATCH][2/2] " KOSAKI Motohiro
2008-03-22 14:55 ` [for -mm][PATCH][1/2] " Rik van Riel
2008-03-22 16:01   ` KOSAKI Motohiro
2008-03-22 16:15     ` Rik van Riel
2008-03-22 16:43       ` KOSAKI Motohiro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox