Get rid of scan

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Get rid of scan_control
@ 2006-02-10  5:02 Christoph Lameter
  2006-02-11  4:53 ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2006-02-10  5:02 UTC (permalink / raw)
  To: nickpiggin; +Cc: akpm

This is done through a variety of measures:

1. nr_reclaimed is the return value of functions and each function
   does the summing of the reclaimed pages on its own.

2. nr_scanned is passed as a reference parameter (sigh... the only leftover)

3. nr_mapped is calculated on each invocation of refill_inactive_list. 
   This is not that optimal but then swapping is not that performance 
   critical.

4. gfp_mask is passed as a parameter. OR flags to gfp_mask for may_swap 
   and may_writepage.

5. Pass swap_cluster_max as a parameter

Most of the parameters passed through scan_control become local variables.
Therefore the compilers are able to generate better code.

And we do no longer have the problem of initializing scan control the
right way.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.16-rc2-mm1/mm/vmscan.c
===================================================================
--- linux-2.6.16-rc2-mm1.orig/mm/vmscan.c	2006-02-09 20:04:37.000000000 -0800
+++ linux-2.6.16-rc2-mm1/mm/vmscan.c	2006-02-09 20:26:32.000000000 -0800
@@ -51,29 +51,12 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
-struct scan_control {
-	/* Incremented by the number of inactive pages that were scanned */
-	unsigned long nr_scanned;
-
-	/* Incremented by the number of pages reclaimed */
-	unsigned long nr_reclaimed;
-
-	unsigned long nr_mapped;	/* From page_state */
-
-	/* This context's GFP mask */
-	gfp_t gfp_mask;
-
-	int may_writepage;
-
-	/* Can pages be swapped as part of reclaim? */
-	int may_swap;
-
-	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
-	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
-	 * In this context, it doesn't matter that we scan the
-	 * whole list at once. */
-	int swap_cluster_max;
-};
+/*
+ * Some additional flags to be added to gfp_t in the context
+ * of swap processing.
+ */
+#define MAY_SWAP      (1 << __GFP_BITS_SHIFT)
+#define MAY_WRITEPAGE (1 << (__GFP_BITS_SHIFT + 1))
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
@@ -409,12 +392,14 @@ cannot_free:
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static int shrink_list(struct list_head *page_list, gfp_t gfp_mask,
+				unsigned long *total_scanned)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	int reclaimed = 0;
+	int nr_scanned = 0;
 
 	cond_resched();
 
@@ -435,10 +420,10 @@ static int shrink_list(struct list_head 
 
 		BUG_ON(PageActive(page));
 
-		sc->nr_scanned++;
+		nr_scanned++;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
-			sc->nr_scanned++;
+			nr_scanned++;
 
 		if (PageWriteback(page))
 			goto keep_locked;
@@ -454,7 +439,7 @@ static int shrink_list(struct list_head 
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
+			if (!(gfp_mask & MAY_SWAP))
 				goto keep_locked;
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
@@ -462,8 +447,8 @@ static int shrink_list(struct list_head 
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
-		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
-			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+		may_enter_fs = (gfp_mask & __GFP_FS) ||
+			(PageSwapCache(page) && (gfp_mask & __GFP_IO));
 
 		/*
 		 * The page is mapped into the page tables of one or more
@@ -473,7 +458,7 @@ static int shrink_list(struct list_head 
 			/*
 			 * No unmapping if we do not swap
 			 */
-			if (!sc->may_swap)
+			if (!(gfp_mask & MAY_SWAP))
 				goto keep_locked;
 
 			switch (try_to_unmap(page, 0)) {
@@ -491,7 +476,7 @@ static int shrink_list(struct list_head 
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
-			if (!sc->may_writepage)
+			if (!(gfp_mask & MAY_WRITEPAGE))
 				goto keep_locked;
 
 			/* Page is dirty, try to write it out here */
@@ -539,7 +524,7 @@ static int shrink_list(struct list_head 
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (PagePrivate(page)) {
-			if (!try_to_release_page(page, sc->gfp_mask))
+			if (!try_to_release_page(page, gfp_mask))
 				goto activate_locked;
 			/*
 			 * file system may manually remove page from the page
@@ -573,7 +558,7 @@ keep:
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
-	sc->nr_reclaimed += reclaimed;
+	*total_scanned += nr_scanned;
 	return reclaimed;
 }
 
@@ -1085,10 +1070,12 @@ static int isolate_lru_pages(int nr_to_s
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
+static int shrink_cache(int max_scan, struct zone *zone, gfp_t gfp_mask,
+			int swap_cluster_max, unsigned long *nr_scanned)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
+	int nr_reclaimed = 0;
 
 	pagevec_init(&pvec, 1);
 
@@ -1100,7 +1087,7 @@ static void shrink_cache(int max_scan, s
 		int nr_scan;
 		int nr_freed;
 
-		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+		nr_taken = isolate_lru_pages(swap_cluster_max,
 					     &zone->inactive_list,
 					     &page_list, &nr_scan);
 		zone->nr_inactive -= nr_taken;
@@ -1111,7 +1098,7 @@ static void shrink_cache(int max_scan, s
 			goto done;
 
 		max_scan -= nr_scan;
-		nr_freed = shrink_list(&page_list, sc);
+		nr_reclaimed += nr_freed = shrink_list(&page_list, gfp_mask, nr_scanned);
 
 		local_irq_disable();
 		if (current_is_kswapd()) {
@@ -1144,6 +1131,7 @@ static void shrink_cache(int max_scan, s
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
+	return nr_reclaimed;
 }
 
 /*
@@ -1164,7 +1152,7 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(int nr_pages, struct zone *zone)
 {
 	int pgmoved;
 	int pgdeactivate = 0;
@@ -1198,7 +1186,7 @@ refill_inactive_zone(int nr_pages, struc
 	 * mapped memory instead of just pagecache.  Work out how much memory
 	 * is mapped.
 	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+	mapped_ratio = (read_page_state(nr_mapped) * 100) / total_memory;
 
 	/*
 	 * Now decide how much we really want to unmap some pages.  The mapped
@@ -1295,12 +1283,14 @@ refill_inactive_zone(int nr_pages, struc
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void
-shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
+static int
+shrink_zone(int priority, struct zone *zone, gfp_t gfp_mask,
+		unsigned long swap_cluster_max, unsigned long *nr_scanned)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1310,37 +1300,37 @@ shrink_zone(int priority, struct zone *z
 	 */
 	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
-	if (nr_active >= sc->swap_cluster_max)
+	if (nr_active >= swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
 	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
-	if (nr_inactive >= sc->swap_cluster_max)
+	if (nr_inactive >= swap_cluster_max)
 		zone->nr_scan_inactive = 0;
 	else
 		nr_inactive = 0;
 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
-			nr_to_scan = min(nr_active,
-					(unsigned long)sc->swap_cluster_max);
+			nr_to_scan = min(nr_active, swap_cluster_max);
 			nr_active -= nr_to_scan;
-			refill_inactive_zone(nr_to_scan, zone, sc);
+			refill_inactive_zone(nr_to_scan, zone);
 		}
 
 		if (nr_inactive) {
-			nr_to_scan = min(nr_inactive,
-					(unsigned long)sc->swap_cluster_max);
+			nr_to_scan = min(nr_inactive, swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			shrink_cache(nr_to_scan, zone, sc);
+			nr_reclaimed += shrink_cache(nr_to_scan, zone, gfp_mask,
+							swap_cluster_max, nr_scanned);
 		}
 	}
 
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
 }
 
 /*
@@ -1359,10 +1349,12 @@ shrink_zone(int priority, struct zone *z
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void
-shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
+static int
+shrink_caches(int priority, struct zone **zones, gfp_t gfp_mask,
+			int swap_cluster_max, unsigned long *nr_scanned)
 {
 	int i;
+	int nr_reclaimed = 0;
 
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
@@ -1380,8 +1372,10 @@ shrink_caches(int priority, struct zone 
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(priority, zone, sc);
+		nr_reclaimed += shrink_zone(priority, zone, gfp_mask,
+						swap_cluster_max, nr_scanned);
 	}
+	return nr_reclaimed;
 }
  
 /*
@@ -1403,13 +1397,12 @@ int try_to_free_pages(struct zone **zone
 	int ret = 0;
 	int total_scanned = 0, total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
 
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	gfp_mask |= MAY_SWAP;
+	if (!laptop_mode)
+		gfp_mask |= MAY_WRITEPAGE;
 
 	inc_page_state(allocstall);
 
@@ -1424,21 +1417,19 @@ int try_to_free_pages(struct zone **zone
 	}
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		sc.nr_mapped = read_page_state(nr_mapped);
-		sc.nr_scanned = 0;
-		sc.nr_reclaimed = 0;
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+		unsigned long nr_scanned = 0;
+
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(priority, zones, &sc);
-		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+		total_reclaimed += shrink_caches(priority, zones, gfp_mask,
+						SWAP_CLUSTER_MAX, &nr_scanned);
+		shrink_slab(nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			total_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
-		total_scanned += sc.nr_scanned;
-		total_reclaimed += sc.nr_reclaimed;
-		if (total_reclaimed >= sc.swap_cluster_max) {
+		total_scanned += nr_scanned;
+		if (total_reclaimed >= SWAP_CLUSTER_MAX) {
 			ret = 1;
 			goto out;
 		}
@@ -1450,13 +1441,13 @@ int try_to_free_pages(struct zone **zone
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+		if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-			sc.may_writepage = 1;
+			gfp_mask |= MAY_WRITEPAGE;
 		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+		if (nr_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 out:
@@ -1504,15 +1495,14 @@ static int balance_pgdat(pg_data_t *pgda
 	int i;
 	int total_scanned, total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
+	gfp_t gfp_mask;
 
 loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
-	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
-	sc.nr_mapped = read_page_state(nr_mapped);
+	gfp_mask = GFP_KERNEL | MAY_SWAP;
+	if (!laptop_mode)
+		gfp_mask |= MAY_WRITEPAGE;
 
 	inc_page_state(pageoutrun);
 
@@ -1575,6 +1565,7 @@ scan:
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
+			unsigned long nr_scanned = 0;
 			int nr_slab;
 
 			if (!populated_zone(zone))
@@ -1591,18 +1582,16 @@ scan:
 			zone->temp_priority = priority;
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
-			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
-			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+
 			atomic_inc(&zone->reclaim_in_progress);
-			shrink_zone(priority, zone, &sc);
+			total_reclaimed += shrink_zone(priority, zone, gfp_mask,
+						max(nr_pages, SWAP_CLUSTER_MAX), &nr_scanned);
 			atomic_dec(&zone->reclaim_in_progress);
 			reclaim_state->reclaimed_slab = 0;
-			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+			nr_slab = shrink_slab(nr_scanned, GFP_KERNEL,
 						lru_pages);
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
-			total_scanned += sc.nr_scanned;
+			total_reclaimed += reclaim_state->reclaimed_slab;
+			total_scanned += nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
@@ -1615,7 +1604,7 @@ scan:
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
 			    total_scanned > total_reclaimed+total_reclaimed/2)
-				sc.may_writepage = 1;
+				gfp_mask |= MAY_WRITEPAGE;
 		}
 		if (nr_pages && to_free > total_reclaimed)
 			continue;	/* swsusp: need to do more work */
@@ -1848,10 +1837,11 @@ int zone_reclaim(struct zone *zone, gfp_
 	int nr_pages;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc;
 	cpumask_t mask;
 	int node_id;
 	int priority;
+	int total_reclaimed;
+	unsigned long nr_scanned;
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
@@ -1867,20 +1857,16 @@ int zone_reclaim(struct zone *zone, gfp_
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
 
-	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.gfp_mask = gfp_mask;
+	if (zone_reclaim_mode & RECLAIM_WRITE)
+		gfp_mask |= MAY_WRITEPAGE;
+	if (zone_reclaim_mode & RECLAIM_SWAP)
+		gfp_mask |= MAY_SWAP;
 
 	disable_swap_token();
 
 	nr_pages = 1 << order;
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+	nr_scanned = 0;
+	total_reclaimed = 0;
 
 	cond_resched();
 	p->flags |= PF_MEMALLOC;
@@ -1893,11 +1879,12 @@ int zone_reclaim(struct zone *zone, gfp_
 	 */
 	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		shrink_zone(priority, zone, &sc);
+		total_reclaimed += shrink_zone(priority, zone, gfp_mask,
+					max(nr_pages, SWAP_CLUSTER_MAX), &nr_scanned);
 		priority--;
-	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+	} while (priority >= 0 && total_reclaimed < nr_pages);
 
-	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (total_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
 		 * shrink_slab does not currently allow us to determine
 		 * how many pages were freed in the zone. So we just
@@ -1906,17 +1893,17 @@ int zone_reclaim(struct zone *zone, gfp_
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
 		 */
-		shrink_slab(sc.nr_scanned, gfp_mask, order);
-		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+		shrink_slab(nr_scanned, gfp_mask, order);
+		total_reclaimed = 1;    /* Avoid getting the off node timeout */
 	}
 
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
-	if (sc.nr_reclaimed == 0)
+	if (total_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed >= nr_pages;
+	return total_reclaimed >= nr_pages;
 }
 #endif
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-10  5:02 Get rid of scan_control Christoph Lameter
@ 2006-02-11  4:53 ` Marcelo Tosatti
  2006-02-11  9:32   ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2006-02-11  4:53 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: nickpiggin, akpm, linux-mm

Hi Christoph,

On Thu, Feb 09, 2006 at 09:02:00PM -0800, Christoph Lameter wrote:
> This is done through a variety of measures:
> 
> 1. nr_reclaimed is the return value of functions and each function
>    does the summing of the reclaimed pages on its own.
> 
> 2. nr_scanned is passed as a reference parameter (sigh... the only leftover)
> 
> 3. nr_mapped is calculated on each invocation of refill_inactive_list. 
>    This is not that optimal but then swapping is not that performance 
>    critical.

But refill_inactive_list() is not used for swapping only. All evicted 
pages go through that path - it can be _very_ hot.

> 4. gfp_mask is passed as a parameter. OR flags to gfp_mask for may_swap 
>    and may_writepage.
> 
> 5. Pass swap_cluster_max as a parameter
> 
> Most of the parameters passed through scan_control become local variables.
> Therefore the compilers are able to generate better code.
> 
> And we do no longer have the problem of initializing scan control the
> right way.

I don't think its worth doing that - do you have any performance
measurement and analysis of the generated code? Is the current code a
bottleneck for any of your applications?

One very nice thing about "scan_control" is that it aggregates all
parameters related to reclaim procedure instances, making the code
much clearer, easier to understand, and elegant. Moreover it allows
expandability: new parameters can be contained within the structure.

At first, my opinion is that the possible benefits for performance do
not outweight the readability and expandability advantages it provides.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11  4:53 ` Marcelo Tosatti
@ 2006-02-11  9:32   ` Andrew Morton
  2006-02-11  9:46     ` Andrew Morton
  2006-02-11 19:01     ` Christoph Lameter
  0 siblings, 2 replies; 18+ messages in thread
From: Andrew Morton @ 2006-02-11  9:32 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: clameter, nickpiggin, linux-mm

Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote:
>
> 
> Hi Christoph,
> 
> On Thu, Feb 09, 2006 at 09:02:00PM -0800, Christoph Lameter wrote:
> > This is done through a variety of measures:
> > 
> > 1. nr_reclaimed is the return value of functions and each function
> >    does the summing of the reclaimed pages on its own.
> > 
> > 2. nr_scanned is passed as a reference parameter (sigh... the only leftover)
> > 
> > 3. nr_mapped is calculated on each invocation of refill_inactive_list. 
> >    This is not that optimal but then swapping is not that performance 
> >    critical.
> 
> But refill_inactive_list() is not used for swapping only. All evicted 
> pages go through that path - it can be _very_ hot.

A bit hot.  I guess it's worth fixing.

> > 4. gfp_mask is passed as a parameter. OR flags to gfp_mask for may_swap 
> >    and may_writepage.
> > 
> > 5. Pass swap_cluster_max as a parameter
> > 
> > Most of the parameters passed through scan_control become local variables.
> > Therefore the compilers are able to generate better code.
> > 
> > And we do no longer have the problem of initializing scan control the
> > right way.
> 
> I don't think its worth doing that - do you have any performance
> measurement and analysis of the generated code? Is the current code a
> bottleneck for any of your applications?

This patch isn't a performance thing.  I found that scan_control wasn't
really a success.  We had one bug due to failing to initialise something in
it, and we're fiddling with fields all over the place.  It just seemed to
obfuscate the code, make it harder to work with, harder to check that
everything was correct.

Kind of like global variables, really.

> One very nice thing about "scan_control" is that it aggregates all
> parameters related to reclaim procedure instances, making the code
> much clearer, easier to understand, and elegant. Moreover it allows
> expandability: new parameters can be contained within the structure.

Well yes, it allows for expansion.  But my experience was the exact
opposite of what you're saying.

scan_control was modelled on writeback_control.  But writeback_control
works, and scan_control doesn't.  I think this is because a)
writeback_control instances are always initialised at the declaration site
and b) writeback_control is just a lot simpler.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11  9:32   ` Andrew Morton
@ 2006-02-11  9:46     ` Andrew Morton
  2006-02-12  3:33       ` Nick Piggin
  2006-02-11 19:01     ` Christoph Lameter
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2006-02-11  9:46 UTC (permalink / raw)
  To: marcelo.tosatti, clameter, nickpiggin, linux-mm

Andrew Morton <akpm@osdl.org> wrote:
>
> I found that scan_control wasn't
>  really a success.  We had one bug due to failing to initialise something in
>  it, and we're fiddling with fields all over the place.  It just seemed to
>  obfuscate the code, make it harder to work with, harder to check that
>  everything was correct.

I spose we could do this, which is a bit of an improvement.

But the problems do remain, really.  The one which creeps me out is looking
at a piece of code which does:


	foo(&sc);
	if (sc.bar ...)

and just not knowing whether foo() altered sc.bar.


diff -puN mm/vmscan.c~vmscan-scan_control-cleanup mm/vmscan.c
--- devel/mm/vmscan.c~vmscan-scan_control-cleanup	2006-02-11 01:34:04.000000000 -0800
+++ devel-akpm/mm/vmscan.c	2006-02-11 01:41:57.000000000 -0800
@@ -1414,13 +1414,14 @@ int try_to_free_pages(struct zone **zone
 	int ret = 0;
 	int total_scanned = 0, total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+	};
 
 	inc_page_state(allocstall);
 
@@ -1438,7 +1439,6 @@ int try_to_free_pages(struct zone **zone
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
 		shrink_caches(priority, zones, &sc);
@@ -1461,7 +1461,8 @@ int try_to_free_pages(struct zone **zone
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
@@ -1515,14 +1516,16 @@ static int balance_pgdat(pg_data_t *pgda
 	int i;
 	int total_scanned, total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_writepage = !laptop_mode,
+		.may_swap = 1,
+		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+	};
 
 loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
-	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1604,7 +1607,6 @@ scan:
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			sc.nr_reclaimed = 0;
-			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
 			atomic_inc(&zone->reclaim_in_progress);
 			shrink_zone(priority, zone, &sc);
 			atomic_dec(&zone->reclaim_in_progress);
@@ -1856,13 +1858,19 @@ int zone_reclaim_interval __read_mostly 
  */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages;
+	int nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc;
 	cpumask_t mask;
 	int node_id;
 	int priority;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.nr_mapped = read_page_state(nr_mapped),
+		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+	};
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
@@ -1878,21 +1886,8 @@ int zone_reclaim(struct zone *zone, gfp_
 	if (!cpus_empty(mask) && node_id != numa_node_id())
 		return 0;
 
-	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.gfp_mask = gfp_mask;
-
 	disable_swap_token();
 
-	nr_pages = 1 << order;
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
 	cond_resched();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11  9:32   ` Andrew Morton
  2006-02-11  9:46     ` Andrew Morton
@ 2006-02-11 19:01     ` Christoph Lameter
  2006-02-11 21:13       ` Andrew Morton
  1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2006-02-11 19:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Marcelo Tosatti, nickpiggin, linux-mm

On Sat, 11 Feb 2006, Andrew Morton wrote:

> > But refill_inactive_list() is not used for swapping only. All evicted 
> > pages go through that path - it can be _very_ hot.
> 
> A bit hot.  I guess it's worth fixing.

There is another issue of the anon_vma lock getting very hot during 
zone_reclaim() because refill_inactive_list calls page_referenced(). So 
does shrink_list(). zone_reclaim is only interested in unmapped pages and 
thus checking for references is useless.

> scan_control was modelled on writeback_control.  But writeback_control
> works, and scan_control doesn't.  I think this is because a)
> writeback_control instances are always initialised at the declaration site
> and b) writeback_control is just a lot simpler.

The zoned counter patchset eliminates at least the wbs structure.

Patch to fix the calling of page_referenced() follows. This is against 
2.6.16-rc2. We probably need another patch for current mm. In the case
of VMSCAN_MAY_SWAP not set, we may just want to bypass the whole 
calculation thing for reclaim_mapped.

Do not check references to a page during zone reclaim

Shrink_list and refill_inactive() check all ptes pointing to a page
for reference bits in order to decide if the page should be put on
the active list. This is not necessary for zone_reclaim since we
are only interested in removing unmapped pages. Skip the checks in both
functions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c	2006-02-10 12:15:37.293298891 -0800
+++ linux/mm/vmscan.c	2006-02-10 12:26:19.453541327 -0800
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head 
 		BUG_ON(PageActive(page));

 		sc->nr_scanned++;
+
+		if (!sc->may_swap && page_mapped(page))
+			goto keep_locked;
+
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
@@ -983,7 +987,7 @@ refill_inactive_zone(struct zone *zone, 
 	 * Now use this metric to decide whether to start moving mapped memory
 	 * onto the inactive list.
 	 */
-	if (swap_tendency >= 100)
+	if (swap_tendency >= 100 && sc->may_swap)
 		reclaim_mapped = 1;

 	while (!list_empty(&l_hold)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11 19:01     ` Christoph Lameter
@ 2006-02-11 21:13       ` Andrew Morton
  2006-02-11 21:27         ` Christoph Lameter
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2006-02-11 21:13 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: marcelo.tosatti, nickpiggin, linux-mm

Christoph Lameter <clameter@engr.sgi.com> wrote:
>
> On Sat, 11 Feb 2006, Andrew Morton wrote:
> 
> > > But refill_inactive_list() is not used for swapping only. All evicted 
> > > pages go through that path - it can be _very_ hot.
> > 
> > A bit hot.  I guess it's worth fixing.
> 
> There is another issue of the anon_vma lock getting very hot during 
> zone_reclaim() because refill_inactive_list calls page_referenced(). So 
> does shrink_list(). zone_reclaim is only interested in unmapped pages and 
> thus checking for references is useless.
> 
> > scan_control was modelled on writeback_control.  But writeback_control
> > works, and scan_control doesn't.  I think this is because a)
> > writeback_control instances are always initialised at the declaration site
> > and b) writeback_control is just a lot simpler.
> 
> The zoned counter patchset eliminates at least the wbs structure.

Does that refer to writeback_state?

> Patch to fix the calling of page_referenced() follows. This is against 
> 2.6.16-rc2. We probably need another patch for current mm. In the case
> of VMSCAN_MAY_SWAP not set, we may just want to bypass the whole 
> calculation thing for reclaim_mapped.
> 

What's VMSCAN_MAY_SWAP?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11 21:13       ` Andrew Morton
@ 2006-02-11 21:27         ` Christoph Lameter
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Lameter @ 2006-02-11 21:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: marcelo.tosatti, nickpiggin, linux-mm

On Sat, 11 Feb 2006, Andrew Morton wrote:

> > Patch to fix the calling of page_referenced() follows. This is against 
> > 2.6.16-rc2. We probably need another patch for current mm. In the case
> > of VMSCAN_MAY_SWAP not set, we may just want to bypass the whole 
> > calculation thing for reclaim_mapped.
> > 
> 
> What's VMSCAN_MAY_SWAP?

Heh. Its gone!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-11  9:46     ` Andrew Morton
@ 2006-02-12  3:33       ` Nick Piggin
  2006-02-12  3:47         ` Christoph Lameter
  0 siblings, 1 reply; 18+ messages in thread
From: Nick Piggin @ 2006-02-12  3:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: marcelo.tosatti, clameter, linux-mm

Andrew Morton wrote:
> Andrew Morton <akpm@osdl.org> wrote:
> 
>>I found that scan_control wasn't
>> really a success.  We had one bug due to failing to initialise something in
>> it, and we're fiddling with fields all over the place.  It just seemed to
>> obfuscate the code, make it harder to work with, harder to check that
>> everything was correct.
> 

I agree with Marcelo, I prefer scan_control. I'm not sure if it was
modelled on writeback_control or not, but it is certianly very different:
writeback_control is spread over many files and subsystems. scan_control
is vmscan local and is simply used to alleviate the passing of many
values back and forth between vmscan functions.

> 
> I spose we could do this, which is a bit of an improvement.
> 
> But the problems do remain, really.  The one which creeps me out is looking
> at a piece of code which does:
> 
> 
> 	foo(&sc);
> 	if (sc.bar ...)
> 
> and just not knowing whether foo() altered sc.bar.
> 

Luckily there are very limited call stacks which modify this stuff so it isn't
too hard to keep all in your head at once after you start doing a bit of work
in vmscan. That said, we could implement a commenting convention to help things.

/*
  * refill_inactive_list
  * input:
  * sc.nr_scan - specifies the number of ...
  * sc.blah ...
  *
  * modifies:
  * sc.nr_scan - blah blah
  */

?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  3:33       ` Nick Piggin
@ 2006-02-12  3:47         ` Christoph Lameter
  2006-02-12  4:08           ` Nick Piggin
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2006-02-12  3:47 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, marcelo.tosatti, linux-mm

On Sun, 12 Feb 2006, Nick Piggin wrote:

> I agree with Marcelo, I prefer scan_control. I'm not sure if it was
> modelled on writeback_control or not, but it is certianly very different:
> writeback_control is spread over many files and subsystems. scan_control
> is vmscan local and is simply used to alleviate the passing of many
> values back and forth between vmscan functions.

The trouble with scan_control is that it contains diverse variables. For 
example it caches nr_mapped, its used to pass results back to the caller 
etc. 

> Luckily there are very limited call stacks which modify this stuff so it isn't
> too hard to keep all in your head at once after you start doing a bit of work
> in vmscan. That said, we could implement a commenting convention to help
> things.
> 
> /*
>  * refill_inactive_list
>  * input:
>  * sc.nr_scan - specifies the number of ...
>  * sc.blah ...
>  *
>  * modifies:
>  * sc.nr_scan - blah blah
>  */

Could we at least pass the number of pages reclaimed back as the return 
value of the functions? I believe most of the savings that Andrew saw was 
due to the number of reclaimed pages being processed directly in 
registers.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  3:47         ` Christoph Lameter
@ 2006-02-12  4:08           ` Nick Piggin
  2006-02-12  4:41             ` Christoph Lameter
  0 siblings, 1 reply; 18+ messages in thread
From: Nick Piggin @ 2006-02-12  4:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, marcelo.tosatti, linux-mm

Christoph Lameter wrote:
> On Sun, 12 Feb 2006, Nick Piggin wrote:
> 
> 
>>I agree with Marcelo, I prefer scan_control. I'm not sure if it was
>>modelled on writeback_control or not, but it is certianly very different:
>>writeback_control is spread over many files and subsystems. scan_control
>>is vmscan local and is simply used to alleviate the passing of many
>>values back and forth between vmscan functions.
> 
> 
> The trouble with scan_control is that it contains diverse variables. For 
> example it caches nr_mapped, its used to pass results back to the caller 
> etc. 
> 

But I don't see why that is trouble if you would otherwise need to do
it by passing pointers to variables individually.

Sure you don't get an idea (from the call signature) of exactly what is
modified or how things are used... but you never really got (a complete
picture of) that anyway.

> 
>>Luckily there are very limited call stacks which modify this stuff so it isn't
>>too hard to keep all in your head at once after you start doing a bit of work
>>in vmscan. That said, we could implement a commenting convention to help
>>things.
>>
>>/*
>> * refill_inactive_list
>> * input:
>> * sc.nr_scan - specifies the number of ...
>> * sc.blah ...
>> *
>> * modifies:
>> * sc.nr_scan - blah blah
>> */
> 
> 
> Could we at least pass the number of pages reclaimed back as the return 
> value of the functions? I believe most of the savings that Andrew saw was 
> due to the number of reclaimed pages being processed directly in 
> registers.

What savings are you interested in, exactly? Your initial patch
would definitely have slowed down page reclaim on big systems
due to the read_page_state...

I think most of the cost apart from locking (because that will
depend on contention) is hitting random cachelines of struct pages
then hitting random radix tree cachelines to remove them. Not
much you can do about that.

That said I'm never against microoptimisations provided they
weigh in on the right side of the (subjective) complexity /
improvement ratio.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  4:08           ` Nick Piggin
@ 2006-02-12  4:41             ` Christoph Lameter
  2006-02-12  5:01               ` Nick Piggin
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2006-02-12  4:41 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, marcelo.tosatti, linux-mm

On Sun, 12 Feb 2006, Nick Piggin wrote:

> > Could we at least pass the number of pages reclaimed back as the return
> > value of the functions? I believe most of the savings that Andrew saw was
> > due to the number of reclaimed pages being processed directly in registers.
> 
> What savings are you interested in, exactly? Your initial patch
> would definitely have slowed down page reclaim on big systems
> due to the read_page_state...

The patch that put the whole calculation into a separate block that 
is only executed for the swap case would have taken care of that.


> I think most of the cost apart from locking (because that will
> depend on contention) is hitting random cachelines of struct pages
> then hitting random radix tree cachelines to remove them. Not
> much you can do about that.
> 
> That said I'm never against microoptimisations provided they
> weigh in on the right side of the (subjective) complexity /
> improvement ratio.

Its a bit strange if you call a function and then access a structure 
member to get the result. Locating parameter in a structure makes it
impossible to see what is passed to a function when it is 
called.

It is also something that will make it difficult for compilers to do
a good job. Flow control is easier to optimize for a local variable
than for a pointer into a struct that may have been modified elsewhere.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  4:41             ` Christoph Lameter
@ 2006-02-12  5:01               ` Nick Piggin
  2006-02-12  5:14                 ` Andrew Morton
  2006-02-12  6:25                 ` Christoph Lameter
  0 siblings, 2 replies; 18+ messages in thread
From: Nick Piggin @ 2006-02-12  5:01 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, marcelo.tosatti, linux-mm

Christoph Lameter wrote:
> On Sun, 12 Feb 2006, Nick Piggin wrote:

>>I think most of the cost apart from locking (because that will
>>depend on contention) is hitting random cachelines of struct pages
>>then hitting random radix tree cachelines to remove them. Not
>>much you can do about that.
>>
>>That said I'm never against microoptimisations provided they
>>weigh in on the right side of the (subjective) complexity /
>>improvement ratio.
> 
> 
> Its a bit strange if you call a function and then access a structure 
> member to get the result. Locating parameter in a structure makes it
> impossible to see what is passed to a function when it is 
> called.
> 

Sometimes there is more than one result though :\

> It is also something that will make it difficult for compilers to do
> a good job. Flow control is easier to optimize for a local variable
> than for a pointer into a struct that may have been modified elsewhere.
> 

There are downsides to it. I was basically on the fence with its
removal from mainline, because the complexity of parameters going
to/from functions make the improvement borderline.

But I would have kept it for my internal work, and given Marcelo
is also interested in it I guess it could stay for now (unless
you trump that with some performance numbers I guess).

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  5:01               ` Nick Piggin
@ 2006-02-12  5:14                 ` Andrew Morton
  2006-02-12  5:37                   ` Andrew Morton
  2006-02-12  6:25                 ` Christoph Lameter
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2006-02-12  5:14 UTC (permalink / raw)
  To: Nick Piggin; +Cc: clameter, marcelo.tosatti, linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> There are downsides to it. I was basically on the fence with its
>  removal from mainline, because the complexity of parameters going
>  to/from functions make the improvement borderline.
> 
>  But I would have kept it for my internal work, and given Marcelo
>  is also interested in it I guess it could stay for now (unless
>  you trump that with some performance numbers I guess).

I'm wobbly too.  I still hate the thing, but I hate it less after I fixed
up some of its straggliness.

Returning nr_reclaimed up and down the stack makes sense too - I'll try that.

btw, it'd be nice to think of some better function names too.  We have:

	try_to_free_pages
	->shrink_caches
	  ->shrink_zone
	    ->shrink_cache
	      ->shrink_list

which is fairly irrational.

Something like

	try_to_free_pages
	->shrink_zones(struct zone **zones, ..)
	  ->shrink_zone(struct zone *, ...)
	    ->do_shrink_zone(struct zone *, ...)
	      ->shrink_page_list(struct list_head *, ...)

perhaps.  Maybe s/shrink/reclaim/ just to confuse everyone more.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  5:14                 ` Andrew Morton
@ 2006-02-12  5:37                   ` Andrew Morton
  2006-02-12  6:49                     ` Christoph Lameter
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2006-02-12  5:37 UTC (permalink / raw)
  To: nickpiggin, clameter, marcelo.tosatti, linux-mm

Andrew Morton <akpm@osdl.org> wrote:
>
>  Returning nr_reclaimed up and down the stack makes sense too - I'll try that.

wtf does this, in zone_reclaim() do?

		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  5:01               ` Nick Piggin
  2006-02-12  5:14                 ` Andrew Morton
@ 2006-02-12  6:25                 ` Christoph Lameter
  1 sibling, 0 replies; 18+ messages in thread
From: Christoph Lameter @ 2006-02-12  6:25 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, marcelo.tosatti, linux-mm

On Sun, 12 Feb 2006, Nick Piggin wrote:

> > Its a bit strange if you call a function and then access a structure member
> > to get the result. Locating parameter in a structure makes it
> > impossible to see what is passed to a function when it is called.
> Sometimes there is more than one result though :\

Well there are basically only two: The number of scanned and the number of 
reclaimed pages. My patch passed the number of scanned as a reference 
parameter.

> > It is also something that will make it difficult for compilers to do
> > a good job. Flow control is easier to optimize for a local variable
> > than for a pointer into a struct that may have been modified elsewhere.
> There are downsides to it. I was basically on the fence with its
> removal from mainline, because the complexity of parameters going
> to/from functions make the improvement borderline.

Yes, that may weigh in on the other side of this.

> But I would have kept it for my internal work, and given Marcelo
> is also interested in it I guess it could stay for now (unless
> you trump that with some performance numbers I guess).

The compiler optimization are mostly interesting for platforms that are 
short on memory or need highly efficient code. So sorry, no performance 
numbers for me on that one. For NUMA platforms the clarity of the code is 
what is of interest.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  5:37                   ` Andrew Morton
@ 2006-02-12  6:49                     ` Christoph Lameter
  2006-02-12  7:53                       ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2006-02-12  6:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nickpiggin, marcelo.tosatti, linux-mm

On Sat, 11 Feb 2006, Andrew Morton wrote:

> Andrew Morton <akpm@osdl.org> wrote:
> >
> >  Returning nr_reclaimed up and down the stack makes sense too - I'll try that.
> 
> wtf does this, in zone_reclaim() do?
> 
> 		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */

The number of pages returned from slab_reclaim is global and not per zone. 
(we need to find a way to fix that if we want to enable per zone slab 
reclaim during zone reclaim operations by defaut... and if we had zoned vm 
counters we could also avoid this situation)

So the number of reclaimed slab pages cannot be used to determine if we 
should go off node. sc.nr_reclaimed has some weird value after shrink_slab
is through.

Hmmm. Setting this to one means that we will rescan and shrink the slab
for each allocation if we are out of zone memory and RECLAIM_SLAB is set.
Plus if we do an order 0 allocation we do not go off node as intended.

We better set this to zero. This means the allocation will go offnode
despite us having potentially freed lots of memory on the zone.
Future allocations can then again be done from this zone.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.16-rc2/mm/vmscan.c
===================================================================
--- linux-2.6.16-rc2.orig/mm/vmscan.c	2006-02-11 14:22:07.000000000 -0800
+++ linux-2.6.16-rc2/mm/vmscan.c	2006-02-11 22:42:13.000000000 -0800
@@ -1909,13 +1909,13 @@ int zone_reclaim(struct zone *zone, gfp_
 		/*
 		 * shrink_slab does not currently allow us to determine
 		 * how many pages were freed in the zone. So we just
-		 * shake the slab and then go offnode for a single allocation.
+		 * shake the slab.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
 		 */
 		shrink_slab(sc.nr_scanned, gfp_mask, order);
-		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
+		sc.nr_reclaimed = 0;
 	}

 	p->reclaim_state = NULL;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  6:49                     ` Christoph Lameter
@ 2006-02-12  7:53                       ` Andrew Morton
  2006-02-13 17:54                         ` Christoph Lameter
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2006-02-12  7:53 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: nickpiggin, marcelo.tosatti, linux-mm

Christoph Lameter <clameter@engr.sgi.com> wrote:
>
> On Sat, 11 Feb 2006, Andrew Morton wrote:
> 
> > Andrew Morton <akpm@osdl.org> wrote:
> > >
> > >  Returning nr_reclaimed up and down the stack makes sense too - I'll try that.
> > 
> > wtf does this, in zone_reclaim() do?
> > 
> > 		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
> 
> The number of pages returned from slab_reclaim is global and not per zone. 
> (we need to find a way to fix that if we want to enable per zone slab 
> reclaim during zone reclaim operations by defaut... and if we had zoned vm 
> counters we could also avoid this situation)
>  
> So the number of reclaimed slab pages cannot be used to determine if we 
> should go off node. sc.nr_reclaimed has some weird value after shrink_slab
> is through.

?  sc.nr_reclaimed doesn't have anything to do with shrink_slab()?

Do you mean reclaim_state.reclaimed_slab?  If so, why does it have a weird
value?  What's wrong with it?

> Hmmm. Setting this to one means that we will rescan and shrink the slab
> for each allocation if we are out of zone memory and RECLAIM_SLAB is set.
> Plus if we do an order 0 allocation we do not go off node as intended.
>
> We better set this to zero. This means the allocation will go offnode
> despite us having potentially freed lots of memory on the zone.
> Future allocations can then again be done from this zone.

uh, OK, if you say so.

zone_reclaim() is pretty obscure and could do with some comments.  What's
it _really_ trying to do, and how does it do it?  What is that timer there
for and how is it supposed to work?  Why on earth does it set PF_MEMALLOC,
things like that.

I'd have thought that looking at the zone's free_pages thingies would give
a pretty good approximation to "how much memory did shrink_slab() give us".

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Get rid of scan_control
  2006-02-12  7:53                       ` Andrew Morton
@ 2006-02-13 17:54                         ` Christoph Lameter
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Lameter @ 2006-02-13 17:54 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nickpiggin, marcelo.tosatti, linux-mm

On Sat, 11 Feb 2006, Andrew Morton wrote:

> zone_reclaim() is pretty obscure and could do with some comments.  What's
> it _really_ trying to do, and how does it do it?  What is that timer there
> for and how is it supposed to work?  Why on earth does it set PF_MEMALLOC,
> things like that.
> 
> I'd have thought that looking at the zone's free_pages thingies would give
> a pretty good approximation to "how much memory did shrink_slab() give us".

But that would mean we need to apply the same set of criteria as in 
__alloc_pages(). Its just happening for one allocation and so I thought 
it is not worth dragging all the stuff from __alloc_pages() into vmscan.c.

Here is a cleanup patch with more comments:




zone_reclaim additional comments and cleanup

This patch adds some comments to explain how zone reclaim works.
And it fixes the following issues:

- PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write
  out pages to swap. Currently RECLAIM_SWAP may not do that.

- remove setting sc.nr_reclaimed pages after slab reclaim since the
  slab shrinking code does not use that and the nr_reclaimed pages
  is just right for the intended follow up action.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.16-rc3/mm/vmscan.c
===================================================================
--- linux-2.6.16-rc3.orig/mm/vmscan.c	2006-02-12 16:27:25.000000000 -0800
+++ linux-2.6.16-rc3/mm/vmscan.c	2006-02-13 09:45:05.000000000 -0800
@@ -1870,22 +1870,37 @@ int zone_reclaim_interval __read_mostly 
  */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages;
+	int nr_pages;	/* Minimum pages needed in order to stay on node */
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc;
 	cpumask_t mask;
 	int node_id;
 
+	/*
+	 * Do not reclaim if there was a recent unsuccessful attempt at
+	 * zone reclaim. In that case we let allocations go off node for
+	 * the zone_reclaim_interval. Otherwise we would scan for each off
+	 * node page allocation.
+	 */
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
 			return 0;
 
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that
+	 * does not have reclaimable pages and if we should not delay
+	 * the allocation then do not scan.
+	 */
 	if (!(gfp_mask & __GFP_WAIT) ||
 		zone->all_unreclaimable ||
 		atomic_read(&zone->reclaim_in_progress) > 0)
 			return 0;
 
+	/*
+	 * Only reclaim in the zones that are local or in zones
+	 * that are on nodes without processors.
+	 */
 	node_id = zone->zone_pgdat->node_id;
 	mask = node_to_cpumask(node_id);
 	if (!cpus_empty(mask) && node_id != numa_node_id())
@@ -1908,7 +1923,12 @@ int zone_reclaim(struct zone *zone, gfp_
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	cond_resched();
-	p->flags |= PF_MEMALLOC;
+	/*
+	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
+	 * and we also need to be able to write out pages for RECLAIM_WRITE
+	 * and RECLAIM_SWAP.
+	 */
+	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -1922,23 +1942,29 @@ int zone_reclaim(struct zone *zone, gfp_
 
 	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
 
-	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB))
 		/*
 		 * shrink_slab does not currently allow us to determine
-		 * how many pages were freed in the zone. So we just
-		 * shake the slab and then go offnode for a single allocation.
+		 * how many pages were freed in this zone. So we just
+		 * shake the slab a bit and then go off node for this
+		 * particular allocation despite possibly having freed enough
+		 * memory to allocate in this zone. If we freed local memory
+		 * then the next allocations will be local again.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
 		 */
 		shrink_slab(sc.nr_scanned, gfp_mask, order);
-		sc.nr_reclaimed = 1;    /* Avoid getting the off node timeout */
-	}
 
 	p->reclaim_state = NULL;
-	current->flags &= ~PF_MEMALLOC;
+	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
 	if (sc.nr_reclaimed == 0)
+		/*
+		 * We were unable to reclaim enough pages to stay on node.
+		 * We now allow off node accesses for a certain time period
+		 * before trying again to reclaim pages from the local zone.
+		 */
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
 	return sc.nr_reclaimed >= nr_pages;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2006-02-13 17:54 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-02-10  5:02 Get rid of scan_control Christoph Lameter
2006-02-11  4:53 ` Marcelo Tosatti
2006-02-11  9:32   ` Andrew Morton
2006-02-11  9:46     ` Andrew Morton
2006-02-12  3:33       ` Nick Piggin
2006-02-12  3:47         ` Christoph Lameter
2006-02-12  4:08           ` Nick Piggin
2006-02-12  4:41             ` Christoph Lameter
2006-02-12  5:01               ` Nick Piggin
2006-02-12  5:14                 ` Andrew Morton
2006-02-12  5:37                   ` Andrew Morton
2006-02-12  6:49                     ` Christoph Lameter
2006-02-12  7:53                       ` Andrew Morton
2006-02-13 17:54                         ` Christoph Lameter
2006-02-12  6:25                 ` Christoph Lameter
2006-02-11 19:01     ` Christoph Lameter
2006-02-11 21:13       ` Andrew Morton
2006-02-11 21:27         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox