[PATCH] 1/4: rework alloc

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] 1/4: rework alloc_pages
@ 2004-08-06  4:57 Nick Piggin
  2004-08-06  4:57 ` [PATCH] 2/4: highmem watermarks Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  4:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 165 bytes --]

Here are a few of the more harmless mm patches I have been sitting on
for a while. They've had some testing in my tree (which does get used
by a handful of people).

[-- Attachment #2: vm-rework-alloc_pages.patch --]
[-- Type: text/x-patch, Size: 5914 bytes --]


This reworks alloc_pages a bit.

Previously the ->protection[] logic was broken. It was difficult to follow
and basically didn't use the asynch reclaim watermarks properly.

This one uses ->protection only for lower-zone protection, and gives the
allocator flexibility to add the watermarks as desired.


---

 linux-2.6-npiggin/mm/page_alloc.c |  115 ++++++++++++++++----------------------
 1 files changed, 51 insertions(+), 64 deletions(-)

diff -puN mm/page_alloc.c~vm-rework-alloc_pages mm/page_alloc.c
--- linux-2.6/mm/page_alloc.c~vm-rework-alloc_pages	2004-08-06 14:43:40.000000000 +1000
+++ linux-2.6-npiggin/mm/page_alloc.c	2004-08-06 14:43:40.000000000 +1000
@@ -606,7 +606,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 {
 	const int wait = gfp_mask & __GFP_WAIT;
 	unsigned long min;
-	struct zone **zones;
+	struct zone **zones, *z;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
@@ -617,72 +617,56 @@ __alloc_pages(unsigned int gfp_mask, uns
 	might_sleep_if(wait);
 
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-	if (zones[0] == NULL)     /* no zones in the zonelist */
+
+	if (unlikely(zones[0] == NULL)) {
+		/* Should this ever happen?? */
 		return NULL;
+	}
 
 	alloc_type = zone_idx(zones[0]);
 
 	/* Go through the zonelist once, looking for a zone with enough free */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z->pages_low + (1<<order) + z->protection[alloc_type];
 
-		min = (1<<order) + z->protection[alloc_type];
-
-		/*
-		 * We let real-time tasks dip their real-time paws a little
-		 * deeper into reserves.
-		 */
-		if (rt_task(p))
-			min -= z->pages_low >> 1;
+		if (z->free_pages < min)
+			continue;
 
-		if (z->free_pages >= min ||
-				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
 	}
 
-	/* we're somewhat low on memory, failed to find what we needed */
-	for (i = 0; zones[i] != NULL; i++)
-		wakeup_kswapd(zones[i]);
-
-	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
-
-		min = (1<<order) + z->protection[alloc_type];
+	for (i = 0; (z = zones[i]) != NULL; i++)
+		wakeup_kswapd(z);
 
+	/*
+	 * Go through the zonelist again. Let __GFP_HIGH and allocations
+	 * coming from realtime tasks to go deeper into reserves
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
-			min -= z->pages_low >> 2;
-		if (rt_task(p))
-			min -= z->pages_low >> 1;
+			min -= min>>1;
+		if (unlikely(rt_task(p)) && !in_interrupt())
+			min -= min>>1;
+		min += (1<<order) + z->protection[alloc_type];
 
-		if (z->free_pages >= min ||
-				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
-	}
+		if (z->free_pages < min)
+			continue;
 
-	/* here we're in the low on memory slow path */
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
+	}
 
-rebalance:
+	/* This allocation should allow future memory freeing. */
 	if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
-		for (i = 0; zones[i] != NULL; i++) {
-			struct zone *z = zones[i];
-
+		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
+			if (page)
 				goto got_pg;
-			}
 		}
 		goto nopage;
 	}
@@ -691,6 +675,8 @@ rebalance:
 	if (!wait)
 		goto nopage;
 
+rebalance:
+	/* We now go into synchronous reclaim */
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -701,27 +687,28 @@ rebalance:
 	p->flags &= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z->pages_min;
+		if (gfp_mask & __GFP_HIGH)
+			min -= min>>2;
+		if (unlikely(rt_task(p)))
+			min -= min>>2;
+		min += (1<<order) + z->protection[alloc_type];
 
-		min = (1UL << order) + z->protection[alloc_type];
+		if (z->free_pages < min)
+			continue;
 
-		if (z->free_pages >= min ||
-				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
- 				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
 	}
 
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
 	 *
-	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
-	 * may not be true in other implementations.
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * <= 3, but that may not be true in other implementations.
 	 */
 	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
@@ -744,6 +731,7 @@ nopage:
 	}
 	return NULL;
 got_pg:
+	zone_statistics(zonelist, z);
 	kernel_map_pages(page, 1 << order, 1);
 	return page;
 }
@@ -1861,7 +1849,7 @@ static void setup_per_zone_protection(vo
 				 * zero because the lower zones take
 				 * contributions from the higher zones.
 				 */
-				if (j > max_zone || j > i) {
+				if (j > max_zone || j >= i) {
 					zone->protection[i] = 0;
 					continue;
 				}
@@ -1870,7 +1858,6 @@ static void setup_per_zone_protection(vo
 				 */
 				zone->protection[i] = higherzone_val(zone,
 								max_zone, i);
-				zone->protection[i] += zone->pages_low;
 			}
 		}
 	}

_

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] 2/4: highmem watermarks
  2004-08-06  4:57 [PATCH] 1/4: rework alloc_pages Nick Piggin
@ 2004-08-06  4:57 ` Nick Piggin
  2004-08-06  5:03   ` [PATCH] 3/4: writeout watermarks Nick Piggin
  2004-08-06  5:12 ` [PATCH] 1/4: rework alloc_pages Nick Piggin
  2004-08-06  5:19 ` Andrew Morton
  2 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  4:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

2/4

[-- Attachment #2: vm-highmem-watermarks.patch --]
[-- Type: text/x-patch, Size: 2335 bytes --]



The pages_high - pages_low and pages_low - pages_min deltas are the asynch
reclaim watermarks. As such, the should be in the same ratios as any other
zone for highmem zones. It is the pages_min - 0 delta which is the PF_MEMALLOC
reserve, and this is the region that isn't very useful for highmem.

This patch ensures highmem systems have similar characteristics as non highmem
ones with the same amount of memory, and also that highmem zones get similar
reclaim pressures to other zones.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>


---

 linux-2.6-npiggin/mm/page_alloc.c |   23 ++++++++++++++---------
 1 files changed, 14 insertions(+), 9 deletions(-)

diff -puN mm/page_alloc.c~vm-highmem-watermarks mm/page_alloc.c
--- linux-2.6/mm/page_alloc.c~vm-highmem-watermarks	2004-08-06 14:44:29.000000000 +1000
+++ linux-2.6-npiggin/mm/page_alloc.c	2004-08-06 14:44:29.000000000 +1000
@@ -1882,13 +1882,18 @@ static void setup_per_zone_pages_min(voi
 	}
 
 	for_each_zone(zone) {
+		unsigned long tmp;
 		spin_lock_irqsave(&zone->lru_lock, flags);
+		tmp = (pages_min * zone->present_pages) / lowmem_pages;
 		if (is_highmem(zone)) {
 			/*
-			 * Often, highmem doesn't need to reserve any pages.
-			 * But the pages_min/low/high values are also used for
-			 * batching up page reclaim activity so we need a
-			 * decent value here.
+			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+			 * need highmem pages, so cap pages_min to a small
+			 * value here.
+			 *
+			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * deltas controls asynch page reclaim, and so should
+			 * not be capped for highmem.
 			 */
 			int min_pages;
 
@@ -1899,15 +1904,15 @@ static void setup_per_zone_pages_min(voi
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
-			/* if it's a lowmem zone, reserve a number of pages 
+			/*
+			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = (pages_min * zone->present_pages) / 
-			                   lowmem_pages;
+			zone->pages_min = tmp;
 		}
 
-		zone->pages_low = zone->pages_min * 2;
-		zone->pages_high = zone->pages_min * 3;
+		zone->pages_low = zone->pages_min + tmp;
+		zone->pages_high = zone->pages_low + tmp;
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }

_

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] 3/4: writeout watermarks
  2004-08-06  4:57 ` [PATCH] 2/4: highmem watermarks Nick Piggin
@ 2004-08-06  5:03   ` Nick Piggin
  2004-08-06  5:04     ` [PATCH] 4/4: incremental min aware kswapd Nick Piggin
  2004-08-06  5:27     ` [PATCH] 3/4: writeout watermarks Andrew Morton
  0 siblings, 2 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  5:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 221 bytes --]

3/4

3rd attempt for this patch ;)
I have since addressed your concerns.

So for example, with a 10/40 async/sync ratio, if the sync
watermark is moved down to 20, the async mark will be moved
to 5, preserving the ratio.

[-- Attachment #2: vm-tune-writeout.patch --]
[-- Type: text/x-patch, Size: 1208 bytes --]


Slightly change the writeout watermark calculations so we keep background
and synchronous writeout watermarks in the same ratios after adjusting them.
This ensures we should always attempt to start background writeout before
synchronous writeout.

Signed-off-by: Nick Piggin <nickpiggin@cyberone.com.au>


---

 linux-2.6-npiggin/mm/page-writeback.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff -puN mm/page-writeback.c~vm-tune-writeout mm/page-writeback.c
--- linux-2.6/mm/page-writeback.c~vm-tune-writeout	2004-08-06 14:48:45.000000000 +1000
+++ linux-2.6-npiggin/mm/page-writeback.c	2004-08-06 14:48:45.000000000 +1000
@@ -153,9 +153,11 @@ get_dirty_limits(struct writeback_state 
 	if (dirty_ratio < 5)
 		dirty_ratio = 5;
 
-	background_ratio = dirty_background_ratio;
-	if (background_ratio >= dirty_ratio)
-		background_ratio = dirty_ratio / 2;
+	/*
+	 * Keep the ratio between dirty_ratio and background_ratio roughly
+	 * what the sysctls are after dirty_ratio has been scaled (above).
+	 */
+	background_ratio = dirty_background_ratio * dirty_ratio/vm_dirty_ratio;
 
 	background = (background_ratio * total_pages) / 100;
 	dirty = (dirty_ratio * total_pages) / 100;

_

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] 4/4: incremental min aware kswapd
  2004-08-06  5:03   ` [PATCH] 3/4: writeout watermarks Nick Piggin
@ 2004-08-06  5:04     ` Nick Piggin
  2004-08-06  5:27     ` [PATCH] 3/4: writeout watermarks Andrew Morton
  1 sibling, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  5:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management, Shantanu Goel

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

4/4

[-- Attachment #2: vm-kswapd-incmin.patch --]
[-- Type: text/x-patch, Size: 4548 bytes --]



Explicitly teach kswapd about the incremental min logic instead of just scanning
all zones under the first low zone. This should keep more even pressure applied
on the zones.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>


---

 linux-2.6-npiggin/mm/vmscan.c |   98 +++++++++++++++++-------------------------
 1 files changed, 40 insertions(+), 58 deletions(-)

diff -puN mm/vmscan.c~vm-kswapd-incmin mm/vmscan.c
--- linux-2.6/mm/vmscan.c~vm-kswapd-incmin	2004-08-06 14:49:48.000000000 +1000
+++ linux-2.6-npiggin/mm/vmscan.c	2004-08-06 14:49:48.000000000 +1000
@@ -1011,80 +1011,63 @@ static int balance_pgdat(pg_data_t *pgda
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int all_zones_ok = 1;
-		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
+		int first_low = 0;
 
-		if (nr_pages == 0) {
-			/*
-			 * Scan in the highmem->dma direction for the highest
-			 * zone which needs scanning
-			 */
-			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-				struct zone *zone = pgdat->node_zones + i;
+		sc.nr_scanned = 0;
+		sc.nr_reclaimed = 0;
 
-				if (zone->all_unreclaimable &&
-						priority != DEF_PRIORITY)
-					continue;
-
-				if (zone->free_pages <= zone->pages_high) {
-					end_zone = i;
-					goto scan;
-				}
-			}
-			goto out;
-		} else {
-			end_zone = pgdat->nr_zones - 1;
-		}
-scan:
-		for (i = 0; i <= end_zone; i++) {
+		/* Scan in the highmem->dma direction */
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			lru_pages += zone->nr_active + zone->nr_inactive;
-		}
+			if (nr_pages == 0) {	/* Not software suspend */
+				unsigned long pgfree = zone->free_pages;
+				unsigned long pghigh = zone->pages_high;
 
-		/*
-		 * Now scan the zone in the dma->highmem direction, stopping
-		 * at the last zone which needs scanning.
-		 *
-		 * We do this because the page allocator works in the opposite
-		 * direction.  This prevents the page allocator from allocating
-		 * pages behind kswapd's direction of progress, which would
-		 * cause too much scanning of the lower zones.
-		 */
-		for (i = 0; i <= end_zone; i++) {
-			struct zone *zone = pgdat->node_zones + i;
+				/*
+				 * This satisfies the "incremental min" or
+				 * lower zone protection logic in the allocator
+				 */
+				if (first_low > i)
+					pghigh += zone->protection[first_low];
+				if (pgfree >= pghigh)
+					continue;
+				if (first_low < i)
+					first_low = i;
+
+				all_zones_ok = 0;
+				sc.nr_to_reclaim = pghigh - pgfree;
+			} else
+				sc.nr_to_reclaim = INT_MAX; /* Software susp */
 
 			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 				continue;
-
-			if (nr_pages == 0) {	/* Not software suspend */
-				if (zone->free_pages <= zone->pages_high)
-					all_zones_ok = 0;
-			}
 			zone->temp_priority = priority;
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
-			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
 			sc.priority = priority;
+			lru_pages += zone->nr_active + zone->nr_inactive;
 			shrink_zone(zone, &sc);
-			reclaim_state->reclaimed_slab = 0;
-			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
-			if (zone->all_unreclaimable)
-				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
 				zone->all_unreclaimable = 1;
-			/*
-			 * If we've done a decent amount of scanning and
-			 * the reclaim ratio is low, start doing writepage
-			 * even in laptop mode
-			 */
-			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > total_reclaimed+total_reclaimed/2)
-				sc.may_writepage = 1;
 		}
+		reclaim_state->reclaimed_slab = 0;
+		shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+		sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+
+		total_reclaimed += sc.nr_reclaimed;
+		total_scanned += sc.nr_scanned;
+
+		/*
+		 * If we've done a decent amount of scanning and
+		 * the reclaim ratio is low, start doing writepage
+		 * even in laptop mode
+		 */
+		if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+		    total_scanned > total_reclaimed+total_reclaimed/2)
+			sc.may_writepage = 1;
+
 		if (nr_pages && to_free > total_reclaimed)
 			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
@@ -1096,7 +1079,6 @@ scan:
 		if (total_scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-out:
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 

_

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  4:57 [PATCH] 1/4: rework alloc_pages Nick Piggin
  2004-08-06  4:57 ` [PATCH] 2/4: highmem watermarks Nick Piggin
@ 2004-08-06  5:12 ` Nick Piggin
  2004-08-06  5:19 ` Andrew Morton
  2 siblings, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  5:12 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Linux Memory Management

Nick Piggin wrote:
> Here are a few of the more harmless mm patches I have been sitting on
> for a while. They've had some testing in my tree (which does get used
> by a handful of people).
> 
> 
> ------------------------------------------------------------------------
> 
> 
> This reworks alloc_pages a bit.
> 
> Previously the ->protection[] logic was broken. It was difficult to follow
> and basically didn't use the asynch reclaim watermarks properly.
> 
> This one uses ->protection only for lower-zone protection, and gives the
> allocator flexibility to add the watermarks as desired.
> 

Note that this patch strictly enforces the lower zone protection (which is
currently disabled anyway) instead of allowing GFP_ATOMIC allocations to
get at them.

It also does a few minor things like not taking rt_task into account during
the first loop (because the kswapd reclaim watermarks shouldn't depend on
that), and also only checking rt_task if !in_interrupt.

The biggest thing it does is use the kswapd watermarks correctly - and I've
generally observed lower allocstall, and kswapd being more productive.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  4:57 [PATCH] 1/4: rework alloc_pages Nick Piggin
  2004-08-06  4:57 ` [PATCH] 2/4: highmem watermarks Nick Piggin
  2004-08-06  5:12 ` [PATCH] 1/4: rework alloc_pages Nick Piggin
@ 2004-08-06  5:19 ` Andrew Morton
  2004-08-06  5:29   ` Nick Piggin
  2 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  5:19 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Previously the ->protection[] logic was broken. It was difficult to follow
>  and basically didn't use the asynch reclaim watermarks properly.

eh?

Broken how?

What is an "asynch reclaim watermark"?

>  This one uses ->protection only for lower-zone protection, and gives the
>  allocator flexibility to add the watermarks as desired.

eh?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  5:03   ` [PATCH] 3/4: writeout watermarks Nick Piggin
  2004-08-06  5:04     ` [PATCH] 4/4: incremental min aware kswapd Nick Piggin
@ 2004-08-06  5:27     ` Andrew Morton
  2004-08-06  5:34       ` Nick Piggin
  1 sibling, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  5:27 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> 3rd attempt for this patch ;)
>  I have since addressed your concerns.
> 
>  So for example, with a 10/40 async/sync ratio, if the sync
>  watermark is moved down to 20, the async mark will be moved
>  to 5, preserving the ratio.

Disagree.

> 
> [vm-tune-writeout.patch  text/x-patch (1365 bytes)]
> 
>  Slightly change the writeout watermark calculations so we keep background
>  and synchronous writeout watermarks in the same ratios after adjusting them.
>  This ensures we should always attempt to start background writeout before
>  synchronous writeout.
> 
>  Signed-off-by: Nick Piggin <nickpiggin@cyberone.com.au>
> 
> 
>  ---
> 
>   linux-2.6-npiggin/mm/page-writeback.c |    8 +++++---
>   1 files changed, 5 insertions(+), 3 deletions(-)
> 
>  diff -puN mm/page-writeback.c~vm-tune-writeout mm/page-writeback.c
>  --- linux-2.6/mm/page-writeback.c~vm-tune-writeout	2004-08-06 14:48:45.000000000 +1000
>  +++ linux-2.6-npiggin/mm/page-writeback.c	2004-08-06 14:48:45.000000000 +1000
>  @@ -153,9 +153,11 @@ get_dirty_limits(struct writeback_state 
>   	if (dirty_ratio < 5)
>   		dirty_ratio = 5;
>   
>  -	background_ratio = dirty_background_ratio;
>  -	if (background_ratio >= dirty_ratio)
>  -		background_ratio = dirty_ratio / 2;
>  +	/*
>  +	 * Keep the ratio between dirty_ratio and background_ratio roughly
>  +	 * what the sysctls are after dirty_ratio has been scaled (above).
>  +	 */
>  +	background_ratio = dirty_background_ratio * dirty_ratio/vm_dirty_ratio;
>   
>   	background = (background_ratio * total_pages) / 100;
>   	dirty = (dirty_ratio * total_pages) / 100;

Look, these are sysadmin-settable sysctls.  The admin can set them to
whatever wild and whacky values he wants - it's his computer.

The only reason the check is there at all is because background_ratio >
dirty_ratio has never been even tested, and could explode, and I don't want
to have to test and support it.  Plus if the admin is in the process of
setting both tunables there might be a transient period of time when
they're in a bad state.

That's all!  Please, just pretend the code isn't there at all.  What the
admin sets, the admin gets, end of story.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  5:19 ` Andrew Morton
@ 2004-08-06  5:29   ` Nick Piggin
  2004-08-06  5:37     ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  5:29 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>>Previously the ->protection[] logic was broken. It was difficult to follow
>> and basically didn't use the asynch reclaim watermarks properly.
> 
> 
> eh?
> 
> Broken how?
> 

min = (1<<order) + z->protection[alloc_type];

This value is used both as the condition for waking kswapd, and
whether or not to enter synch reclaim.

What should happen is kswapd gets woken at pages_low, and synch
reclaim is started at pages_min.

> What is an "asynch reclaim watermark"?
> 

pages_low and pages_high.

> 
>> This one uses ->protection only for lower-zone protection, and gives the
>> allocator flexibility to add the watermarks as desired.
> 
> 
> eh?
> 

pages_low + protection and pages_min + protection, etc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  5:27     ` [PATCH] 3/4: writeout watermarks Andrew Morton
@ 2004-08-06  5:34       ` Nick Piggin
  2004-08-06  5:49         ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  5:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:

>>  	background = (background_ratio * total_pages) / 100;
>>  	dirty = (dirty_ratio * total_pages) / 100;
> 
> 
> Look, these are sysadmin-settable sysctls.  The admin can set them to
> whatever wild and whacky values he wants - it's his computer.

Yes I know. That was the problem with my earlier patches.

> 
> The only reason the check is there at all is because background_ratio >
> dirty_ratio has never been even tested, and could explode, and I don't want
> to have to test and support it.  Plus if the admin is in the process of
> setting both tunables there might be a transient period of time when
> they're in a bad state.
> 
> That's all!  Please, just pretend the code isn't there at all.  What the
> admin sets, the admin gets, end of story.
> 

No, it is not that code I am worried about, you're actually doing
this too (disregarding the admin's wishes):

         dirty_ratio = vm_dirty_ratio;
         if (dirty_ratio > unmapped_ratio / 2)
                 dirty_ratio = unmapped_ratio / 2;

         if (dirty_ratio < 5)
                 dirty_ratio = 5;


So if the admin wants a dirty_ratio of 40 and dirty_background_ratio of 10
then that's good, but I'm sure if they knew you're moving dirty_ratio to 10
here, they'd want something like 2 for the dirty_background_ratio.

I contend that the ratio between these two values is more important than
their absolue values -- especially considering one gets twiddled here.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  5:29   ` Nick Piggin
@ 2004-08-06  5:37     ` Andrew Morton
  2004-08-06  6:05       ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  5:37 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Andrew Morton wrote:
> > Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > 
> >>Previously the ->protection[] logic was broken. It was difficult to follow
> >> and basically didn't use the asynch reclaim watermarks properly.
> > 
> > 
> > eh?
> > 
> > Broken how?
> > 
> 
> min = (1<<order) + z->protection[alloc_type];
> 
> This value is used both as the condition for waking kswapd, and
> whether or not to enter synch reclaim.
> 
> What should happen is kswapd gets woken at pages_low, and synch
> reclaim is started at pages_min.

Are you aware of this:

void wakeup_kswapd(struct zone *zone)
{
	if (zone->free_pages > zone->pages_low)
		return;

?

> 
> pages_low + protection and pages_min + protection, etc.

Nick, sorry, but I shouldn't have to expend these many braincells
decrypting your work.  Please: much better explanations, more testing
results.  This stuff is fiddly, sensitive and has a habit of blowing up in
our faces weeks later.  We need to be cautious.  The barriers are higher
nowadays.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  5:34       ` Nick Piggin
@ 2004-08-06  5:49         ` Andrew Morton
  2004-08-06  6:13           ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  5:49 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> No, it is not that code I am worried about, you're actually doing
>  this too (disregarding the admin's wishes):
> 
>           dirty_ratio = vm_dirty_ratio;
>           if (dirty_ratio > unmapped_ratio / 2)
>                   dirty_ratio = unmapped_ratio / 2;
> 
>           if (dirty_ratio < 5)
>                   dirty_ratio = 5;
> 

hm, OK, that's some "try to avoid writeback off the LRU" stuff.

But you said "This ensures we should always attempt to start background
writeout before synchronous writeout.".  Does not the current code do that?

>  So if the admin wants a dirty_ratio of 40 and dirty_background_ratio of 10
>  then that's good, but I'm sure if they knew you're moving dirty_ratio to 10
>  here, they'd want something like 2 for the dirty_background_ratio.
> 
>  I contend that the ratio between these two values is more important than
>  their absolue values -- especially considering one gets twiddled here.

Maybe true, maybe false.  These things are demonstrable via testing, no?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  5:37     ` Andrew Morton
@ 2004-08-06  6:05       ` Nick Piggin
  2004-08-06  6:17         ` Andrew Morton
  2004-08-06 12:01         ` Rik van Riel
  0 siblings, 2 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  6:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>>Andrew Morton wrote:
>>
>>>Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>>>
>>>
>>>>Previously the ->protection[] logic was broken. It was difficult to follow
>>>>and basically didn't use the asynch reclaim watermarks properly.
>>>
>>>
>>>eh?
>>>
>>>Broken how?
>>>
>>
>>min = (1<<order) + z->protection[alloc_type];
>>
>>This value is used both as the condition for waking kswapd, and
>>whether or not to enter synch reclaim.
>>
>>What should happen is kswapd gets woken at pages_low, and synch
>>reclaim is started at pages_min.
> 
> 
> Are you aware of this:
> 
> void wakeup_kswapd(struct zone *zone)
> {
> 	if (zone->free_pages > zone->pages_low)
> 		return;
> 
> ?
> 

Err, yes?

> 
>>pages_low + protection and pages_min + protection, etc.
> 
> 
> Nick, sorry, but I shouldn't have to expend these many braincells
> decrypting your work.  Please: much better explanations, more testing
> results.  This stuff is fiddly, sensitive and has a habit of blowing up in
> our faces weeks later.  We need to be cautious.  The barriers are higher
> nowadays.
> 
> 

OK previously, in a nutshell:

	for_each_zone(z) {
		if (z->free_pages < z->protection)
			continue;
		else
			goto got_pg;
	}

	for_each_zone(z)
		wakeup_kswapd(z);

	for_each_zone(z) {
		if (z->free_pages < z->protection)
			continue;
		else
			goto got_pg;
	}

	try_to_free_pages();

	try again;

After my patch:
	for_each_zone(z) {
		if (z->free_pages < z->pages_low + z->protection)
			continue;
		else
			goto got_pg;
	}

	for_each_zone(z)
		wakeup_kswapd(z);

	for_each_zone(z) {
		if (z->free_pages < z->pages_min + z->protection)
			continue;
		else
			goto got_pg;
	}

	try_to_free_pages();

	try again;

Ie, we have the (pages_low - pages_min) buffer after waking kswapd
before entering synch reclaim. Previously there was no buffer. I thought
this was the point of background reclaim. I don't know if I can explain
it any better than that sorry.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  5:49         ` Andrew Morton
@ 2004-08-06  6:13           ` Nick Piggin
  2004-08-06  6:19             ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  6:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>>No, it is not that code I am worried about, you're actually doing
>> this too (disregarding the admin's wishes):
>>
>>          dirty_ratio = vm_dirty_ratio;
>>          if (dirty_ratio > unmapped_ratio / 2)
>>                  dirty_ratio = unmapped_ratio / 2;
>>
>>          if (dirty_ratio < 5)
>>                  dirty_ratio = 5;
>>
> 
> 
> hm, OK, that's some "try to avoid writeback off the LRU" stuff.
> 

Yep

> But you said "This ensures we should always attempt to start background
> writeout before synchronous writeout.".  Does not the current code do that?
> 

Basically what the above code, is scale the dirty_ratio with the
amount of unmapped pages, however it doesn't also scale the
dirty_background_ratio (it does after my patch).

So it isn't difficult to imagine this causing dirty_ratio to become
very close to dirty_background_ratio.

The crude check prevents the values from becoming exactly equal.

	if (background_ratio >= dirty_ratio)
		background_ratio = dirty_ratio / 2;


> 
>> So if the admin wants a dirty_ratio of 40 and dirty_background_ratio of 10
>> then that's good, but I'm sure if they knew you're moving dirty_ratio to 10
>> here, they'd want something like 2 for the dirty_background_ratio.
>>
>> I contend that the ratio between these two values is more important than
>> their absolue values -- especially considering one gets twiddled here.
> 
> 
> Maybe true, maybe false.  These things are demonstrable via testing, no?
> 
> 

Might be, I don't know how. Seemed straightforward (to me).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  6:05       ` Nick Piggin
@ 2004-08-06  6:17         ` Andrew Morton
  2004-08-06 12:01         ` Rik van Riel
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  6:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
>  Ie, we have the (pages_low - pages_min) buffer after waking kswapd
>  before entering synch reclaim. Previously there was no buffer. I thought
>  this was the point of background reclaim. I don't know if I can explain
>  it any better than that sorry.

Yes, that is the point.  I was wondering yesterday why pages_min was no
longer used for anything any more.  We must have screwed things up when
doing the lower zone protection stuff for NUMA.  Bugger.

Wanna send that patch again, with a fit-for-human-consumption
description?

Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  6:13           ` Nick Piggin
@ 2004-08-06  6:19             ` Andrew Morton
  2004-08-06  6:36               ` Nick Piggin
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2004-08-06  6:19 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
>  Basically what the above code, is scale the dirty_ratio with the
>  amount of unmapped pages, however it doesn't also scale the
>  dirty_background_ratio (it does after my patch).

OK, that makes sense.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 3/4: writeout watermarks
  2004-08-06  6:19             ` Andrew Morton
@ 2004-08-06  6:36               ` Nick Piggin
  0 siblings, 0 replies; 17+ messages in thread
From: Nick Piggin @ 2004-08-06  6:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>> Basically what the above code, is scale the dirty_ratio with the
>> amount of unmapped pages, however it doesn't also scale the
>> dirty_background_ratio (it does after my patch).
> 
> 
> OK, that makes sense.
> 

Ah good - I couldn't understand why you didn't like it, but in retrospect
I wasn't being very clear about what it was supposed to do.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] 1/4: rework alloc_pages
  2004-08-06  6:05       ` Nick Piggin
  2004-08-06  6:17         ` Andrew Morton
@ 2004-08-06 12:01         ` Rik van Riel
  1 sibling, 0 replies; 17+ messages in thread
From: Rik van Riel @ 2004-08-06 12:01 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, linux-mm

On Fri, 6 Aug 2004, Nick Piggin wrote:

> 	for_each_zone(z) {
> 		if (z->free_pages < z->pages_low + z->protection)
> 			continue;
> 		else
> 			goto got_pg;
> 	}
> 
> 	for_each_zone(z)
> 		wakeup_kswapd(z);

Note that since kswapd does NOT take z->protection into account,
you could end up doing too much asynchronous page recycling from
the highmem zone while having stale pages sitting around in the
normal zone.

As long as we have the lowmem protection switched off by default
we should be fine, though.  Either that, or wakeup_kswapd should
tell kswapd what the threshold is ...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2004-08-06 12:01 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-08-06  4:57 [PATCH] 1/4: rework alloc_pages Nick Piggin
2004-08-06  4:57 ` [PATCH] 2/4: highmem watermarks Nick Piggin
2004-08-06  5:03   ` [PATCH] 3/4: writeout watermarks Nick Piggin
2004-08-06  5:04     ` [PATCH] 4/4: incremental min aware kswapd Nick Piggin
2004-08-06  5:27     ` [PATCH] 3/4: writeout watermarks Andrew Morton
2004-08-06  5:34       ` Nick Piggin
2004-08-06  5:49         ` Andrew Morton
2004-08-06  6:13           ` Nick Piggin
2004-08-06  6:19             ` Andrew Morton
2004-08-06  6:36               ` Nick Piggin
2004-08-06  5:12 ` [PATCH] 1/4: rework alloc_pages Nick Piggin
2004-08-06  5:19 ` Andrew Morton
2004-08-06  5:29   ` Nick Piggin
2004-08-06  5:37     ` Andrew Morton
2004-08-06  6:05       ` Nick Piggin
2004-08-06  6:17         ` Andrew Morton
2004-08-06 12:01         ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox