[RFC] 2.3.39 zone balancing

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC] 2.3.39 zone balancing
@ 2000-01-12 21:11 Kanoj Sarcar
  2000-01-13 13:40 ` Rik van Riel
  2000-01-13 17:12 ` Andrea Arcangeli
  0 siblings, 2 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-12 21:11 UTC (permalink / raw)
  To: torvalds, mingo, andrea, alan; +Cc: Kanoj Sarcar, linux-mm, linux-kernel

Folks,

This is a note and a patch about memory balancing. Please read
the new file Documentation/vm/balance in the patch that explains
the logic behind the patch. 

Comments and feedback welcome. Thanks.

Kanoj

--- mm/page_alloc.c	Tue Jan 11 11:00:31 2000
+++ mm/page_alloc.c	Tue Jan 11 23:59:35 2000
@@ -6,6 +6,7 @@
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  */
 
 #include <linux/config.h>
@@ -197,11 +198,25 @@
 #define ZONE_BALANCED(zone) \
 	(((zone)->free_pages > (zone)->pages_low) && (!(zone)->low_on_memory))
 
+static inline unsigned long classfree(zone_t *zone)
+{
+	unsigned long free = 0;
+	zone_t *z = zone->zone_pgdat->node_zones;
+
+	while (z != zone) {
+		free += z->free_pages;
+		z++;
+	}
+	free += zone->free_pages;
+	return(free);
+}
+
 static inline int zone_balance_memory (zone_t *zone, int gfp_mask)
 {
 	int freed;
+	unsigned long free = classfree(zone);
 
-	if (zone->free_pages >= zone->pages_low) {
+	if (free >= zone->pages_low) {
 		if (!zone->low_on_memory)
 			return 1;
 		/*
@@ -208,7 +223,7 @@
 		 * Simple hysteresis: exit 'low memory mode' if
 		 * the upper limit has been reached:
 		 */
-		if (zone->free_pages >= zone->pages_high) {
+		if (free >= zone->pages_high) {
 			zone->low_on_memory = 0;
 			return 1;
 		}
@@ -220,12 +235,7 @@
 	 * state machine, but do not try to free pages
 	 * ourselves.
 	 */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 1;
-
-	current->flags |= PF_MEMALLOC;
 	freed = try_to_free_pages(gfp_mask, zone);
-	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 		return 0;
@@ -232,6 +242,7 @@
 	return 1;
 }
 
+#if 0
 /*
  * We are still balancing memory in a global way:
  */
@@ -260,17 +271,13 @@
 	 * state machine, but do not try to free pages
 	 * ourselves.
 	 */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 1;
-
-	current->flags |= PF_MEMALLOC;
 	freed = try_to_free_pages(gfp_mask, zone);
-	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 		return 0;
 	return 1;
 }
+#endif
 
 /*
  * This is the 'heart' of the zoned buddy allocator:
@@ -340,7 +347,7 @@
  * The main chunk of the balancing code is in this offline branch:
  */
 balance:
-	if (!balance_memory(z, gfp_mask))
+	if (!zone_balance_memory(z, gfp_mask))
 		goto nopage;
 	goto ready;
 }
@@ -513,6 +520,7 @@
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned int totalpages, offset;
+	unsigned int cumulative = 0;
 
 	totalpages = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -565,7 +573,7 @@
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask = -1;
+		unsigned long mask;
 		unsigned long size;
 
 		size = zones_size[j];
@@ -579,13 +587,11 @@
 			continue;
 
 		zone->offset = offset;
-		/*
-		 * It's unnecessery to balance the high memory zone
-		 */
-		if (j != ZONE_HIGHMEM) {
-			zone->pages_low = freepages.low;
-			zone->pages_high = freepages.high;
-		}
+		cumulative += size;
+		mask = (cumulative >> 7);
+		if (mask < 1) mask = 1;
+		zone->pages_low = mask*2;
+		zone->pages_high = mask*3;
 		zone->low_on_memory = 0;
 
 		for (i = 0; i < size; i++) {
@@ -598,6 +604,7 @@
 		}
 
 		offset += size;
+		mask = -1;
 		for (i = 0; i < MAX_ORDER; i++) {
 			unsigned long bitmap_size;
 
--- mm/vmscan.c	Tue Jan 11 11:00:31 2000
+++ mm/vmscan.c	Tue Jan 11 23:29:41 2000
@@ -534,8 +534,11 @@
 	int retval = 1;
 
 	wake_up_process(kswapd_process);
-	if (gfp_mask & __GFP_WAIT)
+	if (gfp_mask & __GFP_WAIT) {
+		current->flags |= PF_MEMALLOC;
 		retval = do_try_to_free_pages(gfp_mask, zone);
+		current->flags &= ~PF_MEMALLOC;
+	}
 	return retval;
 }
 
--- Documentation/vm/balance	Wed Jan 12 13:05:36 2000
+++ Documentation/vm/balance	Wed Jan 12 13:05:29 2000
@@ -0,0 +1,60 @@
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is _only_ needed for non __GFP_WAIT allocations.
+
+There are two reasons to be requesting non __GFP_WAIT allocations:
+the caller can not sleep (typically intr context), or does not want
+to incur cost overheads of page stealing and possible swap io.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in 
+the swapcache but is not being used by anyone)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-12 21:11 [RFC] 2.3.39 zone balancing Kanoj Sarcar
@ 2000-01-13 13:40 ` Rik van Riel
  2000-01-13 17:06   ` Andrea Arcangeli
                     ` (2 more replies)
  2000-01-13 17:12 ` Andrea Arcangeli
  1 sibling, 3 replies; 40+ messages in thread
From: Rik van Riel @ 2000-01-13 13:40 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: torvalds, mingo, andrea, alan, linux-mm, linux-kernel

On Wed, 12 Jan 2000, Kanoj Sarcar wrote:

> --- mm/page_alloc.c	Tue Jan 11 11:00:31 2000
> +++ mm/page_alloc.c	Tue Jan 11 23:59:35 2000
> +		cumulative += size;
> +		mask = (cumulative >> 7);
> +		if (mask < 1) mask = 1;
> +		zone->pages_low = mask*2;
> +		zone->pages_high = mask*3;
>  		zone->low_on_memory = 0;

I think that busier machines probably have a larger need
for DMA memory than this code fragment will give us. I
have the gut feeling that we'll want to keep about 512kB
or more free in the lower 16MB of busy machines...

(if only because such a large amount of free pages in
such a small part of the address space will give us
higher-order free pages)

> --- mm/vmscan.c	Tue Jan 11 11:00:31 2000
> +++ mm/vmscan.c	Tue Jan 11 23:29:41 2000
> @@ -534,8 +534,11 @@
>  	int retval = 1;
>  
>  	wake_up_process(kswapd_process);
> -	if (gfp_mask & __GFP_WAIT)
> +	if (gfp_mask & __GFP_WAIT) {
> +		current->flags |= PF_MEMALLOC;
>  		retval = do_try_to_free_pages(gfp_mask, zone);
> +		current->flags &= ~PF_MEMALLOC;
> +	}
>  	return retval;
>  }

Please note that kswapd still exits when the total number
of free pages in the system is high enough. Balancing can
probably better be done in the background by kswapd than
by applications that happen to stumble across a nonbalanced
zone...

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 13:40 ` Rik van Riel
@ 2000-01-13 17:06   ` Andrea Arcangeli
  2000-01-13 17:18   ` Alan Cox
  2000-01-13 18:52   ` Kanoj Sarcar
  2 siblings, 0 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-13 17:06 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Kanoj Sarcar, torvalds, mingo, alan, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Rik van Riel wrote:

>On Wed, 12 Jan 2000, Kanoj Sarcar wrote:
>
>> --- mm/page_alloc.c	Tue Jan 11 11:00:31 2000
>> +++ mm/page_alloc.c	Tue Jan 11 23:59:35 2000
>> +		cumulative += size;
>> +		mask = (cumulative >> 7);
>> +		if (mask < 1) mask = 1;
>> +		zone->pages_low = mask*2;
>> +		zone->pages_high = mask*3;
>>  		zone->low_on_memory = 0;
>
>I think that busier machines probably have a larger need
>for DMA memory than this code fragment will give us. I
>have the gut feeling that we'll want to keep about 512kB
>or more free in the lower 16MB of busy machines...
>
>(if only because such a large amount of free pages in
>such a small part of the address space will give us
>higher-order free pages)

That's only a workaround because the page-freeing mechanism is currently
not aware about fragmentation and about the order of the request we asked
for.

So such code shouldn't be wrote assuming the page-freeing is weak as now.

Supposing it's smart we don't need to take lots of memory free in the dma
zone to allow high order allocations to succeed.

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-12 21:11 [RFC] 2.3.39 zone balancing Kanoj Sarcar
  2000-01-13 13:40 ` Rik van Riel
@ 2000-01-13 17:12 ` Andrea Arcangeli
  2000-01-13 18:30   ` Kanoj Sarcar
  1 sibling, 1 reply; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-13 17:12 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Linus Torvalds, Alan Cox, linux-mm, linux-kernel

On Wed, 12 Jan 2000, Kanoj Sarcar wrote:

>+There are two reasons to be requesting non __GFP_WAIT allocations:
>+the caller can not sleep (typically intr context), or does not want
>+to incur cost overheads of page stealing and possible swap io.

You may be in a place where you can sleep but you can't do I/O to avoid
deadlocking and so you shouldn't use __GFP_IO and nothing more (it has
nothing to do with __GFP_WAIT).

But if it can sleep and there aren't deadlock conditons going on and it
doesn't use __GFP_WAIT, it means it's buggy and has to be fixed.

I have not read the rest and the patch yet (I'll continue ASAP).

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 13:40 ` Rik van Riel
  2000-01-13 17:06   ` Andrea Arcangeli
@ 2000-01-13 17:18   ` Alan Cox
  2000-01-13 18:37     ` Rik van Riel
  2000-01-13 18:52   ` Kanoj Sarcar
  2 siblings, 1 reply; 40+ messages in thread
From: Alan Cox @ 2000-01-13 17:18 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Kanoj Sarcar, torvalds, mingo, andrea, alan, linux-mm, linux-kernel

> I think that busier machines probably have a larger need
> for DMA memory than this code fragment will give us. I
> have the gut feeling that we'll want to keep about 512kB
> or more free in the lower 16MB of busy machines...

2.2.x  uses a simple algorithm. Normally allocations come from the main pool
if it fails we use the DMA pool. That seems to work just fine.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 17:12 ` Andrea Arcangeli
@ 2000-01-13 18:30   ` Kanoj Sarcar
  2000-01-13 19:22     ` Andrea Arcangeli
  0 siblings, 1 reply; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 18:30 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Linus Torvalds, Alan Cox, linux-mm, linux-kernel

> 
> On Wed, 12 Jan 2000, Kanoj Sarcar wrote:
> 
> >+There are two reasons to be requesting non __GFP_WAIT allocations:
> >+the caller can not sleep (typically intr context), or does not want
> >+to incur cost overheads of page stealing and possible swap io.
> 
> You may be in a place where you can sleep but you can't do I/O to avoid
> deadlocking and so you shouldn't use __GFP_IO and nothing more (it has
> nothing to do with __GFP_WAIT).

You are right: the documentation should read:

+Memory balancing is _only_ needed for non __GFP_WAIT and non __GFP_IO allocations.
+
+There are two reasons to be requesting non __GFP_WAIT allocations:
+the caller can not sleep (typically intr context), or does not want
+to incur cost overheads of page stealing and possible swap io.
+
+Non __GFP_IO allocations are requested to prevent filesystem deadlocks.

But I would not say __GFP_WAIT and __GFP_IO have no relationship. __GFP_IO
does not make sense if __GFP_WAIT is not set. 

> 
> But if it can sleep and there aren't deadlock conditons going on and it
> doesn't use __GFP_WAIT, it means it's buggy and has to be fixed.
> 

Well, I thought about that while coding the patch: you can not try to 
outsmart the programmer who writes that code. For example, I was 
looking at replace_with_highmem() which makes __GFP_HIGHMEM|__GFP_HIGH
requests, although I _think_ it can do __GFP_WAIT|__GFP_IO without
any problems. I just assumed that whoever coded it (you/Mingo?) had
some logic, like not wanting to waste time scanning for stealable pages
or incur disk swap to implement this performance optimization (that
would defeat the optimization).

Kanoj

> I have not read the rest and the patch yet (I'll continue ASAP).
> 
> Andrea
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 17:18   ` Alan Cox
@ 2000-01-13 18:37     ` Rik van Riel
  2000-01-13 20:13       ` Andrea Arcangeli
  0 siblings, 1 reply; 40+ messages in thread
From: Rik van Riel @ 2000-01-13 18:37 UTC (permalink / raw)
  To: Alan Cox; +Cc: Kanoj Sarcar, torvalds, mingo, andrea, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Alan Cox wrote:

> > I think that busier machines probably have a larger need
> > for DMA memory than this code fragment will give us. I
> > have the gut feeling that we'll want to keep about 512kB
> > or more free in the lower 16MB of busy machines...
> 
> 2.2.x uses a simple algorithm. Normally allocations come from the
> main pool if it fails we use the DMA pool. That seems to work just
> fine.

Of course, I should have thought of that.

Our `high-to-low' allocation strategy should make
sure that the free pages `propagate down'...

Now we'll only want to build something into kswapd
so that rebalancing the high memory zones is done in
the background.

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 13:40 ` Rik van Riel
  2000-01-13 17:06   ` Andrea Arcangeli
  2000-01-13 17:18   ` Alan Cox
@ 2000-01-13 18:52   ` Kanoj Sarcar
  2000-01-13 19:59     ` Andrea Arcangeli
  2 siblings, 1 reply; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 18:52 UTC (permalink / raw)
  To: Rik van Riel; +Cc: torvalds, mingo, andrea, alan, linux-mm, linux-kernel

> 
> On Wed, 12 Jan 2000, Kanoj Sarcar wrote:
> 
> > --- mm/page_alloc.c	Tue Jan 11 11:00:31 2000
> > +++ mm/page_alloc.c	Tue Jan 11 23:59:35 2000
> > +		cumulative += size;
> > +		mask = (cumulative >> 7);
> > +		if (mask < 1) mask = 1;
> > +		zone->pages_low = mask*2;
> > +		zone->pages_high = mask*3;
> >  		zone->low_on_memory = 0;
> 
> I think that busier machines probably have a larger need
> for DMA memory than this code fragment will give us. I
> have the gut feeling that we'll want to keep about 512kB
> or more free in the lower 16MB of busy machines...

Note that as I point out in my documentation, and as Alan
also points out, 2.2 is doing fine. The 2.2 code does not
guarantee dma-zone balancing even if it is empty (if there
is enough regular free pages). Which means all dma requests
will fail. I have tried to fix that, since with HIGHMEM, 
the problem is actually more aggravated.

My aim is to fix a couple of problems, move to a zone based
balancing, and then maybe finetune it. For example, the 
>> 7 part can be replaced with >> N, where N is dependent 
on the zone type, or size of lower zones, etc. I mention this
in the doc too. The only problem is, if N < 7, you will probably
have degraded perfomance in certain cases due to more frequent
balancing.

> 
> (if only because such a large amount of free pages in
> such a small part of the address space will give us
> higher-order free pages)

I note that Andrea also commented about this. I am also
of the same opinion as him, we should not (as far as possible)
try to intermingle unrelated issues. In this case though,
I have no idea how having a large number of free dma pages
ensures more higher-order free pages. Can someone give me
the logic for this claim?

> 
> > --- mm/vmscan.c	Tue Jan 11 11:00:31 2000
> > +++ mm/vmscan.c	Tue Jan 11 23:29:41 2000
> > @@ -534,8 +534,11 @@
> >  	int retval = 1;
> >  
> >  	wake_up_process(kswapd_process);
> > -	if (gfp_mask & __GFP_WAIT)
> > +	if (gfp_mask & __GFP_WAIT) {
> > +		current->flags |= PF_MEMALLOC;
> >  		retval = do_try_to_free_pages(gfp_mask, zone);
> > +		current->flags &= ~PF_MEMALLOC;
> > +	}
> >  	return retval;
> >  }
> 
> Please note that kswapd still exits when the total number
> of free pages in the system is high enough. Balancing can
> probably better be done in the background by kswapd than
> by applications that happen to stumble across a nonbalanced
> zone...

Yes, we need to decide whether kswapd needs modification too. Its
just that I want to do incremental fixes, instead of change a 
huge bunch of code all at once. The question is, if I had a Linux
2.3 kernel, where I had completely deleted kswapd(), what problems 
would the kernel face? Ie, what is kswapd()'s purpose?

Kanoj

> 
> regards,
> 
> Rik
> --
> The Internet is not a network of computers. It is a network
> of people. That is its real strength.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 18:30   ` Kanoj Sarcar
@ 2000-01-13 19:22     ` Andrea Arcangeli
  0 siblings, 0 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-13 19:22 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Linus Torvalds, Alan Cox, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

>[..] For example, I was 
>looking at replace_with_highmem() which makes __GFP_HIGHMEM|__GFP_HIGH
>requests, although I _think_ it can do __GFP_WAIT|__GFP_IO without
>any problems. I just assumed that whoever coded it (you/Mingo?) had

I coded it.

>some logic, like not wanting to waste time scanning for stealable pages
>or incur disk swap to implement this performance optimization (that
>would defeat the optimization).

replace_with_higmem is a _memory-usage_ optimization (not a performance
optimization).

The reason of my GFP_ATOMIC choice is that I don't want to steal pages at
all. Not to go faster but because if the system is just low in memory it
means the high memory is been used completly as well so there's no point
in trying to put the anonymous data into highmem in such case.

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 18:52   ` Kanoj Sarcar
@ 2000-01-13 19:59     ` Andrea Arcangeli
  2000-01-13 21:02       ` Kanoj Sarcar
  0 siblings, 1 reply; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-13 19:59 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Rik van Riel, torvalds, mingo, alan, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

>Note that as I point out in my documentation, and as Alan
>also points out, 2.2 is doing fine. The 2.2 code does not
>guarantee dma-zone balancing even if it is empty (if there
>is enough regular free pages). Which means all dma requests
>will fail. I have tried to fix that, since with HIGHMEM, 
>the problem is actually more aggravated.

It's not more aggravated. You fallback in the ISA-DMA zone in the same way
as before.

>I have no idea how having a large number of free dma pages
>ensures more higher-order free pages. Can someone give me
>the logic for this claim?

Probability.

Suppose you have 100mbyte of physical memory. Suppose all 100mbyte are
free. Suppose you want to do a 100mbyte allocation of physically contigous
memory. You'll succeed.

If you have 100mbyte of memory and only half of memory is free. You may
not succeed in allocating 50mbyte of contiguous memory. So the more memory
is free, the more probability you have to succeed in allocating a large
chunk of physically contigous memory.

>Yes, we need to decide whether kswapd needs modification too. Its
>just that I want to do incremental fixes, instead of change a 
>huge bunch of code all at once. The question is, if I had a Linux
>2.3 kernel, where I had completely deleted kswapd(), what problems 
>would the kernel face? Ie, what is kswapd()'s purpose?

I had a pre-2.2.x kernel without kswapd too :). You need kswapd for
machines where noone process ever run and the only thing that runs are
interrupts and bh handlers (e.g. a router).

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 18:37     ` Rik van Riel
@ 2000-01-13 20:13       ` Andrea Arcangeli
  2000-01-13 21:12         ` Rik van Riel
  2000-01-13 21:40         ` Kanoj Sarcar
  0 siblings, 2 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-13 20:13 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Alan Cox, Kanoj Sarcar, Linus Torvalds, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Rik van Riel wrote:

>Now we'll only want to build something into kswapd
>so that rebalancing the high memory zones is done in
>the background.

You never need to rebance the bigmem between 1g and 64g withing kswapd.
This because bh/irq handlers are not going to use it. So kswapd has to
care only about the memory below the bigmem boundary.

BTW I just noticed currently (2.3.40pre1) kswapd is completly
screwedup. kswapd should still do:

		while (nr_free_pages - nr_free_bigpages < freepages.high)

exactly like in our early 2.3.18 bigmem code because _nothing_ is changed
is the basic MM design since that time.

The fix against 2.3.40pre1 to re-activate kswapd is this:

--- 2.3.40pre1/mm/vmscan.c	Sun Jan  9 20:45:31 2000
+++ /tmp/vmscan.c	Thu Jan 13 21:09:33 2000
@@ -503,7 +503,7 @@
 		do {
 			/* kswapd is critical to provide GFP_ATOMIC
 			   allocations (not GFP_HIGHMEM ones). */
-			if (nr_free_buffer_pages() >= freepages.high)
+			if (nr_free_pages() - nr_free_highpages() >= freepages.high)
 				break;
 			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
 				break;


Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 19:59     ` Andrea Arcangeli
@ 2000-01-13 21:02       ` Kanoj Sarcar
  2000-01-13 21:34         ` Benjamin C.R. LaHaise
  2000-01-13 21:42         ` Alan Cox
  0 siblings, 2 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 21:02 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, torvalds, mingo, alan, linux-mm, linux-kernel

> 
> On Thu, 13 Jan 2000, Kanoj Sarcar wrote:
> 
> >Note that as I point out in my documentation, and as Alan
> >also points out, 2.2 is doing fine. The 2.2 code does not
> >guarantee dma-zone balancing even if it is empty (if there
> >is enough regular free pages). Which means all dma requests
> >will fail. I have tried to fix that, since with HIGHMEM, 
> >the problem is actually more aggravated.
> 
> It's not more aggravated. You fallback in the ISA-DMA zone in the same way
> as before.
>

No, I am referring to a different problem that I mentioned in the
doc. If you have a large number of free regular pages, and the dma
zone is completely exhausted, the 2.2 decision of balacing the dma
zone might never fetch an "yes" answer, because it is based on total
number of free pages, not also the per zone free pages. Right? Things 
will get worse the more non-dma pages there are.


> >I have no idea how having a large number of free dma pages
> >ensures more higher-order free pages. Can someone give me
> >the logic for this claim?
> 
> Probability.
> 
> Suppose you have 100mbyte of physical memory. Suppose all 100mbyte are
> free. Suppose you want to do a 100mbyte allocation of physically contigous
> memory. You'll succeed.
> 
> If you have 100mbyte of memory and only half of memory is free. You may
> not succeed in allocating 50mbyte of contiguous memory. So the more memory
> is free, the more probability you have to succeed in allocating a large
> chunk of physically contigous memory.
> 

Oh, okay I see. There is nothing about the dma zone then, you could 
make the balancing more aggressive for the other zones too. Basically,
these kinds of tuning should be controlled by sysctls (instead of 
>>7, do >> N), so while most sites will prefer to run with the least
aggressive balancing, there may be sites with drivers that need 
many high-order pages, they would be willing to sacrifice some 
performance by doing more aggressive balancing. Comes under finetuning 
in the doc.

> >Yes, we need to decide whether kswapd needs modification too. Its
> >just that I want to do incremental fixes, instead of change a 
> >huge bunch of code all at once. The question is, if I had a Linux
> >2.3 kernel, where I had completely deleted kswapd(), what problems 
> >would the kernel face? Ie, what is kswapd()'s purpose?
> 
> I had a pre-2.2.x kernel without kswapd too :). You need kswapd for
> machines where noone process ever run and the only thing that runs are
> interrupts and bh handlers (e.g. a router).
> 

Oh yes, I was forgetting, that is the reason you need an independent
memory freer in any os. It shouldn't be too hard to teach kswapd about
zones.

Kanoj

> Andrea
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 20:13       ` Andrea Arcangeli
@ 2000-01-13 21:12         ` Rik van Riel
  2000-01-13 21:40         ` Kanoj Sarcar
  1 sibling, 0 replies; 40+ messages in thread
From: Rik van Riel @ 2000-01-13 21:12 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Alan Cox, Kanoj Sarcar, Linus Torvalds, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Andrea Arcangeli wrote:

> --- 2.3.40pre1/mm/vmscan.c	Sun Jan  9 20:45:31 2000
> +++ /tmp/vmscan.c	Thu Jan 13 21:09:33 2000
> @@ -503,7 +503,7 @@
>  		do {
>  			/* kswapd is critical to provide GFP_ATOMIC
>  			   allocations (not GFP_HIGHMEM ones). */
> -			if (nr_free_buffer_pages() >= freepages.high)
> +			if (nr_free_pages() - nr_free_highpages() >= freepages.high)
>  				break;
>  			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
>  				break;

Indeed. Linus, please apply this patch...

Btw, shouldn't we make do_try_to_free_pages() a bit smarter
so that it doesn't free high memory pages when there are
enough free pages in that part of memory.

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:02       ` Kanoj Sarcar
@ 2000-01-13 21:34         ` Benjamin C.R. LaHaise
  2000-01-13 21:48           ` Kanoj Sarcar
  2000-01-13 21:42         ` Alan Cox
  1 sibling, 1 reply; 40+ messages in thread
From: Benjamin C.R. LaHaise @ 2000-01-13 21:34 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Andrea Arcangeli, Rik van Riel, torvalds, mingo, alan, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

> No, I am referring to a different problem that I mentioned in the
> doc. If you have a large number of free regular pages, and the dma
> zone is completely exhausted, the 2.2 decision of balacing the dma
> zone might never fetch an "yes" answer, because it is based on total
> number of free pages, not also the per zone free pages. Right? Things 
> will get worse the more non-dma pages there are.

Kanoj, you're wrong.  2.2 works quite well because of the fact that the
low memory mark will tend to consist almost entirely of DMAable pages.
The only allocations that regularly eat into them on a loaded machine are
interrupts, which tend to be short term allocations anyways.  But as soon
as DMAable memory is freed, it tends not to be allocated until interrupts
consume all memory again.

> Oh, okay I see. There is nothing about the dma zone then, you could 
> make the balancing more aggressive for the other zones too. Basically,
> these kinds of tuning should be controlled by sysctls (instead of 
> >>7, do >> N), so while most sites will prefer to run with the least
> aggressive balancing, there may be sites with drivers that need 
> many high-order pages, they would be willing to sacrifice some 
> performance by doing more aggressive balancing. Comes under finetuning 
> in the doc.

Whoa, hold on here.  Last time we tried to do more aggresive balancing, it
was a complete and total disaster that resulted in completely random swap
storms, resulting in spectacularly unusable systems on the lower end
(iirc 64mb was around the breakeven point).  Before harder limits are
placed on memory types and orders, the behaviour of both kswapd and the
allocator need to be tweaked.  so put in the mechanism, but don't start
enforcing it too aggresively.

		-ben

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 20:13       ` Andrea Arcangeli
  2000-01-13 21:12         ` Rik van Riel
@ 2000-01-13 21:40         ` Kanoj Sarcar
  2000-01-14 12:25           ` Jamie Lokier
  1 sibling, 1 reply; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 21:40 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, Alan Cox, Linus Torvalds, linux-mm, linux-kernel

> 
> On Thu, 13 Jan 2000, Rik van Riel wrote:
> 
> >Now we'll only want to build something into kswapd
> >so that rebalancing the high memory zones is done in
> >the background.
> 
> You never need to rebance the bigmem between 1g and 64g withing kswapd.
> This because bh/irq handlers are not going to use it. So kswapd has to
> care only about the memory below the bigmem boundary.
>

Ohh, then there's another problem. Note that try_to_swap_out() currently
does
        if (PageReserved(page)
            || PageLocked(page)
            || (zone && (!memclass(page->zone, zone))))
                goto out_failed;

kswapd passes in a zone = 0 argument. 

This (and all similar places) will need to be changed to
        if (PageReserved(page)
            || PageLocked(page)
            || (zone && (!memclass(page->zone, zone)))
            || ((zone == 0) && PageHighMem(page)))

Either that, or we should teach kswapd about zones, then kswapd can
pass in the zone pointer it is trying to get balanced. For 2.3, kswapd
will not pass in the highmem zone pointer. That would also mean that
Andrea's patch below will not be needed.

I will create a zone-aware kswapd patch, built on top of the one I
already put out, and send that out asap.

Kanoj
 
> BTW I just noticed currently (2.3.40pre1) kswapd is completly
> screwedup. kswapd should still do:
> 
> 		while (nr_free_pages - nr_free_bigpages < freepages.high)
> 
> exactly like in our early 2.3.18 bigmem code because _nothing_ is changed
> is the basic MM design since that time.
> 
> The fix against 2.3.40pre1 to re-activate kswapd is this:
> 
> --- 2.3.40pre1/mm/vmscan.c	Sun Jan  9 20:45:31 2000
> +++ /tmp/vmscan.c	Thu Jan 13 21:09:33 2000
> @@ -503,7 +503,7 @@
>  		do {
>  			/* kswapd is critical to provide GFP_ATOMIC
>  			   allocations (not GFP_HIGHMEM ones). */
> -			if (nr_free_buffer_pages() >= freepages.high)
> +			if (nr_free_pages() - nr_free_highpages() >= freepages.high)
>  				break;
>  			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
>  				break;
> 
> 
> Andrea
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:02       ` Kanoj Sarcar
  2000-01-13 21:34         ` Benjamin C.R. LaHaise
@ 2000-01-13 21:42         ` Alan Cox
  2000-01-13 21:50           ` Kanoj Sarcar
  2000-01-13 22:01           ` Linus Torvalds
  1 sibling, 2 replies; 40+ messages in thread
From: Alan Cox @ 2000-01-13 21:42 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Andrea Arcangeli, Rik van Riel, torvalds, mingo, alan, linux-mm,
	linux-kernel

> doc. If you have a large number of free regular pages, and the dma
> zone is completely exhausted, the 2.2 decision of balacing the dma
> zone might never fetch an "yes" answer, because it is based on total
> number of free pages, not also the per zone free pages. Right? Things 
> will get worse the more non-dma pages there are.

We might not make good choices to free ISA DMA pages, you are correct yes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:34         ` Benjamin C.R. LaHaise
@ 2000-01-13 21:48           ` Kanoj Sarcar
  0 siblings, 0 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 21:48 UTC (permalink / raw)
  To: Benjamin C.R. LaHaise
  Cc: Andrea Arcangeli, Rik van Riel, torvalds, mingo, alan, linux-mm,
	linux-kernel

> 
> On Thu, 13 Jan 2000, Kanoj Sarcar wrote:
> 
> > No, I am referring to a different problem that I mentioned in the
> > doc. If you have a large number of free regular pages, and the dma
> > zone is completely exhausted, the 2.2 decision of balacing the dma
> > zone might never fetch an "yes" answer, because it is based on total
> > number of free pages, not also the per zone free pages. Right? Things 
> > will get worse the more non-dma pages there are.
> 
> Kanoj, you're wrong.  2.2 works quite well because of the fact that the
> low memory mark will tend to consist almost entirely of DMAable pages.
> The only allocations that regularly eat into them on a loaded machine are
> interrupts, which tend to be short term allocations anyways.  But as soon
> as DMAable memory is freed, it tends not to be allocated until interrupts
> consume all memory again.

Okay, you are telling me what _mostly_ happens, the problem I have pointed
out is one that can _probably_ happen under the right conditions of
temperature and pressure. Its a good idea to design against boundary 
conditions, and then improve the design ...

> 
> > Oh, okay I see. There is nothing about the dma zone then, you could 
> > make the balancing more aggressive for the other zones too. Basically,
> > these kinds of tuning should be controlled by sysctls (instead of 
> > >>7, do >> N), so while most sites will prefer to run with the least
> > aggressive balancing, there may be sites with drivers that need 
> > many high-order pages, they would be willing to sacrifice some 
> > performance by doing more aggressive balancing. Comes under finetuning 
> > in the doc.
> 
> Whoa, hold on here.  Last time we tried to do more aggresive balancing, it
> was a complete and total disaster that resulted in completely random swap
> storms, resulting in spectacularly unusable systems on the lower end
> (iirc 64mb was around the breakeven point).  Before harder limits are
> placed on memory types and orders, the behaviour of both kswapd and the
> allocator need to be tweaked.  so put in the mechanism, but don't start
> enforcing it too aggresively.

Absolutely. I am _not_ suggesting doing anything much different than 
in 2.2. All I am saying is that we can provide sysctls (with their 
default values to mimic 2.2 behavior), then individual developers
can tweak those and do performance experiments. 

Kanoj

> 
> 		-ben
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:42         ` Alan Cox
@ 2000-01-13 21:50           ` Kanoj Sarcar
  2000-01-13 21:53             ` Alan Cox
  2000-01-13 22:01           ` Linus Torvalds
  1 sibling, 1 reply; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 21:50 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrea Arcangeli, Rik van Riel, torvalds, mingo, linux-mm, linux-kernel

> 
> > doc. If you have a large number of free regular pages, and the dma
> > zone is completely exhausted, the 2.2 decision of balacing the dma
> > zone might never fetch an "yes" answer, because it is based on total
> > number of free pages, not also the per zone free pages. Right? Things 
> > will get worse the more non-dma pages there are.
> 
> We might not make good choices to free ISA DMA pages, you are correct yes

And given a huge enough HIGHMEM zone, we might not make good choices to free
regular memory too, right?

My patch would fix this problem. I am going to make the patch bigger to
fix kswapd too, then put it out in the next few hours.

Kanoj

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:50           ` Kanoj Sarcar
@ 2000-01-13 21:53             ` Alan Cox
  0 siblings, 0 replies; 40+ messages in thread
From: Alan Cox @ 2000-01-13 21:53 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Alan Cox, Andrea Arcangeli, Rik van Riel, torvalds, mingo,
	linux-mm, linux-kernel

> > We might not make good choices to free ISA DMA pages, you are correct yes
> 
> And given a huge enough HIGHMEM zone, we might not make good choices to free
> regular memory too, right?

I've got no empirical evidence from 2.2.x that the theoretical case occurs.

Alan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:42         ` Alan Cox
  2000-01-13 21:50           ` Kanoj Sarcar
@ 2000-01-13 22:01           ` Linus Torvalds
  2000-01-13 22:13             ` Kanoj Sarcar
  1 sibling, 1 reply; 40+ messages in thread
From: Linus Torvalds @ 2000-01-13 22:01 UTC (permalink / raw)
  To: Alan Cox
  Cc: Kanoj Sarcar, Andrea Arcangeli, Rik van Riel, mingo, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Alan Cox wrote:
>
> > doc. If you have a large number of free regular pages, and the dma
> > zone is completely exhausted, the 2.2 decision of balacing the dma
> > zone might never fetch an "yes" answer, because it is based on total
> > number of free pages, not also the per zone free pages. Right? Things 
> > will get worse the more non-dma pages there are.
> 
> We might not make good choices to free ISA DMA pages, you are correct yes

What I think needs to happen is something like
 - global page table aging logic (it would be surreal to try to age the
   page table entries on a per-zone basis, because page tables do not have
   zones)
 - per-zone page freeing logic

Right now we do neither. Out page table aging thing (swap_out()) looks at
the zone (which I don't think it should), while our shrink_mmap() is often
completely zone-unaware (ie kswapd uses a NULL zone).

The reason swap_out() looks at the zone is that a long time ago the logic
was that you should avoid swapping normal pages out if you really only
needed DMA pages. I think that logic is broken in the larger picture (when
there are multiple kinds of zones), and is unnecessary even in the old
sense, because these days the swap cache works just fine for us, and
should make the impact of "wrong zone" swapouts be insignificant.

So, I'd like somebody to _try_ to (a) rip out the zone-awareness from
swap_out() completely and (b) make kswapd do something more like

	more_work = 0;
	for (i = 0; i < NR_ZONES; i++) {
		more_work |= balance_zone(zone+i)
	}		
	if (!more_work)
		sleep()

where "balance_zone()" would really be a per-zone "shrink_mmap()" with the
free page logic taken into account.

Sounds reasonable?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 22:01           ` Linus Torvalds
@ 2000-01-13 22:13             ` Kanoj Sarcar
  2000-01-13 22:28               ` Rik van Riel
  2000-01-13 22:30               ` Linus Torvalds
  0 siblings, 2 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-13 22:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Andrea Arcangeli, Rik van Riel, mingo, linux-mm, linux-kernel

> 
> 
> On Thu, 13 Jan 2000, Alan Cox wrote:
> >
> > > doc. If you have a large number of free regular pages, and the dma
> > > zone is completely exhausted, the 2.2 decision of balacing the dma
> > > zone might never fetch an "yes" answer, because it is based on total
> > > number of free pages, not also the per zone free pages. Right? Things 
> > > will get worse the more non-dma pages there are.
> > 
> > We might not make good choices to free ISA DMA pages, you are correct yes
> 
> What I think needs to happen is something like
>  - global page table aging logic (it would be surreal to try to age the
>    page table entries on a per-zone basis, because page tables do not have
>    zones)
>  - per-zone page freeing logic
> 
> Right now we do neither. Out page table aging thing (swap_out()) looks at
> the zone (which I don't think it should), while our shrink_mmap() is often
> completely zone-unaware (ie kswapd uses a NULL zone).
> 
> The reason swap_out() looks at the zone is that a long time ago the logic
> was that you should avoid swapping normal pages out if you really only
> needed DMA pages. I think that logic is broken in the larger picture (when
> there are multiple kinds of zones), and is unnecessary even in the old
> sense, because these days the swap cache works just fine for us, and
> should make the impact of "wrong zone" swapouts be insignificant.

Okay, no big code change there. try_to_swap_out() can probably still accept 
a "zone" argument, like it currently does, except that it does not need
to use it. I assume this would also hold for shm_swap()?

> 
> So, I'd like somebody to _try_ to (a) rip out the zone-awareness from
> swap_out() completely and (b) make kswapd do something more like
> 
> 	more_work = 0;
> 	for (i = 0; i < NR_ZONES; i++) {
> 		more_work |= balance_zone(zone+i)
> 	}		
> 	if (!more_work)
> 		sleep()
> 
> where "balance_zone()" would really be a per-zone "shrink_mmap()" with the
> free page logic taken into account.
>

Yes, that's what everyone seems to be pointing at. As I mentioned, I am
looking into this as I type. The only thing is, as Andrea points out, 
2.3 bh/irq handlers do not request HIGHMEM pages, so shouldn't the
2.3 kswapd do something more like: 

       more_work = 0;
       for (i = 0; i < MAX_NR_ZONES; i++) {
		if (i != ZONE_HIGHMEM)
               		more_work |= balance_zone(zone+i)
       }
       if (!more_work)
               sleep()


Kanoj

> Sounds reasonable?
> 
> 		Linus
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 22:13             ` Kanoj Sarcar
@ 2000-01-13 22:28               ` Rik van Riel
  2000-01-13 22:30               ` Linus Torvalds
  1 sibling, 0 replies; 40+ messages in thread
From: Rik van Riel @ 2000-01-13 22:28 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Linus Torvalds, Alan Cox, Andrea Arcangeli, mingo, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

	[snip Linus' winning idea]

> Yes, that's what everyone seems to be pointing at. As I mentioned, I am
> looking into this as I type. The only thing is, as Andrea points out, 
> 2.3 bh/irq handlers do not request HIGHMEM pages, so shouldn't the
> 2.3 kswapd do something more like: 
> 
>        more_work = 0;
>        for (i = 0; i < MAX_NR_ZONES; i++) {
> 		if (i != ZONE_HIGHMEM)
>                		more_work |= balance_zone(zone+i)
>        }
>        if (!more_work)
>                sleep()

Nope. We want to do page aging and reclamation in ZONE_HIGHMEM
too, otherwise all memory `rotation' is going to happen in the
other zones and the system can thrash in the remaining 1G of
memory while there's 3G of unused data in ZONE_HIGHMEM...

But I agree, we probably don't have to reclaim that many pages
in ZONE_HIGHMEM, something like freepages.min should be enough.

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 22:13             ` Kanoj Sarcar
  2000-01-13 22:28               ` Rik van Riel
@ 2000-01-13 22:30               ` Linus Torvalds
  2000-01-13 23:53                 ` Ingo Molnar
  2000-01-14  0:28                 ` [RFC] 2.3.39 zone balancing Andrea Arcangeli
  1 sibling, 2 replies; 40+ messages in thread
From: Linus Torvalds @ 2000-01-13 22:30 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Alan Cox, Andrea Arcangeli, Rik van Riel, mingo, linux-mm, linux-kernel


On Thu, 13 Jan 2000, Kanoj Sarcar wrote:
> 
> Yes, that's what everyone seems to be pointing at. As I mentioned, I am
> looking into this as I type. The only thing is, as Andrea points out, 
> 2.3 bh/irq handlers do not request HIGHMEM pages, so shouldn't the
> 2.3 kswapd do something more like: 
> 
>        more_work = 0;
>        for (i = 0; i < MAX_NR_ZONES; i++) {
> 		if (i != ZONE_HIGHMEM)
>                		more_work |= balance_zone(zone+i)

No, the other reason for kswapd is to get "smoother" behaviour, by trying
to keep some memory free. Also, while we don't use high-memory pages right
now in BH and irq contexts, I don't think that is something we need to
codify, and it may change in the future. There's no real reason per se for
not using them (except for complexity), so I'd hate to have a special case
for that case.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 23:53                 ` Ingo Molnar
@ 2000-01-13 23:29                   ` Linus Torvalds
  2000-01-14  0:33                     ` Andrea Arcangeli
  2000-01-15  2:03                     ` Reworked 2.3.39 zone balancing - v1 Kanoj Sarcar
  0 siblings, 2 replies; 40+ messages in thread
From: Linus Torvalds @ 2000-01-13 23:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Kanoj Sarcar, Alan Cox, Andrea Arcangeli, Rik van Riel, linux-mm,
	linux-kernel

On Fri, 14 Jan 2000, Ingo Molnar wrote:
> 
> so why cant swap_out (conceptually) accept a 'zones under pressure'
> bitmask as an input, and calculate zones from the physical address it sees
> in the page table.

Because swap_out() is going to look at the page tables _anyway_.

Basically, my argument is that there is no way "swap_out()" can really
target any special zone, except by avoiding to do the final stage in a
long sequence of stages that it has already done. I think that's just
completely wasteful - doing all the work, and then at the last minute
deciding to not use the work after all. Especially as we don't really have
any good reason to believe that it's the right thing in the first place.

I suspect we're much better off just having a simple "age the page tables"
thing that doesn't care abotu zones at all, and when a page table entry
has been aged enough, it gets pushed into the page/swap cache. It's
reasonably cheap to fault it in again, and because we use aging on the
page tables we've selected a page that isn't supposed to be very active
anyway.

So that's why I think the page table walker should be completely
zone-blind, and just not care. It's likely to be more "balanced" that way
anyway.

The "shrink_mmap()" stage is another matter entirely. shrink_mmap() has
complete control over which zone it looks at, and can do a good (perfect)
job of balancing the amount of work it does to how much it wants to
accomplish.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 22:30               ` Linus Torvalds
@ 2000-01-13 23:53                 ` Ingo Molnar
  2000-01-13 23:29                   ` Linus Torvalds
  2000-01-14  0:28                 ` [RFC] 2.3.39 zone balancing Andrea Arcangeli
  1 sibling, 1 reply; 40+ messages in thread
From: Ingo Molnar @ 2000-01-13 23:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kanoj Sarcar, Alan Cox, Andrea Arcangeli, Rik van Riel, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Linus Torvalds wrote:

> >        more_work = 0;
> >        for (i = 0; i < MAX_NR_ZONES; i++) {
> > 		if (i != ZONE_HIGHMEM)
> >                		more_work |= balance_zone(zone+i)
> 
> No, the other reason for kswapd is to get "smoother" behaviour, by trying
> to keep some memory free. Also, while we don't use high-memory pages right
> now in BH and irq contexts, I don't think that is something we need to
> codify, and it may change in the future. There's no real reason per se for
> not using them (except for complexity), so I'd hate to have a special case
> for that case.

one more thing, i think there is a real possibility for the following
scenario to happen: well used server, pagecache takes up all the RAM, as
it should. Application just happens to run out of free RAM and we allocate
from the DMA zone. Then the application happens to use these DMA pages
heavily, and which pages thus become unlikely to get freed. Ie. kswapd
will feel the memory pressure in the DMA zone, without being able to help
the situation. Just running kswapd for a long time will not help the
situation, because the DMA pages are highly used.

so why cant swap_out (conceptually) accept a 'zones under pressure'
bitmask as an input, and calculate zones from the physical address it sees
in the page table. Some per-architecture thing like:

	static inline pte_in_zonemask (pte, unsigned long mask)
	{
		idx = pte_to_pagenr(pte);

		/*
		 * Pages are more likely to be in the highest zone
		 */
		for (i = ZONE_MAX-1; i--; ) {
			struct zone_t *zone = zones + i;

			if (zone->offset < idx)
				return (1 << (zone-zones)) & mask;
		}
	}

since ZONE_MAX is 2 or 3 typically, this will likely be unrolled. It's not
going to be as fast as now, but it's simple nevertheless. (and swapping
out is never fast in the first place)

so if kswapd generated a memory pressure 'zone bitmask' instead of a
single zone (single zone is definitely broken), then we could solve such
situations as well. This is at the price of kswapd looping through
pagetables, but i think we should be ready to pay this price for
predictability. Only GFP_DMA16 will pay such price, GFP_NORMAL is likely
to succeed in typical systems. Once highmem_pages/normal_pages is getting
larger, this cost goes up as well.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 22:30               ` Linus Torvalds
  2000-01-13 23:53                 ` Ingo Molnar
@ 2000-01-14  0:28                 ` Andrea Arcangeli
  1 sibling, 0 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-14  0:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kanoj Sarcar, Alan Cox, Rik van Riel, mingo, linux-mm, linux-kernel

On Thu, 13 Jan 2000, Linus Torvalds wrote:

>to keep some memory free. Also, while we don't use high-memory pages right
>now in BH and irq contexts, I don't think that is something we need to
>codify, and it may change in the future. There's no real reason per se for

Yes, it will change on 64bit platforms.

>not using them (except for complexity), so I'd hate to have a special case
>for that case.

With the current code the special case is necessary but a rewrite should
be able to get rid of it cleanly. Anyway actually adding the number of
freeable pages to the free pages when checking the watermark is completly
buggy (this has nothing to do with the special case).

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 23:29                   ` Linus Torvalds
@ 2000-01-14  0:33                     ` Andrea Arcangeli
  2000-01-14  0:52                       ` Linus Torvalds
  2000-01-14  1:13                       ` Kanoj Sarcar
  2000-01-15  2:03                     ` Reworked 2.3.39 zone balancing - v1 Kanoj Sarcar
  1 sibling, 2 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-14  0:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Kanoj Sarcar, Alan Cox, Rik van Riel, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Linus Torvalds wrote:

>Basically, my argument is that there is no way "swap_out()" can really
>target any special zone, except by avoiding to do the final stage in a
>long sequence of stages that it has already done. I think that's just
>completely wasteful - doing all the work, and then at the last minute
>deciding to not use the work after all. Especially as we don't really have
>any good reason to believe that it's the right thing in the first place.

The only problem in what you are suggesting is that you may end swapping
out also the wrong pages. Suppose you want to allocate 4k of DMA
memory. Why should the machine swapout lots of mbytes of data while it
could only swapout 4k? And after each swapout we have to restart from the
vma because to swapout we have to drop the pagetable lock and so the
mappings can be changed from under us.

>So that's why I think the page table walker should be completely
>zone-blind, and just not care. It's likely to be more "balanced" that way
>anyway.

The swapout will be definitely more balanced but we may end doing not
necesary swapouts.

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  0:33                     ` Andrea Arcangeli
@ 2000-01-14  0:52                       ` Linus Torvalds
  2000-01-14  1:08                         ` Rik van Riel
  2000-01-14  2:13                         ` Ingo Molnar
  2000-01-14  1:13                       ` Kanoj Sarcar
  1 sibling, 2 replies; 40+ messages in thread
From: Linus Torvalds @ 2000-01-14  0:52 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ingo Molnar, Kanoj Sarcar, Alan Cox, Rik van Riel, linux-mm,
	linux-kernel

On Fri, 14 Jan 2000, Andrea Arcangeli wrote:
> 
> The only problem in what you are suggesting is that you may end swapping
> out also the wrong pages. Suppose you want to allocate 4k of DMA
> memory.

I agree.

HOWEVER, I don't think this is going to be a huge issue in most cases. And
if people don't need non-DMA memory, then the pages we "swapped" out are
going to stay in RAM anyway, so it's not going to hurt us.

Anyway, I obviously do agree that I may well be wrong, and that real life
is going to come back and bite us, and we'll end up having to not do it
this way. However, I'd prefer trying the "conceptually simple" path first,
and only if it turns out that yes, I was completely wrong, do we try to
fix it up with magic heuristics etc.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  0:52                       ` Linus Torvalds
@ 2000-01-14  1:08                         ` Rik van Riel
  2000-01-14  2:13                         ` Ingo Molnar
  1 sibling, 0 replies; 40+ messages in thread
From: Rik van Riel @ 2000-01-14  1:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Ingo Molnar, Kanoj Sarcar, Alan Cox, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Linus Torvalds wrote:
> On Fri, 14 Jan 2000, Andrea Arcangeli wrote:
> > 
> > The only problem in what you are suggesting is that you may end swapping
> > out also the wrong pages. Suppose you want to allocate 4k of DMA
> > memory.
> 
> I agree.
> 
> HOWEVER, I don't think this is going to be a huge issue in most cases. And
> if people don't need non-DMA memory, then the pages we "swapped" out are
> going to stay in RAM anyway, so it's not going to hurt us.

If the page is not dirtied after we swapped it out last time,
it won't matter one bit for performance. If the page is dirtied
continuously we won't swap out that page either.

All the swap-everything, free later thing means is that we'll
incur a little extra I/O (in the background) and that it might
be easier/faster to free pages in the foreground, when we really
need them.

> Anyway, I obviously do agree that I may well be wrong, and that
> real life is going to come back and bite us, and we'll end up
> having to not do it this way. However, I'd prefer trying the
> "conceptually simple" path first, and only if it turns out that
> yes, I was completely wrong, do we try to fix it up with magic
> heuristics etc.

I don't think there will be that many side effects, except
perhaps a bit higher swap usage...

Of course, under certain workloads the two different tactics
will make a difference, but that can swing either way.

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  0:33                     ` Andrea Arcangeli
  2000-01-14  0:52                       ` Linus Torvalds
@ 2000-01-14  1:13                       ` Kanoj Sarcar
  2000-01-14  2:27                         ` Ingo Molnar
  2000-01-14  2:46                         ` Ingo Molnar
  1 sibling, 2 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-14  1:13 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, Ingo Molnar, Alan Cox, Rik van Riel, linux-mm,
	linux-kernel

> 
> On Thu, 13 Jan 2000, Linus Torvalds wrote:
> 
> >Basically, my argument is that there is no way "swap_out()" can really
> >target any special zone, except by avoiding to do the final stage in a
> >long sequence of stages that it has already done. I think that's just
> >completely wasteful - doing all the work, and then at the last minute
> >deciding to not use the work after all. Especially as we don't really have
> >any good reason to believe that it's the right thing in the first place.
> 
> The only problem in what you are suggesting is that you may end swapping
> out also the wrong pages. Suppose you want to allocate 4k of DMA
> memory. Why should the machine swapout lots of mbytes of data while it
> could only swapout 4k? And after each swapout we have to restart from the
> vma because to swapout we have to drop the pagetable lock and so the
> mappings can be changed from under us.

Yes, I am worried about this a little too. Specially, when you are
hunting for a dma page, and the machine happens to have gigs of
highmem and regular pages, chances are that you will end up stealing
a lot of pages unneccesarily.

But as Linus points out, recovering from that is not that costly
(the page will be in the swapcache mostly, its just the cost of 
the page fault).

What about stealing the page only if the corresponding zone is
also running unbalanced?

Kanoj

> 
> >So that's why I think the page table walker should be completely
> >zone-blind, and just not care. It's likely to be more "balanced" that way
> >anyway.
> 
> The swapout will be definitely more balanced but we may end doing not
> necesary swapouts.
> 
> Andrea
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  2:13                         ` Ingo Molnar
@ 2000-01-14  1:17                           ` Kanoj Sarcar
  2000-01-14  2:36                             ` Ingo Molnar
  0 siblings, 1 reply; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-14  1:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrea Arcangeli, Alan Cox, Rik van Riel,
	linux-mm, linux-kernel

> 
> 
> On Thu, 13 Jan 2000, Linus Torvalds wrote:
> 
> > HOWEVER, I don't think this is going to be a huge issue in most cases. And
> > if people don't need non-DMA memory, then the pages we "swapped" out are
> > going to stay in RAM anyway, so it's not going to hurt us.
> > 
> > Anyway, I obviously do agree that I may well be wrong, and that real life
> > is going to come back and bite us, and we'll end up having to not do it
> > this way. However, I'd prefer trying the "conceptually simple" path first,
> > and only if it turns out that yes, I was completely wrong, do we try to
> > fix it up with magic heuristics etc.
> 
> hm., i think we'll see this with ISA soundcards (still the majority) if
> used as modules. Right now kswapd just gives up too easy and says 'no such
> page', on a box with lots of RAM and all DMA allocated in process VM
> space.
> 
> Anyway, the patch and suggestion of passing in a single zone is i believe
> completely wrong, because it advances mm->swap_address, which unfairly
> selects a given range to be checked for only one zone. So i think it's
> either zone-bitmaps (or equivalent multi-zone logic) or what you
> suggested, to have no zone-awareness in swap_out() for now at all.
> 
> (i believe this is also going to bite us with the IA64 port - kswapd will
> have no information to free pages from the right node, we could solve this
> already with a zone bitmap, or by starting per-zone kswapds. The latter

If you are talking about the discontig memory support, yes, I have thought
about that and arrived at the conclusion that rather than overdesign
right now, we will have to see how things work out on a real machine. 

There's been some arguments against per-zone, or per-node kswapd's, 
so the other alternative is to pass the list of unbalanced zones to
kswapd, which can then scan only the unbalanced ones. This is the 
best solution when there are fairly large number of nodes.

Kanoj

> one looks like overkill to me, but it's conceptually cleaner than bitmaps
> and and does not have a limitation on the number of zones. Might not be a
> highprio issue though.)
> 
> -- mingo
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  0:52                       ` Linus Torvalds
  2000-01-14  1:08                         ` Rik van Riel
@ 2000-01-14  2:13                         ` Ingo Molnar
  2000-01-14  1:17                           ` Kanoj Sarcar
  1 sibling, 1 reply; 40+ messages in thread
From: Ingo Molnar @ 2000-01-14  2:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrea Arcangeli, Kanoj Sarcar, Alan Cox, Rik van Riel, linux-mm,
	linux-kernel

On Thu, 13 Jan 2000, Linus Torvalds wrote:

> HOWEVER, I don't think this is going to be a huge issue in most cases. And
> if people don't need non-DMA memory, then the pages we "swapped" out are
> going to stay in RAM anyway, so it's not going to hurt us.
> 
> Anyway, I obviously do agree that I may well be wrong, and that real life
> is going to come back and bite us, and we'll end up having to not do it
> this way. However, I'd prefer trying the "conceptually simple" path first,
> and only if it turns out that yes, I was completely wrong, do we try to
> fix it up with magic heuristics etc.

hm., i think we'll see this with ISA soundcards (still the majority) if
used as modules. Right now kswapd just gives up too easy and says 'no such
page', on a box with lots of RAM and all DMA allocated in process VM
space.

Anyway, the patch and suggestion of passing in a single zone is i believe
completely wrong, because it advances mm->swap_address, which unfairly
selects a given range to be checked for only one zone. So i think it's
either zone-bitmaps (or equivalent multi-zone logic) or what you
suggested, to have no zone-awareness in swap_out() for now at all.

(i believe this is also going to bite us with the IA64 port - kswapd will
have no information to free pages from the right node, we could solve this
already with a zone bitmap, or by starting per-zone kswapds. The latter
one looks like overkill to me, but it's conceptually cleaner than bitmaps
and and does not have a limitation on the number of zones. Might not be a
highprio issue though.)

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  1:13                       ` Kanoj Sarcar
@ 2000-01-14  2:27                         ` Ingo Molnar
  2000-01-14  2:46                         ` Ingo Molnar
  1 sibling, 0 replies; 40+ messages in thread
From: Ingo Molnar @ 2000-01-14  2:27 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Andrea Arcangeli, Linus Torvalds, Alan Cox, Rik van Riel,
	linux-mm, linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

> But as Linus points out, recovering from that is not that costly
> (the page will be in the swapcache mostly, its just the cost of 
> the page fault).
> 
> What about stealing the page only if the corresponding zone is
> also running unbalanced?

(this is exactly what i'm talking about.)

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  1:17                           ` Kanoj Sarcar
@ 2000-01-14  2:36                             ` Ingo Molnar
  2000-01-14 20:33                               ` Peter Rival
  0 siblings, 1 reply; 40+ messages in thread
From: Ingo Molnar @ 2000-01-14  2:36 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Linus Torvalds, Andrea Arcangeli, Alan Cox, Rik van Riel,
	linux-mm, linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

> There's been some arguments against per-zone, or per-node kswapd's, 
> so the other alternative is to pass the list of unbalanced zones to
> kswapd, which can then scan only the unbalanced ones. This is the 
> best solution when there are fairly large number of nodes.

the current kswapd is not quite suited to go per-zone and/or per-node, i
agree. But the swap_out() logic itself i believe has to be per-node in the
long term. Especially as we are already able to allocate from a given
node. Thus it would be natural to be able to do swap_out() from a given
node - both page tables and pages will likely be bound to a node. Per-node
kswapds are simple - they only have to take a look at p->node or
p->processor to pick up the right mm. This means that every kswapd would
pick up preferred mm's from it's own node.

the pagecache and other 'global' freeing stuff should only run on one of
the kswapds. (or in a separate 'global' freeing daemon, just like bdflush
- although that i think would be an overkill.)

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  1:13                       ` Kanoj Sarcar
  2000-01-14  2:27                         ` Ingo Molnar
@ 2000-01-14  2:46                         ` Ingo Molnar
  2000-01-14  6:22                           ` Kanoj Sarcar
  1 sibling, 1 reply; 40+ messages in thread
From: Ingo Molnar @ 2000-01-14  2:46 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Andrea Arcangeli, Linus Torvalds, Alan Cox, Rik van Riel,
	linux-mm, linux-kernel

On Thu, 13 Jan 2000, Kanoj Sarcar wrote:

> But as Linus points out, recovering from that is not that costly
> (the page will be in the swapcache mostly, its just the cost of 
> the page fault).

note that i was not worried a bit about swapping performance. Swapping is
slow, conceptually. I'm worried about the current pte_young() logic and
the fact that pages can evade swap_out() completely just by being used
(read access) at least once per scan. This not only makes the system slow
(which we dont care), but also unusable in certain cases. This is an
existing problem, Alan got 2.2 reports of frequent GFP_DMA failures on 1GB
boxes. (weird combination of hardware i agree) The zone rewrite already
made the situation much better by ordering zones, and i'll be completely
happy if we make the pte_young() branch in try_to_swap_out() at least
conditional on memory pressure :-)

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  2:46                         ` Ingo Molnar
@ 2000-01-14  6:22                           ` Kanoj Sarcar
  0 siblings, 0 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-14  6:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrea Arcangeli, Linus Torvalds, Alan Cox, Rik van Riel,
	linux-mm, linux-kernel

> 
> On Thu, 13 Jan 2000, Kanoj Sarcar wrote:
> 
> > But as Linus points out, recovering from that is not that costly
> > (the page will be in the swapcache mostly, its just the cost of 
> > the page fault).
> 
> note that i was not worried a bit about swapping performance. Swapping is
> slow, conceptually. I'm worried about the current pte_young() logic and
> the fact that pages can evade swap_out() completely just by being used
> (read access) at least once per scan. This not only makes the system slow
> (which we dont care), but also unusable in certain cases. This is an
> existing problem, Alan got 2.2 reports of frequent GFP_DMA failures on 1GB
> boxes. (weird combination of hardware i agree) The zone rewrite already
> made the situation much better by ordering zones, and i'll be completely
> happy if we make the pte_young() branch in try_to_swap_out() at least
> conditional on memory pressure :-)
> 
> -- mingo
>

Might there be another reason for this possibly? As I point out in
the zone balance doc, if there are way too many non-dma pages, then
even when the dma zone is near empty, it will not be balanced because
the total number of free pages is above the watermark.

Alan did acknowledge this as a theoretical case, although he mentioned
that it was not happening in real life.

Kanoj
 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.nl.linux.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-13 21:40         ` Kanoj Sarcar
@ 2000-01-14 12:25           ` Jamie Lokier
  2000-01-14 13:43             ` Andrea Arcangeli
  0 siblings, 1 reply; 40+ messages in thread
From: Jamie Lokier @ 2000-01-14 12:25 UTC (permalink / raw)
  To: Kanoj Sarcar
  Cc: Andrea Arcangeli, Rik van Riel, Alan Cox, Linus Torvalds,
	linux-mm, linux-kernel

Speaking of kswapd zone balancing.  Can swap cache pages be moved from
one zone to another, or even duplicated according to need later on?

It would seem logical that when a page in the DMA zone is only held for
swap cache, it's worth copying it to the regular zone and using the copy
when the page is needed again to free up DMA pages without hitting the
disk.

I guess this sort of preemptive duplication is in a similar category to
pre-zeroing.  

just a thought,
-- Jamie

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14 12:25           ` Jamie Lokier
@ 2000-01-14 13:43             ` Andrea Arcangeli
  0 siblings, 0 replies; 40+ messages in thread
From: Andrea Arcangeli @ 2000-01-14 13:43 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Kanoj Sarcar, Rik van Riel, Alan Cox, Linus Torvalds, linux-mm,
	linux-kernel

On Fri, 14 Jan 2000, Jamie Lokier wrote:

>It would seem logical that when a page in the DMA zone is only held for
>swap cache, it's worth copying it to the regular zone and using the copy
>when the page is needed again to free up DMA pages without hitting the
>disk.

That's basically what I am just doing for preserving regular pages w.r.t.
high pages in replace_with_highmem but currently I am not graceful against
DMA pages yet.

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RFC] 2.3.39 zone balancing
  2000-01-14  2:36                             ` Ingo Molnar
@ 2000-01-14 20:33                               ` Peter Rival
  0 siblings, 0 replies; 40+ messages in thread
From: Peter Rival @ 2000-01-14 20:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Kanoj Sarcar, Linus Torvalds, Andrea Arcangeli, Alan Cox,
	Rik van Riel, linux-mm, linux-kernel

Sorry to be late on this thread...

Ingo Molnar wrote:

> On Thu, 13 Jan 2000, Kanoj Sarcar wrote:
>
> > There's been some arguments against per-zone, or per-node kswapd's,
> > so the other alternative is to pass the list of unbalanced zones to
> > kswapd, which can then scan only the unbalanced ones. This is the
> > best solution when there are fairly large number of nodes.
>
> the current kswapd is not quite suited to go per-zone and/or per-node, i
> agree. But the swap_out() logic itself i believe has to be per-node in the
> long term. Especially as we are already able to allocate from a given
> node. Thus it would be natural to be able to do swap_out() from a given
> node - both page tables and pages will likely be bound to a node. Per-node
> kswapds are simple - they only have to take a look at p->node or
> p->processor to pick up the right mm. This means that every kswapd would
> pick up preferred mm's from it's own node.
>

Just don't forget about memory-only nodes (i.e., don't use p->processor :)

 - Pete

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Reworked 2.3.39 zone balancing - v1
  2000-01-13 23:29                   ` Linus Torvalds
  2000-01-14  0:33                     ` Andrea Arcangeli
@ 2000-01-15  2:03                     ` Kanoj Sarcar
  1 sibling, 0 replies; 40+ messages in thread
From: Kanoj Sarcar @ 2000-01-15  2:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Alan Cox, Andrea Arcangeli, Rik van Riel, linux-mm,
	linux-kernel, Kanoj Sarcar

Okay folks, here's what I have now. I am still testing it, but I am 
sending it out for comments. 

* kswapd uses a list of zones to be balanced. Useful when the number
of zones is high (in numa, or discontigmem machines). Linus, I have
an alternate version where kswapd goes thru all zones in all pgdats
too, let me know if you would prefer that or want to take a look at 
it.

* any deallocator can decide whether freeing a page will benefit a
zone that has fallen below its watermarks by using PG_ZONE_BALANCED().
Thus shm_swap() and try_to_swap_out() use this to prevent "unneeded"
swapouts. Linus, I am open to deleting this check and going with
what you suggested initially. Added due to Andrea's and my concerns,
originally suggested by Ingo.

* Ingo's idea about victimizing young ptes in try_to_swap_out() under
high pressure not yet in this patch, will be added if this version looks
okay.

Comments/feedback welcome.

Kanoj

--- Documentation/vm/balance	Fri Jan 14 16:32:22 2000
+++ Documentation/vm/balance	Fri Jan 14 16:07:28 2000
@@ -0,0 +1,87 @@
+Started Jan 2000 by Kanoj Sarcar <kanoj@sgi.com>
+
+Memory balancing is needed for non __GFP_WAIT as well as for non
+__GFP_IO allocations.
+
+There are two reasons to be requesting non __GFP_WAIT allocations:
+the caller can not sleep (typically intr context), or does not want
+to incur cost overheads of page stealing and possible swap io for
+whatever reasons.
+
+__GFP_IO allocation requests are made to prevent file system deadlocks.
+
+In the absence of non sleepable allocation requests, it seems detrimental
+to be doing balancing. Page reclamation can be kicked off lazily, that
+is, only when needed (aka zone free memory is 0), instead of making it
+a proactive process.
+
+That being said, the kernel should try to fulfill requests for direct
+mapped pages from the direct mapped pool, instead of falling back on
+the dma pool, so as to keep the dma pool filled for dma requests (atomic
+or not). A similar argument applies to highmem and direct mapped pages.
+OTOH, if there is a lot of free dma pages, it is preferable to satisfy
+regular memory requests by allocating one from the dma pool, instead
+of incurring the overhead of regular zone balancing.
+
+In 2.2, memory balancing/page reclamation would kick off only when the
+_total_ number of free pages fell below 1/64 th of total memory. With the
+right ratio of dma and regular memory, it is quite possible that balancing
+would not be done even when the dma zone was completely empty. 2.2 has
+been running production machines of varying memory sizes, and seems to be
+doing fine even with the presence of this problem. In 2.3, due to
+HIGHMEM, this problem is aggravated.
+
+In 2.3, zone balancing can be done in one of two ways: depending on the
+zone size (and possibly of the size of lower class zones), we can decide
+at init time how many free pages we should aim for while balancing any
+zone. The good part is, while balancing, we do not need to look at sizes
+of lower class zones, the bad part is, we might do too frequent balancing
+due to ignoring possibly lower usage in the lower class zones. Also,
+with a slight change in the allocation routine, it is possible to reduce
+the memclass() macro to be a simple equality.
+
+Another possible solution is that we balance only when the free memory
+of a zone _and_ all its lower class zones falls below 1/64th of the
+total memory in the zone and its lower class zones. This fixes the 2.2
+balancing problem, and stays as close to 2.2 behavior as possible. Also,
+the balancing algorithm works the same way on the various architectures,
+which have different numbers and types of zones. If we wanted to get
+fancy, we could assign different weights to free pages in different
+zones in the future.
+
+Note that if the size of the regular zone is huge compared to dma zone,
+it becomes less significant to consider the free dma pages while
+deciding whether to balance the regular zone. The first solution
+becomes more attractive then.
+
+The appended patch implements the second solution. It also "fixes" two
+problems: first, kswapd is woken up as in 2.2 on low memory conditions
+for non-sleepable allocations. Second, the HIGHMEM zone is also balanced,
+so as to give a fighting chance for replace_with_highmem() to get a
+HIGHMEM page, as well as to ensure that HIGHMEM allocations do not
+fall back into regular zone. This also makes sure that HIGHMEM pages
+are not leaked (for example, in situations where a HIGHMEM page is in 
+the swapcache but is not being used by anyone)
+
+kswapd also needs to know about the zones it should balance. kswapd is
+primarily needed in a situation where balancing can not be done, 
+probably because all allocation requests are coming from intr context
+and all process contexts are sleeping. For 2.3, kswapd does not really
+need to balance the highmem zone, since intr context does not request
+highmem pages. So as not to spend too much time searching for the zones
+that need balancing (specially in a numa or discontig machine with multiple
+zones), kswapd expects to see the zones that it needs to balance in a list.
+Page alloc requests add zones to the list, kswapd deletes zones from the
+list once they are balanced (kswapd could also delete zones from the list
+once it has had a go at it, whether the zone ends up balanced or not), and
+kswapd scans the list without the list lock.
+
+Page stealing from process memory and shm is done if stealing the page would
+alleviate memory pressure on any zone in the page's node that has fallen below
+its watermark.
+
+(Good) Ideas that I have heard:
+1. Dynamic experience should influence balancing: number of failed requests
+for a zone can be tracked and fed into the balancing scheme (jalvo@mbay.net)
+2. Implement a replace_with_highmem()-like replace_with_regular() to preserve
+dma pages. (lkd@tantalophile.demon.co.uk)
--- fs/dcache.c	Tue Jan 11 11:00:25 2000
+++ fs/dcache.c	Thu Jan 13 13:59:18 2000
@@ -412,20 +412,18 @@
  */
 int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone)
 {
-	if (gfp_mask & __GFP_IO) {
-		int count = 0;
-		lock_kernel();
-		if (priority)
-			count = dentry_stat.nr_unused / priority;
-		prune_dcache(count);
-		unlock_kernel();
-		/* FIXME: kmem_cache_shrink here should tell us
-		   the number of pages freed, and it should
-		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-		   to free only the interesting pages in
-		   function of the needs of the current allocation. */
-		kmem_cache_shrink(dentry_cache);
-	}
+	int count = 0;
+	lock_kernel();
+	if (priority)
+		count = dentry_stat.nr_unused / priority;
+	prune_dcache(count);
+	unlock_kernel();
+	/* FIXME: kmem_cache_shrink here should tell us
+	   the number of pages freed, and it should
+	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+	   to free only the interesting pages in
+	   function of the needs of the current allocation. */
+	kmem_cache_shrink(dentry_cache);
 
 	return 0;
 }
--- fs/inode.c	Tue Jan 11 11:00:25 2000
+++ fs/inode.c	Thu Jan 13 13:59:34 2000
@@ -398,20 +398,17 @@
 
 int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone)
 {
-	if (gfp_mask & __GFP_IO)
-	{
-		int count = 0;
+	int count = 0;
 		
-		if (priority)
-			count = inodes_stat.nr_unused / priority;
-		prune_icache(count);
-		/* FIXME: kmem_cache_shrink here should tell us
-		   the number of pages freed, and it should
-		   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
-		   to free only the interesting pages in
-		   function of the needs of the current allocation. */
-		kmem_cache_shrink(inode_cachep);
-	}
+	if (priority)
+		count = inodes_stat.nr_unused / priority;
+	prune_icache(count);
+	/* FIXME: kmem_cache_shrink here should tell us
+	   the number of pages freed, and it should
+	   work in a __GFP_DMA/__GFP_HIGHMEM behaviour
+	   to free only the interesting pages in
+	   function of the needs of the current allocation. */
+	kmem_cache_shrink(inode_cachep);
 
 	return 0;
 }
--- include/linux/mmzone.h	Tue Jan 11 11:00:28 2000
+++ include/linux/mmzone.h	Fri Jan 14 15:59:18 2000
@@ -7,6 +7,7 @@
 #include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <asm/bitops.h>
 
 /*
  * Free memory management - zoned buddy allocator.
@@ -37,6 +38,7 @@
 	int low_on_memory;
 	unsigned long pages_low, pages_high;
 	struct pglist_data *zone_pgdat;
+	struct list_head balance_list;
 
 	/*
 	 * free areas of different sizes
@@ -80,13 +82,24 @@
 	struct page *node_mem_map;
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
+	unsigned int balance_mask;
 } pg_data_t;
 
 extern int numnodes;
+extern struct list_head global_balance_list;
 
+#define zone_index(zone)	((zone) - (zone)->zone_pgdat->node_zones)
+#define zone_mask(zone)		(1 << zone_index(zone))
 #define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
-			&& (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
-			((tzone) - (pgzone)->zone_pgdat->node_zones)))
+				&& (zone_index(pgzone) <= zone_index(tzone)))
+#define MARK_ZONE_UNBALANCED(zone) \
+		set_bit(zone_index(zone), &(zone)->zone_pgdat->balance_mask)
+#define MARK_ZONE_BALANCED(zone) \
+		clear_bit(zone_index(zone), &(zone)->zone_pgdat->balance_mask)
+#define PG_ZONE_BALANCED(zone) \
+			(zone_mask(zone) > (zone)->zone_pgdat->balance_mask)
+
+extern int zone_balance_memory(zone_t *zone, int gfp_mask);
 
 #ifndef CONFIG_DISCONTIGMEM
 
--- ipc/shm.c	Tue Jan 11 11:00:31 2000
+++ ipc/shm.c	Fri Jan 14 14:57:55 2000
@@ -958,7 +958,7 @@
 	if (!pte_present(page))
 		goto check_table;
 	page_map = pte_page(page);
-	if (zone && (!memclass(page_map->zone, zone)))
+	if (PG_ZONE_BALANCED(page_map->zone))
 		goto check_table;
 	swap_attempts++;
 
--- mm/page_alloc.c	Tue Jan 11 11:00:31 2000
+++ mm/page_alloc.c	Fri Jan 14 15:32:40 2000
@@ -6,6 +6,7 @@
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  */
 
 #include <linux/config.h>
@@ -194,26 +195,84 @@
 	return NULL;
 }
 
+static spinlock_t balance_lock = SPIN_LOCK_UNLOCKED;
+
+static inline void del_balance_list(zone_t *zone, int gfp_mask)
+{
+	unsigned long flags;
+
+	/*
+	 * Only kswapd deletes.
+	 */
+	if (gfp_mask != GFP_KSWAPD) return;
+	spin_lock_irqsave(&balance_lock, flags);
+	if (!list_empty(&zone->balance_list)) {
+		list_del(&zone->balance_list);
+		INIT_LIST_HEAD(&zone->balance_list);
+	}
+	spin_unlock_irqrestore(&balance_lock, flags);
+}
+
+static inline void add_balance_list(zone_t *zone, int gfp_mask)
+{
+	unsigned long flags;
+
+	/*
+	 * kswapd never adds.
+	 */
+	if (gfp_mask == GFP_KSWAPD) return;
+	spin_lock_irqsave(&balance_lock, flags);
+	if (list_empty(&zone->balance_list)) {
+		list_add_tail(&zone->balance_list, &global_balance_list);
+	}
+	spin_unlock_irqrestore(&balance_lock, flags);
+}
+
+static inline unsigned long classfree(zone_t *zone)
+{
+	unsigned long free = 0;
+	zone_t *z = zone->zone_pgdat->node_zones;
+
+	while (z != zone) {
+		free += z->free_pages;
+		z++;
+	}
+	free += zone->free_pages;
+	return(free);
+}
+
 #define ZONE_BALANCED(zone) \
 	(((zone)->free_pages > (zone)->pages_low) && (!(zone)->low_on_memory))
 
-static inline int zone_balance_memory (zone_t *zone, int gfp_mask)
+int zone_balance_memory (zone_t *zone, int gfp_mask)
 {
 	int freed;
+	unsigned long flags;
+	unsigned long free = classfree(zone);
 
-	if (zone->free_pages >= zone->pages_low) {
-		if (!zone->low_on_memory)
+	spin_lock_irqsave(&zone->lock, flags);
+	if (free >= zone->pages_low) {
+		if (!zone->low_on_memory) {
+			spin_unlock_irqrestore(&zone->lock, flags);
 			return 1;
+		}
 		/*
 		 * Simple hysteresis: exit 'low memory mode' if
 		 * the upper limit has been reached:
 		 */
-		if (zone->free_pages >= zone->pages_high) {
+		if (free >= zone->pages_high) {
 			zone->low_on_memory = 0;
+			del_balance_list(zone, gfp_mask);
+			MARK_ZONE_BALANCED(zone);
+			spin_unlock_irqrestore(&zone->lock, flags);
 			return 1;
 		}
-	} else
+	} else {
+		add_balance_list(zone, gfp_mask);
+		MARK_ZONE_UNBALANCED(zone);
 		zone->low_on_memory = 1;
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
 
 	/*
 	 * In the atomic allocation case we only 'kick' the
@@ -220,12 +279,7 @@
 	 * state machine, but do not try to free pages
 	 * ourselves.
 	 */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 1;
-
-	current->flags |= PF_MEMALLOC;
 	freed = try_to_free_pages(gfp_mask, zone);
-	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 		return 0;
@@ -232,6 +286,7 @@
 	return 1;
 }
 
+#if 0
 /*
  * We are still balancing memory in a global way:
  */
@@ -260,17 +315,13 @@
 	 * state machine, but do not try to free pages
 	 * ourselves.
 	 */
-	if (!(gfp_mask & __GFP_WAIT))
-		return 1;
-
-	current->flags |= PF_MEMALLOC;
 	freed = try_to_free_pages(gfp_mask, zone);
-	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 		return 0;
 	return 1;
 }
+#endif
 
 /*
  * This is the 'heart' of the zoned buddy allocator:
@@ -340,7 +391,7 @@
  * The main chunk of the balancing code is in this offline branch:
  */
 balance:
-	if (!balance_memory(z, gfp_mask))
+	if (!zone_balance_memory(z, gfp_mask))
 		goto nopage;
 	goto ready;
 }
@@ -513,6 +564,7 @@
 	unsigned long i, j;
 	unsigned long map_size;
 	unsigned int totalpages, offset;
+	unsigned int cumulative = 0;
 
 	totalpages = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -565,7 +617,7 @@
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask = -1;
+		unsigned long mask;
 		unsigned long size;
 
 		size = zones_size[j];
@@ -579,14 +631,13 @@
 			continue;
 
 		zone->offset = offset;
-		/*
-		 * It's unnecessery to balance the high memory zone
-		 */
-		if (j != ZONE_HIGHMEM) {
-			zone->pages_low = freepages.low;
-			zone->pages_high = freepages.high;
-		}
+		cumulative += size;
+		mask = (cumulative >> 7);
+		if (mask < 1) mask = 1;
+		zone->pages_low = mask*2;
+		zone->pages_high = mask*3;
 		zone->low_on_memory = 0;
+		INIT_LIST_HEAD(&zone->balance_list);
 
 		for (i = 0; i < size; i++) {
 			struct page *page = mem_map + offset + i;
@@ -598,6 +649,7 @@
 		}
 
 		offset += size;
+		mask = -1;
 		for (i = 0; i < MAX_ORDER; i++) {
 			unsigned long bitmap_size;
 
--- mm/vmscan.c	Tue Jan 11 11:00:31 2000
+++ mm/vmscan.c	Fri Jan 14 15:00:39 2000
@@ -8,6 +8,7 @@
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  */
 
 #include <linux/slab.h>
@@ -58,9 +59,7 @@
 		goto out_failed;
 	}
 
-	if (PageReserved(page)
-	    || PageLocked(page)
-	    || (zone && (!memclass(page->zone, zone))))
+	if (PageReserved(page) || PageLocked(page) || PG_ZONE_BALANCED(page->zone))
 		goto out_failed;
 
 	/*
@@ -424,16 +423,19 @@
 				goto done;
 		}
 
-		/* don't be too light against the d/i cache since
-		   shrink_mmap() almost never fail when there's
-		   really plenty of memory free. */
-		count -= shrink_dcache_memory(priority, gfp_mask, zone);
-		count -= shrink_icache_memory(priority, gfp_mask, zone);
-		if (count <= 0)
-			goto done;
-
-		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
+
+			/* 
+			 * don't be too light against the d/i cache since
+		   	 * shrink_mmap() almost never fail when there's
+		   	 * really plenty of memory free. 
+			 */
+			count -= shrink_dcache_memory(priority, gfp_mask, zone);
+			count -= shrink_icache_memory(priority, gfp_mask, zone);
+			if (count <= 0)
+				goto done;
+
+			/* Try to get rid of some shared memory pages.. */
 			while (shm_swap(priority, gfp_mask, zone)) {
 				if (!--count)
 					goto done;
@@ -467,8 +469,13 @@
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
+
+struct list_head global_balance_list = LIST_HEAD_INIT(global_balance_list);
+
 int kswapd(void *unused)
 {
+	zone_t	*zone;
+	struct list_head *lhd;
 	struct task_struct *tsk = current;
 
 	kswapd_process = tsk;
@@ -489,7 +496,6 @@
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC;
 
 	while (1) {
 		/*
@@ -503,10 +509,18 @@
 		do {
 			/* kswapd is critical to provide GFP_ATOMIC
 			   allocations (not GFP_HIGHMEM ones). */
-			if (nr_free_buffer_pages() >= freepages.high)
-				break;
-			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
-				break;
+			/*
+			 * kswapd can scan the chain witout lock since
+			 * it is the only chain deleter. New elements
+			 * are added at end of list.
+			 */
+			lhd = global_balance_list.next;
+			while (lhd != &global_balance_list) {
+				zone = list_entry(lhd, zone_t, balance_list);
+				lhd = lhd->next;
+				zone_balance_memory(zone, GFP_KSWAPD);
+			}
+			tsk->flags |= PF_MEMALLOC;
 			run_task_queue(&tq_disk);
 		} while (!tsk->need_resched);
 		tsk->state = TASK_INTERRUPTIBLE;
@@ -533,9 +547,13 @@
 {
 	int retval = 1;
 
-	wake_up_process(kswapd_process);
-	if (gfp_mask & __GFP_WAIT)
+	if (gfp_mask != GFP_KSWAPD)
+		wake_up_process(kswapd_process);
+	if (gfp_mask & __GFP_WAIT) {
+		current->flags |= PF_MEMALLOC;
 		retval = do_try_to_free_pages(gfp_mask, zone);
+		current->flags &= ~PF_MEMALLOC;
+	}
 	return retval;
 }
 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2000-01-15  2:03 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-01-12 21:11 [RFC] 2.3.39 zone balancing Kanoj Sarcar
2000-01-13 13:40 ` Rik van Riel
2000-01-13 17:06   ` Andrea Arcangeli
2000-01-13 17:18   ` Alan Cox
2000-01-13 18:37     ` Rik van Riel
2000-01-13 20:13       ` Andrea Arcangeli
2000-01-13 21:12         ` Rik van Riel
2000-01-13 21:40         ` Kanoj Sarcar
2000-01-14 12:25           ` Jamie Lokier
2000-01-14 13:43             ` Andrea Arcangeli
2000-01-13 18:52   ` Kanoj Sarcar
2000-01-13 19:59     ` Andrea Arcangeli
2000-01-13 21:02       ` Kanoj Sarcar
2000-01-13 21:34         ` Benjamin C.R. LaHaise
2000-01-13 21:48           ` Kanoj Sarcar
2000-01-13 21:42         ` Alan Cox
2000-01-13 21:50           ` Kanoj Sarcar
2000-01-13 21:53             ` Alan Cox
2000-01-13 22:01           ` Linus Torvalds
2000-01-13 22:13             ` Kanoj Sarcar
2000-01-13 22:28               ` Rik van Riel
2000-01-13 22:30               ` Linus Torvalds
2000-01-13 23:53                 ` Ingo Molnar
2000-01-13 23:29                   ` Linus Torvalds
2000-01-14  0:33                     ` Andrea Arcangeli
2000-01-14  0:52                       ` Linus Torvalds
2000-01-14  1:08                         ` Rik van Riel
2000-01-14  2:13                         ` Ingo Molnar
2000-01-14  1:17                           ` Kanoj Sarcar
2000-01-14  2:36                             ` Ingo Molnar
2000-01-14 20:33                               ` Peter Rival
2000-01-14  1:13                       ` Kanoj Sarcar
2000-01-14  2:27                         ` Ingo Molnar
2000-01-14  2:46                         ` Ingo Molnar
2000-01-14  6:22                           ` Kanoj Sarcar
2000-01-15  2:03                     ` Reworked 2.3.39 zone balancing - v1 Kanoj Sarcar
2000-01-14  0:28                 ` [RFC] 2.3.39 zone balancing Andrea Arcangeli
2000-01-13 17:12 ` Andrea Arcangeli
2000-01-13 18:30   ` Kanoj Sarcar
2000-01-13 19:22     ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox