linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC] [RFT] [PATCH] memory zone balancing
@ 1999-12-15 19:50 Kanoj Sarcar
  1999-12-17  7:27 ` Mike Galbraith
  1999-12-27 16:06 ` Andrea Arcangeli
  0 siblings, 2 replies; 12+ messages in thread
From: Kanoj Sarcar @ 1999-12-15 19:50 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: Kanoj Sarcar

Hi,

Could people try out this patch against 2.3.33 and send comments/feedback.

The patch cleans up the way we do memory balancing in the core page alloc
and free routines. 

Thanks.

Kanoj

--- /usr/tmp/p_rdiff_a000os/dcache.c	Wed Dec 15 11:37:12 1999
+++ fs/dcache.c	Tue Dec 14 21:44:20 1999
@@ -410,7 +410,7 @@
  *  ...
  *   6 - base-level: try to shrink a bit.
  */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone)
 {
 	if (gfp_mask & __GFP_IO) {
 		int count = 0;
--- /usr/tmp/p_rdiff_a000p1/inode.c	Wed Dec 15 11:37:20 1999
+++ fs/inode.c	Tue Dec 14 21:45:14 1999
@@ -392,7 +392,7 @@
 	dispose_list(freeable);
 }
 
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone)
 {
 	if (gfp_mask & __GFP_IO)
 	{
--- /usr/tmp/p_rdiff_a000pA/dcache.h	Wed Dec 15 11:37:28 1999
+++ include/linux/dcache.h	Tue Dec 14 22:10:55 1999
@@ -136,13 +136,13 @@
 extern int d_invalidate(struct dentry *);
 
 #define shrink_dcache() prune_dcache(0)
-
+struct zone_struct;
 /* dcache memory management */
-extern int shrink_dcache_memory(int, unsigned int);
+extern int shrink_dcache_memory(int, unsigned int, struct zone_struct *);
 extern void prune_dcache(int);
 
 /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, int, struct zone_struct *);
 extern void prune_icache(int);
 
 /* only used at mount-time */
--- /usr/tmp/p_rdiff_a000pJ/mm.h	Wed Dec 15 11:37:36 1999
+++ include/linux/mm.h	Wed Dec 15 11:12:36 1999
@@ -183,7 +183,6 @@
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
 #define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
 #define PageDecrAfter(page)	test_bit(PG_decr_after, &(page)->flags)
-#define PageDMA(page)		(contig_page_data.node_zones + ZONE_DMA == (page)->zone)
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define PageSwapCache(page)	test_bit(PG_swap_cache, &(page)->flags)
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
@@ -434,10 +433,11 @@
 extern int do_munmap(unsigned long, size_t);
 extern unsigned long do_brk(unsigned long, unsigned long);
 
+struct zone_t;
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int shrink_mmap(int, int, zone_t *);
 extern void truncate_inode_pages(struct inode *, loff_t);
 
 /*
--- /usr/tmp/p_rdiff_a000pS/swap.h	Wed Dec 15 11:37:43 1999
+++ include/linux/swap.h	Tue Dec 14 22:11:01 1999
@@ -78,14 +78,15 @@
 struct vm_area_struct;
 struct sysinfo;
 
+struct zone_t;
 /* linux/ipc/shm.c */
-extern int shm_swap (int, int);
+extern int shm_swap (int, int, zone_t *);
 
 /* linux/mm/swap.c */
 extern void swap_setup (void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
--- /usr/tmp/p_rdiff_a000p-/shm.c	Wed Dec 15 11:37:52 1999
+++ ipc/shm.c	Wed Dec 15 02:24:27 1999
@@ -799,7 +799,7 @@
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
 	pte_t page;
 	struct shmid_kernel *shp;
@@ -849,9 +849,7 @@
 	if (!pte_present(page))
 		goto check_table;
 	page_map = pte_page(page);
-	if ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
-		goto check_table;
-	if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page_map))
+	if (zone && (page_map->zone != zone))
 		goto check_table;
 	swap_attempts++;
 
--- /usr/tmp/p_rdiff_a000pi/util.c	Wed Dec 15 11:38:00 1999
+++ ipc/util.c	Tue Dec 14 21:46:31 1999
@@ -214,7 +214,7 @@
     return;
 }
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
     return 0;
 }
--- /usr/tmp/p_rdiff_a000pr/filemap.c	Wed Dec 15 11:38:14 1999
+++ mm/filemap.c	Wed Dec 15 00:23:44 1999
@@ -211,7 +211,7 @@
 	spin_unlock(&pagecache_lock);
 }
 
-int shrink_mmap(int priority, int gfp_mask)
+int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
 	int ret = 0, count;
 	LIST_HEAD(young);
@@ -239,9 +239,7 @@
 
 		dispose = &old;
 		/* don't account passes over not DMA pages */
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-			goto dispose_continue;
-		if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
+		if (zone && (page->zone != zone))
 			goto dispose_continue;
 
 		count--;
--- /usr/tmp/p_rdiff_a000q0/page_alloc.c	Wed Dec 15 11:38:22 1999
+++ mm/page_alloc.c	Wed Dec 15 03:01:00 1999
@@ -224,7 +224,7 @@
 		return 1;
 
 	current->flags |= PF_MEMALLOC;
-	freed = try_to_free_pages(gfp_mask);
+	freed = try_to_free_pages(gfp_mask, zone);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
@@ -264,7 +264,7 @@
 		return 1;
 
 	current->flags |= PF_MEMALLOC;
-	freed = try_to_free_pages(gfp_mask);
+	freed = try_to_free_pages(gfp_mask, 0);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
@@ -340,7 +340,7 @@
  * The main chunk of the balancing code is in this offline branch:
  */
 balance:
-	if (!balance_memory(gfp_mask))
+	if (!zone_balance_memory(z, gfp_mask))
 		goto nopage;
 	goto ready;
 }
@@ -533,9 +533,9 @@
 		i = 10;
 	if (i > 256)
 		i = 256;
-	freepages.min = i;
-	freepages.low = i * 2;
-	freepages.high = i * 3;
+	freepages.min += i;
+	freepages.low += i * 2;
+	freepages.high += i * 3;
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -565,10 +565,9 @@
 	offset = lmem_map - mem_map;	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		zone_t *zone = pgdat->node_zones + j;
-		unsigned long mask = -1;
-		unsigned long size;
+		unsigned long mask, size;
 
-		size = zones_size[j];
+		mask = size = zones_size[j];
 
 		printk("zone(%ld): %ld pages.\n", j, size);
 		zone->size = size;
@@ -578,13 +577,13 @@
 			continue;
 
 		zone->offset = offset;
-		/*
-		 * It's unnecessery to balance the high memory zone
-		 */
-		if (j != ZONE_HIGHMEM) {
-			zone->pages_low = freepages.low;
-			zone->pages_high = freepages.high;
-		}
+		mask = mask >> 7;
+		if (mask < 10)
+			mask = 10;
+		if (mask > 256)
+			mask = 256;
+		zone->pages_low = mask * 2;
+		zone->pages_high = mask * 3;
 		zone->low_on_memory = 0;
 
 		for (i = 0; i < size; i++) {
@@ -597,6 +596,7 @@
 		}
 
 		offset += size;
+		mask = -1;
 		for (i = 0; i < MAX_ORDER; i++) {
 			unsigned long bitmap_size;
 
--- /usr/tmp/p_rdiff_a000q9/slab.c	Wed Dec 15 11:38:38 1999
+++ mm/slab.c	Wed Dec 15 11:25:44 1999
@@ -503,6 +503,11 @@
 {
 	void	*addr;
 
+	/*
+	 * If we requested dmaable memory, we will get it. Even if we 
+	 * did not request dmaable memory, we might get it, but that
+	 * would be relatively rare and ignorable.
+	 */
 	*dma = flags & SLAB_DMA;
 	addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
 	/* Assume that now we have the pages no one else can legally
@@ -511,18 +516,6 @@
 	 * it is a named-page or buffer-page.  The members it tests are
 	 * of no interest here.....
 	 */
-	if (!*dma && addr) {
-		/* Need to check if can dma. */
-		struct page *page = mem_map + MAP_NR(addr);
-		*dma = 1<<cachep->c_gfporder;
-		while ((*dma)--) {
-			if (!PageDMA(page)) {
-				*dma = 0;
-				break;
-			}
-			page++;
-		}
-	}
 	return addr;
 }
 
--- /usr/tmp/p_rdiff_a000qI/vmscan.c	Wed Dec 15 11:38:48 1999
+++ mm/vmscan.c	Wed Dec 15 02:23:07 1999
@@ -33,7 +33,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask, zone_t *zone)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -60,8 +60,7 @@
 
 	if (PageReserved(page)
 	    || PageLocked(page)
-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-	    || (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page)))
+	    || (zone && (page->zone != zone)))
 		goto out_failed;
 
 	/*
@@ -195,7 +194,7 @@
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t *zone)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -217,7 +216,7 @@
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(vma, address, pte, gfp_mask);
+		result = try_to_swap_out(vma, address, pte, gfp_mask, zone);
 		if (result)
 			return result;
 		address += PAGE_SIZE;
@@ -226,7 +225,7 @@
 	return 0;
 }
 
-static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t *zone)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -246,7 +245,7 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -255,7 +254,7 @@
 	return 0;
 }
 
-static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask, zone_t *zone)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -270,7 +269,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -279,7 +278,7 @@
 	return 0;
 }
 
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int gfp_mask, zone_t *zone)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -300,7 +299,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(vma, address, gfp_mask);
+			int result = swap_out_vma(vma, address, gfp_mask, zone);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -322,7 +321,7 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask, zone_t *zone)
 {
 	struct task_struct * p;
 	int counter;
@@ -383,7 +382,7 @@
 			int ret;
 
 			atomic_inc(&best->mm_count);
-			ret = swap_out_mm(best, gfp_mask);
+			ret = swap_out_mm(best, gfp_mask, zone);
 			mmdrop(best);
 
 			if (!ret)
@@ -409,7 +408,7 @@
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
@@ -419,7 +418,7 @@
 
 	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask)) {
+		while (shrink_mmap(priority, gfp_mask, zone)) {
 			if (!--count)
 				goto done;
 		}
@@ -427,14 +426,14 @@
 		/* don't be too light against the d/i cache since
 		   shrink_mmap() almost never fail when there's
 		   really plenty of memory free. */
-		count -= shrink_dcache_memory(priority, gfp_mask);
-		count -= shrink_icache_memory(priority, gfp_mask);
+		count -= shrink_dcache_memory(priority, gfp_mask, zone);
+		count -= shrink_icache_memory(priority, gfp_mask, zone);
 		if (count <= 0)
 			goto done;
 
 		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
-			while (shm_swap(priority, gfp_mask)) {
+			while (shm_swap(priority, gfp_mask, zone)) {
 				if (!--count)
 					goto done;
 			}
@@ -441,7 +440,7 @@
 		}
 
 		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
+		while (swap_out(priority, gfp_mask, zone)) {
 			if (!--count)
 				goto done;
 		}
@@ -505,7 +504,7 @@
 			   allocations (not GFP_HIGHMEM ones). */
 			if (nr_free_buffer_pages() >= freepages.high)
 				break;
-			if (!do_try_to_free_pages(GFP_KSWAPD))
+			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
 				break;
 			run_task_queue(&tq_disk);
 		} while (!tsk->need_resched);
@@ -529,13 +528,13 @@
  * can be done by just dropping cached pages without having
  * any deadlock issues.
  */
-int try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int retval = 1;
 
 	wake_up_process(kswapd_process);
 	if (gfp_mask & __GFP_WAIT)
-		retval = do_try_to_free_pages(gfp_mask);
+		retval = do_try_to_free_pages(gfp_mask, zone);
 	return retval;
 }
 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  1999-12-15 19:50 [RFC] [RFT] [PATCH] memory zone balancing Kanoj Sarcar
@ 1999-12-17  7:27 ` Mike Galbraith
  1999-12-27 16:00   ` Andrea Arcangeli
  1999-12-27 16:06 ` Andrea Arcangeli
  1 sibling, 1 reply; 12+ messages in thread
From: Mike Galbraith @ 1999-12-17  7:27 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: linux-kernel, linux-mm

On Wed, 15 Dec 1999, Kanoj Sarcar wrote:

> Hi,
> 
> Could people try out this patch against 2.3.33 and send comments/feedback.

Works fine on my x86/UP/SCSI box.  Subjectively, it feels smoother.
(that could well be imagination at work)

	-Mike

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  1999-12-17  7:27 ` Mike Galbraith
@ 1999-12-27 16:00   ` Andrea Arcangeli
  0 siblings, 0 replies; 12+ messages in thread
From: Andrea Arcangeli @ 1999-12-27 16:00 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Kanoj Sarcar, linux-kernel, linux-mm

On Fri, 17 Dec 1999, Mike Galbraith wrote:

>Works fine on my x86/UP/SCSI box.  Subjectively, it feels smoother.
>(that could well be imagination at work)

it's imagination ;)

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  1999-12-15 19:50 [RFC] [RFT] [PATCH] memory zone balancing Kanoj Sarcar
  1999-12-17  7:27 ` Mike Galbraith
@ 1999-12-27 16:06 ` Andrea Arcangeli
  2000-01-04  2:27   ` Kanoj Sarcar
  1 sibling, 1 reply; 12+ messages in thread
From: Andrea Arcangeli @ 1999-12-27 16:06 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: linux-kernel, linux-mm

On Wed, 15 Dec 1999, Kanoj Sarcar wrote:

>-	if ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
>-		goto check_table;
>-	if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page_map))
>+	if (zone && (page_map->zone != zone))
> 		goto check_table;
> 	swap_attempts++;

> 		/* don't account passes over not DMA pages */
>-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
>-			goto dispose_continue;
>-		if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
>+		if (zone && (page->zone != zone))
> 			goto dispose_continue;

> 	if (PageReserved(page)
> 	    || PageLocked(page)
>-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
>-	    || (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page)))
>+	    || (zone && (page->zone != zone)))
> 		goto out_failed;


The above changes you proposed will cause you to go oom even if you still
have 16mbyte of pure (ISA-DMA) RAM free in a no-bigmem kernel. With a
bigmem kernel you'll go OOM even if you still have 1giga of memory free.

As I just told you privately in previous email the checks should be a kind
of ">" (or a more complex operation implemented in a function) and not a
"!=".

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  1999-12-27 16:06 ` Andrea Arcangeli
@ 2000-01-04  2:27   ` Kanoj Sarcar
  2000-01-04  3:23     ` Rik van Riel
  2000-01-06 16:05     ` Ingo Molnar
  0 siblings, 2 replies; 12+ messages in thread
From: Kanoj Sarcar @ 2000-01-04  2:27 UTC (permalink / raw)
  To: Andrea Arcangeli, torvalds, mingo, Kanoj Sarcar; +Cc: linux-kernel, linux-mm

> 
> On Wed, 15 Dec 1999, Kanoj Sarcar wrote:
> 
> >-	if ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
> >-		goto check_table;
> >-	if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page_map))
> >+	if (zone && (page_map->zone != zone))
> > 		goto check_table;
> > 	swap_attempts++;
> 
> > 		/* don't account passes over not DMA pages */
> >-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
> >-			goto dispose_continue;
> >-		if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
> >+		if (zone && (page->zone != zone))
> > 			goto dispose_continue;
> 
> > 	if (PageReserved(page)
> > 	    || PageLocked(page)
> >-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
> >-	    || (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page)))
> >+	    || (zone && (page->zone != zone)))
> > 		goto out_failed;
> 
> 
> The above changes you proposed will cause you to go oom even if you still
> have 16mbyte of pure (ISA-DMA) RAM free in a no-bigmem kernel. With a
> bigmem kernel you'll go OOM even if you still have 1giga of memory free.
> 
> As I just told you privately in previous email the checks should be a kind
> of ">" (or a more complex operation implemented in a function) and not a
> "!=".

Okay, here is a reworked version. Note that this version does not
do per-zone balancing, since experiments show that we need to tune
per-zone watermarks properly before we can start doing that. I am 
working on coming up with a good estimate of per-zone watermarks.
Basically, I am trying to answer the question: if there are d dmapages,
r regular pages and h highmem pages, for a total of d+r+h pages, 
what should the watermarks be for each zone?

Appended is the new (2.3.34) form of the patch.

Thanks.

Kanoj

--- fs/dcache.c	Tue Dec 21 10:27:45 1999
+++ fs/dcache.c	Mon Dec 27 16:05:31 1999
@@ -410,7 +410,7 @@
  *  ...
  *   6 - base-level: try to shrink a bit.
  */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int priority, unsigned int gfp_mask, zone_t * zone)
 {
 	if (gfp_mask & __GFP_IO) {
 		int count = 0;
--- fs/inode.c	Tue Dec 21 10:27:45 1999
+++ fs/inode.c	Mon Dec 27 16:05:31 1999
@@ -392,7 +392,7 @@
 	dispose_list(freeable);
 }
 
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int priority, int gfp_mask, zone_t *zone)
 {
 	if (gfp_mask & __GFP_IO)
 	{
--- include/linux/dcache.h	Tue Dec 21 10:27:49 1999
+++ include/linux/dcache.h	Mon Dec 27 16:05:31 1999
@@ -136,13 +136,13 @@
 extern int d_invalidate(struct dentry *);
 
 #define shrink_dcache() prune_dcache(0)
-
+struct zone_struct;
 /* dcache memory management */
-extern int shrink_dcache_memory(int, unsigned int);
+extern int shrink_dcache_memory(int, unsigned int, struct zone_struct *);
 extern void prune_dcache(int);
 
 /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, int, struct zone_struct *);
 extern void prune_icache(int);
 
 /* only used at mount-time */
--- include/linux/mm.h	Tue Dec 21 10:27:50 1999
+++ include/linux/mm.h	Mon Jan  3 12:31:07 2000
@@ -183,7 +183,6 @@
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
 #define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
 #define PageDecrAfter(page)	test_bit(PG_decr_after, &(page)->flags)
-#define PageDMA(page)		(contig_page_data.node_zones + ZONE_DMA == (page)->zone)
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define PageSwapCache(page)	test_bit(PG_swap_cache, &(page)->flags)
 #define PageReserved(page)	test_bit(PG_reserved, &(page)->flags)
@@ -434,10 +433,11 @@
 extern int do_munmap(unsigned long, size_t);
 extern unsigned long do_brk(unsigned long, unsigned long);
 
+struct zone_t;
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
-extern int shrink_mmap(int, int);
+extern int shrink_mmap(int, int, zone_t *);
 extern void truncate_inode_pages(struct inode *, loff_t);
 
 /*
--- include/linux/mmzone.h	Tue Dec 21 10:27:51 1999
+++ include/linux/mmzone.h	Mon Jan  3 12:29:34 2000
@@ -25,6 +25,8 @@
 	unsigned int * map;
 } free_area_t;
 
+struct pglist_data;
+
 typedef struct zone_struct {
 	/*
 	 * Commonly accessed fields:
@@ -34,6 +36,7 @@
 	unsigned long free_pages;
 	int low_on_memory;
 	unsigned long pages_low, pages_high;
+	struct pglist_data *zone_pgdat;
 
 	/*
 	 * free areas of different sizes
@@ -80,6 +83,10 @@
 } pg_data_t;
 
 extern int numnodes;
+
+#define memclass(pgzone, tzone)	(((pgzone)->zone_pgdat == (tzone)->zone_pgdat) \
+			&& (((pgzone) - (pgzone)->zone_pgdat->node_zones) <= \
+			((tzone) - (pgzone)->zone_pgdat->node_zones)))
 
 #ifndef CONFIG_DISCONTIGMEM
 
--- include/linux/swap.h	Tue Dec 21 10:27:50 1999
+++ include/linux/swap.h	Mon Dec 27 16:05:31 1999
@@ -78,14 +78,15 @@
 struct vm_area_struct;
 struct sysinfo;
 
+struct zone_t;
 /* linux/ipc/shm.c */
-extern int shm_swap (int, int);
+extern int shm_swap (int, int, zone_t *);
 
 /* linux/mm/swap.c */
 extern void swap_setup (void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(unsigned int gfp_mask);
+extern int try_to_free_pages(unsigned int gfp_mask, zone_t *zone);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
--- ipc/shm.c	Tue Dec 21 10:37:12 1999
+++ ipc/shm.c	Mon Dec 27 16:05:31 1999
@@ -799,7 +799,7 @@
 static unsigned long swap_id = 0; /* currently being swapped */
 static unsigned long swap_idx = 0; /* next to swap */
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
 	pte_t page;
 	struct shmid_kernel *shp;
@@ -849,9 +849,7 @@
 	if (!pte_present(page))
 		goto check_table;
 	page_map = pte_page(page);
-	if ((gfp_mask & __GFP_DMA) && !PageDMA(page_map))
-		goto check_table;
-	if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page_map))
+	if (zone && (!memclass(page_map->zone, zone)))
 		goto check_table;
 	swap_attempts++;
 
--- ipc/util.c	Tue Dec 21 10:27:53 1999
+++ ipc/util.c	Mon Dec 27 16:05:31 1999
@@ -214,7 +214,7 @@
     return;
 }
 
-int shm_swap (int prio, int gfp_mask)
+int shm_swap (int prio, int gfp_mask, zone_t *zone)
 {
     return 0;
 }
--- mm/filemap.c	Tue Dec 21 10:37:12 1999
+++ mm/filemap.c	Mon Dec 27 16:05:31 1999
@@ -211,7 +211,7 @@
 	spin_unlock(&pagecache_lock);
 }
 
-int shrink_mmap(int priority, int gfp_mask)
+int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
 {
 	int ret = 0, count;
 	LIST_HEAD(young);
@@ -239,9 +239,7 @@
 
 		dispose = &old;
 		/* don't account passes over not DMA pages */
-		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-			goto dispose_continue;
-		if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page))
+		if (zone && (!memclass(page->zone, zone)))
 			goto dispose_continue;
 
 		count--;
--- mm/page_alloc.c	Tue Dec 21 10:27:53 1999
+++ mm/page_alloc.c	Mon Jan  3 17:18:54 2000
@@ -224,7 +224,7 @@
 		return 1;
 
 	current->flags |= PF_MEMALLOC;
-	freed = try_to_free_pages(gfp_mask);
+	freed = try_to_free_pages(gfp_mask, zone);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
@@ -235,7 +235,7 @@
 /*
  * We are still balancing memory in a global way:
  */
-static inline int balance_memory (int gfp_mask)
+static inline int balance_memory (zone_t *zone, int gfp_mask)
 {
 	unsigned long free = nr_free_pages();
 	static int low_on_memory = 0;
@@ -264,7 +264,7 @@
 		return 1;
 
 	current->flags |= PF_MEMALLOC;
-	freed = try_to_free_pages(gfp_mask);
+	freed = try_to_free_pages(gfp_mask, zone);
 	current->flags &= ~PF_MEMALLOC;
 
 	if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
@@ -340,7 +340,7 @@
  * The main chunk of the balancing code is in this offline branch:
  */
 balance:
-	if (!balance_memory(gfp_mask))
+	if (!balance_memory(z, gfp_mask))
 		goto nopage;
 	goto ready;
 }
@@ -533,9 +533,9 @@
 		i = 10;
 	if (i > 256)
 		i = 256;
-	freepages.min = i;
-	freepages.low = i * 2;
-	freepages.high = i * 3;
+	freepages.min += i;
+	freepages.low += i * 2;
+	freepages.high += i * 3;
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -574,6 +574,7 @@
 		zone->size = size;
 		zone->name = zone_names[j];
 		zone->lock = SPIN_LOCK_UNLOCKED;
+		zone->zone_pgdat = pgdat;
 		if (!size)
 			continue;
 
--- mm/slab.c	Tue Dec 21 10:27:53 1999
+++ mm/slab.c	Mon Jan  3 12:05:23 2000
@@ -503,6 +503,11 @@
 {
 	void	*addr;
 
+	/*
+	 * If we requested dmaable memory, we will get it. Even if we 
+	 * did not request dmaable memory, we might get it, but that
+	 * would be relatively rare and ignorable.
+	 */
 	*dma = flags & SLAB_DMA;
 	addr = (void*) __get_free_pages(flags, cachep->c_gfporder);
 	/* Assume that now we have the pages no one else can legally
@@ -511,18 +516,6 @@
 	 * it is a named-page or buffer-page.  The members it tests are
 	 * of no interest here.....
 	 */
-	if (!*dma && addr) {
-		/* Need to check if can dma. */
-		struct page *page = mem_map + MAP_NR(addr);
-		*dma = 1<<cachep->c_gfporder;
-		while ((*dma)--) {
-			if (!PageDMA(page)) {
-				*dma = 0;
-				break;
-			}
-			page++;
-		}
-	}
 	return addr;
 }
 
--- mm/vmscan.c	Tue Dec 21 10:27:53 1999
+++ mm/vmscan.c	Mon Jan  3 12:30:05 2000
@@ -33,7 +33,7 @@
  * using a process that no longer actually exists (it might
  * have died while we slept).
  */
-static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
+static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask, zone_t *zone)
 {
 	pte_t pte;
 	swp_entry_t entry;
@@ -60,8 +60,7 @@
 
 	if (PageReserved(page)
 	    || PageLocked(page)
-	    || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-	    || (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(page)))
+	    || (zone && (!memclass(page->zone, zone))))
 		goto out_failed;
 
 	/*
@@ -195,7 +194,7 @@
  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
  */
 
-static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t *zone)
 {
 	pte_t * pte;
 	unsigned long pmd_end;
@@ -217,7 +216,7 @@
 	do {
 		int result;
 		vma->vm_mm->swap_address = address + PAGE_SIZE;
-		result = try_to_swap_out(vma, address, pte, gfp_mask);
+		result = try_to_swap_out(vma, address, pte, gfp_mask, zone);
 		if (result)
 			return result;
 		address += PAGE_SIZE;
@@ -226,7 +225,7 @@
 	return 0;
 }
 
-static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
+static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask, zone_t *zone)
 {
 	pmd_t * pmd;
 	unsigned long pgd_end;
@@ -246,7 +245,7 @@
 		end = pgd_end;
 	
 	do {
-		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
+		int result = swap_out_pmd(vma, pmd, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		address = (address + PMD_SIZE) & PMD_MASK;
@@ -255,7 +254,7 @@
 	return 0;
 }
 
-static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
+static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask, zone_t *zone)
 {
 	pgd_t *pgdir;
 	unsigned long end;
@@ -270,7 +269,7 @@
 	if (address >= end)
 		BUG();
 	do {
-		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
+		int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask, zone);
 		if (result)
 			return result;
 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -279,7 +278,7 @@
 	return 0;
 }
 
-static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
+static int swap_out_mm(struct mm_struct * mm, int gfp_mask, zone_t *zone)
 {
 	unsigned long address;
 	struct vm_area_struct* vma;
@@ -300,7 +299,7 @@
 			address = vma->vm_start;
 
 		for (;;) {
-			int result = swap_out_vma(vma, address, gfp_mask);
+			int result = swap_out_vma(vma, address, gfp_mask, zone);
 			if (result)
 				return result;
 			vma = vma->vm_next;
@@ -322,7 +321,7 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+static int swap_out(unsigned int priority, int gfp_mask, zone_t *zone)
 {
 	struct task_struct * p;
 	int counter;
@@ -383,7 +382,7 @@
 			int ret;
 
 			atomic_inc(&best->mm_count);
-			ret = swap_out_mm(best, gfp_mask);
+			ret = swap_out_mm(best, gfp_mask, zone);
 			mmdrop(best);
 
 			if (!ret)
@@ -409,7 +408,7 @@
  * cluster them so that we get good swap-out behaviour. See
  * the "free_memory()" macro for details.
  */
-static int do_try_to_free_pages(unsigned int gfp_mask)
+static int do_try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
@@ -419,7 +418,7 @@
 
 	priority = 6;
 	do {
-		while (shrink_mmap(priority, gfp_mask)) {
+		while (shrink_mmap(priority, gfp_mask, zone)) {
 			if (!--count)
 				goto done;
 		}
@@ -427,14 +426,14 @@
 		/* don't be too light against the d/i cache since
 		   shrink_mmap() almost never fail when there's
 		   really plenty of memory free. */
-		count -= shrink_dcache_memory(priority, gfp_mask);
-		count -= shrink_icache_memory(priority, gfp_mask);
+		count -= shrink_dcache_memory(priority, gfp_mask, zone);
+		count -= shrink_icache_memory(priority, gfp_mask, zone);
 		if (count <= 0)
 			goto done;
 
 		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
-			while (shm_swap(priority, gfp_mask)) {
+			while (shm_swap(priority, gfp_mask, zone)) {
 				if (!--count)
 					goto done;
 			}
@@ -441,7 +440,7 @@
 		}
 
 		/* Then, try to page stuff out.. */
-		while (swap_out(priority, gfp_mask)) {
+		while (swap_out(priority, gfp_mask, zone)) {
 			if (!--count)
 				goto done;
 		}
@@ -505,7 +504,7 @@
 			   allocations (not GFP_HIGHMEM ones). */
 			if (nr_free_buffer_pages() >= freepages.high)
 				break;
-			if (!do_try_to_free_pages(GFP_KSWAPD))
+			if (!do_try_to_free_pages(GFP_KSWAPD, 0))
 				break;
 			run_task_queue(&tq_disk);
 		} while (!tsk->need_resched);
@@ -529,13 +528,13 @@
  * can be done by just dropping cached pages without having
  * any deadlock issues.
  */
-int try_to_free_pages(unsigned int gfp_mask)
+int try_to_free_pages(unsigned int gfp_mask, zone_t *zone)
 {
 	int retval = 1;
 
 	wake_up_process(kswapd_process);
 	if (gfp_mask & __GFP_WAIT)
-		retval = do_try_to_free_pages(gfp_mask);
+		retval = do_try_to_free_pages(gfp_mask, zone);
 	return retval;
 }
 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-04  2:27   ` Kanoj Sarcar
@ 2000-01-04  3:23     ` Rik van Riel
  2000-01-04 15:47       ` Rik van Riel
  2000-01-06 16:05     ` Ingo Molnar
  1 sibling, 1 reply; 12+ messages in thread
From: Rik van Riel @ 2000-01-04  3:23 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Andrea Arcangeli, torvalds, mingo, linux-kernel, linux-mm

On Mon, 3 Jan 100, Kanoj Sarcar wrote:

> Okay, here is a reworked version. Note that this version does not
> do per-zone balancing, since experiments show that we need to tune
> per-zone watermarks properly before we can start doing that. I am 
> working on coming up with a good estimate of per-zone watermarks.
> Basically, I am trying to answer the question: if there are d dmapages,
> r regular pages and h highmem pages, for a total of d+r+h pages, 
> what should the watermarks be for each zone?

d+r+h > limit
d     > limit/2
r     > limit/4
h     > limit/8

DMA pages should always be present, regular pages for
storing pagetables and stuff need to be there too, higmem
pages we don't really care about.

Btw, I think we probably want to increase freepages.min
to 512 or even more on machines that have >1GB of memory.
The current limit of 256 was really intended for machines
with a single zone of memory...

(but on <1GB machines I don't know if it makes sense to
raise the limit much more ... maybe we should raise the
limit automagically if the page alloc/io rate is too
high?)

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-04  3:23     ` Rik van Riel
@ 2000-01-04 15:47       ` Rik van Riel
  0 siblings, 0 replies; 12+ messages in thread
From: Rik van Riel @ 2000-01-04 15:47 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Andrea Arcangeli, torvalds, mingo, linux-kernel, linux-mm

On Tue, 4 Jan 2000, Rik van Riel wrote:

> d+r+h > limit
> d     > limit/2
> r     > limit/4
> h     > limit/8

Ehmmm, wait. I messed up on this one.
We probably want to have a bit more freedom
so the page freeing algorithm doesn't do too
much scanning for nothing.

d+r+h 	> limit
d	> limit/4
r	> limit/4
h	don't care, even on a 1.5GB machine h will
	be 1/3 of total memory, so we'll usually
	have a few free pages in here

> DMA pages should always be present, regular pages for
> storing pagetables and stuff need to be there too, higmem
> pages we don't really care about.
> 
> Btw, I think we probably want to increase freepages.min
> to 512 or even more on machines that have >1GB of memory.
> The current limit of 256 was really intended for machines
> with a single zone of memory...
> 
> (but on <1GB machines I don't know if it makes sense to
> raise the limit much more ... maybe we should raise the
> limit automagically if the page alloc/io rate is too
> high?)

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-06 16:05     ` Ingo Molnar
@ 2000-01-06 15:28       ` David S. Miller
  2000-01-06 18:36         ` Kanoj Sarcar
  2000-01-06 19:14         ` Rik van Riel
  0 siblings, 2 replies; 12+ messages in thread
From: David S. Miller @ 2000-01-06 15:28 UTC (permalink / raw)
  To: mingo; +Cc: kanoj, andrea, torvalds, linux-kernel, linux-mm

   i think this is pretty much 'type-dependent'. In earlier versions
   of the zone allocator i added a zone->memory_balanced() function
   (but removed it later because it first needed the things your patch
   adds). Then every zone can decide for itself wether it's
   balanced. Eg. the DMA zone is rather critical and we want to keep
   it free aggressively (part of that is already achieved by placing
   it at the end of the zone chain), the highmem zone might not need
   any balancing at all, the normal zone wants some high/low watermark
   stuff.

Let's be careful not to design any balancing heuristics which will
fall apart on architectures where only one zone ever exists (because
GFP_DMA is completely meaningless).

Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-04  2:27   ` Kanoj Sarcar
  2000-01-04  3:23     ` Rik van Riel
@ 2000-01-06 16:05     ` Ingo Molnar
  2000-01-06 15:28       ` David S. Miller
  1 sibling, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2000-01-06 16:05 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Andrea Arcangeli, torvalds, linux-kernel, linux-mm

On Mon, 3 Jan 100, Kanoj Sarcar wrote:

> Okay, here is a reworked version. Note that this version does not
> do per-zone balancing, since experiments show that we need to tune
> per-zone watermarks properly before we can start doing that. I am 
> working on coming up with a good estimate of per-zone watermarks.
> Basically, I am trying to answer the question: if there are d dmapages,
> r regular pages and h highmem pages, for a total of d+r+h pages, 
> what should the watermarks be for each zone?

i think this is pretty much 'type-dependent'. In earlier versions of the
zone allocator i added a zone->memory_balanced() function (but removed it
later because it first needed the things your patch adds). Then every zone
can decide for itself wether it's balanced. Eg. the DMA zone is rather
critical and we want to keep it free aggressively (part of that is already
achieved by placing it at the end of the zone chain), the highmem zone
might not need any balancing at all, the normal zone wants some high/low
watermark stuff.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-06 15:28       ` David S. Miller
@ 2000-01-06 18:36         ` Kanoj Sarcar
  2000-01-06 19:14         ` Rik van Riel
  1 sibling, 0 replies; 12+ messages in thread
From: Kanoj Sarcar @ 2000-01-06 18:36 UTC (permalink / raw)
  To: David S. Miller; +Cc: mingo, andrea, torvalds, linux-kernel, linux-mm

> 
>    Date:   Thu, 6 Jan 2000 17:05:41 +0100 (CET)
>    From: Ingo Molnar <mingo@chiara.csoma.elte.hu>
> 
>    i think this is pretty much 'type-dependent'. In earlier versions
>    of the zone allocator i added a zone->memory_balanced() function
>    (but removed it later because it first needed the things your patch
>    adds). Then every zone can decide for itself wether it's
>    balanced. Eg. the DMA zone is rather critical and we want to keep
>    it free aggressively (part of that is already achieved by placing
>    it at the end of the zone chain), the highmem zone might not need
>    any balancing at all, the normal zone wants some high/low watermark
>    stuff.

After thinking about this more, I came to the conclusion that the
ZONE_BALANCED macro is just a quick way of checking whether we are
_really_ unbalanced. If so, then we need to see if we are really 
unbalanced, and do appropriate freeing. To determine whether a zone 
is _really_ unbalanced, I need to look at the number of free pages 
in the lower order zones too, then compare against the zone's water
marks. Ie, the "balancing" is not just about the absolute number of
free pages in the zone, but rather in the class that the zone 
represents.

I am waiting to see if Linus takes in my previous patch, before 
puting too much work into the balancing heuristics.

> 
> Let's be careful not to design any balancing heuristics which will
> fall apart on architectures where only one zone ever exists (because
> GFP_DMA is completely meaningless).

Yes, with all different types of machines out there, we need to be 
able to provide enough hooks to the arch code to tune the watermarks and
balancing frequency. Luckily, the zone structure is exposed via the
pg_data_t, so this should be no problem. Some sysctls are probably also 
called for.

Kanoj
> 
> Later,
> David S. Miller
> davem@redhat.com
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
  2000-01-06 15:28       ` David S. Miller
  2000-01-06 18:36         ` Kanoj Sarcar
@ 2000-01-06 19:14         ` Rik van Riel
  1 sibling, 0 replies; 12+ messages in thread
From: Rik van Riel @ 2000-01-06 19:14 UTC (permalink / raw)
  To: David S. Miller; +Cc: mingo, kanoj, andrea, torvalds, linux-kernel, linux-mm

On Thu, 6 Jan 2000, David S. Miller wrote:
>    Date:   Thu, 6 Jan 2000 17:05:41 +0100 (CET)
>    From: Ingo Molnar <mingo@chiara.csoma.elte.hu>
> 
>    i think this is pretty much 'type-dependent'. In earlier versions
>    of the zone allocator i added a zone->memory_balanced() function
>    (but removed it later because it first needed the things your patch
>    adds). Then every zone can decide for itself wether it's
>    balanced. Eg. the DMA zone is rather critical and we want to keep
>    it free aggressively (part of that is already achieved by placing
>    it at the end of the zone chain), the highmem zone might not need
>    any balancing at all, the normal zone wants some high/low watermark
>    stuff.
> 
> Let's be careful not to design any balancing heuristics which will
> fall apart on architectures where only one zone ever exists (because
> GFP_DMA is completely meaningless).

We'll have to keep the current constraint (freepages.{min,low,high})
stuff as our overall target anyway. We simply need to add some limits
that the special zones have to target for.

And it won't just be tested from kswapd(); swap_tick() and maybe
even __get_pages() will have to check for the constraints too and
wake up kswapd() or take action themselves...

Possible heuristics (using d for dma, n for normal and high for
high free mem; limit = freepages.*) could use the following targets:

d + n > limit
d     > limit/4
n     > limit/4
h     < limit * 2 (because we don't care about free high pages
                   and we do care about free normal and dma pages)

Another possibility is to include the highmem (d + n + h > limit)
and keep the single low-mem limits.

IMHO there should be a number of constraints for a heuristic
like this:
- simple
- not force too much memory free
- the algorithm should have freedom in what to free, in order
  to avoid excessive scanning (alt: separate queues per zone)
- the limits for dma and normal memory should be high enough
  to be able to fulfill kernel allocations
- should have no impact on non-zoned machines

The (extremely simple) constraints above should probably be
enough to actually get the job done. At least, I wouldn't
know why we'd need more...

cheers,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC] [RFT] [PATCH] memory zone balancing
@ 2000-01-04 19:42 Rik van Riel
  0 siblings, 0 replies; 12+ messages in thread
From: Rik van Riel @ 2000-01-04 19:42 UTC (permalink / raw)
  To: Linux MM

On Tue, 4 Jan 2000, Rik van Riel wrote:

> d+r+h > limit
> d     > limit/2
> r     > limit/4
> h     > limit/8

Ehmmm, wait. I messed up on this one.
We probably want to have a bit more freedom
so the page freeing algorithm doesn't do too
much scanning for nothing.

d+r+h 	> limit
d	> limit/4
r	> limit/4
h	don't care, even on a 1.5GB machine h will
	be 1/3 of total memory, so we'll usually
	have a few free pages in here

> DMA pages should always be present, regular pages for
> storing pagetables and stuff need to be there too, higmem
> pages we don't really care about.
> 
> Btw, I think we probably want to increase freepages.min
> to 512 or even more on machines that have >1GB of memory.
> The current limit of 256 was really intended for machines
> with a single zone of memory...
> 
> (but on <1GB machines I don't know if it makes sense to
> raise the limit much more ... maybe we should raise the
> limit automagically if the page alloc/io rate is too
> high?)

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.nl.linux.org/Linux-MM/

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2000-01-06 19:14 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
1999-12-15 19:50 [RFC] [RFT] [PATCH] memory zone balancing Kanoj Sarcar
1999-12-17  7:27 ` Mike Galbraith
1999-12-27 16:00   ` Andrea Arcangeli
1999-12-27 16:06 ` Andrea Arcangeli
2000-01-04  2:27   ` Kanoj Sarcar
2000-01-04  3:23     ` Rik van Riel
2000-01-04 15:47       ` Rik van Riel
2000-01-06 16:05     ` Ingo Molnar
2000-01-06 15:28       ` David S. Miller
2000-01-06 18:36         ` Kanoj Sarcar
2000-01-06 19:14         ` Rik van Riel
2000-01-04 19:42 Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox