linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC] [Patch] For booting a i386 numa system with no memory in a node
       [not found]                             ` <1108686742.6482.51.camel@localhost>
@ 2005-02-21 20:17                               ` keith
  2005-02-21 20:39                                 ` Dave Hansen
  0 siblings, 1 reply; 7+ messages in thread
From: keith @ 2005-02-21 20:17 UTC (permalink / raw)
  To: linux-mm
  Cc: Martin J. Bligh, matt dobson, john stultz, Andy Whitcroft, Dave Hansen

[-- Attachment #1: Type: text/plain, Size: 1408 bytes --]

  Attach is a patch that allows a i386 numa based system to boot without
memory in a node.  It deals with the assumption that all nodes have
memory.  
  
  In a summit based system (IBM x440/x445) is is possible to configure a
box with no memory is a node.  While this is not an ideal performance
setup it is a valid configuration for the box and the kernel should be
able to deal with it.  

 This "memory free node" must not be node 0.  Node 0 must still contain
memory (there are tons of hard coded 0's in the mm code so I am steering
around this for now).

  The assumption that all nodes have memory is not always true.  I have
introduced a simple node_has_online_mem functionality in the topology
code.  This check is based on 
node_start_pfn[nid] == node_end_pfn[nid] 
and as such the node_start/end_pfn must only contain physically present
memory.  

 I presented a patch a while ago that allowed non-present memory
reported from the srat to be ignored at the numa KVA level.  This patch
takes that a set further.  Ignore the memory above max_pfn altogether.  

  This main issues this patch address is fixing the numa_kva code as it
was built without this no-memory node in mind.

  It was tested with 2.6.11-rc4 on a 8-way x445 (summit) with no memory
in the 2nd node.  It supports both a physically empty and  SRAT based
hot-add empty zones just fine.  

Thanks,
  Keith Mannthey
  LTC xSeries   

[-- Attachment #2: patch-2.6.11-rc4-fix_nomem_on_node-v1 --]
[-- Type: text/x-patch, Size: 4929 bytes --]

diff -urN linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c
--- linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c	2005-02-21 11:39:59.000000000 -0800
+++ linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c	2005-02-16 17:23:52.000000000 -0800
@@ -273,14 +273,6 @@
 		int been_here_before = 0;
 
 		for (j = 0; j < num_memory_chunks; j++){
-			/*
-			 *Only add present memroy to node_end/start_pfn 
-			 *There is no guarantee from the srat that the memory is present
-			 */
-			if (node_memory_chunk[j].start_pfn >= max_pfn) {
-				printk ("Ignoring chunk of memory reported in the SRAT (could be hot-add zone?)\n");
-				continue;
-			}
 			if (node_memory_chunk[j].nid == nid) {
 				if (been_here_before == 0) {
 					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
diff -urN linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c
--- linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c	2005-02-21 11:40:28.000000000 -0800
+++ linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c	2005-02-16 17:23:52.000000000 -0800
@@ -128,7 +128,7 @@
  */
 static void __init allocate_pgdat(int nid)
 {
-	if (nid && node_has_online_mem(nid))
+	if (nid)
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
 		NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
@@ -204,10 +204,8 @@
 		if (nid == 0)
 			continue;
 		/* calculate the size of the mem_map needed in bytes */
-		size = node_end_pfn[nid] - node_start_pfn[nid];
-		if (size)
-			size = (size + 1) * sizeof(struct page) + sizeof(pg_data_t);
-		
+		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
+			* sizeof(struct page) + sizeof(pg_data_t);
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
@@ -244,7 +242,6 @@
 	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
 	unsigned long reserve_pages, pfn;
 
-	find_max_pfn();
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
 	 * from node local memory.  They are then mapped directly into KVA
@@ -273,6 +270,7 @@
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
+	find_max_pfn();
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
 	printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
 			reserve_pages, max_low_pfn + reserve_pages);
@@ -401,27 +399,24 @@
 
 		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-		if (node_has_online_mem(nid)){
-			if (start > low) {
+		if (start > low) {
 #ifdef CONFIG_HIGHMEM
-				BUG_ON(start > high);
-				zones_size[ZONE_HIGHMEM] = high - start;
+			BUG_ON(start > high);
+			zones_size[ZONE_HIGHMEM] = high - start;
 #endif
-			} else {
-				if (low < max_dma)
-					zones_size[ZONE_DMA] = low;
-				else {
-					BUG_ON(max_dma > low);
-					BUG_ON(low > high);
-					zones_size[ZONE_DMA] = max_dma;
-					zones_size[ZONE_NORMAL] = low - max_dma;
+		} else {
+			if (low < max_dma)
+				zones_size[ZONE_DMA] = low;
+			else {
+				BUG_ON(max_dma > low);
+				BUG_ON(low > high);
+				zones_size[ZONE_DMA] = max_dma;
+				zones_size[ZONE_NORMAL] = low - max_dma;
 #ifdef CONFIG_HIGHMEM
-					zones_size[ZONE_HIGHMEM] = high - low;
-#endif	
-				}
+				zones_size[ZONE_HIGHMEM] = high - low;
+#endif
 			}
 		}
-		
 		zholes_size = get_zholes_size(nid);
 		/*
 		 * We let the lmem_map for node 0 be allocated from the
diff -urN linux-2.6.11-rc4-fix7/include/asm-i386/topology.h linux-2.6.11-rc4.orig/include/asm-i386/topology.h
--- linux-2.6.11-rc4-fix7/include/asm-i386/topology.h	2005-02-21 11:32:10.000000000 -0800
+++ linux-2.6.11-rc4.orig/include/asm-i386/topology.h	2005-02-16 17:23:58.000000000 -0800
@@ -88,16 +88,6 @@
 	.nr_balance_failed	= 0,			\
 }
 
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-
-#define node_has_online_mem(nid) !(node_start_pfn[nid] == node_end_pfn[nid])                 
-/*                                                                            
-inline int __node_has_online_mem(int nid) {
-        return !(node_start_pfn[nid]== node_end_pfn[nid]);
-}
-*/
-
 #else /* !CONFIG_NUMA */
 /*
  * Other i386 platforms should define their own version of the 
diff -urN linux-2.6.11-rc4-fix7/include/linux/topology.h linux-2.6.11-rc4.orig/include/linux/topology.h
--- linux-2.6.11-rc4-fix7/include/linux/topology.h	2005-02-21 11:32:10.000000000 -0800
+++ linux-2.6.11-rc4.orig/include/linux/topology.h	2005-02-16 17:23:58.000000000 -0800
@@ -31,11 +31,8 @@
 #include <linux/bitops.h>
 #include <linux/mmzone.h>
 #include <linux/smp.h>
-#include <asm/topology.h>
 
-#ifndef node_has_online_mem
-#define node_has_online_mem(nid) (1)
-#endif
+#include <asm/topology.h>
 
 #ifndef nr_cpus_node
 #define nr_cpus_node(node)							\

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 20:17                               ` [RFC] [Patch] For booting a i386 numa system with no memory in a node keith
@ 2005-02-21 20:39                                 ` Dave Hansen
  2005-02-21 22:03                                   ` keith
  0 siblings, 1 reply; 7+ messages in thread
From: Dave Hansen @ 2005-02-21 20:39 UTC (permalink / raw)
  To: keith; +Cc: linux-mm, Martin J. Bligh, matt dobson, John Stultz, Andy Whitcroft

On Mon, 2005-02-21 at 12:17 -0800, keith wrote:
>   Attach is a patch that allows a i386 numa based system to boot without
> memory in a node.  It deals with the assumption that all nodes have
> memory.  

The diff is backwards :)

> -                       if (node_memory_chunk[j].start_pfn >= max_pfn)
> {
> -                               printk ("Ignoring chunk of memory
> reported in the SRAT (could be hot-add zone?)\n");
> -                               continue;
> -                       }

Could you print out the memory ranges, or sizes here?  Also, please add
a KERN_* level to it.  We might not want this unless the user has booted
with "debug".

> +               if (node_has_online_mem(nid)){
> +                       if (start > low) {

Instead of indenting another level, can you just put a continue in the
loop?  I think it makes it much easier to read.  

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 20:39                                 ` Dave Hansen
@ 2005-02-21 22:03                                   ` keith
  2005-02-21 22:24                                     ` Dave Hansen
  0 siblings, 1 reply; 7+ messages in thread
From: keith @ 2005-02-21 22:03 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-mm, Martin J. Bligh, matt dobson, john stultz, Andy Whitcroft

[-- Attachment #1: Type: text/plain, Size: 1164 bytes --]

On Mon, 2005-02-21 at 12:39, Dave Hansen wrote:
> On Mon, 2005-02-21 at 12:17 -0800, keith wrote:
> >   Attach is a patch that allows a i386 numa based system to boot without
> > memory in a node.  It deals with the assumption that all nodes have
> > memory.  
> 
> The diff is backwards :)
Opps!  See new attached patch :)

> 
> > -                       if (node_memory_chunk[j].start_pfn >= max_pfn)
> > {
> > -                               printk ("Ignoring chunk of memory
> > reported in the SRAT (could be hot-add zone?)\n");
> > -                               continue;
> > -                       }
> 
> Could you print out the memory ranges, or sizes here?  Also, please add
> a KERN_* level to it.  We might not want this unless the user has booted
> with "debug".
Done.

> 
> > +               if (node_has_online_mem(nid)){
> > +                       if (start > low) {
> 
> Instead of indenting another level, can you just put a continue in the
> loop?  I think it makes it much easier to read.  

I cannot put a continue here.  I know it makes ugly code worse but we
have to call free area_init_node in all cases.   


Keith Mannthey
LTC xSeries

[-- Attachment #2: patch-2.6.11-rc4-fix_nomem_on_node-v2 --]
[-- Type: text/x-patch, Size: 5088 bytes --]

diff -urN linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c
--- linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c	2005-02-16 17:23:52.000000000 -0800
+++ linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c	2005-02-21 13:56:28.000000000 -0800
@@ -273,6 +273,17 @@
 		int been_here_before = 0;
 
 		for (j = 0; j < num_memory_chunks; j++){
+			/*
+			 *Only add present memroy to node_end/start_pfn 
+			 *There is no guarantee from the srat that the memory 
+			 *is present at boot time. 
+			 */
+			if (node_memory_chunk[j].start_pfn >= max_pfn) {
+				printk (KERN_INFO "Ignoring chunk of memory reported in the SRAT (could be hot-add zone?)\n");
+				printk (KERN_INFO "chunk is reported from pfn %04x to %04x\n",
+					node_memory_chunk[j].start_pfn, node_memory_chunk[j].end_pfn);
+				continue;
+			}
 			if (node_memory_chunk[j].nid == nid) {
 				if (been_here_before == 0) {
 					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
diff -urN linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c
--- linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c	2005-02-16 17:23:52.000000000 -0800
+++ linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c	2005-02-21 11:40:28.000000000 -0800
@@ -128,7 +128,7 @@
  */
 static void __init allocate_pgdat(int nid)
 {
-	if (nid)
+	if (nid && node_has_online_mem(nid))
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
 		NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
@@ -204,8 +204,10 @@
 		if (nid == 0)
 			continue;
 		/* calculate the size of the mem_map needed in bytes */
-		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
-			* sizeof(struct page) + sizeof(pg_data_t);
+		size = node_end_pfn[nid] - node_start_pfn[nid];
+		if (size)
+			size = (size + 1) * sizeof(struct page) + sizeof(pg_data_t);
+		
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
@@ -242,6 +244,7 @@
 	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
 	unsigned long reserve_pages, pfn;
 
+	find_max_pfn();
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
 	 * from node local memory.  They are then mapped directly into KVA
@@ -270,7 +273,6 @@
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	find_max_pfn();
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
 	printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
 			reserve_pages, max_low_pfn + reserve_pages);
@@ -399,24 +401,27 @@
 
 		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-		if (start > low) {
+		if (node_has_online_mem(nid)){
+			if (start > low) {
 #ifdef CONFIG_HIGHMEM
-			BUG_ON(start > high);
-			zones_size[ZONE_HIGHMEM] = high - start;
+				BUG_ON(start > high);
+				zones_size[ZONE_HIGHMEM] = high - start;
 #endif
-		} else {
-			if (low < max_dma)
-				zones_size[ZONE_DMA] = low;
-			else {
-				BUG_ON(max_dma > low);
-				BUG_ON(low > high);
-				zones_size[ZONE_DMA] = max_dma;
-				zones_size[ZONE_NORMAL] = low - max_dma;
+			} else {
+				if (low < max_dma)
+					zones_size[ZONE_DMA] = low;
+				else {
+					BUG_ON(max_dma > low);
+					BUG_ON(low > high);
+					zones_size[ZONE_DMA] = max_dma;
+					zones_size[ZONE_NORMAL] = low - max_dma;
 #ifdef CONFIG_HIGHMEM
-				zones_size[ZONE_HIGHMEM] = high - low;
-#endif
+					zones_size[ZONE_HIGHMEM] = high - low;
+#endif	
+				}
 			}
 		}
+		
 		zholes_size = get_zholes_size(nid);
 		/*
 		 * We let the lmem_map for node 0 be allocated from the
diff -urN linux-2.6.11-rc4.orig/include/asm-i386/topology.h linux-2.6.11-rc4-fix7/include/asm-i386/topology.h
--- linux-2.6.11-rc4.orig/include/asm-i386/topology.h	2005-02-16 17:23:58.000000000 -0800
+++ linux-2.6.11-rc4-fix7/include/asm-i386/topology.h	2005-02-21 11:32:10.000000000 -0800
@@ -88,6 +88,16 @@
 	.nr_balance_failed	= 0,			\
 }
 
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+
+#define node_has_online_mem(nid) !(node_start_pfn[nid] == node_end_pfn[nid])                 
+/*                                                                            
+inline int __node_has_online_mem(int nid) {
+        return !(node_start_pfn[nid]== node_end_pfn[nid]);
+}
+*/
+
 #else /* !CONFIG_NUMA */
 /*
  * Other i386 platforms should define their own version of the 
diff -urN linux-2.6.11-rc4.orig/include/linux/topology.h linux-2.6.11-rc4-fix7/include/linux/topology.h
--- linux-2.6.11-rc4.orig/include/linux/topology.h	2005-02-16 17:23:58.000000000 -0800
+++ linux-2.6.11-rc4-fix7/include/linux/topology.h	2005-02-21 11:32:10.000000000 -0800
@@ -31,9 +31,12 @@
 #include <linux/bitops.h>
 #include <linux/mmzone.h>
 #include <linux/smp.h>
-
 #include <asm/topology.h>
 
+#ifndef node_has_online_mem
+#define node_has_online_mem(nid) (1)
+#endif
+
 #ifndef nr_cpus_node
 #define nr_cpus_node(node)							\
 	({									\

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 22:03                                   ` keith
@ 2005-02-21 22:24                                     ` Dave Hansen
  2005-02-21 23:46                                       ` keith
  0 siblings, 1 reply; 7+ messages in thread
From: Dave Hansen @ 2005-02-21 22:24 UTC (permalink / raw)
  To: keith; +Cc: linux-mm, Martin J. Bligh, matt dobson, John Stultz, Andy Whitcroft

On Mon, 2005-02-21 at 14:03 -0800, keith wrote:
> On Mon, 2005-02-21 at 12:39, Dave Hansen wrote:
> > On Mon, 2005-02-21 at 12:17 -0800, keith wrote:
> > > +               if (node_has_online_mem(nid)){
> > > +                       if (start > low) {
> > 
> > Instead of indenting another level, can you just put a continue in the
> > loop?  I think it makes it much easier to read.  
> 
> I cannot put a continue here.  I know it makes ugly code worse but we
> have to call free area_init_node in all cases.   

If !node_has_online_mem(nid), then (node_start_pfn[nid] ==
node_end_pfn[nid]), and running through this if() won't hurt anything
here:

>                         if (start > low) {
> #ifdef CONFIG_HIGHMEM
>                                 BUG_ON(start > high);
>                                 zones_size[ZONE_HIGHMEM] = high - start;
> #endif
>                         }

high==start, so the bug won't trip, and it will set
zones_size[ZONE_HIGHMEM]=0, which is also OK.  Can you do this?

-               if (start > low) {
+               if (node_has_online_mem(nid) || (start > low)) {


> +#define node_has_online_mem(nid) !(node_start_pfn[nid] == node_end_pfn[nid]) 
> +/*
> +inline int __node_has_online_mem(int nid) {
> +        return !(node_start_pfn[nid]== node_end_pfn[nid]);
> +}
> +*/

You probably want to kill the extra definition.  Also, I prefer

	(node_start_pfn[nid] != node_end_pfn[nid])

to

	!(node_start_pfn[nid] == node_end_pfn[nid])

But, that's the most minor of nits.  

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 22:24                                     ` Dave Hansen
@ 2005-02-21 23:46                                       ` keith
  2005-02-21 23:55                                         ` Dave Hansen
  2005-02-22  0:09                                         ` Dave Hansen
  0 siblings, 2 replies; 7+ messages in thread
From: keith @ 2005-02-21 23:46 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-mm, Martin J. Bligh, matt dobson, john stultz, Andy Whitcroft

[-- Attachment #1: Type: text/plain, Size: 2337 bytes --]

On Mon, 2005-02-21 at 14:24, Dave Hansen wrote:
> On Mon, 2005-02-21 at 14:03 -0800, keith wrote:
> > On Mon, 2005-02-21 at 12:39, Dave Hansen wrote:
> > > On Mon, 2005-02-21 at 12:17 -0800, keith wrote:
> > > > +               if (node_has_online_mem(nid)){
> > > > +                       if (start > low) {
> > > 
> > > Instead of indenting another level, can you just put a continue in the
> > > loop?  I think it makes it much easier to read.  
> > 
> > I cannot put a continue here.  I know it makes ugly code worse but we
> > have to call free area_init_node in all cases.   
> 
> If !node_has_online_mem(nid), then (node_start_pfn[nid] ==
> node_end_pfn[nid]), and running through this if() won't hurt anything
> here:

node_start_pfn[nid] == node_end_pfn[nid] == 0 
start and high are both 0.  That blows the chunk of code up :)

In the no memory in a node case things look like:
start  = 0
high = 0
low = max_low_pfn. 

> >                         if (start > low) {
> > #ifdef CONFIG_HIGHMEM
> >                                 BUG_ON(start > high);
> >                                 zones_size[ZONE_HIGHMEM] = high - start;
> > #endif
> >                         }

 start is 0 and low is max_low_pfn so (start < low) so I catch 
BUG_ON(low > high) in the else part the if. 
 
Since the right zone_sizes is 0 for everything I think it is best just
to skip that section of code altogether. 

> high==start, so the bug won't trip, and it will set
> zones_size[ZONE_HIGHMEM]=0, which is also OK.  Can you do this?
> 
> -               if (start > low) {
> +               if (node_has_online_mem(nid) || (start > low)) {

No, it is the else of that "if" that kills the kernel.  start < low.  
The zone_sizes will all be 0 in the !node_has_online_mem case. They are
initialized to 0 they stay that way as free area_init_node is called. 

> > +#define node_has_online_mem(nid) !(node_start_pfn[nid] == node_end_pfn[nid]) 
> > +/*
> > +inline int __node_has_online_mem(int nid) {
> > +        return !(node_start_pfn[nid]== node_end_pfn[nid]);
> > +}
> > +*/
> 
> You probably want to kill the extra definition.  Also, I prefer

thanks for catching that :)
> 
> 	(node_start_pfn[nid] != node_end_pfn[nid])
> 
> to
> 
> 	!(node_start_pfn[nid] == node_end_pfn[nid])
> 
> But, that's the most minor of nits.  
easy to do. 


Keith 

[-- Attachment #2: patch-2.6.11-rc4-fix_nomem_on_node-v3 --]
[-- Type: text/x-patch, Size: 5016 bytes --]

diff -urN linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c
--- linux-2.6.11-rc4.orig/arch/i386/kernel/srat.c	2005-02-16 17:23:52.000000000 -0800
+++ linux-2.6.11-rc4-fix7/arch/i386/kernel/srat.c	2005-02-21 13:56:28.000000000 -0800
@@ -273,6 +273,17 @@
 		int been_here_before = 0;
 
 		for (j = 0; j < num_memory_chunks; j++){
+			/*
+			 *Only add present memroy to node_end/start_pfn 
+			 *There is no guarantee from the srat that the memory 
+			 *is present at boot time. 
+			 */
+			if (node_memory_chunk[j].start_pfn >= max_pfn) {
+				printk (KERN_INFO "Ignoring chunk of memory reported in the SRAT (could be hot-add zone?)\n");
+				printk (KERN_INFO "chunk is reported from pfn %04x to %04x\n",
+					node_memory_chunk[j].start_pfn, node_memory_chunk[j].end_pfn);
+				continue;
+			}
 			if (node_memory_chunk[j].nid == nid) {
 				if (been_here_before == 0) {
 					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
Files linux-2.6.11-rc4.orig/arch/i386/mm/.discontig.c.swp and linux-2.6.11-rc4-fix7/arch/i386/mm/.discontig.c.swp differ
diff -urN linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c
--- linux-2.6.11-rc4.orig/arch/i386/mm/discontig.c	2005-02-16 17:23:52.000000000 -0800
+++ linux-2.6.11-rc4-fix7/arch/i386/mm/discontig.c	2005-02-21 11:40:28.000000000 -0800
@@ -128,7 +128,7 @@
  */
 static void __init allocate_pgdat(int nid)
 {
-	if (nid)
+	if (nid && node_has_online_mem(nid))
 		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
 	else {
 		NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT));
@@ -204,8 +204,10 @@
 		if (nid == 0)
 			continue;
 		/* calculate the size of the mem_map needed in bytes */
-		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
-			* sizeof(struct page) + sizeof(pg_data_t);
+		size = node_end_pfn[nid] - node_start_pfn[nid];
+		if (size)
+			size = (size + 1) * sizeof(struct page) + sizeof(pg_data_t);
+		
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
@@ -242,6 +244,7 @@
 	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
 	unsigned long reserve_pages, pfn;
 
+	find_max_pfn();
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
 	 * from node local memory.  They are then mapped directly into KVA
@@ -270,7 +273,6 @@
 	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-	find_max_pfn();
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn() - reserve_pages;
 	printk("reserve_pages = %ld find_max_low_pfn() ~ %ld\n",
 			reserve_pages, max_low_pfn + reserve_pages);
@@ -399,24 +401,27 @@
 
 		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-		if (start > low) {
+		if (node_has_online_mem(nid)){
+			if (start > low) {
 #ifdef CONFIG_HIGHMEM
-			BUG_ON(start > high);
-			zones_size[ZONE_HIGHMEM] = high - start;
+				BUG_ON(start > high);
+				zones_size[ZONE_HIGHMEM] = high - start;
 #endif
-		} else {
-			if (low < max_dma)
-				zones_size[ZONE_DMA] = low;
-			else {
-				BUG_ON(max_dma > low);
-				BUG_ON(low > high);
-				zones_size[ZONE_DMA] = max_dma;
-				zones_size[ZONE_NORMAL] = low - max_dma;
+			} else {
+				if (low < max_dma)
+					zones_size[ZONE_DMA] = low;
+				else {
+					BUG_ON(max_dma > low);
+					BUG_ON(low > high);
+					zones_size[ZONE_DMA] = max_dma;
+					zones_size[ZONE_NORMAL] = low - max_dma;
 #ifdef CONFIG_HIGHMEM
-				zones_size[ZONE_HIGHMEM] = high - low;
-#endif
+					zones_size[ZONE_HIGHMEM] = high - low;
+#endif	
+				}
 			}
 		}
+		
 		zholes_size = get_zholes_size(nid);
 		/*
 		 * We let the lmem_map for node 0 be allocated from the
diff -urN linux-2.6.11-rc4.orig/include/asm-i386/topology.h linux-2.6.11-rc4-fix7/include/asm-i386/topology.h
--- linux-2.6.11-rc4.orig/include/asm-i386/topology.h	2005-02-16 17:23:58.000000000 -0800
+++ linux-2.6.11-rc4-fix7/include/asm-i386/topology.h	2005-02-21 15:39:12.000000000 -0800
@@ -88,6 +88,11 @@
 	.nr_balance_failed	= 0,			\
 }
 
+extern unsigned long node_start_pfn[];
+extern unsigned long node_end_pfn[];
+
+#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])                 
+
 #else /* !CONFIG_NUMA */
 /*
  * Other i386 platforms should define their own version of the 
diff -urN linux-2.6.11-rc4.orig/include/linux/topology.h linux-2.6.11-rc4-fix7/include/linux/topology.h
--- linux-2.6.11-rc4.orig/include/linux/topology.h	2005-02-16 17:23:58.000000000 -0800
+++ linux-2.6.11-rc4-fix7/include/linux/topology.h	2005-02-21 11:32:10.000000000 -0800
@@ -31,9 +31,12 @@
 #include <linux/bitops.h>
 #include <linux/mmzone.h>
 #include <linux/smp.h>
-
 #include <asm/topology.h>
 
+#ifndef node_has_online_mem
+#define node_has_online_mem(nid) (1)
+#endif
+
 #ifndef nr_cpus_node
 #define nr_cpus_node(node)							\
 	({									\

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 23:46                                       ` keith
@ 2005-02-21 23:55                                         ` Dave Hansen
  2005-02-22  0:09                                         ` Dave Hansen
  1 sibling, 0 replies; 7+ messages in thread
From: Dave Hansen @ 2005-02-21 23:55 UTC (permalink / raw)
  To: keith; +Cc: linux-mm, Martin J. Bligh, matt dobson, John Stultz, Andy Whitcroft

[-- Attachment #1: Type: text/plain, Size: 123 bytes --]

I think you interpreted my suggestion about the if() backwards.  Is
there a reason the attached patch won't work?

-- Dave

[-- Attachment #2: collapse-if.patch --]
[-- Type: text/x-patch, Size: 1281 bytes --]



---

 sparse-dave/arch/i386/mm/discontig.c |   22 ++++++++++------------
 1 files changed, 10 insertions(+), 12 deletions(-)

diff -puN arch/i386/mm/discontig.c~collapse-if arch/i386/mm/discontig.c
--- sparse/arch/i386/mm/discontig.c~collapse-if	2005-02-21 15:53:54.000000000 -0800
+++ sparse-dave/arch/i386/mm/discontig.c	2005-02-21 15:54:03.000000000 -0800
@@ -401,24 +401,22 @@ void __init zone_sizes_init(void)
 
 		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-		if (node_has_online_mem(nid)){
-			if (start > low) {
+		if ((start > low) || !node_has_online_mem(nid)) {
 #ifdef CONFIG_HIGHMEM
 				BUG_ON(start > high);
 				zones_size[ZONE_HIGHMEM] = high - start;
 #endif
-			} else {
-				if (low < max_dma)
-					zones_size[ZONE_DMA] = low;
-				else {
-					BUG_ON(max_dma > low);
-					BUG_ON(low > high);
-					zones_size[ZONE_DMA] = max_dma;
-					zones_size[ZONE_NORMAL] = low - max_dma;
+		} else {
+			if (low < max_dma)
+				zones_size[ZONE_DMA] = low;
+			else {
+				BUG_ON(max_dma > low);
+				BUG_ON(low > high);
+				zones_size[ZONE_DMA] = max_dma;
+				zones_size[ZONE_NORMAL] = low - max_dma;
 #ifdef CONFIG_HIGHMEM
-					zones_size[ZONE_HIGHMEM] = high - low;
+				zones_size[ZONE_HIGHMEM] = high - low;
 #endif
-				}
 			}
 		}
 
_

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] [Patch] For booting a i386 numa system with no memory in a node
  2005-02-21 23:46                                       ` keith
  2005-02-21 23:55                                         ` Dave Hansen
@ 2005-02-22  0:09                                         ` Dave Hansen
  1 sibling, 0 replies; 7+ messages in thread
From: Dave Hansen @ 2005-02-22  0:09 UTC (permalink / raw)
  To: keith; +Cc: linux-mm, Martin J. Bligh, matt dobson, John Stultz, Andy Whitcroft

This patch somewhat simplifies the code that you're working with.  You
may want to apply it first because it solves a few of the same problems.

http://www.sr71.net/patches/2.6.11/2.6.11-rc3-mhp1/broken-out/B-sparse-080-alloc_remap-i386.patch

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2005-02-22  0:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1106881119.2040.122.camel@cog.beaverton.ibm.com>
     [not found] ` <1106882150.2040.126.camel@cog.beaverton.ibm.com>
     [not found]   ` <1106937253.27125.6.camel@knk>
     [not found]     ` <1106938993.14330.65.camel@localhost>
     [not found]       ` <1106941547.27125.25.camel@knk>
     [not found]         ` <1106942832.17936.3.camel@arrakis>
     [not found]           ` <1108611260.9817.1227.camel@knk>
     [not found]             ` <1108654782.19395.9.camel@localhost>
     [not found]               ` <1108664637.9817.1259.camel@knk>
     [not found]                 ` <1108666091.19395.29.camel@localhost>
     [not found]                   ` <1108671423.9817.1266.camel@knk>
     [not found]                     ` <421510E9.3000901@us.ibm.com>
     [not found]                       ` <1108677113.32193.8.camel@localhost>
     [not found]                         ` <42152690.4030508@us.ibm.com>
     [not found]                           ` <9230000.1108666127@flay>
     [not found]                             ` <1108686742.6482.51.camel@localhost>
2005-02-21 20:17                               ` [RFC] [Patch] For booting a i386 numa system with no memory in a node keith
2005-02-21 20:39                                 ` Dave Hansen
2005-02-21 22:03                                   ` keith
2005-02-21 22:24                                     ` Dave Hansen
2005-02-21 23:46                                       ` keith
2005-02-21 23:55                                         ` Dave Hansen
2005-02-22  0:09                                         ` Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox