So, this patch started out with me trying to keep from passing contiguous, node-specific mem_map into free_area_init_node() and cousins. Instead, I relied on some calls to pfn_to_page(). This works fine and dandy when all you need is the pgdat->node_mem_map to do pfn_to_page(). However, the non-NUMA/DISCONTIG architectures use the real, global mem_map[] instead of a node_mem_map in the pfn_to_page() calculation. So, I ended up effectively trying to initialize mem_map from itself, when it was NULL. That was bad, and caused some very pretty colors on someone's screen when he tested it. So, I had to make sure to initialize the global mem_map[] before calling into free_area_init_node(). Then, I realized how many architectures do this on their own, and have comments like this: /* XXX: MRB-remove - this doesn't seem sane, should this be done somewhere else ?*/ mem_map = NODE_DATA(0)->node_mem_map; Also, ppc64 has some interesting hacks^Wcode in this area to make up for any empty-memory NUMA nodes, which I think can go away now. The following patch does what my first one did (don't pass mem_map into the init functions), incorporates Jesse Barnes' ia64 fixes on top of that, and gets rid of all but one of the global mem_map initializations (parisc is weird). It also magically removes more code than it adds. It could be smaller, but I shamelessly added some comments. Signed-off-by: Dave Hansen --- diff -urp clean/arch/arm/mm/init.c btest/arch/arm/mm/init.c --- clean/arch/arm/mm/init.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/arm/mm/init.c 2004-08-24 10:47:18.000000000 -0700 @@ -499,10 +499,6 @@ void __init paging_init(struct meminfo * bdata->node_boot_start >> PAGE_SHIFT, zhole_size); } -#ifndef CONFIG_DISCONTIGMEM - mem_map = contig_page_data.node_mem_map; -#endif - /* * finish off the bad pages once * the mem_map is initialised diff -urp clean/arch/arm26/mm/init.c btest/arch/arm26/mm/init.c --- clean/arch/arm26/mm/init.c 2004-08-23 08:07:29.000000000 -0700 +++ btest/arch/arm26/mm/init.c 2004-08-24 10:47:18.000000000 -0700 @@ -306,11 +306,9 @@ void __init paging_init(struct meminfo * if (!zone_size[0]) BUG(); - free_area_init_node(0, pgdat, 0, zone_size, + free_area_init_node(0, pgdat, zone_size, bdata->node_boot_start >> PAGE_SHIFT, zhole_size); - mem_map = NODE_DATA(0)->node_mem_map; - /* * finish off the bad pages once * the mem_map is initialised diff -urp clean/arch/cris/arch-v10/mm/init.c btest/arch/cris/arch-v10/mm/init.c --- clean/arch/cris/arch-v10/mm/init.c 2004-08-23 08:07:28.000000000 -0700 +++ btest/arch/cris/arch-v10/mm/init.c 2004-08-24 10:47:18.000000000 -0700 @@ -183,8 +183,7 @@ paging_init(void) * mem_map page array. */ - free_area_init_node(0, &contig_page_data, 0, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); - mem_map = contig_page_data.node_mem_map; + free_area_init_node(0, &contig_page_data, zones_size, PAGE_OFFSET >> PAGE_SHIFT, 0); } /* Initialize remaps of some I/O-ports. It is important that this diff -urp clean/arch/i386/mm/discontig.c btest/arch/i386/mm/discontig.c --- clean/arch/i386/mm/discontig.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/i386/mm/discontig.c 2004-08-24 10:47:18.000000000 -0700 @@ -417,18 +417,15 @@ void __init zone_sizes_init(void) * normal bootmem allocator, but other nodes come from the * remapped KVA area - mbligh */ - if (!nid) - free_area_init_node(nid, NODE_DATA(nid), - zones_size, start, zholes_size); - else { + if (nid) { unsigned long lmem_map; lmem_map = (unsigned long)node_remap_start_vaddr[nid]; lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1; lmem_map &= PAGE_MASK; NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map; - free_area_init_node(nid, NODE_DATA(nid), zones_size, - start, zholes_size); } + free_area_init_node(nid, NODE_DATA(nid), zones_size, + start, zholes_size); } return; } diff -urp clean/arch/ia64/mm/contig.c btest/arch/ia64/mm/contig.c --- clean/arch/ia64/mm/contig.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/ia64/mm/contig.c 2004-08-24 10:47:18.000000000 -0700 @@ -269,7 +269,6 @@ paging_init (void) vmem_map = (struct page *) 0; free_area_init_node(0, &contig_page_data, zones_size, 0, zholes_size); - mem_map = contig_page_data.node_mem_map; } else { unsigned long map_size; @@ -280,11 +279,10 @@ paging_init (void) vmem_map = (struct page *) vmalloc_end; efi_memmap_walk(create_mem_map_page_table, 0); - contig_page_data.node_mem_map = vmem_map; + NODE_DATA(0)->node_mem_map = vmem_map; free_area_init_node(0, &contig_page_data, zones_size, 0, zholes_size); - mem_map = contig_page_data.node_mem_map; printk("Virtual mem_map starts at 0x%p\n", mem_map); } #else /* !CONFIG_VIRTUAL_MEM_MAP */ diff -urp clean/arch/ia64/mm/discontig.c btest/arch/ia64/mm/discontig.c --- clean/arch/ia64/mm/discontig.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/ia64/mm/discontig.c 2004-08-24 10:47:18.000000000 -0700 @@ -665,6 +665,7 @@ void paging_init(void) pfn_offset = mem_data[node].min_pfn; + NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset; free_area_init_node(node, NODE_DATA(node), zones_size, pfn_offset, zholes_size); } diff -urp clean/arch/ia64/mm/init.c btest/arch/ia64/mm/init.c --- clean/arch/ia64/mm/init.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/ia64/mm/init.c 2004-08-24 10:47:18.000000000 -0700 @@ -429,7 +429,7 @@ virtual_memmap_init (u64 start, u64 end, / sizeof(struct page)); if (map_start < map_end) - memmap_init_zone((unsigned long)(map_end - map_start), + memmap_init_zone(map_start, (unsigned long) (map_end - map_start), args->nid, args->zone, page_to_pfn(map_start)); return 0; } diff -urp clean/arch/ppc64/mm/init.c btest/arch/ppc64/mm/init.c --- clean/arch/ppc64/mm/init.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/ppc64/mm/init.c 2004-08-24 10:47:18.000000000 -0700 @@ -615,7 +615,6 @@ void __init paging_init(void) free_area_init_node(0, &contig_page_data, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, zholes_size); - mem_map = contig_page_data.node_mem_map; } #endif /* CONFIG_DISCONTIGMEM */ diff -urp clean/arch/sh/mm/init.c btest/arch/sh/mm/init.c --- clean/arch/sh/mm/init.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/sh/mm/init.c 2004-08-24 10:47:28.000000000 -0700 @@ -215,9 +215,7 @@ void __init paging_init(void) disable_mmu(); #endif - free_area_init_node(0, NODE_DATA(0), 0, zones_size, __MEMORY_START >> PAGE_SHIFT, 0); - /* XXX: MRB-remove - this doesn't seem sane, should this be done somewhere else ?*/ - mem_map = NODE_DATA(0)->node_mem_map; + free_area_init_node(0, NODE_DATA(0), zones_size, __MEMORY_START >> PAGE_SHIFT, 0); #ifdef CONFIG_DISCONTIGMEM /* diff -urp clean/arch/sh64/mm/init.c btest/arch/sh64/mm/init.c --- clean/arch/sh64/mm/init.c 2004-08-23 08:07:29.000000000 -0700 +++ btest/arch/sh64/mm/init.c 2004-08-24 10:47:28.000000000 -0700 @@ -123,10 +123,7 @@ void __init paging_init(void) */ zones_size[ZONE_DMA] = MAX_LOW_PFN - START_PFN; - free_area_init_node(0, NODE_DATA(0), 0, zones_size, __MEMORY_START >> PAGE_SHIFT, 0); - - /* XXX: MRB-remove - this doesn't seem sane, should this be done somewhere else ?*/ - mem_map = NODE_DATA(0)->node_mem_map; + free_area_init_node(0, NODE_DATA(0), zones_size, __MEMORY_START >> PAGE_SHIFT, 0); } void __init mem_init(void) diff -urp clean/arch/sparc/mm/srmmu.c btest/arch/sparc/mm/srmmu.c --- clean/arch/sparc/mm/srmmu.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/sparc/mm/srmmu.c 2004-08-24 10:47:28.000000000 -0700 @@ -1343,7 +1343,6 @@ void __init srmmu_paging_init(void) free_area_init_node(0, &contig_page_data, zones_size, pfn_base, zholes_size); - mem_map = contig_page_data.node_mem_map; } } diff -urp clean/arch/sparc/mm/sun4c.c btest/arch/sparc/mm/sun4c.c --- clean/arch/sparc/mm/sun4c.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/sparc/mm/sun4c.c 2004-08-24 10:47:28.000000000 -0700 @@ -2116,7 +2116,6 @@ void __init sun4c_paging_init(void) free_area_init_node(0, &contig_page_data, zones_size, pfn_base, zholes_size); - mem_map = contig_page_data.node_mem_map; } cnt = 0; diff -urp clean/arch/sparc64/mm/init.c btest/arch/sparc64/mm/init.c --- clean/arch/sparc64/mm/init.c 2004-08-23 08:08:11.000000000 -0700 +++ btest/arch/sparc64/mm/init.c 2004-08-24 10:47:28.000000000 -0700 @@ -1504,7 +1504,6 @@ void __init paging_init(void) free_area_init_node(0, &contig_page_data, zones_size, phys_base >> PAGE_SHIFT, zholes_size); - mem_map = contig_page_data.node_mem_map; } device_scan(); diff -urp clean/arch/v850/kernel/setup.c btest/arch/v850/kernel/setup.c --- clean/arch/v850/kernel/setup.c 2004-08-23 08:07:29.000000000 -0700 +++ btest/arch/v850/kernel/setup.c 2004-08-24 10:47:29.000000000 -0700 @@ -281,7 +281,6 @@ init_mem_alloc (unsigned long ram_start, #error MAX_ORDER is too large for given PAGE_OFFSET (use CONFIG_FORCE_MAX_ZONEORDER to change it) #endif - free_area_init_node (0, NODE_DATA(0), 0, zones_size, + free_area_init_node (0, NODE_DATA(0), zones_size, ADDR_TO_PAGE (PAGE_OFFSET), 0); - mem_map = NODE_DATA(0)->node_mem_map; } diff -urp clean/mm/page_alloc.c btest/mm/page_alloc.c --- clean/mm/page_alloc.c 2004-08-23 08:08:23.000000000 -0700 +++ btest/mm/page_alloc.c 2004-08-24 10:47:28.000000000 -0700 @@ -1690,14 +1690,25 @@ static void __init free_area_init_core(s } } -void __init node_alloc_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long size; + /* + * Make sure that the architecture hasn't already allocated + * a node_mem_map, and that the node contains memory. + */ + if (pgdat->node_mem_map || !pgdat->node_spanned_pages) + return; + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); #ifndef CONFIG_DISCONTIGMEM - mem_map = contig_page_data.node_mem_map; + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; #endif } @@ -1709,8 +1720,7 @@ void __init free_area_init_node(int nid, pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!pfn_to_page(node_start_pfn)) - node_alloc_mem_map(pgdat); + alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); }