This patch remaps the lmem_map (struct page) arrays for each node onto their own nodes. This is non-trivial, since all of ZONE_NORMAL, and hence permanently mapped KVA resides on node 0. Very early in the boot sequence, it calculates the size of the lmem_map arrays (rounding up to the nearest large page size), and reserves a suitable amount of permanent KVA by shifting down max_low_pfn to create a gap between max_low_pfn and highstart_pfn (both of which are normally about 896Mb). It then uses the new set_pmd_pfn function to set up the pmds correctly so that the large pages point at the physical addresses reserved from the remote nodes. Tested on NUMA-Q and some ratty old i386 PC kicking around under my desk (on 2.5.36-mm1). Was good for a 20% improvement in system time on kernel compile when I initially benchmarked it against 2.5.32 or something - due to a reduction in inter-node traffic, better interconnect cache usage and locality. Should have no effect on any system other than i386 NUMA systems. M. diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/discontig.c 20-numamap/arch/i386/mm/discontig.c --- 11-numafixes2/arch/i386/mm/discontig.c Wed Sep 18 20:41:11 2002 +++ 20-numamap/arch/i386/mm/discontig.c Thu Sep 19 16:07:10 2002 @@ -1,5 +1,6 @@ /* - * Written by: Patricia Gaughen, IBM Corporation + * Written by: Patricia Gaughen , IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh * * Copyright (C) 2002, IBM Corp. * @@ -19,8 +20,6 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to */ #include @@ -113,35 +112,98 @@ } } +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_start_pfn[MAX_NUMNODES]; +unsigned long node_remap_size[MAX_NUMNODES]; +unsigned long node_remap_offset[MAX_NUMNODES]; +void *node_remap_start_vaddr[MAX_NUMNODES]; +extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for (node = 1; node < numnodes; ++node) { + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn< system_max_low_pfn) { - highstart_pfn = system_max_low_pfn; - } - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > system_max_low_pfn) + highstart_pfn = system_max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); #endif + system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(system_max_low_pfn)); - - for (nid = 0; nid < numnodes; nid++) + printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", + min_low_pfn, max_low_pfn, highstart_pfn); + + printk("Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + for (nid = 0; nid < numnodes; nid++) { allocate_pgdat(nid); + node_remap_start_vaddr[nid] = pfn_to_kaddr( + highstart_pfn - node_remap_offset[nid]); + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(highstart_pfn + - node_remap_offset[nid] + node_remap_size[nid])); + } + printk("High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); for (nid = 0; nid < numnodes; nid++) find_max_pfn_node(nid); @@ -244,7 +306,18 @@ #endif } } - free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, start, 0); + /* + * We let the lmem_map for node 0 be allocated from the + * normal bootmem allocator, but other nodes come from the + * remapped KVA area - mbligh + */ + if (nid) + free_area_init_node(nid, NODE_DATA(nid), + node_remap_start_vaddr[nid], zones_size, + start, 0); + else + free_area_init_node(nid, NODE_DATA(nid), 0, + zones_size, start, 0); } return; } diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/init.c 20-numamap/arch/i386/mm/init.c --- 11-numafixes2/arch/i386/mm/init.c Wed Sep 18 20:41:11 2002 +++ 20-numamap/arch/i386/mm/init.c Thu Sep 19 16:07:10 2002 @@ -245,6 +245,12 @@ unsigned long __PAGE_KERNEL = _PAGE_KERNEL; +#ifndef CONFIG_DISCONTIGMEM +#define remap_numa_kva() do {} while (0) +#else +extern void __init remap_numa_kva(void); +#endif + static void __init pagetable_init (void) { unsigned long vaddr; @@ -269,6 +275,7 @@ } kernel_physical_mapping_init(pgd_base); + remap_numa_kva(); /* * Fixed mappings, only the page table structure has to be @@ -449,7 +456,11 @@ set_max_mapnr_init(); +#ifdef CONFIG_HIGHMEM + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); +#else high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); +#endif /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/pgtable.c 20-numamap/arch/i386/mm/pgtable.c --- 11-numafixes2/arch/i386/mm/pgtable.c Wed Sep 18 20:41:11 2002 +++ 20-numamap/arch/i386/mm/pgtable.c Thu Sep 19 16:07:10 2002 @@ -84,6 +84,39 @@ __flush_tlb_one(vaddr); } +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk ("set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk ("set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + __pgd_offset(vaddr); + if (pgd_none(*pgd)) { + printk ("set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pmd = pmd_offset(pgd, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/include/asm-i386/page.h 20-numamap/include/asm-i386/page.h --- 11-numafixes2/include/asm-i386/page.h Wed Sep 18 20:41:12 2002 +++ 20-numamap/include/asm-i386/page.h Thu Sep 19 16:07:10 2002 @@ -142,6 +142,7 @@ #define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #ifndef CONFIG_DISCONTIGMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map))