linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: "Martin J. Bligh" <mbligh@aracnet.com>
To: Andrew Morton <akpm@digeo.com>
Cc: linux-mm mailing list <linux-mm@kvack.org>
Subject: [PATCH] relocate lmem_maps for i386 discontigmem onto their own nodes
Date: Sat, 21 Sep 2002 14:55:36 -0700	[thread overview]
Message-ID: <8826642.1032620136@[10.10.2.3]> (raw)

[-- Attachment #1: Type: text/plain, Size: 8948 bytes --]

This patch remaps the lmem_map (struct page) arrays for each node
onto their own nodes. This is non-trivial, since all of ZONE_NORMAL,
and hence permanently mapped KVA resides on node 0.

Very early in the boot sequence, it calculates the size of the 
lmem_map arrays (rounding up to the nearest large page size), 
and reserves a suitable amount of permanent KVA by shifting down 
max_low_pfn to create a gap between max_low_pfn and highstart_pfn
(both of which are normally about 896Mb).

It then uses the new set_pmd_pfn function to set up the pmds 
correctly so that the large pages point at the physical addresses
reserved from the remote nodes.

Tested on NUMA-Q and some ratty old i386 PC kicking around under
my desk (on 2.5.36-mm1). Was good for a 20% improvement in system
time on kernel compile when I initially benchmarked it against 
2.5.32 or something - due to a reduction in inter-node traffic,
better interconnect cache usage and locality. Should have no effect
on any system other than i386 NUMA systems.

M.

diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/discontig.c 20-numamap/arch/i386/mm/discontig.c
--- 11-numafixes2/arch/i386/mm/discontig.c	Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/discontig.c	Thu Sep 19 16:07:10 2002
@@ -1,5 +1,6 @@
 /*
- * Written by: Patricia Gaughen, IBM Corporation
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh 
  *
  * Copyright (C) 2002, IBM Corp.
  *
@@ -19,8 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
  */
 

#include <linux/config.h>
@@ -113,35 +112,98 @@
 	}
 }
 
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void __init remap_numa_kva(void)
+{
+	void *vaddr;
+	unsigned long pfn;
+	int node;
+
+	for (node = 1; node < numnodes; ++node) {
+		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+			set_pmd_pfn((ulong) vaddr, 
+				node_remap_start_pfn[node] + pfn, 
+				PAGE_KERNEL_LARGE);
+		}
+	}
+}
+

+static unsigned long calculate_numa_remap_pages(void)
+{
+	int nid;
+	unsigned long size, reserve_pages = 0;
+
+	for (nid = 1; nid < numnodes; nid++) {
+		/* calculate the size of the mem_map needed in bytes */
+		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
+			* sizeof(struct page);
+		/* convert size to large (pmd size) pages, rounding up */
+		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+		/* now the roundup is correct, convert to PAGE_SIZE pages */
+		size = size * PTRS_PER_PTE;
+		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+				size, nid);
+		node_remap_size[nid] = size;
+		reserve_pages += size;
+		node_remap_offset[nid] = reserve_pages;
+		printk("Shrinking node %d from %ld pages to %ld pages\n",
+
			nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+		node_end_pfn[nid] -= size;
+		node_remap_start_pfn[nid] = node_end_pfn[nid];
+	}
+	printk("Reserving total of %ld pages for numa KVA remap\n",
+			reserve_pages);
+	return reserve_pages;
+}
+
 unsigned long __init setup_memory(void)
 {
 	int nid;
 	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
+	unsigned long reserve_pages;
 
 	get_memcfg_numa();
+	reserve_pages = calculate_numa_remap_pages();
 
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
+	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
 
 	find_max_pfn();
 	system_max_low_pfn = max_low_pfn =
find_max_low_pfn();
-
 #ifdef CONFIG_HIGHMEM
-		highstart_pfn = highend_pfn = max_pfn;
-		if (max_pfn > system_max_low_pfn) {
-			highstart_pfn = system_max_low_pfn;
-		}
-		printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-		       pages_to_mb(highend_pfn - highstart_pfn));
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > system_max_low_pfn)
+		highstart_pfn = system_max_low_pfn;
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+	       pages_to_mb(highend_pfn - highstart_pfn));
 #endif
+	system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(system_max_low_pfn));
-	
-	for (nid = 0; nid < numnodes; nid++)
+	printk("min_low_pfn = %ld, max_low_pfn = %ld,
highstart_pfn = %ld\n", 
+			min_low_pfn, max_low_pfn, highstart_pfn);
+
+	printk("Low memory ends at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(max_low_pfn));
+	for (nid = 0; nid < numnodes; nid++) {
 		allocate_pgdat(nid);
+		node_remap_start_vaddr[nid] = pfn_to_kaddr(
+			highstart_pfn - node_remap_offset[nid]);
+		printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+			(ulong) node_remap_start_vaddr[nid],
+			(ulong) pfn_to_kaddr(highstart_pfn
+			    - node_remap_offset[nid] + node_remap_size[nid]));
+	}
+	printk("High memory starts at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(highstart_pfn));
 	for (nid = 0; nid < numnodes; nid++)
 		find_max_pfn_node(nid);
 
@@ -244,7 +306,18 @@
 #endif
 			}
 		}
-		free_area_init_node(nid,
NODE_DATA(nid), 0, zones_size, start, 0);
+		/*
+		 * We let the lmem_map for node 0 be allocated from the
+		 * normal bootmem allocator, but other nodes come from the
+		 * remapped KVA area - mbligh
+		 */
+		if (nid)
+			free_area_init_node(nid, NODE_DATA(nid), 
+				node_remap_start_vaddr[nid], zones_size, 
+				start, 0);
+		else
+			free_area_init_node(nid, NODE_DATA(nid), 0, 
+				zones_size, start, 0);
 	}
 	return;
 }
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/init.c 20-numamap/arch/i386/mm/init.c
--- 11-numafixes2/arch/i386/mm/init.c	Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/init.c	Thu Sep 19 16:07:10 2002
@@ -245,6 +245,12 @@
 
 unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
 
+#ifndef CONFIG_DISCONTIGMEM

+#define remap_numa_kva() do {} while (0)
+#else
+extern void __init remap_numa_kva(void);
+#endif
+
 static void __init pagetable_init (void)
 {
 	unsigned long vaddr;
@@ -269,6 +275,7 @@
 	}
 
 	kernel_physical_mapping_init(pgd_base);
+	remap_numa_kva();
 
 	/*
 	 * Fixed mappings, only the page table structure has to be
@@ -449,7 +456,11 @@
 
 	set_max_mapnr_init();
 
+#ifdef CONFIG_HIGHMEM
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
+#else
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+#endif
 
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/pgtable.c 20-numamap/arch/i386/mm/pgtable.c
--- 11-numafixes2/arch/i386/mm/pgtable.c	Wed Sep
18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/pgtable.c	Thu Sep 19 16:07:10 2002
@@ -84,6 +84,39 @@
 	__flush_tlb_one(vaddr);
 }
 
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
+		printk ("set_pmd_pfn: vaddr misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
+		printk ("set_pmd_pfn: pfn
misaligned\n");
+		return; /* BUG(); */
+	}
+	pgd = swapper_pg_dir + __pgd_offset(vaddr);
+	if (pgd_none(*pgd)) {
+		printk ("set_pmd_pfn: pgd_none\n");
+		return; /* BUG(); */
+	}
+	pmd = pmd_offset(pgd, vaddr);
+	set_pmd(pmd, pfn_pmd(pfn, flags));
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/include/asm-i386/page.h 20-numamap/include/asm-i386/page.h
--- 11-numafixes2/include/asm-i386/page.h	Wed Sep 18 20:41:12 2002
+++ 20-numamap/include/asm-i386/page.h	Thu Sep 19 16:07:10 2002
@@
-142,6 +142,7 @@
 #define MAXMEM			((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 #ifndef CONFIG_DISCONTIGMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))

[-- Attachment #2: 20-numamap --]
[-- Type: application/octet-stream, Size: 7918 bytes --]

diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/discontig.c 20-numamap/arch/i386/mm/discontig.c
--- 11-numafixes2/arch/i386/mm/discontig.c	Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/discontig.c	Thu Sep 19 16:07:10 2002
@@ -1,5 +1,6 @@
 /*
- * Written by: Patricia Gaughen, IBM Corporation
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh 
  *
  * Copyright (C) 2002, IBM Corp.
  *
@@ -19,8 +20,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
  */
 
 #include <linux/config.h>
@@ -113,35 +112,98 @@
 	}
 }
 
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void __init remap_numa_kva(void)
+{
+	void *vaddr;
+	unsigned long pfn;
+	int node;
+
+	for (node = 1; node < numnodes; ++node) {
+		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+			set_pmd_pfn((ulong) vaddr, 
+				node_remap_start_pfn[node] + pfn, 
+				PAGE_KERNEL_LARGE);
+		}
+	}
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+	int nid;
+	unsigned long size, reserve_pages = 0;
+
+	for (nid = 1; nid < numnodes; nid++) {
+		/* calculate the size of the mem_map needed in bytes */
+		size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) 
+			* sizeof(struct page);
+		/* convert size to large (pmd size) pages, rounding up */
+		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+		/* now the roundup is correct, convert to PAGE_SIZE pages */
+		size = size * PTRS_PER_PTE;
+		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+				size, nid);
+		node_remap_size[nid] = size;
+		reserve_pages += size;
+		node_remap_offset[nid] = reserve_pages;
+		printk("Shrinking node %d from %ld pages to %ld pages\n",
+			nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+		node_end_pfn[nid] -= size;
+		node_remap_start_pfn[nid] = node_end_pfn[nid];
+	}
+	printk("Reserving total of %ld pages for numa KVA remap\n",
+			reserve_pages);
+	return reserve_pages;
+}
+
 unsigned long __init setup_memory(void)
 {
 	int nid;
 	unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
+	unsigned long reserve_pages;
 
 	get_memcfg_numa();
+	reserve_pages = calculate_numa_remap_pages();
 
-	/*
-	 * partially used pages are not usable - thus
-	 * we are rounding upwards:
-	 */
+	/* partially used pages are not usable - thus round upwards */
 	system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
 
 	find_max_pfn();
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
-
 #ifdef CONFIG_HIGHMEM
-		highstart_pfn = highend_pfn = max_pfn;
-		if (max_pfn > system_max_low_pfn) {
-			highstart_pfn = system_max_low_pfn;
-		}
-		printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
-		       pages_to_mb(highend_pfn - highstart_pfn));
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > system_max_low_pfn)
+		highstart_pfn = system_max_low_pfn;
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+	       pages_to_mb(highend_pfn - highstart_pfn));
 #endif
+	system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(system_max_low_pfn));
-	
-	for (nid = 0; nid < numnodes; nid++)
+	printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", 
+			min_low_pfn, max_low_pfn, highstart_pfn);
+
+	printk("Low memory ends at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(max_low_pfn));
+	for (nid = 0; nid < numnodes; nid++) {
 		allocate_pgdat(nid);
+		node_remap_start_vaddr[nid] = pfn_to_kaddr(
+			highstart_pfn - node_remap_offset[nid]);
+		printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+			(ulong) node_remap_start_vaddr[nid],
+			(ulong) pfn_to_kaddr(highstart_pfn
+			    - node_remap_offset[nid] + node_remap_size[nid]));
+	}
+	printk("High memory starts at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(highstart_pfn));
 	for (nid = 0; nid < numnodes; nid++)
 		find_max_pfn_node(nid);
 
@@ -244,7 +306,18 @@
 #endif
 			}
 		}
-		free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, start, 0);
+		/*
+		 * We let the lmem_map for node 0 be allocated from the
+		 * normal bootmem allocator, but other nodes come from the
+		 * remapped KVA area - mbligh
+		 */
+		if (nid)
+			free_area_init_node(nid, NODE_DATA(nid), 
+				node_remap_start_vaddr[nid], zones_size, 
+				start, 0);
+		else
+			free_area_init_node(nid, NODE_DATA(nid), 0, 
+				zones_size, start, 0);
 	}
 	return;
 }
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/init.c 20-numamap/arch/i386/mm/init.c
--- 11-numafixes2/arch/i386/mm/init.c	Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/init.c	Thu Sep 19 16:07:10 2002
@@ -245,6 +245,12 @@
 
 unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
 
+#ifndef CONFIG_DISCONTIGMEM
+#define remap_numa_kva() do {} while (0)
+#else
+extern void __init remap_numa_kva(void);
+#endif
+
 static void __init pagetable_init (void)
 {
 	unsigned long vaddr;
@@ -269,6 +275,7 @@
 	}
 
 	kernel_physical_mapping_init(pgd_base);
+	remap_numa_kva();
 
 	/*
 	 * Fixed mappings, only the page table structure has to be
@@ -449,7 +456,11 @@
 
 	set_max_mapnr_init();
 
+#ifdef CONFIG_HIGHMEM
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
+#else
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+#endif
 
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/pgtable.c 20-numamap/arch/i386/mm/pgtable.c
--- 11-numafixes2/arch/i386/mm/pgtable.c	Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/pgtable.c	Thu Sep 19 16:07:10 2002
@@ -84,6 +84,39 @@
 	__flush_tlb_one(vaddr);
 }
 
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
+		printk ("set_pmd_pfn: vaddr misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
+		printk ("set_pmd_pfn: pfn misaligned\n");
+		return; /* BUG(); */
+	}
+	pgd = swapper_pg_dir + __pgd_offset(vaddr);
+	if (pgd_none(*pgd)) {
+		printk ("set_pmd_pfn: pgd_none\n");
+		return; /* BUG(); */
+	}
+	pmd = pmd_offset(pgd, vaddr);
+	set_pmd(pmd, pfn_pmd(pfn, flags));
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 {
 	unsigned long address = __fix_to_virt(idx);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/include/asm-i386/page.h 20-numamap/include/asm-i386/page.h
--- 11-numafixes2/include/asm-i386/page.h	Wed Sep 18 20:41:12 2002
+++ 20-numamap/include/asm-i386/page.h	Thu Sep 19 16:07:10 2002
@@ -142,6 +142,7 @@
 #define MAXMEM			((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
 #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 #ifndef CONFIG_DISCONTIGMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))

                 reply	other threads:[~2002-09-21 21:55 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='8826642.1032620136@[10.10.2.3]' \
    --to=mbligh@aracnet.com \
    --cc=akpm@digeo.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox