* [PATCH] relocate lmem_maps for i386 discontigmem onto their own nodes
@ 2002-09-21 21:55 Martin J. Bligh
0 siblings, 0 replies; only message in thread
From: Martin J. Bligh @ 2002-09-21 21:55 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-mm mailing list
[-- Attachment #1: Type: text/plain, Size: 8948 bytes --]
This patch remaps the lmem_map (struct page) arrays for each node
onto their own nodes. This is non-trivial, since all of ZONE_NORMAL,
and hence permanently mapped KVA resides on node 0.
Very early in the boot sequence, it calculates the size of the
lmem_map arrays (rounding up to the nearest large page size),
and reserves a suitable amount of permanent KVA by shifting down
max_low_pfn to create a gap between max_low_pfn and highstart_pfn
(both of which are normally about 896Mb).
It then uses the new set_pmd_pfn function to set up the pmds
correctly so that the large pages point at the physical addresses
reserved from the remote nodes.
Tested on NUMA-Q and some ratty old i386 PC kicking around under
my desk (on 2.5.36-mm1). Was good for a 20% improvement in system
time on kernel compile when I initially benchmarked it against
2.5.32 or something - due to a reduction in inter-node traffic,
better interconnect cache usage and locality. Should have no effect
on any system other than i386 NUMA systems.
M.
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/discontig.c 20-numamap/arch/i386/mm/discontig.c
--- 11-numafixes2/arch/i386/mm/discontig.c Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/discontig.c Thu Sep 19 16:07:10 2002
@@ -1,5 +1,6 @@
/*
- * Written by: Patricia Gaughen, IBM Corporation
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
@@ -19,8 +20,6 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
*/
#include <linux/config.h>
@@ -113,35 +112,98 @@
}
}
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void __init remap_numa_kva(void)
+{
+ void *vaddr;
+ unsigned long pfn;
+ int node;
+
+ for (node = 1; node < numnodes; ++node) {
+ for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+ vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+ set_pmd_pfn((ulong) vaddr,
+ node_remap_start_pfn[node] + pfn,
+ PAGE_KERNEL_LARGE);
+ }
+ }
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+ int nid;
+ unsigned long size, reserve_pages = 0;
+
+ for (nid = 1; nid < numnodes; nid++) {
+ /* calculate the size of the mem_map needed in bytes */
+ size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
+ * sizeof(struct page);
+ /* convert size to large (pmd size) pages, rounding up */
+ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+ /* now the roundup is correct, convert to PAGE_SIZE pages */
+ size = size * PTRS_PER_PTE;
+ printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+ size, nid);
+ node_remap_size[nid] = size;
+ reserve_pages += size;
+ node_remap_offset[nid] = reserve_pages;
+ printk("Shrinking node %d from %ld pages to %ld pages\n",
+
nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+ node_end_pfn[nid] -= size;
+ node_remap_start_pfn[nid] = node_end_pfn[nid];
+ }
+ printk("Reserving total of %ld pages for numa KVA remap\n",
+ reserve_pages);
+ return reserve_pages;
+}
+
unsigned long __init setup_memory(void)
{
int nid;
unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
+ unsigned long reserve_pages;
get_memcfg_numa();
+ reserve_pages = calculate_numa_remap_pages();
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
+ /* partially used pages are not usable - thus round upwards */
system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
find_max_pfn();
system_max_low_pfn = max_low_pfn =
find_max_low_pfn();
-
#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > system_max_low_pfn) {
- highstart_pfn = system_max_low_pfn;
- }
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > system_max_low_pfn)
+ highstart_pfn = system_max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
#endif
+ system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(system_max_low_pfn));
-
- for (nid = 0; nid < numnodes; nid++)
+ printk("min_low_pfn = %ld, max_low_pfn = %ld,
highstart_pfn = %ld\n",
+ min_low_pfn, max_low_pfn, highstart_pfn);
+
+ printk("Low memory ends at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+ for (nid = 0; nid < numnodes; nid++) {
allocate_pgdat(nid);
+ node_remap_start_vaddr[nid] = pfn_to_kaddr(
+ highstart_pfn - node_remap_offset[nid]);
+ printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ (ulong) node_remap_start_vaddr[nid],
+ (ulong) pfn_to_kaddr(highstart_pfn
+ - node_remap_offset[nid] + node_remap_size[nid]));
+ }
+ printk("High memory starts at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(highstart_pfn));
for (nid = 0; nid < numnodes; nid++)
find_max_pfn_node(nid);
@@ -244,7 +306,18 @@
#endif
}
}
- free_area_init_node(nid,
NODE_DATA(nid), 0, zones_size, start, 0);
+ /*
+ * We let the lmem_map for node 0 be allocated from the
+ * normal bootmem allocator, but other nodes come from the
+ * remapped KVA area - mbligh
+ */
+ if (nid)
+ free_area_init_node(nid, NODE_DATA(nid),
+ node_remap_start_vaddr[nid], zones_size,
+ start, 0);
+ else
+ free_area_init_node(nid, NODE_DATA(nid), 0,
+ zones_size, start, 0);
}
return;
}
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/init.c 20-numamap/arch/i386/mm/init.c
--- 11-numafixes2/arch/i386/mm/init.c Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/init.c Thu Sep 19 16:07:10 2002
@@ -245,6 +245,12 @@
unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
+#ifndef CONFIG_DISCONTIGMEM
+#define remap_numa_kva() do {} while (0)
+#else
+extern void __init remap_numa_kva(void);
+#endif
+
static void __init pagetable_init (void)
{
unsigned long vaddr;
@@ -269,6 +275,7 @@
}
kernel_physical_mapping_init(pgd_base);
+ remap_numa_kva();
/*
* Fixed mappings, only the page table structure has to be
@@ -449,7 +456,11 @@
set_max_mapnr_init();
+#ifdef CONFIG_HIGHMEM
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
+#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+#endif
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/pgtable.c 20-numamap/arch/i386/mm/pgtable.c
--- 11-numafixes2/arch/i386/mm/pgtable.c Wed Sep
18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/pgtable.c Thu Sep 19 16:07:10 2002
@@ -84,6 +84,39 @@
__flush_tlb_one(vaddr);
}
+/*
+ * Associate a large virtual page frame with a given physical page frame
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned.
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+
+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
+ printk ("set_pmd_pfn: vaddr misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
+ printk ("set_pmd_pfn: pfn
misaligned\n");
+ return; /* BUG(); */
+ }
+ pgd = swapper_pg_dir + __pgd_offset(vaddr);
+ if (pgd_none(*pgd)) {
+ printk ("set_pmd_pfn: pgd_none\n");
+ return; /* BUG(); */
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ set_pmd(pmd, pfn_pmd(pfn, flags));
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/include/asm-i386/page.h 20-numamap/include/asm-i386/page.h
--- 11-numafixes2/include/asm-i386/page.h Wed Sep 18 20:41:12 2002
+++ 20-numamap/include/asm-i386/page.h Thu Sep 19 16:07:10 2002
@@
-142,6 +142,7 @@
#define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
[-- Attachment #2: 20-numamap --]
[-- Type: application/octet-stream, Size: 7918 bytes --]
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/discontig.c 20-numamap/arch/i386/mm/discontig.c
--- 11-numafixes2/arch/i386/mm/discontig.c Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/discontig.c Thu Sep 19 16:07:10 2002
@@ -1,5 +1,6 @@
/*
- * Written by: Patricia Gaughen, IBM Corporation
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
@@ -19,8 +20,6 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
*/
#include <linux/config.h>
@@ -113,35 +112,98 @@
}
}
+#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
+
+unsigned long node_remap_start_pfn[MAX_NUMNODES];
+unsigned long node_remap_size[MAX_NUMNODES];
+unsigned long node_remap_offset[MAX_NUMNODES];
+void *node_remap_start_vaddr[MAX_NUMNODES];
+extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void __init remap_numa_kva(void)
+{
+ void *vaddr;
+ unsigned long pfn;
+ int node;
+
+ for (node = 1; node < numnodes; ++node) {
+ for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
+ vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+ set_pmd_pfn((ulong) vaddr,
+ node_remap_start_pfn[node] + pfn,
+ PAGE_KERNEL_LARGE);
+ }
+ }
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+ int nid;
+ unsigned long size, reserve_pages = 0;
+
+ for (nid = 1; nid < numnodes; nid++) {
+ /* calculate the size of the mem_map needed in bytes */
+ size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
+ * sizeof(struct page);
+ /* convert size to large (pmd size) pages, rounding up */
+ size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+ /* now the roundup is correct, convert to PAGE_SIZE pages */
+ size = size * PTRS_PER_PTE;
+ printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
+ size, nid);
+ node_remap_size[nid] = size;
+ reserve_pages += size;
+ node_remap_offset[nid] = reserve_pages;
+ printk("Shrinking node %d from %ld pages to %ld pages\n",
+ nid, node_end_pfn[nid], node_end_pfn[nid] - size);
+ node_end_pfn[nid] -= size;
+ node_remap_start_pfn[nid] = node_end_pfn[nid];
+ }
+ printk("Reserving total of %ld pages for numa KVA remap\n",
+ reserve_pages);
+ return reserve_pages;
+}
+
unsigned long __init setup_memory(void)
{
int nid;
unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
+ unsigned long reserve_pages;
get_memcfg_numa();
+ reserve_pages = calculate_numa_remap_pages();
- /*
- * partially used pages are not usable - thus
- * we are rounding upwards:
- */
+ /* partially used pages are not usable - thus round upwards */
system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
find_max_pfn();
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
-
#ifdef CONFIG_HIGHMEM
- highstart_pfn = highend_pfn = max_pfn;
- if (max_pfn > system_max_low_pfn) {
- highstart_pfn = system_max_low_pfn;
- }
- printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
- pages_to_mb(highend_pfn - highstart_pfn));
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > system_max_low_pfn)
+ highstart_pfn = system_max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
#endif
+ system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(system_max_low_pfn));
-
- for (nid = 0; nid < numnodes; nid++)
+ printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
+ min_low_pfn, max_low_pfn, highstart_pfn);
+
+ printk("Low memory ends at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+ for (nid = 0; nid < numnodes; nid++) {
allocate_pgdat(nid);
+ node_remap_start_vaddr[nid] = pfn_to_kaddr(
+ highstart_pfn - node_remap_offset[nid]);
+ printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ (ulong) node_remap_start_vaddr[nid],
+ (ulong) pfn_to_kaddr(highstart_pfn
+ - node_remap_offset[nid] + node_remap_size[nid]));
+ }
+ printk("High memory starts at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(highstart_pfn));
for (nid = 0; nid < numnodes; nid++)
find_max_pfn_node(nid);
@@ -244,7 +306,18 @@
#endif
}
}
- free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, start, 0);
+ /*
+ * We let the lmem_map for node 0 be allocated from the
+ * normal bootmem allocator, but other nodes come from the
+ * remapped KVA area - mbligh
+ */
+ if (nid)
+ free_area_init_node(nid, NODE_DATA(nid),
+ node_remap_start_vaddr[nid], zones_size,
+ start, 0);
+ else
+ free_area_init_node(nid, NODE_DATA(nid), 0,
+ zones_size, start, 0);
}
return;
}
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/init.c 20-numamap/arch/i386/mm/init.c
--- 11-numafixes2/arch/i386/mm/init.c Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/init.c Thu Sep 19 16:07:10 2002
@@ -245,6 +245,12 @@
unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
+#ifndef CONFIG_DISCONTIGMEM
+#define remap_numa_kva() do {} while (0)
+#else
+extern void __init remap_numa_kva(void);
+#endif
+
static void __init pagetable_init (void)
{
unsigned long vaddr;
@@ -269,6 +275,7 @@
}
kernel_physical_mapping_init(pgd_base);
+ remap_numa_kva();
/*
* Fixed mappings, only the page table structure has to be
@@ -449,7 +456,11 @@
set_max_mapnr_init();
+#ifdef CONFIG_HIGHMEM
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
+#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+#endif
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/arch/i386/mm/pgtable.c 20-numamap/arch/i386/mm/pgtable.c
--- 11-numafixes2/arch/i386/mm/pgtable.c Wed Sep 18 20:41:11 2002
+++ 20-numamap/arch/i386/mm/pgtable.c Thu Sep 19 16:07:10 2002
@@ -84,6 +84,39 @@
__flush_tlb_one(vaddr);
}
+/*
+ * Associate a large virtual page frame with a given physical page frame
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned.
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pmd_t *pmd;
+
+ if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
+ printk ("set_pmd_pfn: vaddr misaligned\n");
+ return; /* BUG(); */
+ }
+ if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
+ printk ("set_pmd_pfn: pfn misaligned\n");
+ return; /* BUG(); */
+ }
+ pgd = swapper_pg_dir + __pgd_offset(vaddr);
+ if (pgd_none(*pgd)) {
+ printk ("set_pmd_pfn: pgd_none\n");
+ return; /* BUG(); */
+ }
+ pmd = pmd_offset(pgd, vaddr);
+ set_pmd(pmd, pfn_pmd(pfn, flags));
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one(vaddr);
+}
+
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
diff -urN -X /home/mbligh/.diff.exclude 11-numafixes2/include/asm-i386/page.h 20-numamap/include/asm-i386/page.h
--- 11-numafixes2/include/asm-i386/page.h Wed Sep 18 20:41:12 2002
+++ 20-numamap/include/asm-i386/page.h Thu Sep 19 16:07:10 2002
@@ -142,6 +142,7 @@
#define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2002-09-21 21:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-21 21:55 [PATCH] relocate lmem_maps for i386 discontigmem onto their own nodes Martin J. Bligh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox