[PATCH 03/05] x86_64: NUMA emulation

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Magnus Damm <magnus@valinux.co.jp>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Magnus Damm <magnus@valinux.co.jp>, pj@sgi.com, ak@suse.de
Subject: [PATCH 03/05] x86_64: NUMA emulation
Date: Thu, 10 Nov 2005 18:08:18 +0900 (JST)	[thread overview]
Message-ID: <20051110090936.8083.13572.sendpatchset@cherry.local> (raw)
In-Reply-To: <20051110090920.8083.54147.sendpatchset@cherry.local>

Improve the x86_64 NUMA emulation code to use the generic NUMA emulation code.

This patch replaces the current x86_64 CONFIG_NUMA_EMU code with a more 
advanced implementation that uses the generic NUMA emulation code. The x86_64 
NUMA emulation today only supports dividing of nodes on single node systems, 
but this implementation supports both single node systems and larger systems
with multiple NUMA nodes. With the patch, each real NUMA node will be divided
into several smaller nodes during boot if requested on the kernel command line.

Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
---

 arch/x86_64/Kconfig           |   11 +--
 arch/x86_64/mm/numa.c         |  119 +++++++++++++++++++++---------------------
 include/asm-x86_64/numa.h     |    1
 include/asm-x86_64/numnodes.h |    6 +-
 4 files changed, 68 insertions(+), 69 deletions(-)

--- from-0005/arch/x86_64/Kconfig
+++ to-work/arch/x86_64/Kconfig	2005-11-09 11:50:03.000000000 +0900
@@ -258,14 +258,6 @@ config X86_64_ACPI_NUMA
        help
 	 Enable ACPI SRAT based node topology detection.
 
-config NUMA_EMU
-	bool "NUMA emulation"
-	depends on NUMA
-	help
-	  Enable NUMA emulation. A flat machine will be split
-	  into virtual nodes when booted with "numa=fake=N", where N is the
-	  number of nodes. This is only useful for debugging.
-
 config ARCH_DISCONTIGMEM_ENABLE
        bool
        depends on NUMA
@@ -288,6 +280,9 @@ config ARCH_FLATMEM_ENABLE
 	def_bool y
 	depends on !NUMA
 
+config ARCH_NUMA_EMU_ENABLE
+	def_bool y
+
 source "mm/Kconfig"
 
 config HAVE_ARCH_EARLY_PFN_TO_NID
--- from-0006/arch/x86_64/mm/numa.c
+++ to-work/arch/x86_64/mm/numa.c	2005-11-09 17:50:54.000000000 +0900
@@ -48,15 +48,14 @@ int numa_off __initdata;
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __init populate_memnodemap(
-	const struct node *nodes, int numnodes, int shift)
+static int __init populate_memnodemap(const struct node *nodes, int shift)
 {
 	int i; 
 	int res = -1;
 	unsigned long addr, end;
 
 	memset(memnodemap, 0xff, sizeof(memnodemap));
-	for (i = 0; i < numnodes; i++) {
+	for_each_online_node(i) {
 		addr = nodes[i].start;
 		end = nodes[i].end;
 		if (addr >= end)
@@ -74,17 +73,17 @@ static int __init populate_memnodemap(
 	return res;
 }
 
-int __init compute_hash_shift(struct node *nodes, int numnodes)
+int __init compute_hash_shift(struct node *nodes)
 {
 	int shift = 20;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+	while (populate_memnodemap(nodes, shift + 1) >= 0)
 		shift++;
 
 	printk(KERN_DEBUG "Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+	if (populate_memnodemap(nodes, shift) != 1) {
 		printk(KERN_INFO
 	"Your memory is not aligned you need to rebuild your kernel "
 	"with a bigger NODEMAPSIZE shift=%d\n",
@@ -186,36 +185,53 @@ void __init numa_init_array(void)
 }
 
 #ifdef CONFIG_NUMA_EMU
-int numa_fake __initdata = 0;
-
-/* Numa emulation */
-static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+void __init numa_emu_setup_nid(int real_nid)
 {
- 	int i;
- 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+	unsigned long start_pfn, end_pfn;
+	int real_max = 1 << NODES_SHIFT_HW;
+	int nid, new_nodes;
+
+	if (real_nid >= real_max)
+		return;
+
+	/* setup emulated nodes */
+
+	new_nodes = 0;
+
+	for (nid = real_nid + real_max; nid < MAX_NUMNODES; nid += real_max) { 
+		if (numa_emu_new(nid, nodes[real_nid].start >> PAGE_SHIFT, 
+				 nodes[real_nid].end >> PAGE_SHIFT,
+				 &start_pfn, &end_pfn) != 0)
+		     break;
+
+		nodes[nid].start = start_pfn << PAGE_SHIFT;
+		nodes[nid].end = end_pfn << PAGE_SHIFT;
+		new_nodes++;
+	}
+
+	if (!new_nodes)
+		return;
+
+	/* shrink real node */
+
+	if (numa_emu_shrink(real_nid, new_nodes,
+			    nodes[real_nid].start >> PAGE_SHIFT, 
+			    nodes[real_nid].end >> PAGE_SHIFT,
+			    &start_pfn, &end_pfn) != 0)
+		return;
+
+	nodes[real_nid].start = start_pfn << PAGE_SHIFT;
+	nodes[real_nid].end = end_pfn << PAGE_SHIFT;
+
+	/* set emulated nodes online */
+
+	for (nid = real_nid + real_max; nid < MAX_NUMNODES; nid += real_max) { 
+		node_set_online(nid);
+
+		if (!--new_nodes)
+			break;
+	}
 
- 	/* Kludge needed for the hash function */
- 	if (hweight64(sz) > 1) {
- 		unsigned long x = 1;
- 		while ((x << 1) < sz)
- 			x <<= 1;
- 		if (x < sz/2)
- 			printk("Numa emulation unbalanced. Complain to maintainer\n");
- 		sz = x;
- 	}
-
- 	for (i = 0; i < numa_fake; i++) {
- 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
- 		if (i == numa_fake-1)
- 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
- 		nodes[i].end = nodes[i].start + sz;
- 		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
- 		       i,
- 		       nodes[i].start, nodes[i].end,
- 		       (nodes[i].end - nodes[i].start) >> 20);
-		node_set_online(i);
- 	}
- 	return 0;
 }
 #endif
 
@@ -223,16 +239,10 @@ void __init numa_initmem_doinit(unsigned
 { 
 	int i;
 
+#ifdef CONFIG_ACPI_NUMA
+	nodes_clear(node_online_map);
 	memset(&nodes,0,sizeof(nodes)); 
 
-#ifdef CONFIG_NUMA_EMU
-	if (numa_fake && !numa_emulation(start_pfn, end_pfn))
- 		return;
-#endif
-
-	memset(&nodes,0,sizeof(nodes)); 
-
-#ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
 	 */
@@ -243,28 +253,26 @@ void __init numa_initmem_doinit(unsigned
  		return;
 #endif
 
+#ifdef CONFIG_K8_NUMA
+	nodes_clear(node_online_map);
 	memset(&nodes,0,sizeof(nodes)); 
 
-#ifdef CONFIG_K8_NUMA
 	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
 		return;
 #endif
 
+	nodes_clear(node_online_map);
 	memset(&nodes,0,sizeof(nodes)); 
 
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
 
-	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+	printk(KERN_INFO "Single node at %016lx-%016lx\n", 
 	       start_pfn << PAGE_SHIFT,
 	       end_pfn << PAGE_SHIFT); 
 		/* setup dummy node covering all memory */ 
-	memnodemap[0] = 0;
-	nodes_clear(node_online_map);
+
 	node_set_online(0);
-	for (i = 0; i < NR_CPUS; i++)
-		numa_set_node(i, 0);
-	node_to_cpumask[0] = cpumask_of_cpu(0);
 	nodes[0].start = start_pfn << PAGE_SHIFT;
 	nodes[0].end = end_pfn << PAGE_SHIFT;
 }
@@ -275,7 +283,10 @@ void __init numa_initmem_init(unsigned l
 
 	numa_initmem_doinit(start_pfn, end_pfn);
 
-	memnode_shift = compute_hash_shift(nodes, num_online_nodes());
+	for_each_online_node(i)
+		numa_emu_setup_nid(i);
+
+	memnode_shift = compute_hash_shift(nodes);
 	if (memnode_shift < 0) { 
 		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
 		return; 
@@ -321,13 +332,7 @@ __init int numa_setup(char *opt) 
 { 
 	if (!strncmp(opt,"off",3))
 		numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
-	if(!strncmp(opt, "fake=", 5)) {
-		numa_fake = simple_strtoul(opt+5,NULL,0); ;
-		if (numa_fake >= MAX_NUMNODES)
-			numa_fake = MAX_NUMNODES;
-	}
-#endif
+        numa_emu_setup(opt);
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
  		acpi_numa = -1;
--- from-0002/include/asm-x86_64/numa.h
+++ to-work/include/asm-x86_64/numa.h	2005-11-09 11:50:03.000000000 +0900
@@ -8,7 +8,6 @@ struct node { 
 	u64 start,end; 
 };
 
-extern int compute_hash_shift(struct node *nodes, int numnodes);
 extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
--- from-0001/include/asm-x86_64/numnodes.h
+++ to-work/include/asm-x86_64/numnodes.h	2005-11-09 11:50:03.000000000 +0900
@@ -3,10 +3,10 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_NUMA
-#define NODES_SHIFT	6
+#if defined(CONFIG_K8_NUMA) || defined(CONFIG_ACPI_NUMA)
+#define NODES_SHIFT_HW	3
 #else
-#define NODES_SHIFT	0
+#define NODES_SHIFT_HW	0
 #endif
 
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2005-11-10  9:08 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-11-10  9:08 [PATCH 00/05][RFC] NUMA emulation update Magnus Damm
2005-11-10  9:08 ` [PATCH 01/05] NUMA: Generic code Magnus Damm
2005-11-11  4:16   ` Andi Kleen
2005-11-15  8:34     ` Magnus Damm
2005-11-15 14:15       ` Andi Kleen
2005-11-16  5:22         ` Magnus Damm
2005-11-16  7:48           ` Andi Kleen
2005-11-16  7:57             ` Magnus Damm
2005-11-16  8:38               ` Andi Kleen
2005-11-16 11:31               ` Werner Almesberger
2005-11-10  9:08 ` [PATCH 02/05] x86_64: NUMA cleanup Magnus Damm
2005-11-10  9:08 ` Magnus Damm [this message]
2005-11-10  9:08 ` [PATCH 04/05] x86_64: NUMA without SMP Magnus Damm
2005-11-10  9:08 ` [PATCH 05/05] NUMA: find_next_best_node fix Magnus Damm

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20051110090936.8083.13572.sendpatchset@cherry.local \
    --to=magnus@valinux.co.jp \
    --cc=ak@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=pj@sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox