linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Magnus Damm <magnus@valinux.co.jp>
From: Isaku Yamahata <yamahata@valinux.co.jp>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Cc: Magnus Damm <magnus@valinux.co.jp>
Subject: [PATCH 07/07] i386: numa emulation on pc
Date: Fri, 30 Sep 2005 16:33:51 +0900 (JST)	[thread overview]
Message-ID: <20050930073308.10631.24247.sendpatchset@cherry.local> (raw)
In-Reply-To: <20050930073232.10631.63786.sendpatchset@cherry.local>

This patch adds NUMA emulation for i386 on top of the fixes for sparsemem and
discontigmem. NUMA emulation already exists for x86_64, and this patch adds
the same feature using the same config option CONFIG_NUMA_EMU. The kernel
command line option used is also the same as for x86_64.

Pass "numa=fake=N" to the kernel where N is the number of nodes to emulate.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
Signed-off-by: Magnus Damm <magnus@valinux.co.jp>
---

 arch/i386/Kconfig           |   20 +++++++-
 arch/i386/kernel/setup.c    |   34 +++++++++-----
 arch/i386/mm/numa.c         |  100 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/mmzone.h   |    7 +++
 include/asm-i386/numnodes.h |    2
 5 files changed, 145 insertions(+), 18 deletions(-)

--- from-0009/arch/i386/Kconfig
+++ to-work/arch/i386/Kconfig	2005-09-30 13:31:13.000000000 +0900
@@ -134,7 +134,7 @@ endchoice
 config ACPI_SRAT
 	bool
 	default y
-	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH || NUMA_EMU)
 
 config X86_SUMMIT_NUMA
 	bool
@@ -756,12 +756,21 @@ config X86_PAE
 	depends on HIGHMEM64G
 	default y
 
+config NUMA_EMU
+	bool "Numa Memory Nodes Emulation"
+	depends on X86_PC
+	default n
+	help
+	  Enable NUMA emulation. A regular single-node PC machine will be
+	  split into virtual nodes when booted with "numa=fake=N", where
+	  N is the number of nodes.
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
-	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
+	depends on (NUMA_EMU && ACPI && HIGHMEM) || (SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)))
 	default n if X86_PC
-	default y if (X86_NUMAQ || X86_SUMMIT)
+	default y if (X86_NUMAQ || X86_SUMMIT || NUMA_EMU)
 
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
@@ -770,6 +779,9 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI)
 
+comment "NUMA (Emulation on PC) requires highmem support and ACPI"
+	depends on X86_PC && (!HIGHMEM || !ACPI)
+
 config HAVE_ARCH_BOOTMEM_NODE
 	bool
 	depends on NUMA
@@ -916,7 +928,7 @@ config IRQBALANCE
 # Summit needs it only when NUMA is on
 config BOOT_IOREMAP
 	bool
-	depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI))
+	depends on (((X86_SUMMIT || X86_GENERICARCH || NUMA_EMU) && NUMA) || (X86 && EFI))
 	default y
 
 config REGPARM
--- from-0008/arch/i386/kernel/setup.c
+++ to-work/arch/i386/kernel/setup.c	2005-09-28 17:49:53.000000000 +0900
@@ -931,6 +931,13 @@ static void __init parse_cmdline_early (
 			elfcorehdr_addr = memparse(from+11, &from);
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+		// virtual numa setup
+		else if (!memcmp(from, "numa=", 5)) {
+			extern void numa_setup(char*, char**);
+			numa_setup(from+5, &from);
+		}
+#endif
 		/*
 		 * highmem=size forces highmem to be exactly 'size' bytes.
 		 * This works even on boxes that have no highmem otherwise.
@@ -1211,26 +1218,22 @@ static inline unsigned long  nid_size_pa
 {
 	return node_end_pfn[nid] - node_start_pfn[nid];
 }
-static inline int nid_starts_in_highmem(int nid)
-{
-	return node_start_pfn[nid] >= max_low_pfn;
-}
-
 void __init nid_zone_sizes_init(int nid)
 {
 	unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-	unsigned long max_dma;
+	unsigned long max_dma = min(max_hardware_dma_pfn(), max_low_pfn);
 	unsigned long start = node_start_pfn[nid];
 	unsigned long end = node_end_pfn[nid];
 
 	if (node_has_online_mem(nid)){
-		if (nid_starts_in_highmem(nid)) {
-			zones_size[ZONE_HIGHMEM] = nid_size_pages(nid);
-		} else {
-			max_dma = min(max_hardware_dma_pfn(), max_low_pfn);
-			zones_size[ZONE_DMA] = max_dma;
-			zones_size[ZONE_NORMAL] = max_low_pfn - max_dma;
-			zones_size[ZONE_HIGHMEM] = end - max_low_pfn;
+		if (start < max_dma) {
+			zones_size[ZONE_DMA] = min(end, max_dma) - start;
+		}
+		if (start < max_low_pfn && max_dma < end) {
+			zones_size[ZONE_NORMAL] = min(end, max_low_pfn) - max(start, max_dma);
+		}
+		if (max_low_pfn <= end) {
+			zones_size[ZONE_HIGHMEM] = end - max(start, max_low_pfn);
 		}
 	}
 
@@ -1270,7 +1273,12 @@ void __init setup_bootmem_allocator(void
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
+#ifdef CONFIG_NUMA_EMU
+	bootmap_size = init_bootmem(max(min_low_pfn, node_start_pfn[0]),
+				    min(max_low_pfn, node_end_pfn[0]));
+#else
 	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+#endif
 
 	register_bootmem_low_pages(max_low_pfn);
 
--- from-0006/arch/i386/mm/numa.c
+++ to-work/arch/i386/mm/numa.c	2005-09-28 17:49:53.000000000 +0900
@@ -165,3 +165,103 @@ int early_pfn_to_nid(unsigned long pfn)
 
 	return 0;
 }
+
+#ifdef CONFIG_NUMA_EMU
+int numa_fake __initdata = 0;
+
+extern unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+extern unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
+
+int
+get_memcfg_numa_emu(void)
+{
+	unsigned long node_size;
+	unsigned long shift;
+	int i;
+	
+	if (numa_fake == 0)
+		return 0;
+	node_size = max_pfn / numa_fake;
+	if (node_size == 0)
+		return 0;
+	
+	printk("NUMA - single node, flat memory mode, broken into %d nodes\n",
+	       numa_fake);
+	shift = 1;
+	while ((1 << shift) < node_size) {
+		shift++;
+	}
+	node_size = 1 << shift;
+	if (node_size * PAGE_SIZE < (1UL << SECTION_SIZE_BITS)) {
+		printk("node_size %ld is too small.(it must be >= %ld)\n",
+		       node_size * PAGE_SIZE, (1UL << SECTION_SIZE_BITS));
+		printk("consider descreas # of nodes "
+		       "(or decreas SECTIONS_SIZE_BITS %d)\n",
+		       SECTION_SIZE_BITS);
+		printk("kernel will panic!\n");
+		// Don't panic here.
+		// Here even early printk is not enabled so that
+		// this message won't be showed if we panic right here.
+		// Let the kernel go, print this message and then panic.
+	}
+	printk("block size %ld shift %ld\n", node_size, shift);
+
+        nodes_clear(node_online_map);
+	for (i = 0; i < numa_fake; i++) {
+		unsigned long size;
+		unsigned long pfn;
+		node_start_pfn[i] = node_size * i;
+		node_end_pfn[i] = min(node_start_pfn[i] + node_size, max_pfn);
+
+		node_remap_size[i] = node_memmap_size_bytes(i,
+							    node_start_pfn[i],
+							    node_end_pfn[i]);
+
+		//XXX see calculate_numa_remap_pages()
+		size = node_remap_size[i] + sizeof(pg_data_t);
+		size = (size + PMD_SIZE - 1) / PMD_SIZE;
+		size = size * PTRS_PER_PTE;
+		for (pfn = node_end_pfn[i] - size;
+		     pfn < node_end_pfn[i]; pfn++)
+			if (!page_is_ram(pfn))
+				break;
+		if (pfn != node_end_pfn[i])
+			size = 0;
+		if (node_end_pfn[i] & (PTRS_PER_PTE - 1)) {
+			size += node_end_pfn[i] & (PTRS_PER_PTE - 1);
+		}
+		
+		if (node_start_pfn[i] + size >= node_end_pfn[i]) {
+			printk("last memory segment %d has too few pages "
+			       "%ld = %ld - %ld\n",
+			       i, 
+			       node_end_pfn[i] - node_start_pfn[i],
+			       node_start_pfn[i],
+			       node_end_pfn[i]);
+			node_start_pfn[i] = 0;
+			node_end_pfn[i] = 0;
+			node_remap_size[i] = 0;
+			break;
+		} else {
+			node_set_online(i);
+			memory_present(i, node_start_pfn[i], node_end_pfn[i]);
+		}
+	}
+	printk("total %d blocks, max %ld\n", i, max_pfn);
+	return 1;
+}
+#endif
+
+void __init
+numa_setup(char* opt, char** retptr)
+{
+#ifdef CONFIG_NUMA_EMU
+	if (!memcmp(opt, "fake=", 5) && (*(opt + 5))) {
+		numa_fake = simple_strtoul(opt + 5, retptr, 0);
+		numa_fake = min(numa_fake, MAX_NUMNODES);
+		printk("fake numa nodes = %d/%d\n", numa_fake, MAX_NUMNODES);
+	} else {
+		*retptr = opt;
+	}
+#endif
+}
--- from-0009/include/asm-i386/mmzone.h
+++ to-work/include/asm-i386/mmzone.h	2005-09-30 13:53:35.000000000 +0900
@@ -18,6 +18,9 @@ extern struct pglist_data *node_data[];
 	#include <asm/srat.h>
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+extern int get_memcfg_numa_emu(void);
+#endif
 extern int get_memcfg_numa_flat(void );
 /*
  * This allows any one NUMA architecture to be compiled
@@ -33,6 +36,10 @@ static inline void get_memcfg_numa(void)
 	if (get_memcfg_from_srat())
 		return;
 #endif
+#ifdef CONFIG_NUMA_EMU
+	if (get_memcfg_numa_emu())
+		return;
+#endif
 
 	get_memcfg_numa_flat();
 }
--- from-0001/include/asm-i386/numnodes.h
+++ to-work/include/asm-i386/numnodes.h	2005-09-28 17:49:53.000000000 +0900
@@ -8,7 +8,7 @@
 /* Max 16 Nodes */
 #define NODES_SHIFT	4
 
-#elif defined(CONFIG_ACPI_SRAT)
+#elif defined(CONFIG_ACPI_SRAT) || defined(CONFIG_NUMA_EMU)
 
 /* Max 8 Nodes */
 #define NODES_SHIFT	3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2005-09-30  7:33 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-09-30  7:33 [PATCH 00/07][RFC] i386: NUMA emulation Magnus Damm
2005-09-30  7:33 ` [PATCH 01/07] i386: srat non acpi Magnus Damm, Magnus Damm
2005-09-30  7:33 ` [PATCH 02/07] i386: numa on non-smp Magnus Damm, Magnus Damm
2005-09-30  7:33 ` [PATCH 03/07] cpuset: smp or numa Magnus Damm, Magnus Damm
2005-09-30  7:33 ` [PATCH 04/07] i386: numa warning fix Magnus Damm, Isaku Yamahata
2005-09-30  7:33 ` [PATCH 05/07] i386: sparsemem on pc Magnus Damm, Magnus Damm
2005-09-30 15:25   ` Dave Hansen
2005-10-01  0:32     ` Magnus Damm
2005-09-30  7:33 ` [PATCH 06/07] i386: discontigmem " Magnus Damm, Magnus Damm
2005-09-30  7:33 ` Magnus Damm, Isaku Yamahata [this message]
2005-09-30 18:55   ` [PATCH 07/07] i386: numa emulation " Dave Hansen
2005-10-03  9:59     ` Magnus Damm
2005-10-03 16:16       ` Dave Hansen
2005-10-04  5:06         ` Magnus Damm
2005-10-04  7:52   ` Hirokazu Takahashi
2005-10-04  9:49     ` Magnus Damm
2005-09-30 15:23 ` [PATCH 00/07][RFC] i386: NUMA emulation Dave Hansen
2005-10-03  2:08   ` Magnus Damm
2005-10-03  7:34     ` David Lang
2005-10-03 10:02       ` Magnus Damm
2005-10-03 13:33         ` David Lang
2005-10-03 14:59           ` Martin J. Bligh
2005-10-03 15:03             ` David Lang
2005-10-03 15:08               ` Martin J. Bligh
2005-10-03 15:13                 ` David Lang
2005-10-03 15:25                   ` Martin J. Bligh
2005-10-03 15:32                     ` David Lang
2005-10-03 15:54                       ` Martin J. Bligh
2005-10-03 16:44                         ` David Lang
2005-10-03 14:45       ` Martin J. Bligh
2005-10-03 14:49         ` David Lang
2005-10-03  3:21   ` Paul Jackson
2005-10-03  5:05     ` Magnus Damm
2005-10-03  5:26       ` Hirokazu Takahashi
2005-10-03  5:33       ` Paul Jackson
2005-10-03  5:59         ` Magnus Damm
2005-10-03  7:26           ` Paul Jackson
2005-10-03  5:34       ` Paul Jackson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050930073308.10631.24247.sendpatchset@cherry.local \
    --to=magnus@valinux.co.jp \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox