memory hotremove prototype, take 3

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* memory hotremove prototype, take 3
@ 2003-12-01  3:41 IWAMOTO Toshihiro
  2003-12-01 19:56 ` Pavel Machek
  2003-12-03 19:41 ` Martin J. Bligh
  0 siblings, 2 replies; 10+ messages in thread
From: IWAMOTO Toshihiro @ 2003-12-01  3:41 UTC (permalink / raw)
  To: linux-kernel, linux-mm

Hi,

this is a new version of my memory hotplug prototype patch, against
linux-2.6.0-test11.

Freeing 100% of a specified memory zone is non-trivial and necessary
for memory hot removal.  This patch splits memory into 1GB zones, and
implements complete zone memory freeing using kswapd or "remapping".

A bit more detailed explanation and some test scripts are at:
	http://people.valinux.co.jp/~iwamoto/mh.html

Main changes against previous versions are:
- The stability is greatly improved.  Kernel crashes (probably related
  with kswapd) still happen, but they are rather rare so that I'm
  having difficulty reproducing crashes.
  Page remapping under simultaneous tar + rm -rf works.
- Implemented a solution to a deadlock caused by ext2_rename, which
  increments a refcount of a directory page twice.

Questions and comments are welcome.

$Id: memoryhotplug.patch,v 1.26 2003/11/28 09:12:12 iwamoto Exp $

diff -dpur linux-2.6.0-test11/arch/i386/Kconfig linux-2.6.0-test11-mh/arch/i386/Kconfig
--- linux-2.6.0-test11/arch/i386/Kconfig	Thu Nov 27 05:43:07 2003
+++ linux-2.6.0-test11-mh/arch/i386/Kconfig	Fri Nov 28 17:45:42 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MEMHOTPLUGTEST
+       bool "Memory hotplug test"
+       default n
+
 config DISCONTIGMEM
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HAVE_ARCH_BOOTMEM_NODE
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HIGHPTE
diff -dpur linux-2.6.0-test11/arch/i386/mm/discontig.c linux-2.6.0-test11-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test11/arch/i386/mm/discontig.c	Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/arch/i386/mm/discontig.c	Fri Nov 28 17:45:42 2003
@@ -28,6 +28,7 @@
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
+#include <linux/proc_fs.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
 	return 1;
 }
 
+int __init get_memcfg_numa_blks(void)
+{
+	int i, pfn;
+
+	printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+	/* Run the memory configuration and find the top of memory. */
+	find_max_pfn();
+	if (max_pfn & (PTRS_PER_PTE - 1)) {
+		pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+		printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+		max_pfn = pfn;
+	}
+	for(i = 0; i < MAX_NUMNODES; i++) {
+		pfn = PFN_DOWN(1 << 30) * i;
+		node_start_pfn[i]  = pfn;
+		pfn += PFN_DOWN(1 << 30);
+		if (pfn < max_pfn)
+			node_end_pfn[i]	  = pfn;
+		else {
+			node_end_pfn[i]	  = max_pfn;
+			i++;
+			printk("total %d blocks, max %d\n", i, max_pfn);
+			break;
+		}
+	}
+
+	/* Fill in the physnode_map with our simplistic memory model,
+	* all memory is in node 0.
+	*/
+	for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+	       pfn += PAGES_PER_ELEMENT)
+	{
+		physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+	}
+
+         /* Indicate there is one node available. */
+	node_set_online(0);
+	numnodes = i;
+
+	return 1;
+}
+
 /*
  * Find the highest page frame number we have available for the node
  */
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
 	}
 }
 
+static struct kcore_list numa_kc;
+
 void __init remap_numa_kva(void)
 {
 	void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
 		}
+		memset(node_remap_start_vaddr[node], 0,
+		    node_remap_size[node] * PAGE_SIZE);
 	}
+	kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+	    node_remap_offset[numnodes - 1] << PAGE_SHIFT);
 }
 
 static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test11/include/asm-i386/kmap_types.h linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test11/include/asm-i386/kmap_types.h	Thu Nov 27 05:44:56 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/kmap_types.h	Fri Nov 28 17:52:08 2003
@@ -24,7 +24,13 @@ D(10)	KM_IRQ0,
 D(11)	KM_IRQ1,
 D(12)	KM_SOFTIRQ0,
 D(13)	KM_SOFTIRQ1,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(14)	KM_REMAP0,
+D(15)	KM_REMAP1,
+D(16)	KM_TYPE_NR,
+#else
 D(14)	KM_TYPE_NR
+#endif
 };
 
 #undef D
diff -dpur linux-2.6.0-test11/include/asm-i386/mmzone.h linux-2.6.0-test11-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test11/include/asm-i386/mmzone.h	Thu Nov 27 05:44:10 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/mmzone.h	Fri Nov 28 17:45:42 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
 #endif /* CONFIG_X86_NUMAQ */
 
 extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
 /*
  * This allows any one NUMA architecture to be compiled
  * for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
 		return;
 #endif
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+	get_memcfg_numa_blks();
+	return;
+#endif
 	get_memcfg_numa_flat();
 }
 
diff -dpur linux-2.6.0-test11/include/asm-i386/numnodes.h linux-2.6.0-test11-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test11/include/asm-i386/numnodes.h	Thu Nov 27 05:43:09 2003
+++ linux-2.6.0-test11-mh/include/asm-i386/numnodes.h	Fri Nov 28 17:45:42 2003
@@ -13,6 +13,10 @@
 /* Max 8 Nodes */
 #define NODES_SHIFT	3
 
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT	3
+
 #endif /* CONFIG_X86_NUMAQ */
 
 #endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test11/include/linux/mm.h linux-2.6.0-test11-mh/include/linux/mm.h
--- linux-2.6.0-test11/include/linux/mm.h	Thu Nov 27 05:42:55 2003
+++ linux-2.6.0-test11-mh/include/linux/mm.h	Fri Nov 28 17:45:42 2003
@@ -219,7 +219,14 @@ struct page {
  */
 #define put_page_testzero(p)				\
 	({						\
-		BUG_ON(page_count(p) == 0);		\
+		if (page_count(p) == 0) {		\
+			int i;						\
+			printk("Page: %lx ", (long)p);			\
+			for(i = 0; i < sizeof(struct page); i++)	\
+				printk(" %02x", ((unsigned char *)p)[i]); \
+			printk("\n");					\
+			BUG();				\
+		}					\
 		atomic_dec_and_test(&(p)->count);	\
 	})
 
diff -dpur linux-2.6.0-test11/include/linux/mmzone.h linux-2.6.0-test11-mh/include/linux/mmzone.h
--- linux-2.6.0-test11/include/linux/mmzone.h	Thu Nov 27 05:44:20 2003
+++ linux-2.6.0-test11-mh/include/linux/mmzone.h	Fri Nov 28 17:45:42 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
 	return num;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
 #else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
 
 #define node_online(node) \
diff -dpur linux-2.6.0-test11/include/linux/page-flags.h linux-2.6.0-test11-mh/include/linux/page-flags.h
--- linux-2.6.0-test11/include/linux/page-flags.h	Thu Nov 27 05:44:52 2003
+++ linux-2.6.0-test11-mh/include/linux/page-flags.h	Fri Nov 28 17:45:42 2003
@@ -76,6 +76,8 @@
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
 
+#define	PG_again		20
+
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -268,6 +270,10 @@ extern void get_full_page_state(struct p
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+
+#define PageAgain(page)	test_bit(PG_again, &(page)->flags)
+#define SetPageAgain(page)	set_bit(PG_again, &(page)->flags)
+#define ClearPageAgain(page)	clear_bit(PG_again, &(page)->flags)
 
 /*
  * The PageSwapCache predicate doesn't use a PG_flag at this time,
diff -dpur linux-2.6.0-test11/mm/filemap.c linux-2.6.0-test11-mh/mm/filemap.c
--- linux-2.6.0-test11/mm/filemap.c	Thu Nov 27 05:43:33 2003
+++ linux-2.6.0-test11-mh/mm/filemap.c	Fri Nov 28 17:45:42 2003
@@ -448,7 +448,8 @@ repeat:
 			spin_lock(&mapping->page_lock);
 
 			/* Has the page been truncated while we slept? */
-			if (page->mapping != mapping || page->index != offset) {
+			if (page->mapping != mapping || page->index != offset ||
+			    PageAgain(page)) {
 				unlock_page(page);
 				page_cache_release(page);
 				goto repeat;
@@ -677,6 +678,12 @@ page_not_up_to_date:
 			goto page_ok;
 		}
 
+		if (PageAgain(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto find_page;
+		}
+
 readpage:
 		/* ... and start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
@@ -1120,6 +1127,12 @@ page_not_uptodate:
 		goto success;
 	}
 
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_find;
+	}
+
 	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
@@ -1228,6 +1241,12 @@ page_not_uptodate:
 		goto success;
 	}
 
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry_find;
+	}
+
 	if (!mapping->a_ops->readpage(file, page)) {
 		wait_on_page_locked(page);
 		if (PageUptodate(page))
@@ -1436,6 +1455,11 @@ retry:
 	if (PageUptodate(page)) {
 		unlock_page(page);
 		goto out;
+	}
+	if (PageAgain(page)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto retry;
 	}
 	err = filler(data, page);
 	if (err < 0) {
diff -dpur linux-2.6.0-test11/mm/page_alloc.c linux-2.6.0-test11-mh/mm/page_alloc.c
--- linux-2.6.0-test11/mm/page_alloc.c	Thu Nov 27 05:42:56 2003
+++ linux-2.6.0-test11-mh/mm/page_alloc.c	Fri Nov 28 17:45:42 2003
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 
@@ -52,6 +53,9 @@ EXPORT_SYMBOL(nr_swap_pages);
  */
 struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+#endif
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -411,7 +415,9 @@ int is_head_of_free_region(struct page *
 	spin_unlock_irqrestore(&zone->lock, flags);
         return 0;
 }
+#endif
 
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
@@ -512,9 +518,28 @@ static struct page *buffered_rmqueue(str
 		mod_page_state(pgalloc, 1 << order);
 		prep_new_page(page, order);
 	}
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (page != NULL && ! zone_active[page->flags >> ZONE_SHIFT])
+		printk("alloc_page from disabled zone: %p\n", page);
+#endif
 	return page;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+	int i;
+
+	for(i = 0; ; i++) {
+		if (zone_table[i] == z)
+			return zone_active[i];
+		if (zone_table[i] == NULL)
+			BUG();
+	}
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -562,6 +587,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		struct zone *z = zones[i];
 		unsigned long local_low;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
@@ -590,6 +619,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		unsigned long local_min;
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
@@ -613,6 +646,10 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(z))
+				continue;
+#endif
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
@@ -638,6 +675,10 @@ rebalance:
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		min += z->pages_min;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
@@ -1076,6 +1117,9 @@ static int __init build_zonelists_node(p
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct zone *zone;
+#endif
 
 	local_node = pgdat->node_id;
 	printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1135,7 @@ static void __init build_zonelists(pg_da
 			k = ZONE_HIGHMEM;
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
  		j = build_zonelists_node(pgdat, zonelist, j, k);
  		/*
  		 * Now we build the zonelist so that it contains the zones
@@ -1107,6 +1151,23 @@ static void __init build_zonelists(pg_da
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
+#else
+		for(; k >= 0; k--) {
+			zone = pgdat->node_zones + k;
+			if (zone->present_pages)
+				zonelist->zones[j++] = zone;
+			for (node = local_node + 1; node < numnodes; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+			for (node = 0; node < local_node; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+		}
+#endif
 	} 
 }
 
@@ -1252,6 +1313,9 @@ static void __init free_area_init_core(s
 		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+		zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1644,3 +1708,145 @@ int min_free_kbytes_sysctl_handler(ctl_t
 	setup_per_zone_pages_min();
 	return 0;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+    int *eof, void *data)
+{
+	char *p;
+	int i, len;
+	const struct zone *z;
+
+	p = page;
+	for(i = 0; ; i++) {
+		z = zone_table[i];
+		if (z == NULL)
+			break;
+		if (! z->present_pages)
+			/* skip empty zone */
+			continue;
+		len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+		    zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+		    z->present_pages);
+		p += len;
+	}
+	len = p - page;
+
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len < 0)
+		len = 0;
+	if (len > count)
+		len = count;
+
+	return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+    unsigned long count, void *data)
+{
+	unsigned long idx;
+	char buf[64], *p;
+	int i;
+	struct list_head *l;
+
+	if (count > sizeof(buf) - 1)
+		count = sizeof(buf) - 1;
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	p = strchr(buf, ' ');
+	if (p == NULL)
+		goto out;
+
+	*p++ = '\0';
+	idx = simple_strtoul(p, NULL, 0);
+
+	if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+		printk("Argument out of range\n");
+		goto out;
+	}
+	if (strcmp(buf, "disable") == 0) {
+		printk("disable %d\n", idx);
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = pcp->high = 0;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->low = pcp->high = 0;
+		}
+		zone_active[idx] = 0;
+		zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+	} else if (strcmp(buf, "purge") == 0) {
+		if (zone_active[idx])
+			printk("Zone %d still active (proceeding anyway)\n",
+			    idx);
+		printk("purge %d\n", idx);
+		wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+		/* XXX overkill, but who cares? */
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+	} else if (strcmp(buf, "enable") == 0) {
+		printk("enable %d\n", idx);
+		zone_active[idx] = 1;
+		zone_table[idx]->pages_high = 
+		    zone_table[idx]->pages_min * 3;
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = 2 * pcp->batch;
+			pcp->high = 6 * pcp->batch;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->high = 2 * pcp->batch;
+		}
+	} else if (strcmp(buf, "remap") == 0) {
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+		kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+	} else if (strcmp(buf, "active") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		spin_lock_irq(&zone_table[idx]->lru_lock);
+		i = 0;
+		list_for_each(l, &zone_table[idx]->active_list) {
+			printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+			i++;
+			if (i == 10)
+				break;
+		}
+		spin_unlock_irq(&zone_table[idx]->lru_lock);
+		printk("\n");
+	} else if (strcmp(buf, "inuse") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+			if (page_count(&zone_table[idx]->zone_mem_map[i]))
+				printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+		printk("\n");
+	}
+out:
+	return count;
+}
+
+static int __init procmhtest_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("memhotplug", 0, NULL);
+	if (entry == NULL)
+		return -1;
+
+	entry->read_proc = &mhtest_read;
+	entry->write_proc = &mhtest_write;
+	return 0;
+}
+__initcall(procmhtest_init);
+#endif
diff -dpur linux-2.6.0-test11/mm/shmem.c linux-2.6.0-test11-mh/mm/shmem.c
--- linux-2.6.0-test11/mm/shmem.c	Thu Nov 27 05:43:41 2003
+++ linux-2.6.0-test11-mh/mm/shmem.c	Fri Nov 28 17:45:42 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
 	 */
-	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+	printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+	return p;
+#else
+	return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
 }
 
 static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test11/mm/truncate.c linux-2.6.0-test11-mh/mm/truncate.c
--- linux-2.6.0-test11/mm/truncate.c	Thu Nov 27 05:45:39 2003
+++ linux-2.6.0-test11-mh/mm/truncate.c	Fri Nov 28 17:45:42 2003
@@ -132,6 +132,10 @@ void truncate_inode_pages(struct address
 			next++;
 			if (TestSetPageLocked(page))
 				continue;
+			if (PageAgain(page)) {
+				unlock_page(page);
+				continue;
+			}
 			if (PageWriteback(page)) {
 				unlock_page(page);
 				continue;
@@ -165,6 +169,14 @@ void truncate_inode_pages(struct address
 			struct page *page = pvec.pages[i];
 
 			lock_page(page);
+			if (PageAgain(page)) {
+				unsigned long index = page->index;
+
+				unlock_page(page);
+				put_page(page);
+				page = find_lock_page(mapping, index);
+				pvec.pages[i] = page;
+			}
 			wait_on_page_writeback(page);
 			if (page->index > next)
 				next = page->index;
@@ -255,6 +267,14 @@ void invalidate_inode_pages2(struct addr
 			struct page *page = pvec.pages[i];
 
 			lock_page(page);
+			if (PageAgain(page)) {
+				unsigned long index = page->index;
+
+				unlock_page(page);
+				put_page(page);
+				page = find_lock_page(mapping, index);
+				pvec.pages[i] = page;
+			}
 			if (page->mapping == mapping) {	/* truncate race? */
 				wait_on_page_writeback(page);
 				next = page->index + 1;
diff -dpur linux-2.6.0-test11/mm/vmalloc.c linux-2.6.0-test11-mh/mm/vmalloc.c
--- linux-2.6.0-test11/mm/vmalloc.c	Thu Nov 27 05:44:23 2003
+++ linux-2.6.0-test11-mh/mm/vmalloc.c	Fri Nov 28 17:45:42 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
+#ifdef CONFIG_MEMHOTPLUGTEST
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
 }
 
 EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test11/mm/vmscan.c linux-2.6.0-test11-mh/mm/vmscan.c
--- linux-2.6.0-test11/mm/vmscan.c	Thu Nov 27 05:43:06 2003
+++ linux-2.6.0-test11-mh/mm/vmscan.c	Fri Nov 28 17:55:35 2003
@@ -36,6 +36,9 @@
 #include <asm/div64.h>
 
 #include <linux/swapops.h>
+#ifdef CONFIG_KDB
+#include <linux/kdb.h>
+#endif
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -285,6 +288,8 @@ shrink_list(struct list_head *page_list,
 			goto keep_locked;
 
 		pte_chain_lock(page);
+		if ((! zone_activep(page_zone(page))) && page_mapped(page))
+			page_referenced(page);
 		referenced = page_referenced(page);
 		if (referenced && page_mapping_inuse(page)) {
 			/* In active use or really unfreeable.  Activate it. */
@@ -589,7 +594,7 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
 			struct page_state *ps, int priority)
 {
 	int pgmoved;
@@ -607,6 +612,12 @@ refill_inactive_zone(struct zone *zone, 
 
 	lru_add_drain();
 	pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone)) {
+		nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+		printk("Purging active list of disabled zone\n");
+	}
+#endif
 	spin_lock_irq(&zone->lru_lock);
 	while (nr_pages && !list_empty(&zone->active_list)) {
 		page = list_entry(zone->active_list.prev, struct page, lru);
@@ -658,12 +669,20 @@ refill_inactive_zone(struct zone *zone, 
 	 */
 	if (swap_tendency >= 100)
 		reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		reclaim_mapped = 1;
+#endif
 
 	while (!list_empty(&l_hold)) {
 		page = list_entry(l_hold.prev, struct page, lru);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
 			pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(zone))
+				page_referenced(page);	/* XXX */
+#endif
 			if (page_mapped(page) && page_referenced(page)) {
 				pte_chain_unlock(page);
 				list_add(&page->lru, &l_active);
@@ -767,6 +786,11 @@ shrink_zone(struct zone *zone, int max_s
 	ratio = (unsigned long)nr_pages * zone->nr_active /
 				((zone->nr_inactive | 1) * 2);
 	atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		/* XXX */
+		atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
 	if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
 		int count;
 
@@ -1048,6 +1072,439 @@ int kswapd(void *p)
 		balance_pgdat(pgdat, 0, &ps);
 	}
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+	struct address_space* mapping = page->mapping;
+	struct buffer_head *bh, *head;
+
+	spin_lock(&mapping->private_lock);
+	bh = head = page_buffers(page);
+	printk("buffers:");
+	do {
+		printk(" %lx %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	printk("\n");
+	spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+	struct page *newpage;
+	struct zone *zone;
+	struct address_space *mapping = page->mapping;
+	char *np, *op;
+	void *p;
+	int waitcnt, error = -1;
+
+	newpage = alloc_page(GFP_HIGHUSER);
+	if (newpage == NULL)
+		return -ENOMEM;
+	if (TestSetPageLocked(newpage))
+		BUG();
+	lock_page(page);
+
+	if (! PagePrivate(page) && PageWriteback(page))
+#ifdef CONFIG_KDB
+		KDB_ENTER();
+#else
+		BUG();
+#endif
+	if (PagePrivate(page)) {
+		waitcnt = 100;
+		while (PageWriteback(page)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(10);
+			__set_current_state(TASK_RUNNING);
+			if (! --waitcnt)
+				goto radixfail;
+		}
+
+		/* XXX copied from shrink_list() */
+		if (PageDirty(page) &&
+		    is_page_cache_freeable(page) &&
+		    mapping != NULL &&
+		    mapping->a_ops->writepage != NULL) {
+			spin_lock(&mapping->page_lock);
+			if (test_clear_page_dirty(page)) {
+				int res;
+				struct writeback_control wbc = {
+					.sync_mode = WB_SYNC_NONE,
+					.nr_to_write = SWAP_CLUSTER_MAX,
+					.nonblocking = 1,
+					.for_reclaim = 1,
+				};
+
+				list_move(&page->list, &mapping->locked_pages);
+				spin_unlock(&mapping->page_lock);
+
+				SetPageReclaim(page);
+				res = mapping->a_ops->writepage(page, &wbc);
+
+				if (res == WRITEPAGE_ACTIVATE) {
+					ClearPageReclaim(page);
+					goto radixfail;
+				}
+				if (!PageWriteback(page)) {
+					/* synchronous write or broken a_ops? */
+					ClearPageReclaim(page);
+				}
+				lock_page(page);
+				if (! PagePrivate(page))
+					goto bufferdone;
+			} else
+				spin_unlock(&mapping->page_lock);
+		}
+
+		waitcnt = 100;
+		while (1) {
+			if (try_to_release_page(page, GFP_KERNEL))
+				break;
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(10);
+			__set_current_state(TASK_RUNNING);
+			if (! --waitcnt) {
+				print_buffer(page);
+				goto radixfail;
+			}
+		}
+	}
+bufferdone:
+	if (mapping == NULL) {
+		/* The page is an anon page. Allocate swap entry. */
+		if (!add_to_swap(page))
+			goto radixfail;
+		mapping = page->mapping;
+	}
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		goto radixfail;
+	if (PagePrivate(page)) /* XXX */
+		BUG();
+
+	/* should {__add_to,__remove_from}_page_cache be used instead? */
+	spin_lock(&mapping->page_lock);
+	if (mapping != page->mapping)
+		printk("mapping changed %p -> %p, page %p\n",
+		    mapping, page->mapping, page);
+	if (radix_tree_delete(&mapping->page_tree, page->index) == NULL) {
+		/* Page truncated. */
+		spin_unlock(&mapping->page_lock);
+		radix_tree_preload_end();
+		goto radixfail;
+	}
+	/* don't __put_page(page) here. truncate may be in progress */
+	newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+	    ~(1 << PG_highmem) & ~(1 << PG_chainlock) &
+	    ~(1 << PG_direct) & ~(~0UL << ZONE_SHIFT);
+
+	/* list_del(&page->list); XXX */
+	radix_tree_insert(&mapping->page_tree, page->index, newpage);
+	page_cache_get(newpage);
+	newpage->mapping = mapping;
+	newpage->index = page->index;
+	if (PageDirty(page))
+		list_add(&newpage->list, &mapping->dirty_pages);
+	else
+		list_add(&newpage->list, &mapping->clean_pages);
+	spin_unlock(&mapping->page_lock);
+	radix_tree_preload_end();
+
+	pte_chain_lock(page);
+	if (page_mapped(page)) {
+		while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+			pte_chain_unlock(page);
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+			__set_current_state(TASK_RUNNING);
+			pte_chain_lock(page);
+		}
+		if (error == SWAP_FAIL) {
+			pte_chain_unlock(page); /* XXX */
+			/* either during mremap or mlocked */
+			goto unmapfail;
+		}
+	}
+	pte_chain_unlock(page);
+	if (PagePrivate(page))
+		printk("buffer reappeared\n");
+
+	unlock_page(page);	/* no lock needed while waiting page count */
+
+	waitcnt = 1;
+wait_again:
+	while (page_count(page) > 2) {
+		waitcnt++;
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+		if ((waitcnt % 5000) == 0) {
+			printk("remap_onepage: still waiting on %p %d\n", page, waitcnt);
+			break;
+		}
+		if (PagePrivate(page))
+			break;		/* see below */
+	}
+
+	lock_page(page);
+	BUG_ON(page_count(page) == 0);
+	if (PagePrivate(page))
+		try_to_release_page(page, GFP_KERNEL);
+	if (page_count(page) > 2) {
+		if (waitcnt > 50000)
+			goto unmapfail;
+		unlock_page(page);
+		goto wait_again;
+	}
+	if (PageReclaim(page) || PageWriteback(page) || PagePrivate(page))
+#ifdef CONFIG_KDB
+		KDB_ENTER();
+#else
+		BUG();
+#endif
+	if (page_count(page) == 1) {
+		/* page has been truncated.  free both pages. */
+		spin_lock(&mapping->page_lock);
+		p = radix_tree_lookup(&mapping->page_tree, newpage->index);
+		if (p != NULL) {
+			/* new cache page appeared after truncation */
+			printk("page %p newpage %p radix %p\n",
+			    page, newpage, p);
+			BUG_ON(p == newpage);
+		}
+		list_del(&newpage->list);
+		put_page(newpage);
+		if (page_count(newpage) != 1) {
+			printk("newpage count %d != 1, %p\n",
+			    page_count(newpage), newpage);
+			BUG();
+		}
+		/* No need to do page->list. remove_from_page_cache did. */
+		newpage->mapping = page->mapping = NULL;
+		spin_unlock(&mapping->page_lock);
+		ClearPageActive(page);
+		ClearPageActive(newpage);
+		unlock_page(page);
+		unlock_page(newpage);
+		put_page(page);
+		put_page(newpage);
+		return 0;
+	}
+
+	spin_lock(&mapping->page_lock);
+	list_del(&page->list); /* XXX */
+	page->mapping = NULL;
+	spin_unlock(&mapping->page_lock);
+	unlock_page(page);
+
+	np = kmap_atomic(newpage, KM_REMAP0);
+	op = kmap_atomic(page, KM_REMAP1);
+	if (np == NULL || op == NULL) {	/* XXX */
+		printk("%p %p %p %p\n", np, op, newpage, page);
+		BUG();
+	}
+	memcpy(np, op, PAGE_SIZE);
+	kunmap_atomic(page, KM_REMAP1);
+	kunmap_atomic(newpage, KM_REMAP0);
+	ClearPageActive(page);
+	__put_page(page);
+	put_page(page);
+
+	/* We are done. Finish and let the waiters run. */
+	SetPageUptodate(newpage);
+	/* XXX locking order correct? */
+	zone = page_zone(newpage);
+	spin_lock_irq(&zone->lru_lock);
+	if (PageActive(newpage)) {
+		list_add(&newpage->lru, &zone->active_list);
+		zone->nr_active++;
+	} else {
+		list_add(&newpage->lru, &zone->inactive_list);
+		zone->nr_inactive++;
+	}
+	SetPageLRU(newpage);
+	spin_unlock_irq(&zone->lru_lock);
+	unlock_page(newpage);
+	page_cache_release(newpage);
+	return 0;
+
+unmapfail:
+	/*
+	 * Try to unwind by notifying waiters.  If someone misbehaves,
+	 * we die.
+	 */
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		BUG();
+	/* should {__add_to,__remove_from}_page_cache be used instead? */
+	spin_lock(&mapping->page_lock);
+	/* list_del(&newpage->list); */
+	if (radix_tree_delete(&mapping->page_tree, page->index) == NULL)
+		/* Hold extra count to handle truncate */
+		page_cache_get(newpage);
+	radix_tree_insert(&mapping->page_tree, page->index, page);
+	/* no page_cache_get(page); needed */
+	radix_tree_preload_end();
+	spin_unlock(&mapping->page_lock);
+
+	SetPageAgain(newpage);
+	/* XXX unmap needed?  No, it shouldn't.  Handled by fault handlers. */
+	unlock_page(newpage);
+
+	waitcnt = 1;
+	for(; page_count(newpage) > 2; waitcnt++) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+		if ((waitcnt % 10000) == 0) {
+			printk("You are hosed.\n");
+			printk("newpage %p\n", newpage);
+			BUG();
+		}
+	}
+	BUG_ON(PageUptodate(newpage));
+	ClearPageDirty(newpage);
+	ClearPageActive(newpage);
+	spin_lock(&mapping->page_lock);
+	newpage->mapping = NULL;
+	if (page_count(newpage) == 1) {
+		printk("newpage %p truncated. page %p\n", newpage, page);
+		BUG();
+	}
+	list_del(&newpage->list);
+	spin_unlock(&mapping->page_lock);
+	unlock_page(page);
+	__put_page(newpage);
+	__free_page(newpage);
+	return 1;
+	
+radixfail:
+	unlock_page(page);
+	unlock_page(newpage);
+	__free_page(newpage);
+	return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+	int cpu = get_cpu();
+
+	schedule_work(&lru_drain_wq[cpu]);
+	put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+	struct zone *zone = p;
+	struct page *page, *page1;
+	struct list_head *l;
+	int active, i, nr_failed = 0;
+	int fastmode = 100;
+	LIST_HEAD(failedp);
+
+	daemonize("remap%d", zone->zone_start_pfn);
+	if (atomic_read(&remapd_count) > 0) {
+		printk("remapd already running\n");
+		return 0;
+	}
+	atomic_inc(&remapd_count);
+	on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+	while(nr_failed < 100) {
+		spin_lock_irq(&zone->lru_lock);
+		for(active = 0; active < 2; active++) {
+			l = active ? &zone->active_list :
+			    &zone->inactive_list;
+			for(i = 0; ! list_empty(l) && i < 10; i++) {
+				page = list_entry(l->prev, struct page, lru);
+				if (fastmode && PageLocked(page)) {
+					page1 = page;
+					while (fastmode && PageLocked(page)) {
+						page =
+						    list_entry(page->lru.prev,
+						    struct page, lru);
+						fastmode--;
+						if (&page->lru == l) {
+							/* scanned the whole
+							   list */
+							page = page1;
+							break;
+						}
+						if (page == page1)
+							BUG();
+					}
+					if (! fastmode) {
+						printk("used up fastmode\n");
+						page = page1;
+					}
+				}
+				if (! TestClearPageLRU(page))
+					BUG();
+				list_del(&page->lru);
+				if (page_count(page) == 0) {
+					/* the page is in pagevec_release();
+					   shrink_cache says so. */
+					SetPageLRU(page);
+					list_add(&page->lru, l);
+					continue;
+				}
+				if (active)
+					zone->nr_active--;
+				else
+					zone->nr_inactive--;
+				page_cache_get(page);
+				spin_unlock_irq(&zone->lru_lock);
+				goto got_page;
+			}
+		}
+		spin_unlock_irq(&zone->lru_lock);
+		break;
+
+	got_page:
+		if (remap_onepage(page)) {
+			nr_failed++;
+			list_add(&page->lru, &failedp);
+		}
+	}
+	if (list_empty(&failedp))
+		goto out;
+
+	while (! list_empty(&failedp)) {
+		spin_lock_irq(&zone->lru_lock);
+		page = list_entry(failedp.prev, struct page, lru);
+		list_del(&page->lru);
+		if (PageActive(page)) {
+			list_add(&page->lru, &zone->active_list);
+			zone->nr_active++;
+		} else {
+			list_add(&page->lru, &zone->inactive_list);
+			zone->nr_inactive++;
+		}
+		if (TestSetPageLRU(page))
+			BUG();
+		spin_unlock_irq(&zone->lru_lock);
+		page_cache_release(page);
+	}
+out:
+	atomic_dec(&remapd_count);
+	return 0;
+}
+			
+static int __init remapd_init(void)
+{
+	int i;
+
+	for(i = 0; i < NR_CPUS; i++)
+		INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+	return 0;
+}
+
+module_init(remapd_init);
+#endif
 
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-01  3:41 memory hotremove prototype, take 3 IWAMOTO Toshihiro
@ 2003-12-01 19:56 ` Pavel Machek
  2003-12-03 19:41 ` Martin J. Bligh
  1 sibling, 0 replies; 10+ messages in thread
From: Pavel Machek @ 2003-12-01 19:56 UTC (permalink / raw)
  To: IWAMOTO Toshihiro; +Cc: linux-kernel, linux-mm

Hi!

> this is a new version of my memory hotplug prototype patch, against
> linux-2.6.0-test11.
> 
> Freeing 100% of a specified memory zone is non-trivial and necessary
> for memory hot removal.  This patch splits memory into 1GB zones, and
> implements complete zone memory freeing using kswapd or "remapping".
> 
> A bit more detailed explanation and some test scripts are at:
> 	http://people.valinux.co.jp/~iwamoto/mh.html

I scanned it...

hotunplug seems cool... How do you deal with kernel data structures in
memory "to be removed"? Or you simply don't allow kmalloc() to
allocate there?

During hotunplug, you copy pages to new locaion. Would it simplify
code if you forced them to be swapped out, instead? [Yep, it would be
slower...]
								Pavel
-- 
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-01  3:41 memory hotremove prototype, take 3 IWAMOTO Toshihiro
  2003-12-01 19:56 ` Pavel Machek
@ 2003-12-03 19:41 ` Martin J. Bligh
  2003-12-04  3:58   ` IWAMOTO Toshihiro
  1 sibling, 1 reply; 10+ messages in thread
From: Martin J. Bligh @ 2003-12-03 19:41 UTC (permalink / raw)
  To: IWAMOTO Toshihiro, linux-kernel, linux-mm

> this is a new version of my memory hotplug prototype patch, against
> linux-2.6.0-test11.
> 
> Freeing 100% of a specified memory zone is non-trivial and necessary
> for memory hot removal.  This patch splits memory into 1GB zones, and
> implements complete zone memory freeing using kswapd or "remapping".
> 
> A bit more detailed explanation and some test scripts are at:
> 	http://people.valinux.co.jp/~iwamoto/mh.html
> 
> Main changes against previous versions are:
> - The stability is greatly improved.  Kernel crashes (probably related
>   with kswapd) still happen, but they are rather rare so that I'm
>   having difficulty reproducing crashes.
>   Page remapping under simultaneous tar + rm -rf works.
> - Implemented a solution to a deadlock caused by ext2_rename, which
>   increments a refcount of a directory page twice.
> 
> Questions and comments are welcome.

I really think that doing this over zones and pgdats isn't the best approach.
You're going to make memory allocation and reclaim vastly less efficient,
and you're exposing a bunch of very specialised code inside the main
memory paths. 

Have you looked at Daniel's CONFIG_NONLINEAR stuff? That provides a much
cleaner abstraction for getting rid of discontiguous memory in the non
truly-NUMA case, and should work really well for doing mem hot add / remove
as well.

M.

PS. What's this bit of the patch for?

 void *vmalloc(unsigned long size)
 {
+#ifdef CONFIG_MEMHOTPLUGTEST
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
 }
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-03 19:41 ` Martin J. Bligh
@ 2003-12-04  3:58   ` IWAMOTO Toshihiro
  2003-12-04  5:38     ` Martin J. Bligh
  0 siblings, 1 reply; 10+ messages in thread
From: IWAMOTO Toshihiro @ 2003-12-04  3:58 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: IWAMOTO Toshihiro, linux-kernel, linux-mm

At Wed, 03 Dec 2003 11:41:01 -0800,
Martin J. Bligh <mbligh@aracnet.com> wrote:
> 
> > this is a new version of my memory hotplug prototype patch, against
> > linux-2.6.0-test11.
> > 
> > Freeing 100% of a specified memory zone is non-trivial and necessary
> > for memory hot removal.  This patch splits memory into 1GB zones, and
> > implements complete zone memory freeing using kswapd or "remapping".
> > 
> > A bit more detailed explanation and some test scripts are at:
> > 	http://people.valinux.co.jp/~iwamoto/mh.html
> > 
> > Main changes against previous versions are:
> > - The stability is greatly improved.  Kernel crashes (probably related
> >   with kswapd) still happen, but they are rather rare so that I'm
> >   having difficulty reproducing crashes.
> >   Page remapping under simultaneous tar + rm -rf works.
> > - Implemented a solution to a deadlock caused by ext2_rename, which
> >   increments a refcount of a directory page twice.
> > 
> > Questions and comments are welcome.
> 
> I really think that doing this over zones and pgdats isn't the best approach.
> You're going to make memory allocation and reclaim vastly less efficient,
> and you're exposing a bunch of very specialised code inside the main
> memory paths. 

I used the discontigmem code because this is what we have now.
My hacks such as zone_active[] will go away when the memory hot add
code (on which Goto-san is working on) is ready.

> Have you looked at Daniel's CONFIG_NONLINEAR stuff? That provides a much
> cleaner abstraction for getting rid of discontiguous memory in the non
> truly-NUMA case, and should work really well for doing mem hot add / remove
> as well.

Thanks for pointing out.  I looked at the patch.
It should be doable to make my patch work with the CONFIG_NONLINEAR
code.  For my code to work, basically the following functionarities
are necessary:
1. disabling alloc_page from hot-removing area
and
2. enumerating pages in use in hot-removing area.

My target is somewhat NUMA-ish and fairly large.  So I'm not sure if
CONFIG_NONLINEAR fits, but CONFIG_NUMA isn't perfect either.


> PS. What's this bit of the patch for?
> 
>  void *vmalloc(unsigned long size)
>  {
> +#ifdef CONFIG_MEMHOTPLUGTEST
> +       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
> +#else
>         return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
> +#endif
>  }

This is necessary because kernel memory cannot be swapped out.
Only highmem can be hot removed, though it doesn't need to be highmem.
We can define another zone attribute such as GFP_HOTPLUGGABLE.

--
IWAMOTO Toshihiro
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04  3:58   ` IWAMOTO Toshihiro
@ 2003-12-04  5:38     ` Martin J. Bligh
  2003-12-04 15:44       ` IWAMOTO Toshihiro
  0 siblings, 1 reply; 10+ messages in thread
From: Martin J. Bligh @ 2003-12-04  5:38 UTC (permalink / raw)
  To: IWAMOTO Toshihiro; +Cc: linux-kernel, linux-mm

> I used the discontigmem code because this is what we have now.
> My hacks such as zone_active[] will go away when the memory hot add
> code (on which Goto-san is working on) is ready.

Understand that, but it'd be much cleaner (and more likely to get 
accepted) doing it the other way.

>> Have you looked at Daniel's CONFIG_NONLINEAR stuff? That provides a much
>> cleaner abstraction for getting rid of discontiguous memory in the non
>> truly-NUMA case, and should work really well for doing mem hot add / remove
>> as well.
> 
> Thanks for pointing out.  I looked at the patch.
> It should be doable to make my patch work with the CONFIG_NONLINEAR
> code.  For my code to work, basically the following functionarities
> are necessary:
> 1. disabling alloc_page from hot-removing area
> and
> 2. enumerating pages in use in hot-removing area.
> 
> My target is somewhat NUMA-ish and fairly large.  So I'm not sure if
> CONFIG_NONLINEAR fits, but CONFIG_NUMA isn't perfect either.

If your target is NUMA, then you really, really need CONFIG_NONLINEAR.
We don't support multiple pgdats per node, nor do I wish to, as it'll
make an unholy mess ;-). With CONFIG_NONLINEAR, the discontiguities
within a node are buried down further, so we have much less complexity
to deal with from the main VM. The abstraction also keeps the poor
VM engineers trying to read / write the code saner via simplicity ;-)

WRT generic discontigmem support (not NUMA), doing that via pgdats
should really go away, as there's no real difference between the 
chunks of physical memory as far as the page allocator is concerned.
The plan is to use Daniel's nonlinear stuff to replace that, and keep
the pgdats strictly for NUMA. Same would apply to hotpluggable zones - 
I'd hate to end up with 512 pgdats of stuff that are really all the
same memory types underneath.

The real issue you have is the mapping of the struct pages - if we can
acheive a non-contig mapping of the mem_map / lmem_map array, we should
be able to take memory on and offline reasonably easy. If you're willing
for a first implementation to pre-allocate the struct page array for 
every possible virtual address, it makes life a lot easier.

Adding the other layer of indirection for access the struct page array
should fix up most of that, and is very easily abstracted out via the
pfn_to_page macros and friends. I ripped out all the direct references
to mem_map indexing already in 2.6, so it should all be nicely 
abstracted out.

>> PS. What's this bit of the patch for?
>> 
>>  void *vmalloc(unsigned long size)
>>  {
>> +#ifdef CONFIG_MEMHOTPLUGTEST
>> +       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
>> +#else
>>         return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
>> +#endif
>>  }
> 
> This is necessary because kernel memory cannot be swapped out.
> Only highmem can be hot removed, though it doesn't need to be highmem.
> We can define another zone attribute such as GFP_HOTPLUGGABLE.

You could just lock the pages, I'd think? I don't see at a glance
exactly what you were using this for, but would that work?

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04  5:38     ` Martin J. Bligh
@ 2003-12-04 15:44       ` IWAMOTO Toshihiro
  2003-12-04 17:12         ` Martin J. Bligh
  2003-12-04 18:27         ` Jesse Barnes
  0 siblings, 2 replies; 10+ messages in thread
From: IWAMOTO Toshihiro @ 2003-12-04 15:44 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: IWAMOTO Toshihiro, linux-kernel, linux-mm

At Wed, 03 Dec 2003 21:38:54 -0800,
Martin J. Bligh <mbligh@aracnet.com> wrote:
> > My target is somewhat NUMA-ish and fairly large.  So I'm not sure if
> > CONFIG_NONLINEAR fits, but CONFIG_NUMA isn't perfect either.
> 
> If your target is NUMA, then you really, really need CONFIG_NONLINEAR.
> We don't support multiple pgdats per node, nor do I wish to, as it'll
> make an unholy mess ;-). With CONFIG_NONLINEAR, the discontiguities
> within a node are buried down further, so we have much less complexity
> to deal with from the main VM. The abstraction also keeps the poor
> VM engineers trying to read / write the code saner via simplicity ;-)

IIRC, memory is contiguous within a NUMA node.  I think Goto-san will
clarify this issue when his code gets ready. :-)

> WRT generic discontigmem support (not NUMA), doing that via pgdats
> should really go away, as there's no real difference between the 
> chunks of physical memory as far as the page allocator is concerned.
> The plan is to use Daniel's nonlinear stuff to replace that, and keep
> the pgdats strictly for NUMA. Same would apply to hotpluggable zones - 
> I'd hate to end up with 512 pgdats of stuff that are really all the
> same memory types underneath.

Yes. Unnecessary zone rebalancing would suck.

> The real issue you have is the mapping of the struct pages - if we can
> acheive a non-contig mapping of the mem_map / lmem_map array, we should
> be able to take memory on and offline reasonably easy. If you're willing
> for a first implementation to pre-allocate the struct page array for 
> every possible virtual address, it makes life a lot easier.

Preallocating struct page array isn't feasible for the target system
because max memory / min memory ratio is large.
Our plan is to use the beginning (or the end) of the memory block being
hotplugged.  If a 2GB memory block is added, first ~20MB is used for
the struct page array for the rest of the memory block.


> >> PS. What's this bit of the patch for?
> >> 
> >>  void *vmalloc(unsigned long size)
> >>  {
> >> +#ifdef CONFIG_MEMHOTPLUGTEST
> >> +       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
> >> +#else
> >>         return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
> >> +#endif
> >>  }
> > 
> > This is necessary because kernel memory cannot be swapped out.
> > Only highmem can be hot removed, though it doesn't need to be highmem.
> > We can define another zone attribute such as GFP_HOTPLUGGABLE.
> 
> You could just lock the pages, I'd think? I don't see at a glance
> exactly what you were using this for, but would that work?

I haven't seriously considered to implement vmalloc'd memory, but I
guess that would be too complicated if not impossible.
Making kernel threads or interrupt handlers block on memory access
sound very difficult to me.

--
IWAMOTO Toshihiro
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04 15:44       ` IWAMOTO Toshihiro
@ 2003-12-04 17:12         ` Martin J. Bligh
  2003-12-04 18:27         ` Jesse Barnes
  1 sibling, 0 replies; 10+ messages in thread
From: Martin J. Bligh @ 2003-12-04 17:12 UTC (permalink / raw)
  To: IWAMOTO Toshihiro; +Cc: linux-kernel, linux-mm

>> > My target is somewhat NUMA-ish and fairly large.  So I'm not sure if
>> > CONFIG_NONLINEAR fits, but CONFIG_NUMA isn't perfect either.
>> 
>> If your target is NUMA, then you really, really need CONFIG_NONLINEAR.
>> We don't support multiple pgdats per node, nor do I wish to, as it'll
>> make an unholy mess ;-). With CONFIG_NONLINEAR, the discontiguities
>> within a node are buried down further, so we have much less complexity
>> to deal with from the main VM. The abstraction also keeps the poor
>> VM engineers trying to read / write the code saner via simplicity ;-)
> 
> IIRC, memory is contiguous within a NUMA node.  I think Goto-san will
> clarify this issue when his code gets ready. :-)

Right - but then you can't use discontigmem's multiple pgdat's inside
a node to implement hotplug mem for NUMA systems.

> Preallocating struct page array isn't feasible for the target system
> because max memory / min memory ratio is large.
> Our plan is to use the beginning (or the end) of the memory block being
> hotplugged.  If a 2GB memory block is added, first ~20MB is used for
> the struct page array for the rest of the memory block.

Right - that makes perfect sense, it just has 2 problems:

1) You end up with a discontiguous mem_map array (fixable by adding a layer
of indirection in the wrapped macros).
2) on 32 bit, it's going to make a mess, as you need to map mem_map
inside the permanently mapped kernel area (aka ZONE_NORMAL+vmalloc space 
except in a kind of wierd cornercase I created with remap_numa_kva, 
which creates a no-man's land of permanently mapped kernel memory 
between ZONE_NORMAL and VMALLOC_RESERVE area for the remapped 
lmem_maps from the other nodes).

>> You could just lock the pages, I'd think? I don't see at a glance
>> exactly what you were using this for, but would that work?
> 
> I haven't seriously considered to implement vmalloc'd memory, but I
> guess that would be too complicated if not impossible.
> Making kernel threads or interrupt handlers block on memory access
> sound very difficult to me.

Aahh, maybe I understand now. You're saying you don't support hotplugging
ZONE_NORMAL, so you want to restrict vmalloc accesses to the non-hotplugged
areas? In which case things like HIGHPTE will be a nightmare as well ... ;-)
You also need to be very wary of where memlocked pages are allocated from.

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04 15:44       ` IWAMOTO Toshihiro
  2003-12-04 17:12         ` Martin J. Bligh
@ 2003-12-04 18:27         ` Jesse Barnes
  2003-12-04 18:29           ` Martin J. Bligh
  1 sibling, 1 reply; 10+ messages in thread
From: Jesse Barnes @ 2003-12-04 18:27 UTC (permalink / raw)
  To: IWAMOTO Toshihiro; +Cc: Martin J. Bligh, linux-kernel, linux-mm

On Fri, Dec 05, 2003 at 12:44:06AM +0900, IWAMOTO Toshihiro wrote:
> IIRC, memory is contiguous within a NUMA node.  I think Goto-san will
> clarify this issue when his code gets ready. :-)

Not on all systems.  On sn2 we use ia64's virtual memmap to make memory
within a node appear contiguous, even though it may not be.

Jesse
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04 18:27         ` Jesse Barnes
@ 2003-12-04 18:29           ` Martin J. Bligh
  2003-12-04 18:59             ` Jesse Barnes
  0 siblings, 1 reply; 10+ messages in thread
From: Martin J. Bligh @ 2003-12-04 18:29 UTC (permalink / raw)
  To: Jesse Barnes, IWAMOTO Toshihiro; +Cc: linux-kernel, linux-mm

>> IIRC, memory is contiguous within a NUMA node.  I think Goto-san will
>> clarify this issue when his code gets ready. :-)
> 
> Not on all systems.  On sn2 we use ia64's virtual memmap to make memory
> within a node appear contiguous, even though it may not be.

Wasn't there a plan to get rid of that though? I forget what it was,
probably using CONFIG_NONLINEAR too ... ?

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: memory hotremove prototype, take 3
  2003-12-04 18:29           ` Martin J. Bligh
@ 2003-12-04 18:59             ` Jesse Barnes
  0 siblings, 0 replies; 10+ messages in thread
From: Jesse Barnes @ 2003-12-04 18:59 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: IWAMOTO Toshihiro, linux-kernel, linux-mm

On Thu, Dec 04, 2003 at 10:29:53AM -0800, Martin J. Bligh wrote:
> >> IIRC, memory is contiguous within a NUMA node.  I think Goto-san will
> >> clarify this issue when his code gets ready. :-)
> > 
> > Not on all systems.  On sn2 we use ia64's virtual memmap to make memory
> > within a node appear contiguous, even though it may not be.
> 
> Wasn't there a plan to get rid of that though? I forget what it was,
> probably using CONFIG_NONLINEAR too ... ?

I think config_nonliner would do the trick, but no one's done the work
yet :)

Jesse
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2003-12-04 18:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-12-01  3:41 memory hotremove prototype, take 3 IWAMOTO Toshihiro
2003-12-01 19:56 ` Pavel Machek
2003-12-03 19:41 ` Martin J. Bligh
2003-12-04  3:58   ` IWAMOTO Toshihiro
2003-12-04  5:38     ` Martin J. Bligh
2003-12-04 15:44       ` IWAMOTO Toshihiro
2003-12-04 17:12         ` Martin J. Bligh
2003-12-04 18:27         ` Jesse Barnes
2003-12-04 18:29           ` Martin J. Bligh
2003-12-04 18:59             ` Jesse Barnes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox