From mboxrd@z Thu Jan  1 00:00:00 1970
Date: Tue, 04 Nov 2003 19:29:43 +0900
From: IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
Subject: memory hotremove prototype, take 2
MIME-Version: 1.0 (generated by SEMI 1.14.3 - "Ushinoya")
Content-Type: text/plain; charset=US-ASCII
Message-Id: <20031104102943.898A67007B@sv1.valinux.co.jp>
Sender: owner-linux-mm@kvack.org
Return-Path: <owner-linux-mm@kvack.org>
To: linux-mm@kvack.org, lhms-devel@lists.sourceforge.net
List-ID: <linux-mm.kvack.org>

Hi,

As you may know, I'm working on memory hotplug.
(See http://marc.theaimsgroup.com/?l=linux-kernel&m=106637967926960
for my original patch.)
I fixed several fatal bugs in the original patch and it works much
better.  The updated version is included in this mail.
I confirmed successful "make -j4" cross-build of NetBSD libc while
rotating active and inactive zones and remapping pages of inactive
zones.

However, I discovered my page remapping approach has a fatal flaw and
ext2_rename caused a deadlock.  Let me explain the situation.

To put it simple, what my patch does is:
    for each page (called "oldpage" hereafter) do
        1. allocate "newpage" as a replacement
        2. increment oldpage's page count
        3. rewrite oldpage radix entry with the one of newpage,
         so that find_get_page and its friends return newpage
        4. wait until page_count(oldpage) drops to 1
        5. copy oldpage's content to newpage,
         SetPageUptodate(newpage), unlock_page(newpage)
        6. oldpage can be freed

ext2_rename does:

        old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
              :
              :
                err = ext2_add_link(new_dentry, old_inode);

ext2_find_entry increments old_page's page count, and ext2_add_link
finds and locks a directory page.  These two pages can be the same if
a file is renamed within a directory.  If the radix tree gets
rewritten in the meanwhile, a deadlock happens.

To solve this problem, I think I must implement either of the followings.
        1. Make the step 4 timeout and retry page remapping.
           Timeout means "newpage" returned by find_get_page
           is obsolete and the page should be looked up again.
           To achieve this, a page flag, say PG_again, could be used
           to notify callers.
           Every portion of the kernel which calls find_get_page
           needs modification.
        2. Record which processes incremented page count and return
	   oldpage when asked by such process to avoid deadlock.

I'm going to try solution 1, but I hope there is a better solution.
If you have any idea or comments, let me know.

$Id: memoryhotplug.patch,v 1.10 2003/10/31 09:54:35 iwamoto Exp $

diff -dpur linux-2.6.0-test9-kdb/arch/i386/Kconfig linux-2.6.0-test9-kdb-mh/arch/i386/Kconfig
--- linux-2.6.0-test9-kdb/arch/i386/Kconfig	Thu Oct 30 11:14:47 2003
+++ linux-2.6.0-test9-kdb-mh/arch/i386/Kconfig	Thu Oct 30 12:11:36 2003
@@ -706,14 +706,18 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MEMHOTPLUGTEST
+       bool "Memory hotplug test"
+       default n
+
 config DISCONTIGMEM
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HAVE_ARCH_BOOTMEM_NODE
 	bool
-	depends on NUMA
+	depends on NUMA || MEMHOTPLUGTEST
 	default y
 
 config HIGHPTE
diff -dpur linux-2.6.0-test9-kdb/arch/i386/mm/discontig.c linux-2.6.0-test9-kdb-mh/arch/i386/mm/discontig.c
--- linux-2.6.0-test9-kdb/arch/i386/mm/discontig.c	Sun Oct 26 03:43:49 2003
+++ linux-2.6.0-test9-kdb-mh/arch/i386/mm/discontig.c	Thu Oct 30 14:58:10 2003
@@ -28,6 +28,7 @@
 #include <linux/mmzone.h>
 #include <linux/highmem.h>
 #include <linux/initrd.h>
+#include <linux/proc_fs.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/mmzone.h>
@@ -111,6 +112,49 @@ int __init get_memcfg_numa_flat(void)
 	return 1;
 }
 
+int __init get_memcfg_numa_blks(void)
+{
+	int i, pfn;
+
+	printk("NUMA - single node, flat memory mode, but broken in several blocks\n");
+
+	/* Run the memory configuration and find the top of memory. */
+	find_max_pfn();
+	if (max_pfn & (PTRS_PER_PTE - 1)) {
+		pfn = max_pfn & ~(PTRS_PER_PTE - 1);
+		printk("Rounding down maxpfn %d -> %d\n", max_pfn, pfn);
+		max_pfn = pfn;
+	}
+	for(i = 0; i < MAX_NUMNODES; i++) {
+		pfn = PFN_DOWN(1 << 30) * i;
+		node_start_pfn[i]  = pfn;
+		pfn += PFN_DOWN(1 << 30);
+		if (pfn < max_pfn)
+			node_end_pfn[i]	  = pfn;
+		else {
+			node_end_pfn[i]	  = max_pfn;
+			i++;
+			printk("total %d blocks, max %d\n", i, max_pfn);
+			break;
+		}
+	}
+
+	/* Fill in the physnode_map with our simplistic memory model,
+	* all memory is in node 0.
+	*/
+	for (pfn = node_start_pfn[0]; pfn <= max_pfn;
+	       pfn += PAGES_PER_ELEMENT)
+	{
+		physnode_map[pfn / PAGES_PER_ELEMENT] = pfn / PFN_DOWN(1 << 30);
+	}
+
+         /* Indicate there is one node available. */
+	node_set_online(0);
+	numnodes = i;
+
+	return 1;
+}
+
 /*
  * Find the highest page frame number we have available for the node
  */
@@ -183,6 +227,8 @@ static void __init register_bootmem_low_
 	}
 }
 
+static struct kcore_list numa_kc;
+
 void __init remap_numa_kva(void)
 {
 	void *vaddr;
@@ -196,7 +242,11 @@ void __init remap_numa_kva(void)
 				node_remap_start_pfn[node] + pfn, 
 				PAGE_KERNEL_LARGE);
 		}
+		memset(node_remap_start_vaddr[node], 0,
+		    node_remap_size[node] * PAGE_SIZE);
 	}
+	kclist_add(&numa_kc, node_remap_start_vaddr[numnodes - 1],
+	    node_remap_offset[numnodes - 1] << PAGE_SHIFT);
 }
 
 static unsigned long calculate_numa_remap_pages(void)
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/kmap_types.h linux-2.6.0-test9-kdb-mh/include/asm-i386/kmap_types.h
--- linux-2.6.0-test9-kdb/include/asm-i386/kmap_types.h	Thu Oct 30 11:14:47 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/kmap_types.h	Thu Oct 30 12:22:27 2003
@@ -25,7 +25,13 @@ D(11)	KM_IRQ1,
 D(12)	KM_SOFTIRQ0,
 D(13)	KM_SOFTIRQ1,
 D(14)	KM_KDB,
+#ifdef CONFIG_MEMHOTPLUGTEST
+D(15)	KM_REMAP0,
+D(16)	KM_REMAP1,
+D(17)	KM_TYPE_NR,
+#else
 D(15)	KM_TYPE_NR
+#endif
 };
 
 #undef D
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/mmzone.h linux-2.6.0-test9-kdb-mh/include/asm-i386/mmzone.h
--- linux-2.6.0-test9-kdb/include/asm-i386/mmzone.h	Sun Oct 26 03:43:39 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/mmzone.h	Thu Oct 30 12:42:06 2003
@@ -128,6 +128,10 @@ static inline struct pglist_data *pfn_to
 #endif /* CONFIG_X86_NUMAQ */
 
 extern int get_memcfg_numa_flat(void );
+#ifdef CONFIG_MEMHOTPLUGTEST
+extern int get_memcfg_numa_blks(void);
+#endif
+
 /*
  * This allows any one NUMA architecture to be compiled
  * for, and still fall back to the flat function if it
@@ -143,6 +147,10 @@ static inline void get_memcfg_numa(void)
 		return;
 #endif
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+	get_memcfg_numa_blks();
+	return;
+#endif
 	get_memcfg_numa_flat();
 }
 
diff -dpur linux-2.6.0-test9-kdb/include/asm-i386/numnodes.h linux-2.6.0-test9-kdb-mh/include/asm-i386/numnodes.h
--- linux-2.6.0-test9-kdb/include/asm-i386/numnodes.h	Sun Oct 26 03:43:02 2003
+++ linux-2.6.0-test9-kdb-mh/include/asm-i386/numnodes.h	Thu Oct 30 12:32:27 2003
@@ -13,6 +13,10 @@
 /* Max 8 Nodes */
 #define NODES_SHIFT	3
 
+#elif defined(CONFIG_MEMHOTPLUGTEST)
+
+#define NODES_SHIFT	3
+
 #endif /* CONFIG_X86_NUMAQ */
 
 #endif /* _ASM_MAX_NUMNODES_H */
diff -dpur linux-2.6.0-test9-kdb/include/linux/mm.h linux-2.6.0-test9-kdb-mh/include/linux/mm.h
--- linux-2.6.0-test9-kdb/include/linux/mm.h	Sun Oct 26 03:42:50 2003
+++ linux-2.6.0-test9-kdb-mh/include/linux/mm.h	Thu Oct 30 12:11:37 2003
@@ -219,7 +219,14 @@ struct page {
  */
 #define put_page_testzero(p)				\
 	({						\
-		BUG_ON(page_count(p) == 0);		\
+		if (page_count(p) == 0) {		\
+			int i;						\
+			printk("Page: %lx ", (long)p);			\
+			for(i = 0; i < sizeof(struct page); i++)	\
+				printk(" %02x", ((unsigned char *)p)[i]); \
+			printk("\n");					\
+			BUG();				\
+		}					\
 		atomic_dec_and_test(&(p)->count);	\
 	})
 
@@ -620,6 +627,12 @@ static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
 }
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+#define	page_trace(p)	page_trace_func(p, __FUNCTION__, __LINE__)
+extern void page_trace_func(const struct page *, const char *, int);
+#else
+#define	page_trace(p)	do { } while(0)
 #endif
 
 #endif /* __KERNEL__ */
diff -dpur linux-2.6.0-test9-kdb/include/linux/mmzone.h linux-2.6.0-test9-kdb-mh/include/linux/mmzone.h
--- linux-2.6.0-test9-kdb/include/linux/mmzone.h	Sun Oct 26 03:43:49 2003
+++ linux-2.6.0-test9-kdb-mh/include/linux/mmzone.h	Thu Oct 30 12:11:37 2003
@@ -360,6 +360,10 @@ static inline unsigned int num_online_me
 	return num;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int zone_activep(const struct zone *);
+int remapd(void *p);
+#endif
 #else /* !CONFIG_DISCONTIGMEM && !CONFIG_NUMA */
 
 #define node_online(node) \
diff -dpur linux-2.6.0-test9-kdb/ipc/util.c linux-2.6.0-test9-kdb-mh/ipc/util.c
--- linux-2.6.0-test9-kdb/ipc/util.c	Sun Oct 26 03:43:27 2003
+++ linux-2.6.0-test9-kdb-mh/ipc/util.c	Thu Oct 30 12:11:37 2003
@@ -324,6 +324,9 @@ void* ipc_rcu_alloc(int size)
 		if (out) out += sizeof(struct ipc_rcu_kmalloc);
 	}
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+	printk("ipc_rcu_alloc: %lx\n", (unsigned long)out);
+#endif
 	return out;
 }
 
diff -dpur linux-2.6.0-test9-kdb/mm/memory.c linux-2.6.0-test9-kdb-mh/mm/memory.c
--- linux-2.6.0-test9-kdb/mm/memory.c	Thu Oct 30 11:27:52 2003
+++ linux-2.6.0-test9-kdb-mh/mm/memory.c	Thu Oct 30 12:11:37 2003
@@ -420,6 +420,17 @@ zap_pte_range(struct mmu_gather *tlb, pm
 						mark_page_accessed(page);
 					tlb->freed++;
 					page_remove_rmap(page, ptep);
+#if 1 // debug
+	/* Validate page */
+	{
+		struct zone *z = page_zone(page);
+		int idx = page - z->zone_mem_map;
+		if (idx < 0 || idx >= z->spanned_pages) {
+			printk("zap_pte_range: %d %d\n", page->flags >> ZONE_SHIFT, idx);
+			BUG();
+		}
+	}
+#endif
 					tlb_remove_page(tlb, page);
 				}
 			}
diff -dpur linux-2.6.0-test9-kdb/mm/page_alloc.c linux-2.6.0-test9-kdb-mh/mm/page_alloc.c
--- linux-2.6.0-test9-kdb/mm/page_alloc.c	Sun Oct 26 03:42:53 2003
+++ linux-2.6.0-test9-kdb-mh/mm/page_alloc.c	Thu Oct 30 12:45:21 2003
@@ -31,6 +31,7 @@
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/proc_fs.h>
 
 #include <asm/tlbflush.h>
 
@@ -52,6 +53,10 @@ EXPORT_SYMBOL(nr_swap_pages);
  */
 struct zone *zone_table[MAX_NR_ZONES*MAX_NUMNODES];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_MEMHOTPLUGTEST
+static char zone_active[MAX_NR_ZONES*MAX_NUMNODES];
+static const struct page *page_trace_list[10];
+#endif
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -257,6 +262,7 @@ free_pages_bulk(struct zone *zone, int c
 		page = list_entry(list->prev, struct page, list);
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->list);
+		page_trace(page);
 		__free_pages_bulk(page, base, zone, area, mask, order);
 		ret++;
 	}
@@ -411,7 +417,9 @@ int is_head_of_free_region(struct page *
 	spin_unlock_irqrestore(&zone->lock, flags);
         return 0;
 }
+#endif
 
+#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_MEMHOTPLUGTEST)
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
@@ -512,9 +520,28 @@ static struct page *buffered_rmqueue(str
 		mod_page_state(pgalloc, 1 << order);
 		prep_new_page(page, order);
 	}
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_active[page->flags >> ZONE_SHIFT])
+		printk("alloc_page from disabled zone: %p\n", page);
+#endif
 	return page;
 }
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+int
+zone_activep(const struct zone *z)
+{
+	int i;
+
+	for(i = 0; ; i++) {
+		if (zone_table[i] == z)
+			return zone_active[i];
+		if (zone_table[i] == NULL)
+			BUG();
+	}
+}
+#endif
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -562,6 +589,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		struct zone *z = zones[i];
 		unsigned long local_low;
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		/*
 		 * This is the fabled 'incremental min'. We let real-time tasks
 		 * dip their real-time paws a little deeper into reserves.
@@ -590,6 +621,10 @@ __alloc_pages(unsigned int gfp_mask, uns
 		unsigned long local_min;
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
@@ -613,6 +648,10 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(z))
+				continue;
+#endif
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
@@ -638,6 +677,10 @@ rebalance:
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
+#ifdef CONFIG_MEMHOTPLUGTEST
+		if (! zone_activep(z))
+			continue;
+#endif
 		min += z->pages_min;
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
@@ -676,6 +719,21 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
+#if 1 // debug
+	/* Validate page */
+	{
+		struct zone *z = page_zone(page);
+		int idx = page - z->zone_mem_map;
+		if (idx < 0 || idx >= z->spanned_pages) {
+			printk("%d %d\n", page->flags >> ZONE_SHIFT, idx);
+			BUG();
+		}
+	}
+#endif
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_active[page->flags >> ZONE_SHIFT])
+		BUG();
+#endif
 	return page;
 }
 
@@ -1076,6 +1134,9 @@ static int __init build_zonelists_node(p
 static void __init build_zonelists(pg_data_t *pgdat)
 {
 	int i, j, k, node, local_node;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct zone *zone;
+#endif
 
 	local_node = pgdat->node_id;
 	printk("Building zonelist for node : %d\n", local_node);
@@ -1091,7 +1152,7 @@ static void __init build_zonelists(pg_da
 			k = ZONE_HIGHMEM;
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
-
+#ifndef CONFIG_MEMHOTPLUGTEST
  		j = build_zonelists_node(pgdat, zonelist, j, k);
  		/*
  		 * Now we build the zonelist so that it contains the zones
@@ -1107,6 +1168,23 @@ static void __init build_zonelists(pg_da
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
+#else
+		for(; k >= 0; k--) {
+			zone = pgdat->node_zones + k;
+			if (zone->present_pages)
+				zonelist->zones[j++] = zone;
+			for (node = local_node + 1; node < numnodes; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+			for (node = 0; node < local_node; node++) {
+				zone = NODE_DATA(node)->node_zones + k;
+				if (zone->present_pages)
+					zonelist->zones[j++] = zone;
+			}
+		}
+#endif
 	} 
 }
 
@@ -1252,6 +1330,9 @@ static void __init free_area_init_core(s
 		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
+#ifdef CONFIG_MEMHOTPLUGTEST
+		zone_active[nid * MAX_NR_ZONES + j] = 1;
+#endif
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1295,8 +1376,8 @@ static void __init free_area_init_core(s
 			pcp->batch = 1 * batch;
 			INIT_LIST_HEAD(&pcp->list);
 		}
-		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
+		printk("  %s zone: %lu pages, LIFO batch:%lu start:%lu\n",
+				zone_names[j], realsize, batch, zone_start_pfn);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		atomic_set(&zone->refill_counter, 0);
@@ -1644,3 +1725,187 @@ int min_free_kbytes_sysctl_handler(ctl_t
 	setup_per_zone_pages_min();
 	return 0;
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static int mhtest_read(char *page, char **start, off_t off, int count,
+    int *eof, void *data)
+{
+	char *p;
+	int i, len;
+	const struct zone *z;
+
+	p = page;
+	for(i = 0; ; i++) {
+		z = zone_table[i];
+		if (z == NULL)
+			break;
+		if (! z->present_pages)
+			/* skip empty zone */
+			continue;
+		len = sprintf(p, "Zone %d: %sabled free %d, active %d, present %d\n", i,
+		    zone_active[i] ? "en" : "dis", z->free_pages, z->nr_active,
+		    z->present_pages);
+		p += len;
+	}
+	len = p - page;
+
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len < 0)
+		len = 0;
+	if (len > count)
+		len = count;
+
+	return len;
+}
+
+static int mhtest_write(struct file *file, const char *buffer,
+    unsigned long count, void *data)
+{
+	unsigned long idx;
+	char buf[64], *p;
+	int i;
+	struct list_head *l;
+
+	if (count > sizeof(buf) - 1)
+		count = sizeof(buf) - 1;
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+
+	buf[count] = 0;
+
+	p = strchr(buf, ' ');
+	if (p == NULL)
+		goto out;
+
+	*p++ = '\0';
+	idx = simple_strtoul(p, NULL, 0);
+
+	if (strcmp(buf, "trace") == 0) {
+		for(i = 0; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]); i++)
+			if (page_trace_list[i] == NULL) {
+				page_trace_list[i] = (struct page *)idx;
+				printk("add trace %lx\n", (unsigned long)idx);
+				goto out;
+			}
+		printk("page_trace_list is full (not added)\n");
+		goto out;
+	} else if (strcmp(buf, "untrace") == 0) {
+		for(i = 0; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]); i++)
+			if (page_trace_list[i] == (struct page *)idx)
+				break;
+		if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0])) {
+			printk("not registered\n");
+			goto out;
+		}
+		for(; i < sizeof(page_trace_list) /
+		    sizeof(page_trace_list[0]) - 1; i++)
+			page_trace_list[i] = page_trace_list[i + 1];
+		page_trace_list[i] = NULL;
+		goto out;
+	}
+	if (idx > MAX_NR_ZONES*MAX_NUMNODES) {
+		printk("Argument out of range\n");
+		goto out;
+	}
+	if (strcmp(buf, "disable") == 0) {
+		printk("disable %d\n", idx);
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = pcp->high = 0;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->low = pcp->high = 0;
+		}
+		zone_active[idx] = 0;
+		zone_table[idx]->pages_high = zone_table[idx]->present_pages;
+	} else if (strcmp(buf, "purge") == 0) {
+		if (zone_active[idx])
+			printk("Zone %d still active (proceeding anyway)\n",
+			    idx);
+		printk("purge %d\n", idx);
+		wake_up_interruptible(&zone_table[idx]->zone_pgdat->kswapd_wait);
+		/* XXX overkill, but who cares? */
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+	} else if (strcmp(buf, "enable") == 0) {
+		printk("enable %d\n", idx);
+		zone_active[idx] = 1;
+		zone_table[idx]->pages_high = 
+		    zone_table[idx]->pages_min * 3;
+		/* XXX */
+		for (i = 0; i < NR_CPUS; i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[0];	/* hot */
+			pcp->low = 2 * pcp->batch;
+			pcp->high = 6 * pcp->batch;
+
+			pcp = &zone_table[idx]->pageset[i].pcp[1];	/* cold */
+			pcp->high = 2 * pcp->batch;
+		}
+	} else if (strcmp(buf, "remap") == 0) {
+		on_each_cpu(drain_local_pages, NULL, 1, 1);
+		kernel_thread(remapd, zone_table[idx], CLONE_KERNEL);
+	} else if (strcmp(buf, "active") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		spin_lock_irq(&zone_table[idx]->lru_lock);
+		i = 0;
+		list_for_each(l, &zone_table[idx]->active_list) {
+			printk(" %lx", (unsigned long)list_entry(l, struct page, lru));
+			i++;
+			if (i == 10)
+				break;
+		}
+		spin_unlock_irq(&zone_table[idx]->lru_lock);
+		printk("\n");
+	} else if (strcmp(buf, "inuse") == 0) {
+		if (zone_table[idx] == NULL)
+			goto out;
+		for(i = 0; i < zone_table[idx]->spanned_pages; i++)
+			if (page_count(&zone_table[idx]->zone_mem_map[i]))
+				printk(" %lx", (unsigned long)&zone_table[idx]->zone_mem_map[i]);
+		printk("\n");
+	}
+out:
+	return count;
+}
+
+static int __init procmhtest_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("memhotplug", 0, NULL);
+	if (entry == NULL)
+		return -1;
+
+	entry->read_proc = &mhtest_read;
+	entry->write_proc = &mhtest_write;
+	return 0;
+}
+__initcall(procmhtest_init);
+
+void
+page_trace_func(const struct page *p, const char *func, int line) {
+	int i;
+
+	for(i = 0; i < sizeof(page_trace_list) /
+	    sizeof(page_trace_list[0]); i++) {
+		if (page_trace_list[i] == NULL)
+			return;
+		if (page_trace_list[i] == p)
+			break;
+	}
+	if (i == sizeof(page_trace_list) / sizeof(page_trace_list[0]))
+		return;
+
+	printk("Page %lx, %s %d\n", (unsigned long)p, func, line);
+}
+#endif
diff -dpur linux-2.6.0-test9-kdb/mm/shmem.c linux-2.6.0-test9-kdb-mh/mm/shmem.c
--- linux-2.6.0-test9-kdb/mm/shmem.c	Sun Oct 26 03:43:30 2003
+++ linux-2.6.0-test9-kdb-mh/mm/shmem.c	Thu Oct 30 12:11:37 2003
@@ -80,7 +80,15 @@ static inline struct page *shmem_dir_all
 	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
 	 * might be reconsidered if it ever diverges from PAGE_SIZE.
 	 */
-	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	struct page* p = alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+	printk("shmem_dir_alloc: %lx\n", (unsigned long)p);
+	return p;
+#else
+	return alloc_pages(gfp_mask & ~__GFP_HIGHMEM,
+	    PAGE_CACHE_SHIFT-PAGE_SHIFT);
+#endif
 }
 
 static inline void shmem_dir_free(struct page *page)
diff -dpur linux-2.6.0-test9-kdb/mm/swap.c linux-2.6.0-test9-kdb-mh/mm/swap.c
--- linux-2.6.0-test9-kdb/mm/swap.c	Sun Oct 26 03:43:26 2003
+++ linux-2.6.0-test9-kdb-mh/mm/swap.c	Thu Oct 30 12:11:37 2003
@@ -77,6 +77,7 @@ void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 
+	page_trace(page);
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page) && !PageActive(page)) {
 		del_page_from_inactive_list(zone, page);
@@ -189,6 +190,19 @@ void release_pages(struct page **pages, 
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
+		if (page_count(page) == 0) {
+			struct zone **z = zone_table;
+			int idx;
+			while (*z) {
+				idx = page - (*z)->zone_mem_map;
+				if (idx >= 0 && idx < (*z)->spanned_pages)
+					break;
+				z++;
+			}
+			if (*z != NULL)
+				printk("Zone: %lx %d, index: %d\n",
+				    (unsigned long)*z, z - zone_table, idx);
+		}
 		if (PageReserved(page) || !put_page_testzero(page))
 			continue;
 
@@ -251,6 +265,10 @@ void __pagevec_release_nonlru(struct pag
 		BUG_ON(PageLRU(page));
 		if (put_page_testzero(page))
 			pagevec_add(&pages_to_free, page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+		else
+			printk("Page %lx disappearing\n", page);
+#endif
 	}
 	pagevec_free(&pages_to_free);
 	pagevec_reinit(pvec);
diff -dpur linux-2.6.0-test9-kdb/mm/swap_state.c linux-2.6.0-test9-kdb-mh/mm/swap_state.c
--- linux-2.6.0-test9-kdb/mm/swap_state.c	Sun Oct 26 03:43:31 2003
+++ linux-2.6.0-test9-kdb-mh/mm/swap_state.c	Thu Oct 30 12:11:37 2003
@@ -152,6 +152,7 @@ int add_to_swap(struct page * page)
 			ClearPageDirty(page);
 			set_page_dirty(page);
 			INC_CACHE_INFO(add_total);
+			page_trace(page);
 			return 1;
 		case -EEXIST:
 			/* Raced with "speculative" read_swap_cache_async */
@@ -161,6 +162,7 @@ int add_to_swap(struct page * page)
 		default:
 			/* -ENOMEM radix-tree allocation failure */
 			swap_free(entry);
+			page_trace(page);
 			return 0;
 		}
 	}
diff -dpur linux-2.6.0-test9-kdb/mm/vmalloc.c linux-2.6.0-test9-kdb-mh/mm/vmalloc.c
--- linux-2.6.0-test9-kdb/mm/vmalloc.c	Sun Oct 26 03:43:51 2003
+++ linux-2.6.0-test9-kdb-mh/mm/vmalloc.c	Thu Oct 30 12:11:37 2003
@@ -447,7 +447,11 @@ EXPORT_SYMBOL(__vmalloc);
  */
 void *vmalloc(unsigned long size)
 {
+#ifdef CONFIG_MEMHOTPLUGTEST
+       return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+#else
        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+#endif
 }
 
 EXPORT_SYMBOL(vmalloc);
diff -dpur linux-2.6.0-test9-kdb/mm/vmscan.c linux-2.6.0-test9-kdb-mh/mm/vmscan.c
--- linux-2.6.0-test9-kdb/mm/vmscan.c	Sun Oct 26 03:42:59 2003
+++ linux-2.6.0-test9-kdb-mh/mm/vmscan.c	Fri Oct 31 15:34:58 2003
@@ -285,6 +285,8 @@ shrink_list(struct list_head *page_list,
 			goto keep_locked;
 
 		pte_chain_lock(page);
+		if ((! zone_activep(page_zone(page))) && page_mapped(page))
+			page_referenced(page);
 		referenced = page_referenced(page);
 		if (referenced && page_mapping_inuse(page)) {
 			/* In active use or really unfreeable.  Activate it. */
@@ -310,6 +312,7 @@ shrink_list(struct list_head *page_list,
 		}
 #endif /* CONFIG_SWAP */
 
+		page_trace(page);
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
@@ -318,9 +321,11 @@ shrink_list(struct list_head *page_list,
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
 				pte_chain_unlock(page);
+				page_trace(page);
 				goto activate_locked;
 			case SWAP_AGAIN:
 				pte_chain_unlock(page);
+				page_trace(page);
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
@@ -367,6 +372,7 @@ shrink_list(struct list_head *page_list,
 					.nonblocking = 1,
 					.for_reclaim = 1,
 				};
+				page_trace(page);
 
 				list_move(&page->list, &mapping->locked_pages);
 				spin_unlock(&mapping->page_lock);
@@ -410,12 +416,14 @@ shrink_list(struct list_head *page_list,
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (PagePrivate(page)) {
+			page_trace(page);
 			if (!try_to_release_page(page, gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1)
 				goto free_it;
 		}
 
+		page_trace(page);
 		if (!mapping)
 			goto keep_locked;	/* truncate got there first */
 
@@ -431,6 +439,7 @@ shrink_list(struct list_head *page_list,
 			goto keep_locked;
 		}
 
+		page_trace(page);
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };
@@ -589,7 +598,7 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
+refill_inactive_zone(struct zone *zone, int nr_pages_in,
 			struct page_state *ps, int priority)
 {
 	int pgmoved;
@@ -607,6 +616,12 @@ refill_inactive_zone(struct zone *zone, 
 
 	lru_add_drain();
 	pgmoved = 0;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone)) {
+		nr_pages = nr_pages_in = zone->present_pages - zone->free_pages;
+		printk("Purging active list of disabled zone\n");
+	}
+#endif
 	spin_lock_irq(&zone->lru_lock);
 	while (nr_pages && !list_empty(&zone->active_list)) {
 		page = list_entry(zone->active_list.prev, struct page, lru);
@@ -614,6 +629,7 @@ refill_inactive_zone(struct zone *zone, 
 		if (!TestClearPageLRU(page))
 			BUG();
 		list_del(&page->lru);
+		page_trace(page);
 		if (page_count(page) == 0) {
 			/* It is currently in pagevec_release() */
 			SetPageLRU(page);
@@ -658,20 +674,30 @@ refill_inactive_zone(struct zone *zone, 
 	 */
 	if (swap_tendency >= 100)
 		reclaim_mapped = 1;
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		reclaim_mapped = 1;
+#endif
 
 	while (!list_empty(&l_hold)) {
 		page = list_entry(l_hold.prev, struct page, lru);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
 			pte_chain_lock(page);
+#ifdef CONFIG_MEMHOTPLUGTEST
+			if (! zone_activep(zone))
+				page_referenced(page);	/* XXX */
+#endif
 			if (page_mapped(page) && page_referenced(page)) {
 				pte_chain_unlock(page);
+				page_trace(page);
 				list_add(&page->lru, &l_active);
 				continue;
 			}
 			pte_chain_unlock(page);
 			if (!reclaim_mapped) {
 				list_add(&page->lru, &l_active);
+				page_trace(page);
 				continue;
 			}
 		}
@@ -682,9 +708,11 @@ refill_inactive_zone(struct zone *zone, 
 		if (total_swap_pages == 0 && !page->mapping &&
 						!PagePrivate(page)) {
 			list_add(&page->lru, &l_active);
+			page_trace(page);
 			continue;
 		}
 		list_add(&page->lru, &l_inactive);
+		page_trace(page);
 	}
 
 	pagevec_init(&pvec, 1);
@@ -767,6 +795,11 @@ shrink_zone(struct zone *zone, int max_s
 	ratio = (unsigned long)nr_pages * zone->nr_active /
 				((zone->nr_inactive | 1) * 2);
 	atomic_add(ratio+1, &zone->refill_counter);
+#ifdef CONFIG_MEMHOTPLUGTEST
+	if (! zone_activep(zone))
+		/* XXX */
+		atomic_add(SWAP_CLUSTER_MAX, &zone->refill_counter);
+#endif
 	if (atomic_read(&zone->refill_counter) > SWAP_CLUSTER_MAX) {
 		int count;
 
@@ -1048,6 +1081,326 @@ int kswapd(void *p)
 		balance_pgdat(pgdat, 0, &ps);
 	}
 }
+
+#ifdef CONFIG_MEMHOTPLUGTEST
+static void
+print_buffer(struct page* page)
+{
+	struct address_space* mapping = page->mapping;
+	struct buffer_head *bh, *head;
+
+	spin_lock(&mapping->private_lock);
+	bh = head = page_buffers(page);
+	printk("buffers:");
+	do {
+		printk(" %x %d\n", bh->b_state, atomic_read(&bh->b_count));
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+	printk("\n");
+	spin_unlock(&mapping->private_lock);
+}
+/* try to remap a page. returns non-zero on failure */
+int remap_onepage(struct page *page)
+{
+	struct page *newpage;
+	struct zone *zone;
+	struct address_space *mapping = page->mapping;
+	char *np, *op;
+	int waitcnt, error = -1;
+
+	newpage = alloc_page(GFP_HIGHUSER);
+	if (newpage == NULL)
+		return -ENOMEM;
+	if (TestSetPageLocked(newpage))
+		BUG();
+	lock_page(page);
+
+	if (PagePrivate(page)) {
+		waitcnt = 100;
+		while (PageWriteback(page)) {
+			current->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(10);
+			if (! --waitcnt)
+				goto radixfail;
+		}
+
+		/* XXX copied from shrink_list() */
+		if (PageDirty(page) &&
+		    is_page_cache_freeable(page) &&
+		    mapping != NULL &&
+		    mapping->a_ops->writepage != NULL) {
+			spin_lock(&mapping->page_lock);
+			if (test_clear_page_dirty(page)) {
+				int res;
+				struct writeback_control wbc = {
+					.sync_mode = WB_SYNC_NONE,
+					.nr_to_write = SWAP_CLUSTER_MAX,
+					.nonblocking = 1,
+					.for_reclaim = 1,
+				};
+
+				list_move(&page->list, &mapping->locked_pages);
+				spin_unlock(&mapping->page_lock);
+
+				SetPageReclaim(page);
+				res = mapping->a_ops->writepage(page, &wbc);
+
+				if (res == WRITEPAGE_ACTIVATE) {
+					ClearPageReclaim(page);
+					goto radixfail;
+				}
+				if (!PageWriteback(page)) {
+					/* synchronous write or broken a_ops? */
+					ClearPageReclaim(page);
+				}
+				lock_page(page);
+				goto waitbuffer;
+			}
+			spin_unlock(&mapping->page_lock);
+		}
+
+	waitbuffer:
+		waitcnt = 100;
+		while (1) {
+			if (try_to_release_page(page, GFP_KERNEL))
+				break;
+			current->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(10);
+			if (! --waitcnt) {
+				print_buffer(page);
+				goto radixfail;
+			}
+		}
+	}
+	if (mapping == NULL) {
+		/* The page is an anon page. Allocate swap entry. */
+		/* ...but just bail for now */
+		if (!add_to_swap(page))
+			goto radixfail;
+		mapping = page->mapping;
+	}
+	error = radix_tree_preload(GFP_KERNEL);
+	if (error)
+		goto radixfail;
+	if (PagePrivate(page)) /* XXX */
+		BUG();
+
+	newpage->flags |= page->flags & ~(1 << PG_uptodate) &
+	    ~(1 << PG_highmem) & ~(~0UL << ZONE_SHIFT);
+	/* should {__add_to,__remove_from}_page_cache be used instead? */
+	spin_lock(&mapping->page_lock);
+	radix_tree_delete(&mapping->page_tree, page->index);
+	__put_page(page);
+	/* list_del(&page->list); XXX */
+	radix_tree_insert(&mapping->page_tree, page->index, newpage);
+	page_cache_get(newpage);
+	radix_tree_preload_end();
+	newpage->mapping = mapping;
+	newpage->index = page->index;
+	spin_unlock(&mapping->page_lock);
+	if (PageDirty(page))
+		list_add(&newpage->list, &mapping->dirty_pages);
+	else
+		list_add(&newpage->list, &mapping->clean_pages);
+
+	pte_chain_lock(page);
+	if (page_mapped(page)) {
+		while ((error = try_to_unmap(page)) == SWAP_AGAIN) {
+			pte_chain_unlock(page);
+			current->state = TASK_INTERRUPTIBLE;
+			schedule_timeout(1);
+			pte_chain_lock(page);
+		}
+		if (error == SWAP_FAIL)
+			/* either during mremap or mlocked */
+			goto unmapfail;
+	}
+	pte_chain_unlock(page);
+	if (PagePrivate(page))
+		printk("buffer reappeared\n");
+
+	unlock_page(page);	/* no lock needed while waiting page count */
+
+	waitcnt = 0;
+wait_again:
+	while (page_count(page) != 1) {
+		current->state = TASK_INTERRUPTIBLE;
+		schedule_timeout(1);
+		if (PagePrivate(page))
+			break;		/* see below */
+		if (waitcnt == 10000) {
+			printk("remap_onepage: still waiting on %p\n", page);
+			waitcnt++;
+		}
+		if (waitcnt < 10000)
+			waitcnt++;
+	}
+
+	lock_page(page);
+	if (PagePrivate(page))
+		try_to_release_page(page, GFP_KERNEL);
+	if (page_count(page) != 1) {
+		unlock_page(page);
+		goto wait_again;
+	}
+	spin_lock(&mapping->page_lock);
+	list_del(&page->list); /* XXX */
+	page->mapping = NULL;
+	spin_unlock(&mapping->page_lock);
+	unlock_page(page);
+
+	np = kmap_atomic(newpage, KM_REMAP0);
+	op = kmap_atomic(page, KM_REMAP1);
+	if (np == NULL || op == NULL) {	/* XXX */
+		printk("%p %p %p %p\n", np, op, newpage, page);
+		BUG();
+	}
+	memcpy(np, op, PAGE_SIZE);
+	kunmap_atomic(page, KM_REMAP1);
+	kunmap_atomic(newpage, KM_REMAP0);
+	ClearPageActive(page);
+	put_page(page);
+
+	/* We are done. Finish and let the waiters run. */
+	SetPageUptodate(newpage);
+	/* XXX locking order correct? */
+	zone = page_zone(newpage);
+	spin_lock_irq(&zone->lru_lock);
+	if (PageActive(newpage)) {
+		list_add(&newpage->lru, &zone->active_list);
+		zone->nr_active++;
+	} else {
+		list_add(&newpage->lru, &zone->inactive_list);
+		zone->nr_inactive++;
+	}
+	SetPageLRU(newpage);
+	spin_unlock_irq(&zone->lru_lock);
+	unlock_page(newpage);
+	page_cache_release(newpage);
+	return 0;
+
+unmapfail:
+	/* unwind is impossible if some process is waiting on the newpage */
+	printk("You are hosed.\n");
+	BUG();
+	
+radixfail:
+	unlock_page(page);
+	unlock_page(newpage);
+	__free_page(newpage);
+	return 1;
+}
+
+static struct work_struct lru_drain_wq[NR_CPUS];
+static void
+lru_drain_schedule(void *p)
+{
+	int cpu = get_cpu();
+
+	schedule_work(&lru_drain_wq[cpu]);
+	put_cpu();
+}
+
+atomic_t remapd_count;
+int remapd(void *p)
+{
+	struct zone *zone = p;
+	struct page *page;
+	int i, nr_failed = 0;
+	LIST_HEAD(failedp);
+
+	daemonize("remap%d", zone->zone_start_pfn);
+	if (atomic_read(&remapd_count) > 0) {
+		printk("remapd already running\n");
+		return 0;
+	}
+	atomic_inc(&remapd_count);
+	on_each_cpu(lru_drain_schedule, NULL, 1, 1);
+	while(nr_failed < 100) {
+		spin_lock_irq(&zone->lru_lock);
+		for(i = 0; ! list_empty(&zone->inactive_list) &&
+		    i < 10; i++) {
+			page = list_entry(zone->inactive_list.prev,
+			    struct page, lru);
+			if (! TestClearPageLRU(page))
+				BUG();
+			list_del(&page->lru);
+			zone->nr_inactive--;
+			if (page_count(page) == 0) {
+				/* the page is in pagevec_release();
+				   shrink_cache says so. */
+				SetPageLRU(page);
+				list_add(&page->lru, &zone->inactive_list);
+				continue;
+			}
+			page_cache_get(page);
+			spin_unlock_irq(&zone->lru_lock);
+			goto got_page;
+		}
+
+		for(i = 0; ! list_empty(&zone->active_list) &&
+		    i < 10; i++) {
+			page = list_entry(zone->active_list.prev,
+			    struct page, lru);
+			if (! TestClearPageLRU(page))
+				BUG();
+			list_del(&page->lru);
+			zone->nr_active--;
+			if (page_count(page) == 0) {
+				/* the page is in pagevec_release();
+				   shrink_cache says so. */
+				SetPageLRU(page);
+				list_add(&page->lru, &zone->active_list);
+				continue;
+			}
+			page_cache_get(page);
+			spin_unlock_irq(&zone->lru_lock);
+			goto got_page;
+		}
+		spin_unlock_irq(&zone->lru_lock);
+		break;
+
+	got_page:
+		if (remap_onepage(page)) {
+			nr_failed++;
+			list_add(&page->lru, &failedp);
+		}
+	}
+	if (list_empty(&failedp))
+		goto out;
+
+	while (! list_empty(&failedp)) {
+		spin_lock_irq(&zone->lru_lock);
+		page = list_entry(failedp.prev, struct page, lru);
+		list_del(&page->lru);
+		if (PageActive(page)) {
+			list_add(&page->lru, &zone->active_list);
+			zone->nr_active++;
+		} else {
+			list_add(&page->lru, &zone->inactive_list);
+			zone->nr_inactive++;
+		}
+		SetPageLRU(page);
+		spin_unlock_irq(&zone->lru_lock);
+		page_cache_release(page);
+	}
+out:
+	atomic_dec(&remapd_count);
+	return 0;
+}
+			
+static int __init remapd_init(void)
+{
+	int i;
+
+	for(i = 0; i < NR_CPUS; i++)
+		INIT_WORK(&lru_drain_wq[i], lru_add_drain, NULL);
+	return 0;
+}
+
+module_init(remapd_init);
+#endif
 
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.

--
IWAMOTO Toshihiro @ VA Linux Systems Japan
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>