linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* migration cache, updated
@ 2004-10-25 21:39 Marcelo Tosatti
  2004-10-26  1:17 ` Hiroyuki KAMEZAWA
                   ` (2 more replies)
  0 siblings, 3 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-25 21:39 UTC (permalink / raw)
  To: linux-mm; +Cc: Hirokazu Takahashi, IWAMOTO Toshihiro, Dave Hansen, Hugh Dickins

Hi,

This is an improved version of the migration cache patch - 
thanks to everyone who contributed - Hirokazu, Iwamoto, Dave,
Hugh.

Identification of migration pages is now done with SwapCache bit 
and special swap type as suggested.

Also fixed a plethora of other bugs present in the last patch.

It now works fine - survives stress testing.

Hugh - I'm not sure about reusing swap allocation code for this 
anymore - it is designed to deal with storage (all the batching
, extent handling, etc). 

the idr code is smaller - it does only what we need here.

Have to update it to last -mhp version.


diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h linux-2.6.9-rc2-mm4.build/include/linux/mm.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h	2004-10-05 15:09:38.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/mm.h	2004-10-25 18:42:37.000000000 -0200
@@ -251,6 +251,24 @@ extern int capture_page_range(unsigned l
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -458,11 +476,14 @@ void page_address_init(void);
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h linux-2.6.9-rc2-mm4.build/include/linux/swap.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h	2004-10-05 15:09:39.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swap.h	2004-10-25 20:42:27.912127704 -0200
@@ -253,6 +253,7 @@ extern sector_t map_swap_page(struct swa
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -321,6 +322,21 @@ static inline swp_entry_t get_swap_page(
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h linux-2.6.9-rc2-mm4.build/include/linux/swapops.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h	2004-10-05 15:09:35.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swapops.h	2004-10-24 12:15:07.000000000 -0200
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -30,8 +32,7 @@ static inline swp_entry_t swp_entry(unsi
  */
 static inline unsigned swp_type(swp_entry_t entry)
 {
-	return (entry.val >> SWP_TYPE_SHIFT(entry)) &
-			((1 << MAX_SWAPFILES_SHIFT) - 1);
+	return ((entry.val >> SWP_TYPE_SHIFT(entry)));
 }
 
 /*
@@ -68,3 +69,24 @@ static inline pte_t swp_entry_to_pte(swp
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c linux-2.6.9-rc2-mm4.build/mm/fremap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/fremap.c	2004-10-25 20:44:05.185339928 -0200
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
@@ -53,7 +53,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -456,8 +455,13 @@ static void zap_pte_range(struct mmu_gat
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1408,6 +1412,9 @@ static int do_swap_page(struct mm_struct
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1433,15 +1440,22 @@ again:
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1459,10 +1473,14 @@ again:
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page);
+	}
 
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-25 20:34:35.324971872 -0200
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,159 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	migration_remove_reference(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		lock_page(page);
+		remove_from_migration_cache(page, page->private);
+		unlock_page(page);
+		kfree(c);
+	}
+		
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -119,9 +274,11 @@ page_migratable(struct page *page, struc
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -400,10 +558,14 @@ migrate_onepage(struct page *page)
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -420,8 +582,9 @@ migrate_onepage(struct page *page)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c linux-2.6.9-rc2-mm4.build/mm/page_io.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/page_io.c	2004-10-24 12:23:55.000000000 -0200
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c linux-2.6.9-rc2-mm4.build/mm/rmap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/rmap.c	2004-10-25 17:31:43.000000000 -0200
@@ -49,7 +49,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
@@ -641,22 +646,36 @@ static int try_to_unmap_one(struct page 
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		if (PageAnon(page)) {
+			swp_entry_t entry = { .val = page->private };
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+	//		BUG_ON(!PageSwapCache(page));
+			if (PageSwapCache(page) && !PageMigration(page)) {
+				swap_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, swp_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			} else if (PageMigration(page)) {
+				// page cache get to reference pte,
+				// remove from migration cache
+				// on zero-users at fault path
+				migration_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, migration_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			}
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
-	}
 
 	mm->rss--;
 	page_remove_rmap(page);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c linux-2.6.9-rc2-mm4.build/mm/shmem.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/shmem.c	2004-10-24 12:24:20.000000000 -0200
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c linux-2.6.9-rc2-mm4.build/mm/swapfile.c
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
@@ -38,8 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
-#include <linux/swapops.h>
-
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
@@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
 		}
 
 #ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
+		// FIXME: allow relocation of migrate cache pages 
+		// into real swap pages for swapout.
+		if (PageSwapCache(page) && !PageMigration(page)) {
 			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-25 21:39 migration cache, updated Marcelo Tosatti
@ 2004-10-26  1:17 ` Hiroyuki KAMEZAWA
  2004-10-26 12:01   ` Marcelo Tosatti
  2004-10-26  6:37 ` Hirokazu Takahashi
  2004-10-26  9:15 ` Hirokazu Takahashi
  2 siblings, 1 reply; 48+ messages in thread
From: Hiroyuki KAMEZAWA @ 2004-10-26  1:17 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-mm, Hirokazu Takahashi, IWAMOTO Toshihiro, Dave Hansen,
	Hugh Dickins

Hi, Marcelo

Marcelo Tosatti wrote:
> Hi,
>  #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
> -#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
> +#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
> +
> +#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
>  
At the first glance, I think MIGRATION_TYPE=0 is better.
#define MIGRATION_TYPE  (0)

In swapfile.c::sys_swapon()
This code determines new swap_type for commanded swapon().
=============
p = swap_info;
for (type = 0 ; type < nr_swapfiles ; type++,p++)
          if (!(p->flags & SWP_USED))
                break;
error = -EPERM;
==============

set nr_swapfiles=1, swap_info[0].flags = SWP_USED
at boot time seems good. or fix swapon().

Thanks.
Kame <kamezawa.hiroyu@jp.fujitsu.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-25 21:39 migration cache, updated Marcelo Tosatti
  2004-10-26  1:17 ` Hiroyuki KAMEZAWA
@ 2004-10-26  6:37 ` Hirokazu Takahashi
  2004-10-26  9:20   ` Marcelo Tosatti
  2004-10-26  9:15 ` Hirokazu Takahashi
  2 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-26  6:37 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi,

I tested your patch and dead-locked has been occured in
do_swap_page().

> This is an improved version of the migration cache patch - 
> thanks to everyone who contributed - Hirokazu, Iwamoto, Dave,
> Hugh.

> diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
> --- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
> +++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
> @@ -1433,15 +1440,22 @@ again:
>  		inc_page_state(pgmajfault);
>  		grab_swap_token();
>  	}
> -
>  	mark_page_accessed(page);
>  	lock_page(page);
>  	if (!PageSwapCache(page)) {
> +		/* hiro: add !PageMigration(page) here */
>  		/* page-migration has occured */
>  		unlock_page(page);
>  		page_cache_release(page);
>  		goto again;
>  	}
> +	}
> +
> +
> +	if (pte_is_migration(orig_pte)) {
> +		mark_page_accessed(page);
> +		lock_page(page);


The previous code will cause deadlock, as the page is already locked.

> +	}
>  
>  	/*
>  	 * Back out if somebody else faulted in this pte while we
> @@ -1459,10 +1473,14 @@ again:
>  	}
>  
>  	/* The page isn't present yet, go ahead with the fault. */
> -		
> -	swap_free(entry);
> -	if (vm_swap_full())
> -		remove_exclusive_swap_page(page);
> +
> +	if (!pte_is_migration(orig_pte)) {
> +		swap_free(entry);
> +		if (vm_swap_full())
> +			remove_exclusive_swap_page(page);
> +	} else {
> +		migration_remove_reference(page);

migration_remove_reference() also tries to lock the page that is
already locked.

> +	}
>  
>  	mm->rss++;
>  	pte = mk_pte(page, vma->vm_page_prot);
> diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
> --- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
> +++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-25 20:34:35.324971872 -0200

> +int migration_remove_reference(struct page *page)
> +{
> +	struct counter *c;
> +	swp_entry_t entry;
> +
> +	entry.val = page->private;
> +
> +	read_lock_irq(&migration_space.tree_lock);
> +
> +	c = idr_find(&migration_idr, swp_offset(entry));
> +
> +	read_unlock_irq(&migration_space.tree_lock);
> +
> +	if (!c->i)
> +		BUG();
> +
> +	c->i--;
> +
> +	if (!c->i) {
> +		lock_page(page);

It will be dead-locked when this function is called from do_swap_page().

> +		remove_from_migration_cache(page, page->private);
> +		unlock_page(page);
> +		kfree(c);
> +	}
> +		
> +}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-25 21:39 migration cache, updated Marcelo Tosatti
  2004-10-26  1:17 ` Hiroyuki KAMEZAWA
  2004-10-26  6:37 ` Hirokazu Takahashi
@ 2004-10-26  9:15 ` Hirokazu Takahashi
  2004-10-26  9:25   ` Marcelo Tosatti
  2 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-26  9:15 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi, Marcelo,

> Hi,
> 
> This is an improved version of the migration cache patch - 
> thanks to everyone who contributed - Hirokazu, Iwamoto, Dave,
> Hugh.

Some comments.

> diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
> --- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
> +++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
> @@ -1408,6 +1412,9 @@ static int do_swap_page(struct mm_struct
>  	pte_unmap(page_table);
>  	spin_unlock(&mm->page_table_lock);
>  again:
> +	if (pte_is_migration(orig_pte)) {
> +		page = lookup_migration_cache(entry.val);
> +	} else {
>  	page = lookup_swap_cache(entry);
>  	if (!page) {
>   		swapin_readahead(entry, address, vma);
> @@ -1433,15 +1440,22 @@ again:
>  		inc_page_state(pgmajfault);
>  		grab_swap_token();
>  	}
> -
>  	mark_page_accessed(page);
>  	lock_page(page);
>  	if (!PageSwapCache(page)) {
> +		/* hiro: add !PageMigration(page) here */
>  		/* page-migration has occured */

Now, !PageSwapCache(page) means the page isn't neither in the swap-cache
nor in the migration-cache. The original code is enough.

>  		unlock_page(page);
>  		page_cache_release(page);
>  		goto again;
>  	}
> +	}
> +
> +
> +	if (pte_is_migration(orig_pte)) {
> +		mark_page_accessed(page);
> +		lock_page(page);
> +	}
>  
>  	/*
>  	 * Back out if somebody else faulted in this pte while we

> diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
> --- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
> +++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
> @@ -38,8 +38,6 @@
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
>  
> -#include <linux/swapops.h>
> -
>  /*
>   * The list of shrinker callbacks used by to apply pressure to
>   * ageable caches.
> @@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
>  		}
>  
>  #ifdef CONFIG_SWAP
> -		if (PageSwapCache(page)) {
> +		// FIXME: allow relocation of migrate cache pages 
> +		// into real swap pages for swapout.


In my thought, it would be better to remove a target page from the
LRU lists prior to migration. So that it makes the swap code not to
grab the page, which is in the migration cache.


> +		if (PageSwapCache(page) && !PageMigration(page)) {
>  			swp_entry_t swap = { .val = page->private };
>  			__delete_from_swap_cache(page);
>  			write_unlock_irq(&mapping->tree_lock);
> 


Thanks,
Hirokazu Takahashi.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26  6:37 ` Hirokazu Takahashi
@ 2004-10-26  9:20   ` Marcelo Tosatti
  2004-10-26 13:45     ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-26  9:20 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Tue, Oct 26, 2004 at 03:37:31PM +0900, Hirokazu Takahashi wrote:
> Hi,
> 
> I tested your patch and dead-locked has been occured in
> do_swap_page().

Hi,

> > This is an improved version of the migration cache patch - 
> > thanks to everyone who contributed - Hirokazu, Iwamoto, Dave,
> > Hugh.
> 
> > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
> > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
> > +++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
> > @@ -1433,15 +1440,22 @@ again:
> >  		inc_page_state(pgmajfault);
> >  		grab_swap_token();
> >  	}
> > -
> >  	mark_page_accessed(page);
> >  	lock_page(page);
> >  	if (!PageSwapCache(page)) {
> > +		/* hiro: add !PageMigration(page) here */
> >  		/* page-migration has occured */
> >  		unlock_page(page);
> >  		page_cache_release(page);
> >  		goto again;
> >  	}
> > +	}
> > +
> > +
> > +	if (pte_is_migration(orig_pte)) {
> > +		mark_page_accessed(page);
> > +		lock_page(page);
> 
> 
> The previous code will cause deadlock, as the page is already locked.

Actually this one is fine - the page is not locked (its locked
by the SwapCache pte path - not migration path)

if (pte_is_migration(pte)) 
	lookup_migration_cache
else 
	old lookup swap cache
	lock_page

if (pte_is_migration(pte))
	mark_page_accessed
	lock_page

> > +	}
> >  
> >  	/*
> >  	 * Back out if somebody else faulted in this pte while we
> > @@ -1459,10 +1473,14 @@ again:
> >  	}
> >  
> >  	/* The page isn't present yet, go ahead with the fault. */
> > -		
> > -	swap_free(entry);
> > -	if (vm_swap_full())
> > -		remove_exclusive_swap_page(page);
> > +
> > +	if (!pte_is_migration(orig_pte)) {
> > +		swap_free(entry);
> > +		if (vm_swap_full())
> > +			remove_exclusive_swap_page(page);
> > +	} else {
> > +		migration_remove_reference(page);
> 
> migration_remove_reference() also tries to lock the page that is
> already locked.

Oh now thats a mess I've done.

I moved the locking into migration_remove_reference(), but didnt
update the swap path code - I've moved it out again.

> > +	}
> >  
> >  	mm->rss++;
> >  	pte = mk_pte(page, vma->vm_page_prot);
> > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
> > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
> > +++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-25 20:34:35.324971872 -0200
> 
> > +int migration_remove_reference(struct page *page)
> > +{
> > +	struct counter *c;
> > +	swp_entry_t entry;
> > +
> > +	entry.val = page->private;
> > +
> > +	read_lock_irq(&migration_space.tree_lock);
> > +
> > +	c = idr_find(&migration_idr, swp_offset(entry));
> > +
> > +	read_unlock_irq(&migration_space.tree_lock);
> > +
> > +	if (!c->i)
> > +		BUG();
> > +
> > +	c->i--;
> > +
> > +	if (!c->i) {
> > +		lock_page(page);
> 
> It will be dead-locked when this function is called from do_swap_page().

Can you please try the tests with the following updated patch

Works for me


diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h linux-2.6.9-rc2-mm4.build/include/linux/mm.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h	2004-10-05 15:09:38.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/mm.h	2004-10-25 18:42:37.000000000 -0200
@@ -251,6 +251,24 @@ extern int capture_page_range(unsigned l
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -458,11 +476,14 @@ void page_address_init(void);
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h linux-2.6.9-rc2-mm4.build/include/linux/swap.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h	2004-10-05 15:09:39.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swap.h	2004-10-25 20:42:27.000000000 -0200
@@ -253,6 +253,7 @@ extern sector_t map_swap_page(struct swa
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -321,6 +322,21 @@ static inline swp_entry_t get_swap_page(
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h linux-2.6.9-rc2-mm4.build/include/linux/swapops.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h	2004-10-05 15:09:35.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swapops.h	2004-10-24 12:15:07.000000000 -0200
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -30,8 +32,7 @@ static inline swp_entry_t swp_entry(unsi
  */
 static inline unsigned swp_type(swp_entry_t entry)
 {
-	return (entry.val >> SWP_TYPE_SHIFT(entry)) &
-			((1 << MAX_SWAPFILES_SHIFT) - 1);
+	return ((entry.val >> SWP_TYPE_SHIFT(entry)));
 }
 
 /*
@@ -68,3 +69,24 @@ static inline pte_t swp_entry_to_pte(swp
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c linux-2.6.9-rc2-mm4.build/mm/fremap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/fremap.c	2004-10-25 20:44:05.000000000 -0200
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
@@ -53,7 +53,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -456,8 +455,13 @@ static void zap_pte_range(struct mmu_gat
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1408,6 +1412,9 @@ static int do_swap_page(struct mm_struct
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1433,15 +1440,22 @@ again:
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1459,10 +1473,14 @@ again:
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page);
+	}
 
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-26 08:20:01.000000000 -0200
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,161 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);
+	}
+		
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -119,9 +276,11 @@ page_migratable(struct page *page, struc
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -400,10 +560,14 @@ migrate_onepage(struct page *page)
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -420,8 +584,9 @@ migrate_onepage(struct page *page)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c linux-2.6.9-rc2-mm4.build/mm/page_io.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/page_io.c	2004-10-24 12:23:55.000000000 -0200
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c linux-2.6.9-rc2-mm4.build/mm/rmap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/rmap.c	2004-10-25 17:31:43.000000000 -0200
@@ -49,7 +49,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
@@ -641,22 +646,36 @@ static int try_to_unmap_one(struct page 
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		if (PageAnon(page)) {
+			swp_entry_t entry = { .val = page->private };
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+	//		BUG_ON(!PageSwapCache(page));
+			if (PageSwapCache(page) && !PageMigration(page)) {
+				swap_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, swp_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			} else if (PageMigration(page)) {
+				// page cache get to reference pte,
+				// remove from migration cache
+				// on zero-users at fault path
+				migration_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, migration_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			}
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
-	}
 
 	mm->rss--;
 	page_remove_rmap(page);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c linux-2.6.9-rc2-mm4.build/mm/shmem.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/shmem.c	2004-10-24 12:24:20.000000000 -0200
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c linux-2.6.9-rc2-mm4.build/mm/swapfile.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/swapfile.c	2004-10-25 19:03:43.000000000 -0200
@@ -29,7 +29,6 @@
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
@@ -38,8 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
-#include <linux/swapops.h>
-
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
@@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
 		}
 
 #ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
+		// FIXME: allow relocation of migrate cache pages 
+		// into real swap pages for swapout.
+		if (PageSwapCache(page) && !PageMigration(page)) {
 			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26  9:15 ` Hirokazu Takahashi
@ 2004-10-26  9:25   ` Marcelo Tosatti
  2004-10-26 14:01     ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-26  9:25 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Tue, Oct 26, 2004 at 06:15:04PM +0900, Hirokazu Takahashi wrote:
> Hi, Marcelo,
> 
> > Hi,
> > 
> > This is an improved version of the migration cache patch - 
> > thanks to everyone who contributed - Hirokazu, Iwamoto, Dave,
> > Hugh.
> 
> Some comments.
> 
> > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
> > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
> > +++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-25 19:35:18.000000000 -0200
> > @@ -1408,6 +1412,9 @@ static int do_swap_page(struct mm_struct
> >  	pte_unmap(page_table);
> >  	spin_unlock(&mm->page_table_lock);
> >  again:
> > +	if (pte_is_migration(orig_pte)) {
> > +		page = lookup_migration_cache(entry.val);
> > +	} else {
> >  	page = lookup_swap_cache(entry);
> >  	if (!page) {
> >   		swapin_readahead(entry, address, vma);
> > @@ -1433,15 +1440,22 @@ again:
> >  		inc_page_state(pgmajfault);
> >  		grab_swap_token();
> >  	}
> > -
> >  	mark_page_accessed(page);
> >  	lock_page(page);
> >  	if (!PageSwapCache(page)) {
> > +		/* hiro: add !PageMigration(page) here */
> >  		/* page-migration has occured */
> 
> Now, !PageSwapCache(page) means the page isn't neither in the swap-cache
> nor in the migration-cache. The original code is enough.

OK!

> >  		unlock_page(page);
> >  		page_cache_release(page);
> >  		goto again;
> >  	}
> > +	}
> > +
> > +
> > +	if (pte_is_migration(orig_pte)) {
> > +		mark_page_accessed(page);
> > +		lock_page(page);
> > +	}
> >  
> >  	/*
> >  	 * Back out if somebody else faulted in this pte while we
> 
> > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
> > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
> > +++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
> > @@ -38,8 +38,6 @@
> >  #include <asm/tlbflush.h>
> >  #include <asm/div64.h>
> >  
> > -#include <linux/swapops.h>
> > -
> >  /*
> >   * The list of shrinker callbacks used by to apply pressure to
> >   * ageable caches.
> > @@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
> >  		}
> >  
> >  #ifdef CONFIG_SWAP
> > -		if (PageSwapCache(page)) {
> > +		// FIXME: allow relocation of migrate cache pages 
> > +		// into real swap pages for swapout.
> 
> 
> In my thought, it would be better to remove a target page from the
> LRU lists prior to migration. So that it makes the swap code not to
> grab the page, which is in the migration cache.

I dont see a problem with having the pages on LRU - the reclaiming 
code sees it, but its unfreeable, so it doesnt touch it. 

The reclaiming path should see its a migration page, unmap the pte's
to it, remap them to swapcache pages (and ptes), so they can be
swapped out on pressure.

Can you please expand your thoughts?

> > +		if (PageSwapCache(page) && !PageMigration(page)) {
> >  			swp_entry_t swap = { .val = page->private };
> >  			__delete_from_swap_cache(page);
> >  			write_unlock_irq(&mapping->tree_lock);
> > 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 13:45     ` Hirokazu Takahashi
@ 2004-10-26 11:41       ` Marcelo Tosatti
  2004-10-27 13:40       ` Hirokazu Takahashi
  1 sibling, 0 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-26 11:41 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Tue, Oct 26, 2004 at 10:45:50PM +0900, Hirokazu Takahashi wrote:
> Hi,
> 
> > > The previous code will cause deadlock, as the page is already locked.
> > 
> > Actually this one is fine - the page is not locked (its locked
> > by the SwapCache pte path - not migration path)
> > 
> > if (pte_is_migration(pte)) 
> > 	lookup_migration_cache
> > else 
> > 	old lookup swap cache
> > 	lock_page
> > 
> > if (pte_is_migration(pte))
> > 	mark_page_accessed
> > 	lock_page
> 
> Oh, I understand.
> 
> > Can you please try the tests with the following updated patch
> > 
> > Works for me
> 
> It didn't work without one fix.
> 
> +void remove_from_migration_cache(struct page *page, int id)
> +{
> +	write_lock_irq(&migration_space.tree_lock);
> +        idr_remove(&migration_idr, id);
> +	radix_tree_delete(&migration_space.page_tree, id);
> +	ClearPageSwapCache(page);
> +	page->private = NULL;
> +	write_unlock_irq(&migration_space.tree_lock);
> +}
> 
> +int migration_remove_reference(struct page *page)
> +{
> +	struct counter *c;
> +	swp_entry_t entry;
> +
> +	entry.val = page->private;
> +
> +	read_lock_irq(&migration_space.tree_lock);
> +
> +	c = idr_find(&migration_idr, swp_offset(entry));
> +
> +	read_unlock_irq(&migration_space.tree_lock);
> +
> +	if (!c->i)
> +		BUG();
> +
> +	c->i--;
> +
> +	if (!c->i) {
> +		remove_from_migration_cache(page, page->private);
> +		kfree(c);
> 
> page_cache_release(page) should be invoked here, as the count for
> the migration cache must be decreased.
> With this fix, your migration cache started to work very fine!

Oh yes, I removed that by accident.

> +	}
> +		
> +}
> 
> 
> 
> The attached patch is what I ported your patch to the latest version
> and I fixed the bug.

It seems a hunk from your own tree leaked into this patch?

See above

> @@ -367,11 +527,6 @@ generic_migrate_page(struct page *page, 
>  
>  	/* map the newpage where the old page have been mapped. */
>  	touch_unmapped_address(&vlist);
> -	if (PageSwapCache(newpage)) {
> -		lock_page(newpage);
> -		__remove_exclusive_swap_page(newpage, 1);
> -		unlock_page(newpage);
> -	}
>  
>  	page->mapping = NULL;
>  	unlock_page(page);
> @@ -383,11 +538,6 @@ out_busy:
>  	/* Roll back all operations. */
>  	rewind_page(page, newpage);
>  	touch_unmapped_address(&vlist);
> -	if (PageSwapCache(page)) {
> -		lock_page(page);
> -		__remove_exclusive_swap_page(page, 1);
> -		unlock_page(page);
> -	}
>  	return ret;

This two hunks?

OK fine I'll update the patch with all fixes to 
the newer version of -mhp, and start working 
on the nonblocking version of the migration 
functions.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26  1:17 ` Hiroyuki KAMEZAWA
@ 2004-10-26 12:01   ` Marcelo Tosatti
  2004-10-26 23:47     ` Hiroyuki KAMEZAWA
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-26 12:01 UTC (permalink / raw)
  To: Hiroyuki KAMEZAWA
  Cc: linux-mm, Hirokazu Takahashi, IWAMOTO Toshihiro, Dave Hansen,
	Hugh Dickins

On Tue, Oct 26, 2004 at 10:17:44AM +0900, Hiroyuki KAMEZAWA wrote:
> Hi, Marcelo
> 
> Marcelo Tosatti wrote:
> >Hi,
> > #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
> >-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
> >+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
> >+
> >+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
> > 
> At the first glance, I think MIGRATION_TYPE=0 is better.
> #define MIGRATION_TYPE  (0)
> 
> In swapfile.c::sys_swapon()
> This code determines new swap_type for commanded swapon().
> =============
> p = swap_info;
> for (type = 0 ; type < nr_swapfiles ; type++,p++)
>          if (!(p->flags & SWP_USED))
>                break;
> error = -EPERM;
> ==============
> 
> set nr_swapfiles=1, swap_info[0].flags = SWP_USED
> at boot time seems good. or fix swapon().

Hi Hiroyuki,

Indeed.

This should do it?

--- swapfile.c.orig     2004-10-26 11:33:56.734551048 -0200
+++ swapfile.c  2004-10-26 11:34:03.284555296 -0200
@@ -1370,6 +1370,13 @@ asmlinkage long sys_swapon(const char __
                swap_list_unlock();
                goto out;
        }
+
+       /* MAX_SWAPFILES-1 is reserved for migration pages */
+       if (type > MAX_SWAPFILES-1) {
+               swap_list_unlock();
+               goto out;
+       }
+
        if (type >= nr_swapfiles)
                nr_swapfiles = type+1;
        INIT_LIST_HEAD(&p->extent_list);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 14:01     ` Hirokazu Takahashi
@ 2004-10-26 12:24       ` Marcelo Tosatti
  2004-10-27  7:25         ` IWAMOTO Toshihiro
  2004-10-27 13:48         ` Hirokazu Takahashi
  0 siblings, 2 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-26 12:24 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Tue, Oct 26, 2004 at 11:01:10PM +0900, Hirokazu Takahashi wrote:
> Hi, Marcelo,
> 
> > > > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
> > > > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
> > > > +++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
> > > > @@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
> > > >  		}
> > > >  
> > > >  #ifdef CONFIG_SWAP
> > > > -		if (PageSwapCache(page)) {
> > > > +		// FIXME: allow relocation of migrate cache pages 
> > > > +		// into real swap pages for swapout.
> > > 
> > > 
> > > In my thought, it would be better to remove a target page from the
> > > LRU lists prior to migration. So that it makes the swap code not to
> > > grab the page, which is in the migration cache.
> > 
> > I dont see a problem with having the pages on LRU - the reclaiming 
> > code sees it, but its unfreeable, so it doesnt touch it. 
> > 
> > The reclaiming path should see its a migration page, unmap the pte's
> > to it, remap them to swapcache pages (and ptes), so they can be
> > swapped out on pressure.
> > 
> > Can you please expand your thoughts?
> 
> I thought the easiest way to avoid the race condition was
> removing the page from LRU during memory migration.
> But there may be no problem about the page, which is unfreeable
> as you mentioned.
> 
> BTW, I wonder how the migration code avoid to choose some pages
> on LRU, which may have count == 0. This may happen the pages
> are going to be removed. We have to care about it.

AFAICS its already done by __steal_page_from_lru(), which is used
by grab_capturing_pages():

static int
grab_capturing_pages(struct list_head *page_list, unsigned long start_pfn,
                                                        unsigned long nr_pages)
{
        struct page *page;
        struct zone *zone;
        int rest = 0;
        int i;
                                                                                    
        for (i = 0; i < nr_pages; i++) {
                page = pfn_to_page(start_pfn + i);
                zone = page_zone(page);
                spin_lock_irq(&zone->lru_lock);
                if (page_under_capture(page)) {
                        if (PageLRU(page) && __steal_page_from_lru(zone, page))
                                list_add(&page->lru, page_list);
                        else
                                rest++;
                }
                spin_unlock_irq(&zone->lru_lock);
        }
        return rest;
}


Pages with reference count zero will be not be moved to the page
list, and truncated pages seem to be handled nicely later on the
migration codepath.

A quick search on Iwamoto's test utils shows no sign of truncate(). 

It would be nice to add more testcases (such as truncate() 
intensive application) to his testsuite.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26  9:20   ` Marcelo Tosatti
@ 2004-10-26 13:45     ` Hirokazu Takahashi
  2004-10-26 11:41       ` Marcelo Tosatti
  2004-10-27 13:40       ` Hirokazu Takahashi
  0 siblings, 2 replies; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-26 13:45 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi,

> > The previous code will cause deadlock, as the page is already locked.
> 
> Actually this one is fine - the page is not locked (its locked
> by the SwapCache pte path - not migration path)
> 
> if (pte_is_migration(pte)) 
> 	lookup_migration_cache
> else 
> 	old lookup swap cache
> 	lock_page
> 
> if (pte_is_migration(pte))
> 	mark_page_accessed
> 	lock_page

Oh, I understand.

> Can you please try the tests with the following updated patch
> 
> Works for me

It didn't work without one fix.

+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}

+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);

page_cache_release(page) should be invoked here, as the count for
the migration cache must be decreased.
With this fix, your migration cache started to work very fine!

+	}
+		
+}



The attached patch is what I ported your patch to the latest version
and I fixed the bug.


diff -puN include/linux/mm.h~migration_cache_marcelo include/linux/mm.h
--- linux-2.6.9-rc4/include/linux/mm.h~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/include/linux/mm.h	Tue Oct 26 21:08:31 2004
@@ -250,6 +250,24 @@ struct page {
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -457,11 +475,14 @@ void page_address_init(void);
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -puN include/linux/swap.h~migration_cache_marcelo include/linux/swap.h
--- linux-2.6.9-rc4/include/linux/swap.h~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/include/linux/swap.h	Tue Oct 26 21:09:47 2004
@@ -257,6 +257,7 @@ static inline int remove_exclusive_swap_
 {
 	return __remove_exclusive_swap_page(p, 0);
 }
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -330,6 +331,21 @@ static inline swp_entry_t get_swap_page(
 #define put_swap_token(x) do { } while(0)
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
 
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
diff -puN include/linux/swapops.h~migration_cache_marcelo include/linux/swapops.h
--- linux-2.6.9-rc4/include/linux/swapops.h~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/include/linux/swapops.h	Tue Oct 26 21:08:31 2004
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -67,3 +69,24 @@ static inline pte_t swp_entry_to_pte(swp
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -puN mm/fremap.c~migration_cache_marcelo mm/fremap.c
--- linux-2.6.9-rc4/mm/fremap.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/fremap.c	Tue Oct 26 21:08:31 2004
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -puN mm/memory.c~migration_cache_marcelo mm/memory.c
--- linux-2.6.9-rc4/mm/memory.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/memory.c	Tue Oct 26 21:08:31 2004
@@ -53,7 +53,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -456,8 +455,13 @@ static void zap_pte_range(struct mmu_gat
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1533,6 +1537,9 @@ static int do_swap_page(struct mm_struct
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1558,15 +1565,22 @@ again:
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1584,10 +1598,14 @@ again:
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page);
+	}
 
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
diff -puN mm/mmigrate.c~migration_cache_marcelo mm/mmigrate.c
--- linux-2.6.9-rc4/mm/mmigrate.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/mmigrate.c	Tue Oct 26 22:19:59 2004
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,162 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);
+		page_cache_release(page);
+	}
+		
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -121,9 +279,11 @@ page_migratable(struct page *page, struc
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -367,11 +527,6 @@ generic_migrate_page(struct page *page, 
 
 	/* map the newpage where the old page have been mapped. */
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(newpage)) {
-		lock_page(newpage);
-		__remove_exclusive_swap_page(newpage, 1);
-		unlock_page(newpage);
-	}
 
 	page->mapping = NULL;
 	unlock_page(page);
@@ -383,11 +538,6 @@ out_busy:
 	/* Roll back all operations. */
 	rewind_page(page, newpage);
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(page)) {
-		lock_page(page);
-		__remove_exclusive_swap_page(page, 1);
-		unlock_page(page);
-	}
 	return ret;
 
 out_removing:
@@ -416,10 +566,14 @@ migrate_onepage(struct page *page)
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -438,8 +592,9 @@ migrate_onepage(struct page *page)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
diff -puN mm/page_io.c~migration_cache_marcelo mm/page_io.c
--- linux-2.6.9-rc4/mm/page_io.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/page_io.c	Tue Oct 26 21:08:31 2004
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -puN mm/rmap.c~migration_cache_marcelo mm/rmap.c
--- linux-2.6.9-rc4/mm/rmap.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/rmap.c	Tue Oct 26 21:08:31 2004
@@ -50,7 +50,7 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
@@ -644,22 +644,36 @@ static int try_to_unmap_one(struct page 
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		if (PageAnon(page)) {
+			swp_entry_t entry = { .val = page->private };
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+	//		BUG_ON(!PageSwapCache(page));
+			if (PageSwapCache(page) && !PageMigration(page)) {
+				swap_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, swp_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			} else if (PageMigration(page)) {
+				// page cache get to reference pte,
+				// remove from migration cache
+				// on zero-users at fault path
+				migration_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, migration_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			}
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
-	}
 
 	mm->rss--;
 	page_remove_rmap(page);
diff -puN mm/shmem.c~migration_cache_marcelo mm/shmem.c
--- linux-2.6.9-rc4/mm/shmem.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/shmem.c	Tue Oct 26 21:08:31 2004
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -puN mm/swapfile.c~migration_cache_marcelo mm/swapfile.c
--- linux-2.6.9-rc4/mm/swapfile.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/swapfile.c	Tue Oct 26 21:08:31 2004
@@ -33,7 +33,6 @@
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
diff -puN mm/vmscan.c~migration_cache_marcelo mm/vmscan.c
--- linux-2.6.9-rc4/mm/vmscan.c~migration_cache_marcelo	Tue Oct 26 21:08:31 2004
+++ linux-2.6.9-rc4-taka/mm/vmscan.c	Tue Oct 26 21:08:31 2004
@@ -39,8 +39,6 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
-#include <linux/swapops.h>
-
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
@@ -460,7 +458,9 @@ int shrink_list(struct list_head *page_l
 		}
 
 #ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
+		// FIXME: allow relocation of migrate cache pages 
+		// into real swap pages for swapout.
+		if (PageSwapCache(page) && !PageMigration(page)) {
 			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26  9:25   ` Marcelo Tosatti
@ 2004-10-26 14:01     ` Hirokazu Takahashi
  2004-10-26 12:24       ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-26 14:01 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi, Marcelo,

> > > diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
> > > --- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
> > > +++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
> > > @@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
> > >  		}
> > >  
> > >  #ifdef CONFIG_SWAP
> > > -		if (PageSwapCache(page)) {
> > > +		// FIXME: allow relocation of migrate cache pages 
> > > +		// into real swap pages for swapout.
> > 
> > 
> > In my thought, it would be better to remove a target page from the
> > LRU lists prior to migration. So that it makes the swap code not to
> > grab the page, which is in the migration cache.
> 
> I dont see a problem with having the pages on LRU - the reclaiming 
> code sees it, but its unfreeable, so it doesnt touch it. 
> 
> The reclaiming path should see its a migration page, unmap the pte's
> to it, remap them to swapcache pages (and ptes), so they can be
> swapped out on pressure.
> 
> Can you please expand your thoughts?

I thought the easiest way to avoid the race condition was
removing the page from LRU during memory migration.
But there may be no problem about the page, which is unfreeable
as you mentioned.

BTW, I wonder how the migration code avoid to choose some pages
on LRU, which may have count == 0. This may happen the pages
are going to be removed. We have to care about it.


> > > +		if (PageSwapCache(page) && !PageMigration(page)) {
> > >  			swp_entry_t swap = { .val = page->private };
> > >  			__delete_from_swap_cache(page);
> > >  			write_unlock_irq(&mapping->tree_lock);
> > > 

Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 12:01   ` Marcelo Tosatti
@ 2004-10-26 23:47     ` Hiroyuki KAMEZAWA
  0 siblings, 0 replies; 48+ messages in thread
From: Hiroyuki KAMEZAWA @ 2004-10-26 23:47 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-mm, Hirokazu Takahashi, IWAMOTO Toshihiro, Dave Hansen,
	Hugh Dickins

Marcelo Tosatti wrote:

> 
> This should do it?
> 
> --- swapfile.c.orig     2004-10-26 11:33:56.734551048 -0200
> +++ swapfile.c  2004-10-26 11:34:03.284555296 -0200
> @@ -1370,6 +1370,13 @@ asmlinkage long sys_swapon(const char __
>                 swap_list_unlock();
>                 goto out;
>         }
> +
> +       /* MAX_SWAPFILES-1 is reserved for migration pages */
> +       if (type > MAX_SWAPFILES-1) {
> +               swap_list_unlock();
> +               goto out;
> +       }
> +
>         if (type >= nr_swapfiles)
>                 nr_swapfiles = type+1;
>         INIT_LIST_HEAD(&p->extent_list);
> 

This looks easier to read than my suggestion :).
But..
=========
if (type >=  MIGRATION_TYPE) { /* MIGRATION_TYPE is set to maximum available swp_type. */
	goto out;
}
=========
Is maybe correct .

Thanks.

Kame <kamezawa.hiroyu@jp.fujitsu.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 12:24       ` Marcelo Tosatti
@ 2004-10-27  7:25         ` IWAMOTO Toshihiro
  2004-10-27 16:27           ` Marcelo Tosatti
  2004-10-27 13:48         ` Hirokazu Takahashi
  1 sibling, 1 reply; 48+ messages in thread
From: IWAMOTO Toshihiro @ 2004-10-27  7:25 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, haveblue, hugh

At Tue, 26 Oct 2004 10:24:19 -0200,
Marcelo Tosatti wrote:

> Pages with reference count zero will be not be moved to the page
> list, and truncated pages seem to be handled nicely later on the
> migration codepath.
> 
> A quick search on Iwamoto's test utils shows no sign of truncate(). 

IIRC, the easiest test method is file overwrite, such as

	while true; do
		tar zxvf ../some.tar.gz
	done


> It would be nice to add more testcases (such as truncate() 
> intensive application) to his testsuite.

And it would be great to have an automated regression test suite.
I wonder if OSDL's test harness(http://stp.sf.net/) could be used, but
I had no chance to investigate any further.

--
IWAMOTO Toshihiro
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 13:45     ` Hirokazu Takahashi
  2004-10-26 11:41       ` Marcelo Tosatti
@ 2004-10-27 13:40       ` Hirokazu Takahashi
  1 sibling, 0 replies; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-27 13:40 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi, Marcelo,

Oops has occurred on on my box with the migration cache patch
after the long run. I tested it with Iwamoto's zone hotplug
emulation patch.

There may be some migration cache related bugs there. 
Some pages seem to remain in the migration cache
after page migration.

> +	if (!c->i) {
> +		remove_from_migration_cache(page, page->private);
> +		kfree(c);
> 
> page_cache_release(page) should be invoked here, as the count for
> the migration cache must be decreased.
> With this fix, your migration cache started to work very fine!
> 
> +	}
> +		
> +}

Please take a look at the attached logs.

Some pages might have been put in LRU, and they seems to
have been chosen as target pages to migrate again.
The pages are handled as swap-cache pages accidentally.
And both of the swp_offset in the logs seem to be
very big. This looks like some pages in the migration cache
haven't been released yet.

"swap file entry f8000223" means:
    - swp_type is 0x1f, which is MIGRATION_TYPE.
    - swp_offset is 0x223.

"swap file entry f8005ff4" means:
    - swp_type is 0x1f, which is MIGRATION_TYPE.
    - swp_offset is 0x5ff4.

I have no idea why this has happened yet.


Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000223
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000224
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000225
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000226
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000227
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000228
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f8000229
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022a
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022b
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022c
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022d
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022e
Oct 26 21:28:06 target1 kernel: swap_free: Bad swap file entry f800022f
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000230
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000231
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000232
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000233
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000234
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000235
Oct 26 21:28:08 target1 kernel: swap_free: Bad swap file entry f8000236
Oct 26 21:28:08 target1 kernel: Unable to handle kernel NULL pointer dereference at virtual address 00000028
Oct 26 21:28:08 target1 kernel:  printing eip:
Oct 26 21:28:08 target1 kernel: c028ef60
Oct 26 21:28:08 target1 kernel: *pde = 00000000
Oct 26 21:28:08 target1 kernel: Oops: 0002 [#1]
Oct 26 21:28:08 target1 kernel: SMP 
Oct 26 21:28:08 target1 kernel: Modules linked in:
Oct 26 21:28:08 target1 kernel: CPU:    0
Oct 26 21:28:08 target1 kernel: EIP:    0060:[_spin_lock+0/16]    Not tainted VLI
Oct 26 21:28:08 target1 kernel: EIP:    0060:[<c028ef60>]    Not tainted VLI
Oct 26 21:28:08 target1 kernel: EFLAGS: 00010282   (2.6.9-rc4-mm1) 
Oct 26 21:28:08 target1 kernel: EIP is at _spin_lock+0x0/0x10
Oct 26 21:28:09 target1 kernel: eax: 00000028   ebx: 00008097   ecx: ce0e7ed0   edx: ce0e7f24
Oct 26 21:28:09 target1 kernel: esi: 00000000   edi: f71fa8a0   ebp: 00000000   esp: ce0e7ec4
Oct 26 21:28:09 target1 kernel: ds: 007b   es: 007b   ss: 0068
Oct 26 21:28:09 target1 kernel: Process migrate131072 (pid: 1700, threadinfo=ce0e6000 task=cf01a100)
Oct 26 21:28:09 target1 kernel: Stack: c014ca72 00000000 00000000 00000001 cc9c7414 00000022 00000246 c013a5f1 
Oct 26 21:28:09 target1 kernel:        c1187420 f71fa8a0 c1187420 c03036e0 00000000 f71fa8a0 ce0e7f24 f71fa8a0 
Oct 26 21:28:09 target1 kernel:        00000000 c014ccc7 f71fa8a0 ce0e7f24 ce0e7f24 c0154b1f f71fa8a0 ce0e7f24 
Oct 26 21:28:09 target1 kernel: Call Trace:
Oct 26 21:28:09 target1 kernel:  [try_to_unmap_file+50/576] try_to_unmap_file+0x32/0x240
Oct 26 21:28:09 target1 kernel:  [<c014ca72>] try_to_unmap_file+0x32/0x240
Oct 26 21:28:09 target1 kernel:  [buffered_rmqueue+433/480] buffered_rmqueue+0x1b1/0x1e0
Oct 26 21:28:09 target1 kernel:  [<c013a5f1>] buffered_rmqueue+0x1b1/0x1e0
Oct 26 21:28:09 target1 kernel:  [try_to_unmap+71/89] try_to_unmap+0x47/0x59
Oct 26 21:28:09 target1 kernel:  [<c014ccc7>] try_to_unmap+0x47/0x59
Oct 26 21:28:09 target1 kernel:  [generic_migrate_page+95/688] generic_migrate_page+0x5f/0x2b0
Oct 26 21:28:09 target1 kernel:  [<c0154b1f>] generic_migrate_page+0x5f/0x2b0
Oct 26 21:28:09 target1 kernel:  [migrate_onepage+308/416] migrate_onepage+0x134/0x1a0
Oct 26 21:28:09 target1 kernel:  [<c0154ea4>] migrate_onepage+0x134/0x1a0
Oct 26 21:28:09 target1 kernel:  [migrate_page_common+0/256] migrate_page_common+0x0/0x100
Oct 26 21:28:09 target1 kernel:  [<c01545e0>] migrate_page_common+0x0/0x100
Oct 26 21:28:09 target1 kernel:  [try_to_migrate_pages+1144/1616] try_to_migrate_pages+0x478/0x650
Oct 26 21:28:09 target1 kernel:  [<c0155388>] try_to_migrate_pages+0x478/0x650
Oct 26 21:28:09 target1 kernel:  [mmigrated+146/201] mmigrated+0x92/0xc9
Oct 26 21:28:09 target1 kernel:  [<c01559b2>] mmigrated+0x92/0xc9
Oct 26 21:28:09 target1 kernel:  [mmigrated+0/201] mmigrated+0x0/0xc9
Oct 26 21:28:09 target1 kernel:  [<c0155920>] mmigrated+0x0/0xc9
Oct 26 21:28:09 target1 kernel:  [kernel_thread_helper+5/24] kernel_thread_helper+0x5/0x18
Oct 26 21:28:09 target1 kernel:  [<c010408d>] kernel_thread_helper+0x5/0x18
Oct 26 21:28:09 target1 kernel: Code: 00 00 01 74 05 e8 79 ec ff ff c3 ba 00 e0 ff ff 21 e2 81 42 14 00 01 00 00 f0 81 28 00 00 00 01 74 05 e8 5c ec ff ff c3 8d 76 00 <f0> fe 08 79 09 f3 90 80 38 00 7e f9 eb f2 c3 90 f0 81 28 00 00 





Oct 27 05:50:48 target1 kernel: swap_dup: Bad swap file entry f8005ff4
Oct 27 05:50:49 target1 kernel: ------------[ cut here ]------------
Oct 27 05:50:49 target1 kernel: kernel BUG at mm/mmigrate.c:115!
Oct 27 05:50:49 target1 kernel: invalid operand: 0000 [#1]
Oct 27 05:50:49 target1 kernel: SMP 
Oct 27 05:50:49 target1 kernel: Modules linked in:
Oct 27 05:50:49 target1 kernel: CPU:    0
Oct 27 05:50:49 target1 kernel: EIP:    0060:[migration_remove_entry+24/80]    Not tainted VLI
Oct 27 05:50:49 target1 kernel: EIP:    0060:[<c0154158>]    Not tainted VLI
Oct 27 05:50:49 target1 kernel: EFLAGS: 00010246   (2.6.9-rc4-mm1) 
Oct 27 05:50:49 target1 kernel: EIP is at migration_remove_entry+0x18/0x50
Oct 27 05:50:49 target1 kernel: eax: c0303700   ebx: 00000000   ecx: c0303704   edx: f8005ff4
Oct 27 05:50:49 target1 kernel: esi: 00000000   edi: 00000000   ebp: c12033a0   esp: cb681e08
Oct 27 05:50:49 target1 kernel: ds: 007b   es: 007b   ss: 0068
Oct 27 05:50:49 target1 kernel: Process migrate65536 (pid: 22493, threadinfo=cb680000 task=cee90a80)
Oct 27 05:50:49 target1 kernel: Stack: 00005ff4 c0145c61 f8005ff4 c68e9f64 00004000 bffd5000 c0000000 c40b5c00 
Oct 27 05:50:49 target1 kernel:        00000000 c0145ce5 c12033a0 c40b5bfc bffd5000 0002b000 00000000 bffd5000 
Oct 27 05:50:49 target1 kernel:        c40b5c00 c0000000 00000000 c0145d55 c12033a0 c40b5bfc bffd5000 0002b000 
Oct 27 05:50:49 target1 kernel: Call Trace:
Oct 27 05:50:49 target1 kernel:  [zap_pte_range+705/768] zap_pte_range+0x2c1/0x300
Oct 27 05:50:49 target1 kernel:  [<c0145c61>] zap_pte_range+0x2c1/0x300
Oct 27 05:50:49 target1 kernel:  [zap_pmd_range+69/112] zap_pmd_range+0x45/0x70
Oct 27 05:50:49 target1 kernel:  [<c0145ce5>] zap_pmd_range+0x45/0x70
Oct 27 05:50:49 target1 kernel:  [unmap_page_range+69/112] unmap_page_range+0x45/0x70
Oct 27 05:50:49 target1 kernel:  [<c0145d55>] unmap_page_range+0x45/0x70
Oct 27 05:50:49 target1 kernel:  [unmap_vmas+376/640] unmap_vmas+0x178/0x280
Oct 27 05:50:49 target1 kernel:  [<c0145ef8>] unmap_vmas+0x178/0x280
Oct 27 05:50:49 target1 kernel:  [exit_mmap+123/336] exit_mmap+0x7b/0x150
Oct 27 05:50:49 target1 kernel:  [<c014a64b>] exit_mmap+0x7b/0x150
Oct 27 05:50:49 target1 kernel:  [mmput+33/160] mmput+0x21/0xa0
Oct 27 05:50:49 target1 kernel:  [<c011a611>] mmput+0x21/0xa0
Oct 27 05:50:49 target1 kernel:  [touch_unmapped_address+208/256] touch_unmapped_address+0xd0/0x100
Oct 27 05:50:50 target1 kernel:  [<c014c480>] touch_unmapped_address+0xd0/0x100
Oct 27 05:50:50 target1 kernel:  [generic_migrate_page+482/688] generic_migrate_page+0x1e2/0x2b0
Oct 27 05:50:50 target1 kernel:  [<c0154cc2>] generic_migrate_page+0x1e2/0x2b0
Oct 27 05:50:50 target1 kernel:  [migrate_onepage+308/416] migrate_onepage+0x134/0x1a0
Oct 27 05:50:50 target1 kernel:  [<c0154ec4>] migrate_onepage+0x134/0x1a0
Oct 27 05:50:50 target1 kernel:  [migrate_page_common+0/256] migrate_page_common+0x0/0x100
Oct 27 05:50:50 target1 kernel:  [<c0154600>] migrate_page_common+0x0/0x100
Oct 27 05:50:50 target1 kernel:  [try_to_migrate_pages+648/1616] try_to_migrate_pages+0x288/0x650
Oct 27 05:50:50 target1 kernel:  [<c01551b8>] try_to_migrate_pages+0x288/0x650
Oct 27 05:50:50 target1 kernel:  [mmigrated+146/201] mmigrated+0x92/0xc9
Oct 27 05:50:50 target1 kernel:  [<c01559d2>] mmigrated+0x92/0xc9
Oct 27 05:50:50 target1 kernel:  [mmigrated+0/201] mmigrated+0x0/0xc9
Oct 27 05:50:50 target1 kernel:  [<c0155940>] mmigrated+0x0/0xc9
Oct 27 05:50:50 target1 kernel:  [kernel_thread_helper+5/24] kernel_thread_helper+0x5/0x18
Oct 27 05:50:50 target1 kernel:  [<c010408d>] kernel_thread_helper+0x5/0x18
Oct 27 05:50:50 target1 kernel: Code: 0c 00 00 00 00 ff 0d 80 36 30 c0 e8 03 af 13 00 5b 5e c3 53 8b 4c 24 08 51 68 00 37 30 c0 e8 40 23 fe ff 89 c3 58 85 db 5a 75 08 <0f> 0b 73 00 ee 43 2a c0 31 c0 f0 0f ab 03 19 c0 85 c0 74 07 89 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-26 12:24       ` Marcelo Tosatti
  2004-10-27  7:25         ` IWAMOTO Toshihiro
@ 2004-10-27 13:48         ` Hirokazu Takahashi
  2004-10-28 15:19           ` Marcelo Tosatti
  1 sibling, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-10-27 13:48 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi,

> > BTW, I wonder how the migration code avoid to choose some pages
> > on LRU, which may have count == 0. This may happen the pages
> > are going to be removed. We have to care about it.
> 
> AFAICS its already done by __steal_page_from_lru(), which is used
> by grab_capturing_pages():
	:
> Pages with reference count zero will be not be moved to the page
> list, and truncated pages seem to be handled nicely later on the
> migration codepath.

Ok, I see no problem about this with the current implementation.


BTW, now I'm just wondering migration_duplicate() should be
called from copy_page_range(), since page-migration and fork()
may work at the same time.

What do you think about this?


Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-27  7:25         ` IWAMOTO Toshihiro
@ 2004-10-27 16:27           ` Marcelo Tosatti
  0 siblings, 0 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-27 16:27 UTC (permalink / raw)
  To: IWAMOTO Toshihiro
  Cc: Hirokazu Takahashi, linux-mm, haveblue, hugh, cliffw, judith

On Wed, Oct 27, 2004 at 04:25:24PM +0900, IWAMOTO Toshihiro wrote:
> At Tue, 26 Oct 2004 10:24:19 -0200,
> Marcelo Tosatti wrote:
> 
> > Pages with reference count zero will be not be moved to the page
> > list, and truncated pages seem to be handled nicely later on the
> > migration codepath.
> > 
> > A quick search on Iwamoto's test utils shows no sign of truncate(). 
> 
> IIRC, the easiest test method is file overwrite, such as
> 
> 	while true; do
> 		tar zxvf ../some.tar.gz
> 	done
> 
> 
> > It would be nice to add more testcases (such as truncate() 
> > intensive application) to his testsuite.
> 
> And it would be great to have an automated regression test suite.
> I wonder if OSDL's test harness(http://stp.sf.net/) could be used, but
> I had no chance to investigate any further.

I dont think it is usable as it is because the benchmarks are fixed
and you can't have scripts (your own commands) running as far as I 
remember - so its not possible to remove memory regions.

Other than that it should be fine - make a script to add/remove
memory zones and let the benchmarks run.

Cliff, Judith, is that right?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-27 13:48         ` Hirokazu Takahashi
@ 2004-10-28 15:19           ` Marcelo Tosatti
  2004-10-28 16:05             ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-28 15:19 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Wed, Oct 27, 2004 at 10:48:37PM +0900, Hirokazu Takahashi wrote:
> Hi,
> 
> > > BTW, I wonder how the migration code avoid to choose some pages
> > > on LRU, which may have count == 0. This may happen the pages
> > > are going to be removed. We have to care about it.
> > 
> > AFAICS its already done by __steal_page_from_lru(), which is used
> > by grab_capturing_pages():
> 	:
> > Pages with reference count zero will be not be moved to the page
> > list, and truncated pages seem to be handled nicely later on the
> > migration codepath.
> 
> Ok, I see no problem about this with the current implementation.
> 
> 
> BTW, now I'm just wondering migration_duplicate() should be
> called from copy_page_range(), since page-migration and fork()
> may work at the same time.
> 
> What do you think about this?

Yep thats probably what caused your failures.

I'll prepare a new patch.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 15:19           ` Marcelo Tosatti
@ 2004-10-28 16:05             ` Marcelo Tosatti
  2004-10-28 18:51               ` Dave Hansen
  2004-11-05 13:49               ` Hirokazu Takahashi
  0 siblings, 2 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-28 16:05 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Thu, Oct 28, 2004 at 01:19:28PM -0200, Marcelo Tosatti wrote:
> On Wed, Oct 27, 2004 at 10:48:37PM +0900, Hirokazu Takahashi wrote:
> > Hi,
> > 
> > > > BTW, I wonder how the migration code avoid to choose some pages
> > > > on LRU, which may have count == 0. This may happen the pages
> > > > are going to be removed. We have to care about it.
> > > 
> > > AFAICS its already done by __steal_page_from_lru(), which is used
> > > by grab_capturing_pages():
> > 	:
> > > Pages with reference count zero will be not be moved to the page
> > > list, and truncated pages seem to be handled nicely later on the
> > > migration codepath.
> > 
> > Ok, I see no problem about this with the current implementation.
> > 
> > 
> > BTW, now I'm just wondering migration_duplicate() should be
> > called from copy_page_range(), since page-migration and fork()
> > may work at the same time.
> > 
> > What do you think about this?
> 
> Yep thats probably what caused your failures.
> 
> I'll prepare a new patch.

Here it is - with the copy_page_range() fix as you pointed out,
plus sys_swapon() fix as suggested by Hiroyuki.

I've also added a BUG() in case of swap_free() failure, so we 
get a backtrace.

Can you please test this - thanks.

diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h linux-2.6.9-rc2-mm4.build/include/linux/mm.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h	2004-10-05 15:09:38.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/mm.h	2004-10-25 18:42:37.000000000 -0200
@@ -251,6 +251,24 @@ extern int capture_page_range(unsigned l
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -458,11 +476,14 @@ void page_address_init(void);
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h linux-2.6.9-rc2-mm4.build/include/linux/swap.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h	2004-10-05 15:09:39.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swap.h	2004-10-25 20:42:27.000000000 -0200
@@ -253,6 +253,7 @@ extern sector_t map_swap_page(struct swa
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -321,6 +322,21 @@ static inline swp_entry_t get_swap_page(
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h linux-2.6.9-rc2-mm4.build/include/linux/swapops.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h	2004-10-05 15:09:35.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swapops.h	2004-10-24 12:15:07.000000000 -0200
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -30,8 +32,7 @@ static inline swp_entry_t swp_entry(unsi
  */
 static inline unsigned swp_type(swp_entry_t entry)
 {
-	return (entry.val >> SWP_TYPE_SHIFT(entry)) &
-			((1 << MAX_SWAPFILES_SHIFT) - 1);
+	return ((entry.val >> SWP_TYPE_SHIFT(entry)));
 }
 
 /*
@@ -68,3 +69,24 @@ static inline pte_t swp_entry_to_pte(swp
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c linux-2.6.9-rc2-mm4.build/mm/fremap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/fremap.c	2004-10-25 20:44:05.000000000 -0200
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-10-28 15:06:59.000000000 -0200
@@ -53,7 +53,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -290,7 +289,13 @@ skip_copy_pte_range:
 				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
 					if (!pte_file(pte)) {
-						swap_duplicate(pte_to_swp_entry(pte));
+						swp_entry_t entry;
+						entry = pte_to_swp_entry(pte);
+						if (pte_is_migration(pte)) 
+							migration_duplicate(entry);
+						else
+							swap_duplicate(entry);
+						
 						if (list_empty(&dst->mmlist)) {
 							spin_lock(&mmlist_lock);
 							list_add(&dst->mmlist,
@@ -456,8 +461,13 @@ static void zap_pte_range(struct mmu_gat
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1408,6 +1418,9 @@ static int do_swap_page(struct mm_struct
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1433,15 +1446,22 @@ again:
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1459,10 +1479,14 @@ again:
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page);
+	}
 
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-28 15:03:44.000000000 -0200
@@ -1,4 +1,4 @@
-/*
+ /*
  *  linux/mm/mmigrate.c
  *
  *  Support of memory hotplug
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,161 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);
+		page_cache_release(page);
+	}
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -119,9 +276,11 @@ page_migratable(struct page *page, struc
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -144,7 +303,7 @@ migrate_page_common(struct page *page, s
 		case -ENOENT:
 			copy_highpage(newpage, page);
 			return ret;
-		case -EBUSY:
+		case -EBUSY: 
 			return ret;
 		case -EAGAIN:
 			writeback_and_free_buffers(page);
@@ -317,6 +476,7 @@ generic_migrate_page(struct page *page, 
 	switch (ret) {
 	default:
 		/* The page is busy. Try it later. */
+		BUG();
 		goto out_busy;
 	case -ENOENT:
 		/* The file the page belongs to has been truncated. */
@@ -400,10 +560,14 @@ migrate_onepage(struct page *page)
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -420,8 +584,9 @@ migrate_onepage(struct page *page)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
@@ -454,6 +619,8 @@ int try_to_migrate_pages(struct list_hea
 		.may_writepage	= 0,
 	};
 
+	printk(KERN_ERR "try to migrate pages!\n");
+
 	current->flags |= PF_KSWAPD;    /*  It's fake */
 	list_for_each_entry_safe(page, page2, page_list, lru) {
 		/*
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c linux-2.6.9-rc2-mm4.build/mm/page_io.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/page_io.c	2004-10-24 12:23:55.000000000 -0200
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c linux-2.6.9-rc2-mm4.build/mm/rmap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/rmap.c	2004-10-25 17:31:43.000000000 -0200
@@ -49,7 +49,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
@@ -641,22 +646,36 @@ static int try_to_unmap_one(struct page 
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		if (PageAnon(page)) {
+			swp_entry_t entry = { .val = page->private };
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+	//		BUG_ON(!PageSwapCache(page));
+			if (PageSwapCache(page) && !PageMigration(page)) {
+				swap_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, swp_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			} else if (PageMigration(page)) {
+				// page cache get to reference pte,
+				// remove from migration cache
+				// on zero-users at fault path
+				migration_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, migration_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			}
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
-	}
 
 	mm->rss--;
 	page_remove_rmap(page);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c linux-2.6.9-rc2-mm4.build/mm/shmem.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/shmem.c	2004-10-24 12:24:20.000000000 -0200
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c linux-2.6.9-rc2-mm4.build/mm/swapfile.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/swapfile.c	2004-10-28 15:09:49.000000000 -0200
@@ -29,7 +29,6 @@
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
@@ -230,6 +229,7 @@ bad_device:
 	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
 	goto out;
 bad_nofile:
+	BUG();
 	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
 	return NULL;
@@ -1369,6 +1370,13 @@ asmlinkage long sys_swapon(const char __
 		swap_list_unlock();
 		goto out;
 	}
+
+	/* MIGRATION_TYPE is reserved for migration pages */
+	if (type >= MIGRATION_TYPE) {
+		swap_list_unlock();
+		goto out;
+	}
+
 	if (type >= nr_swapfiles)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);
diff -Nur --show-c-function linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c linux-2.6.9-rc2-mm4.build/mm/vmscan.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/vmscan.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/vmscan.c	2004-10-25 19:15:56.000000000 -0200
@@ -38,8 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
-#include <linux/swapops.h>
-
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
@@ -459,7 +457,9 @@ int shrink_list(struct list_head *page_l
 		}
 
 #ifdef CONFIG_SWAP
-		if (PageSwapCache(page)) {
+		// FIXME: allow relocation of migrate cache pages 
+		// into real swap pages for swapout.
+		if (PageSwapCache(page) && !PageMigration(page)) {
 			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 18:51               ` Dave Hansen
@ 2004-10-28 16:26                 ` Marcelo Tosatti
  2004-10-28 20:24                   ` Dave Hansen
  2004-11-03 15:21                   ` Marcelo Tosatti
  0 siblings, 2 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2004-10-28 16:26 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, hugh

On Thu, Oct 28, 2004 at 11:51:57AM -0700, Dave Hansen wrote:
> Marcelo Tosatti wrote:
> >+static inline int PageMigration(struct page *page)
> >+{
> >+        swp_entry_t entry;
> >+
> >+        if (!PageSwapCache(page))
> >+                return 0;
> >+
> >+        entry.val = page->private;
> >+
> >+        if (swp_type(entry) != MIGRATION_TYPE)
> >+                return 0;
> >+
> >+        return 1;
> >+}
> 
> Don't we usually try to keep the Page*() operations to be strict 
> page->flags checks?  Should this be page_migration() or something 
> similar instead?

Yeah I think page_migration() will be more conformant to the current
macros.

Will do it, and upgrade to the latest -mhp. What is it again? 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 16:05             ` Marcelo Tosatti
@ 2004-10-28 18:51               ` Dave Hansen
  2004-10-28 16:26                 ` Marcelo Tosatti
  2004-11-05 13:49               ` Hirokazu Takahashi
  1 sibling, 1 reply; 48+ messages in thread
From: Dave Hansen @ 2004-10-28 18:51 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, hugh

Marcelo Tosatti wrote:
> +static inline int PageMigration(struct page *page)
> +{
> +        swp_entry_t entry;
> +
> +        if (!PageSwapCache(page))
> +                return 0;
> +
> +        entry.val = page->private;
> +
> +        if (swp_type(entry) != MIGRATION_TYPE)
> +                return 0;
> +
> +        return 1;
> +}

Don't we usually try to keep the Page*() operations to be strict 
page->flags checks?  Should this be page_migration() or something 
similar instead?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 16:26                 ` Marcelo Tosatti
@ 2004-10-28 20:24                   ` Dave Hansen
  2004-11-03 15:21                   ` Marcelo Tosatti
  1 sibling, 0 replies; 48+ messages in thread
From: Dave Hansen @ 2004-10-28 20:24 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, hugh

Marcelo Tosatti wrote:
> Yeah I think page_migration() will be more conformant to the current
> macros.
> 
> Will do it, and upgrade to the latest -mhp. What is it again? 

http://sprucegoose.sr71.net/patches/2.6.9-mm1-mhp1/
http://sprucegoose.sr71.net/patches/patch-2.6.9-mm1-mhp1.gz

I have one against 2.6.10-rc1-mm1, but it's not quite ready yet.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 16:26                 ` Marcelo Tosatti
  2004-10-28 20:24                   ` Dave Hansen
@ 2004-11-03 15:21                   ` Marcelo Tosatti
  2004-11-04  8:01                     ` Hirokazu Takahashi
  1 sibling, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-11-03 15:21 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, hugh

On Thu, Oct 28, 2004 at 02:26:52PM -0200, Marcelo Tosatti wrote:
> On Thu, Oct 28, 2004 at 11:51:57AM -0700, Dave Hansen wrote:
> > Marcelo Tosatti wrote:
> > >+static inline int PageMigration(struct page *page)
> > >+{
> > >+        swp_entry_t entry;
> > >+
> > >+        if (!PageSwapCache(page))
> > >+                return 0;
> > >+
> > >+        entry.val = page->private;
> > >+
> > >+        if (swp_type(entry) != MIGRATION_TYPE)
> > >+                return 0;
> > >+
> > >+        return 1;
> > >+}
> > 
> > Don't we usually try to keep the Page*() operations to be strict 
> > page->flags checks?  Should this be page_migration() or something 
> > similar instead?
> 
> Yeah I think page_migration() will be more conformant to the current
> macros.
> 
> Will do it, and upgrade to the latest -mhp. What is it again? 
 
Can't boot 2.6.9-mm1-mhp on my dual P4 - reverting -mhp 
makes it happy again (with same .config file). Freezes
after "OK, now booting the kernel".

Will stick to -rc4-mm1 for now.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-03 15:21                   ` Marcelo Tosatti
@ 2004-11-04  8:01                     ` Hirokazu Takahashi
  0 siblings, 0 replies; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-11-04  8:01 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: haveblue, linux-mm, iwamoto, hugh, kamezawa.hiroyu

Hi,

> Can't boot 2.6.9-mm1-mhp on my dual P4 - reverting -mhp 
> makes it happy again (with same .config file). Freezes
> after "OK, now booting the kernel".
> 
> Will stick to -rc4-mm1 for now.


I guess the problem may be solved with the attached patch Kame-san made.
 


From: Hiroyuki KAMEZAWA <kamezawa.hiroyu@jp.fujitsu.com>
To: Dave Hansen <haveblue@us.ibm.com>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>,
	lhms-devel@lists.sourceforge.net
Date: Mon, 01 Nov 2004 19:23:15 +0900
User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.3) Gecko/20040910

Dave Hansen wrote:

> Hirokazu Takahashi wrote:
> 
>> BTW, linux-2.6.9-mm1-mhp1 often causes startup freeze on my box.
>> I don't know why.
> 
> 
> It would be very helpful to diagnose it.  Where does the freeze occur? 
> Can you dump the console?  Did you boot with "debug" on the kernel 
> command line?  Have you tried sysrq?
> 
This is a patch for this i386 problem.
alloc_memmap() is called twice and this hits new BUG_ON() :)

Thanks.
Kame <kamezawa.hiroyu@jp.fujitsu.com>

---

  linux-2.6.9-mm1-mhp-kamezawa/arch/i386/kernel/setup.c |   13 -------------
  1 files changed, 13 deletions(-)

diff -puN arch/i386/kernel/setup.c~cleanup arch/i386/kernel/setup.c
--- linux-2.6.9-mm1-mhp/arch/i386/kernel/setup.c~cleanup	2004-11-01 18:33:45.179201168 +0900
+++ linux-2.6.9-mm1-mhp-kamezawa/arch/i386/kernel/setup.c	2004-11-01 18:34:01.648697424 +0900
@@ -1393,19 +1393,6 @@ void __init setup_arch(char **cmdline_p)
  #endif
  	paging_init();

-	/*
-	 * NOTE: at this point the bootmem allocator is fully available.
-	 */
-
-#ifdef CONFIG_NONLINEAR
-	{
-		/* use alloc_node_mem_map() instead - daveh */
-		struct page *lmem_map;
-		lmem_map = alloc_bootmem(max_pfn * sizeof(struct page));
-		alloc_memmap(lmem_map, 0, max_pfn);
-	}
-#endif
-
  	zone_sizes_init();

  #ifdef CONFIG_EARLY_PRINTK

_


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-10-28 16:05             ` Marcelo Tosatti
  2004-10-28 18:51               ` Dave Hansen
@ 2004-11-05 13:49               ` Hirokazu Takahashi
  2004-11-05 15:16                 ` Marcelo Tosatti
  1 sibling, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-11-05 13:49 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi, Marcelo,

I happened to meet a bug.

> > Yep thats probably what caused your failures.
> > 
> > I'll prepare a new patch.
> 
> Here it is - with the copy_page_range() fix as you pointed out,
> plus sys_swapon() fix as suggested by Hiroyuki.
> 
> I've also added a BUG() in case of swap_free() failure, so we 
> get a backtrace.
> 
> Can you please test this - thanks.

>From the attached message, lookup_migration_cache() returned NULL
in do_swap_page(). There might be a race condition related the
migration cache.


Nov  5 22:18:22 target1 kernel: Unable to handle kernel NULL pointer dereference at virtual address 00000000
Nov  5 22:18:22 target1 kernel:  printing eip:
Nov  5 22:18:22 target1 kernel: c0141364
Nov  5 22:18:22 target1 kernel: *pde = 00000000
Nov  5 22:18:22 target1 kernel: Oops: 0000 [#1]
Nov  5 22:18:22 target1 kernel: SMP 
Nov  5 22:18:22 target1 kernel: Modules linked in:
Nov  5 22:18:22 target1 kernel: CPU:    0
Nov  5 22:18:22 target1 kernel: EIP:    0060:[mark_page_accessed+4/80]    Not tainted VLI
Nov  5 22:18:22 target1 kernel: EIP:    0060:[<c0141364>]    Not tainted VLI
Nov  5 22:18:22 target1 kernel: EFLAGS: 00010246   (2.6.9-rc4-mm1) 
Nov  5 22:18:22 target1 kernel: EIP is at mark_page_accessed+0x4/0x50
Nov  5 22:18:22 target1 kernel: eax: 00000000   ebx: 00000000   ecx: c0304700   edx: f8000005
Nov  5 22:18:22 target1 kernel: esi: 00000000   edi: 0000053e   ebp: b72eafe0   esp: ce12be90
Nov  5 22:18:22 target1 kernel: ds: 007b   es: 007b   ss: 0068
Nov  5 22:18:22 target1 kernel: Process grep (pid: 2441, threadinfo=ce12a000 task=cf66d550)
Nov  5 22:18:22 target1 kernel: Stack: f8000005 00000000 c01474d4 c01065c2 00000001 ef720000 cf2655f4 ccc13b70 
Nov  5 22:18:22 target1 kernel:        b72eafe0 cf76cdc0 c0147cf4 cf76cdc0 cf2655f4 b72eafe0 c84f8ba8 ccc13b70 
Nov  5 22:18:22 target1 kernel:        0000053e 00000000 b72eafe0 cf76cdc0 cf2655f4 cf66d550 c0114835 cf76cdc0 
Nov  5 22:18:22 target1 kernel: Call Trace:
Nov  5 22:18:22 target1 kernel:  [do_swap_page+372/784] do_swap_page+0x174/0x310
Nov  5 22:18:22 target1 kernel:  [<c01474d4>] do_swap_page+0x174/0x310
Nov  5 22:18:22 target1 kernel:  [apic_timer_interrupt+26/32] apic_timer_interrupt+0x1a/0x20
Nov  5 22:18:22 target1 kernel:  [<c01065c2>] apic_timer_interrupt+0x1a/0x20
Nov  5 22:18:22 target1 kernel:  [handle_mm_fault+228/352] handle_mm_fault+0xe4/0x160
Nov  5 22:18:22 target1 kernel:  [<c0147cf4>] handle_mm_fault+0xe4/0x160
Nov  5 22:18:22 target1 kernel:  [do_page_fault+469/1487] do_page_fault+0x1d5/0x5cf
Nov  5 22:18:22 target1 kernel:  [<c0114835>] do_page_fault+0x1d5/0x5cf
Nov  5 22:18:22 target1 kernel:  [run_timer_softirq+481/496] run_timer_softirq+0x1e1/0x1f0
Nov  5 22:18:22 target1 kernel:  [<c0124821>] run_timer_softirq+0x1e1/0x1f0
Nov  5 22:18:22 target1 kernel:  [update_wall_time+21/64] update_wall_time+0x15/0x40
Nov  5 22:18:22 target1 kernel:  [<c01244d5>] update_wall_time+0x15/0x40
Nov  5 22:18:22 target1 kernel:  [do_timer+46/192] do_timer+0x2e/0xc0
Nov  5 22:18:22 target1 kernel:  [<c012486e>] do_timer+0x2e/0xc0
Nov  5 22:18:22 target1 kernel:  [timer_interrupt+72/240] timer_interrupt+0x48/0xf0
Nov  5 22:18:22 target1 kernel:  [<c010b148>] timer_interrupt+0x48/0xf0
Nov  5 22:18:22 target1 kernel:  [timer_interrupt+229/240] timer_interrupt+0xe5/0xf0
Nov  5 22:18:22 target1 kernel:  [<c010b1e5>] timer_interrupt+0xe5/0xf0
Nov  5 22:18:22 target1 kernel:  [handle_IRQ_event+44/96] handle_IRQ_event+0x2c/0x60
Nov  5 22:18:22 target1 kernel:  [<c013530c>] handle_IRQ_event+0x2c/0x60
Nov  5 22:18:22 target1 kernel:  [__do_IRQ+280/336] __do_IRQ+0x118/0x150
Nov  5 22:18:22 target1 kernel:  [<c0135458>] __do_IRQ+0x118/0x150
Nov  5 22:18:22 target1 kernel:  [__do_IRQ+318/336] __do_IRQ+0x13e/0x150
Nov  5 22:18:22 target1 kernel:  [<c013547e>] __do_IRQ+0x13e/0x150
Nov  5 22:18:22 target1 kernel:  [do_page_fault+0/1487] do_page_fault+0x0/0x5cf
Nov  5 22:18:22 target1 kernel:  [<c0114660>] do_page_fault+0x0/0x5cf
Nov  5 22:18:22 target1 kernel:  [error_code+45/56] error_code+0x2d/0x38
Nov  5 22:18:22 target1 kernel:  [<c010663d>] error_code+0x2d/0x38
Nov  5 22:18:22 target1 kernel: Code: 1c 85 20 80 3f c0 01 da ff 42 38 51 9d 8d 86 00 01 00 00 e8 ef e9 14 00 5b 5e 5f c3 8d 74 26 00 8d bc 27 00 00 00 00 56 53 89 c3 <8b> 03 83 e0 40 75 25 8b 03 be 02 00 00 00 83 e0 04 74 19 8b 03 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-05 13:49               ` Hirokazu Takahashi
@ 2004-11-05 15:16                 ` Marcelo Tosatti
  2004-11-16  4:07                   ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-11-05 15:16 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

[-- Attachment #1: Type: text/plain, Size: 1671 bytes --]

On Fri, Nov 05, 2004 at 10:49:58PM +0900, Hirokazu Takahashi wrote:
> Hi, Marcelo,
> 
> I happened to meet a bug.
> 
> > > Yep thats probably what caused your failures.
> > > 
> > > I'll prepare a new patch.
> > 
> > Here it is - with the copy_page_range() fix as you pointed out,
> > plus sys_swapon() fix as suggested by Hiroyuki.
> > 
> > I've also added a BUG() in case of swap_free() failure, so we 
> > get a backtrace.
> > 
> > Can you please test this - thanks.
> 
> >From the attached message, lookup_migration_cache() returned NULL
> in do_swap_page(). There might be a race condition related the
> migration cache.

Hi Hirokazu!

The problem is that another thread can fault in the pte 
(removing the radix tree entry) while the current thread dropped the 
page_table_lock - which explains the NULL lookup_migration_cache. 
The swap code handles this situation, but I've completly missed it. 

Updated patch attached.

Extreme thanks for your testing, its being crucial! 

We're getting there.

do_swap_page now does:

 again:
+       if (pte_is_migration(orig_pte)) {
+               page = lookup_migration_cache(entry.val);
+               if (!page) {
+                       spin_lock(&mm->page_table_lock);
+                       page_table = pte_offset_map(pmd, address);
+                       if (likely(pte_same(*page_table, orig_pte)))
+                               ret = VM_FAULT_OOM;
+                       else
+                               ret = VM_FAULT_MINOR;
+                       pte_unmap(page_table);
+                       spin_unlock(&mm->page_table_lock);
+                       goto out;
+               }
+       } else {


[-- Attachment #2: migr5 --]
[-- Type: text/plain, Size: 16162 bytes --]

diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h linux-2.6.9-rc2-mm4.build/include/linux/mm.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/mm.h	2004-10-05 15:09:38.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/mm.h	2004-10-25 18:42:37.000000000 -0200
@@ -251,6 +251,24 @@
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -458,11 +476,14 @@
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h linux-2.6.9-rc2-mm4.build/include/linux/swap.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swap.h	2004-10-05 15:09:39.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swap.h	2004-10-25 20:42:27.000000000 -0200
@@ -253,6 +253,7 @@
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -321,6 +322,21 @@
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h linux-2.6.9-rc2-mm4.build/include/linux/swapops.h
--- linux-2.6.9-rc2-mm4.mhp.orig/include/linux/swapops.h	2004-10-05 15:09:35.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/include/linux/swapops.h	2004-10-24 12:15:07.000000000 -0200
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -68,3 +69,24 @@
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c linux-2.6.9-rc2-mm4.build/mm/fremap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/fremap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/fremap.c	2004-10-25 20:44:05.000000000 -0200
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c linux-2.6.9-rc2-mm4.build/mm/memory.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/memory.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/memory.c	2004-11-05 14:42:15.000000000 -0200
@@ -53,7 +53,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -290,7 +289,13 @@
 				/* pte contains position in swap, so copy. */
 				if (!pte_present(pte)) {
 					if (!pte_file(pte)) {
-						swap_duplicate(pte_to_swp_entry(pte));
+						swp_entry_t entry;
+						entry = pte_to_swp_entry(pte);
+						if (pte_is_migration(pte)) 
+							migration_duplicate(entry);
+						else
+							swap_duplicate(entry);
+						
 						if (list_empty(&dst->mmlist)) {
 							spin_lock(&mmlist_lock);
 							list_add(&dst->mmlist,
@@ -456,8 +461,13 @@
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1408,6 +1418,20 @@
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+		if (!page) { 
+			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
+			if (likely(pte_same(*page_table, orig_pte)))
+				ret = VM_FAULT_OOM;
+			else
+				ret = VM_FAULT_MINOR;
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1433,15 +1457,22 @@
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1459,10 +1490,14 @@
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page);
+	}
 
 	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c linux-2.6.9-rc2-mm4.build/mm/mmigrate.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/mmigrate.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/mmigrate.c	2004-10-28 15:03:44.000000000 -0200
@@ -1,4 +1,4 @@
-/*
+ /*
  *  linux/mm/mmigrate.c
  *
  *  Support of memory hotplug
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,161 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	if (!c->i)
+		BUG();
+
+	c->i--;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);
+		page_cache_release(page);
+	}
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -119,9 +276,11 @@
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -400,10 +560,14 @@
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -420,8 +584,9 @@
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
 	return 0;
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c linux-2.6.9-rc2-mm4.build/mm/page_io.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/page_io.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/page_io.c	2004-10-24 12:23:55.000000000 -0200
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c linux-2.6.9-rc2-mm4.build/mm/rmap.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/rmap.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/rmap.c	2004-10-25 17:31:43.000000000 -0200
@@ -49,7 +49,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rmap.h>
@@ -641,22 +646,36 @@
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
-	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
-		/*
-		 * Store the swap location in the pte.
-		 * See handle_pte_fault() ...
-		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		if (PageAnon(page)) {
+			swp_entry_t entry = { .val = page->private };
+			/*
+			 * Store the swap location in the pte.
+			 * See handle_pte_fault() ...
+			 */
+	//		BUG_ON(!PageSwapCache(page));
+			if (PageSwapCache(page) && !PageMigration(page)) {
+				swap_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, swp_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			} else if (PageMigration(page)) {
+				// page cache get to reference pte,
+				// remove from migration cache
+				// on zero-users at fault path
+				migration_duplicate(entry);
+				if (list_empty(&mm->mmlist)) {
+					spin_lock(&mmlist_lock);
+					list_add(&mm->mmlist, &init_mm.mmlist);
+					spin_unlock(&mmlist_lock);
+				}
+				set_pte(pte, migration_entry_to_pte(entry));
+				BUG_ON(pte_file(*pte));
+			}
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
-	}
 
 	mm->rss--;
 	page_remove_rmap(page);
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c linux-2.6.9-rc2-mm4.build/mm/shmem.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/shmem.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/shmem.c	2004-10-24 12:24:20.000000000 -0200
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -Nur linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c linux-2.6.9-rc2-mm4.build/mm/swapfile.c
--- linux-2.6.9-rc2-mm4.mhp.orig/mm/swapfile.c	2004-10-05 15:08:23.000000000 -0300
+++ linux-2.6.9-rc2-mm4.build/mm/swapfile.c	2004-10-28 15:09:49.000000000 -0200
@@ -29,7 +29,6 @@
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
@@ -230,6 +229,7 @@
 	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
 	goto out;
 bad_nofile:
+	BUG();
 	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
 	return NULL;
@@ -1369,6 +1370,13 @@
 		swap_list_unlock();
 		goto out;
 	}
+
+	/* MIGRATION_TYPE is reserved for migration pages */
+	if (type >= MIGRATION_TYPE) {
+		swap_list_unlock();
+		goto out;
+	}
+
 	if (type >= nr_swapfiles)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-05 15:16                 ` Marcelo Tosatti
@ 2004-11-16  4:07                   ` Hirokazu Takahashi
  2004-11-23 12:14                     ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-11-16  4:07 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi Marcelo,

I've been testing the memory migration code with your patch.
I found problems and I think the attached patch would
fix some of them.

One of the problems is a race condition between add_to_migration_cache()
and try_to_unmap(). Some pages in the migration cache cannot
be removed with the current implementation. Please suppose
a process space might be removed between them. In this case
no one can remove pages the process had from the migration cache,
because they can be removed only when the pagetables pointed
the pages.

Therefore, I made pages removed from the migration cache
at the end of generic_migrate_page() if they remain in the cache.

The another is a fork() related problem. If fork() has occurred
during page migration, the previous work may not go well.
pages may not be removed from the migration cache.

So I made the swapcode ignore pages in the migration cache.
However, as you know this is just a workaround and not a correct
way to fix it.

> Hi Hirokazu!
> 
> The problem is that another thread can fault in the pte 
> (removing the radix tree entry) while the current thread dropped the 
> page_table_lock - which explains the NULL lookup_migration_cache. 
> The swap code handles this situation, but I've completly missed it. 
> 
> Updated patch attached.
> 
> Extreme thanks for your testing, its being crucial! 
> 
> We're getting there.
> 
> do_swap_page now does:
> 
>  again:
> +       if (pte_is_migration(orig_pte)) {
> +               page = lookup_migration_cache(entry.val);
> +               if (!page) {
> +                       spin_lock(&mm->page_table_lock);
> +                       page_table = pte_offset_map(pmd, address);
> +                       if (likely(pte_same(*page_table, orig_pte)))
> +                               ret = VM_FAULT_OOM;
> +                       else
> +                               ret = VM_FAULT_MINOR;
> +                       pte_unmap(page_table);
> +                       spin_unlock(&mm->page_table_lock);
> +                       goto out;
> +               }
> +       } else {
> 



Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
---

 linux-2.6.9-rc4-taka/mm/memory.c   |    2 +-
 linux-2.6.9-rc4-taka/mm/mmigrate.c |   28 +++++++++++++++++++++-------
 linux-2.6.9-rc4-taka/mm/vmscan.c   |    4 ++++
 3 files changed, 26 insertions, 8 deletions

diff -puN mm/mmigrate.c~marcelo-FIX1 mm/mmigrate.c
--- linux-2.6.9-rc4/mm/mmigrate.c~marcelo-FIX1	Tue Nov 16 10:43:56 2004
+++ linux-2.6.9-rc4-taka/mm/mmigrate.c	Tue Nov 16 11:07:10 2004
@@ -114,14 +114,14 @@ int migration_remove_entry(swp_entry_t e
 
 	lock_page(page);	
 
-	migration_remove_reference(page);
+	migration_remove_reference(page, 1);
 
 	unlock_page(page);
 
 	page_cache_release(page);
 }
 
-int migration_remove_reference(struct page *page)
+int migration_remove_reference(struct page *page, int dec)
 {
 	struct counter *c;
 	swp_entry_t entry;
@@ -134,10 +134,9 @@ int migration_remove_reference(struct pa
 
 	read_unlock_irq(&migration_space.tree_lock);
 
-	if (!c->i)
-		BUG();
+	BUG_ON(c->i < dec);
 
-	c->i--;
+	c->i -= dec;
 
 	if (!c->i) {
 		remove_from_migration_cache(page, page->private);
@@ -146,6 +145,15 @@ int migration_remove_reference(struct pa
 	}
 }
 
+int detach_from_migration_cache(struct page *page)
+{
+	lock_page(page);	
+	migration_remove_reference(page, 0);
+	unlock_page(page);
+
+	return 0;
+}
+
 int add_to_migration_cache(struct page *page, int gfp_mask) 
 {
 	int error, offset;
@@ -522,7 +530,9 @@ generic_migrate_page(struct page *page, 
 
 	/* map the newpage where the old page have been mapped. */
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(newpage)) {
+	if (PageMigration(newpage))
+		detach_from_migration_cache(newpage);
+	else if (PageSwapCache(newpage)) {
 		lock_page(newpage);
 		__remove_exclusive_swap_page(newpage, 1);
 		unlock_page(newpage);
@@ -538,7 +548,9 @@ out_busy:
 	/* Roll back all operations. */
 	unwind_page(page, newpage);
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(page)) {
+	if (PageMigration(page))
+		detach_from_migration_cache(page);
+	else if (PageSwapCache(page)) {
 		lock_page(page);
 		__remove_exclusive_swap_page(page, 1);
 		unlock_page(page);
@@ -550,6 +562,8 @@ out_removing:
 		BUG();
 	unlock_page(page);
 	unlock_page(newpage);
+	if (PageMigration(page))
+		detach_from_migration_cache(page);
 	return ret;
 }
 
diff -puN mm/vmscan.c~marcelo-FIX1 mm/vmscan.c
--- linux-2.6.9-rc4/mm/vmscan.c~marcelo-FIX1	Mon Nov 15 12:20:35 2004
+++ linux-2.6.9-rc4-taka/mm/vmscan.c	Tue Nov 16 11:06:06 2004
@@ -459,6 +459,10 @@ int shrink_list(struct list_head *page_l
 			goto keep_locked;
 		}
 
+		if (PageMigration(page)) {
+			write_unlock_irq(&mapping->tree_lock);
+			goto keep_locked;
+		}
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->private };
diff -puN mm/memory.c~marcelo-FIX1 mm/memory.c
--- linux-2.6.9-rc4/mm/memory.c~marcelo-FIX1	Tue Nov 16 11:06:31 2004
+++ linux-2.6.9-rc4-taka/mm/memory.c	Tue Nov 16 11:06:57 2004
@@ -1621,7 +1621,7 @@ again:
 		if (vm_swap_full())
 			remove_exclusive_swap_page(page);
 	} else {
-		migration_remove_reference(page);
+		migration_remove_reference(page, 1);
 	}
 
 	mm->rss++;
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-16  4:07                   ` Hirokazu Takahashi
@ 2004-11-23 12:14                     ` Marcelo Tosatti
  2004-11-24 10:21                       ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-11-23 12:14 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Tue, Nov 16, 2004 at 01:07:18PM +0900, Hirokazu Takahashi wrote:
> Hi Marcelo,
> 
> I've been testing the memory migration code with your patch.
> I found problems and I think the attached patch would
> fix some of them.
> 
> One of the problems is a race condition between add_to_migration_cache()
> and try_to_unmap(). Some pages in the migration cache cannot
> be removed with the current implementation. Please suppose
> a process space might be removed between them. In this case
> no one can remove pages the process had from the migration cache,
> because they can be removed only when the pagetables pointed
> the pages.

I guess I dont fully understand you Hirokazu.

unmap_vmas function (called by exit_mmap) calls zap_pte_range, 
and that does:

                        if (pte_is_migration(pte)) {
                                migration_remove_entry(swp_entry);
                        } else
                                free_swap_and_cache(swp_entry);

migration_remove_entry should decrease the IDR counter, and 
remove the migration cache page on zero reference.

Am I missing something?

I assume you are seeing this problems in practice?

Sorry for the delay, been busy with other things.

> Therefore, I made pages removed from the migration cache
> at the end of generic_migrate_page() if they remain in the cache.
> 
> The another is a fork() related problem. If fork() has occurred
> during page migration, the previous work may not go well.
> pages may not be removed from the migration cache.
> 
> So I made the swapcode ignore pages in the migration cache.
> However, as you know this is just a workaround and not a correct
> way to fix it.
> 
> > Hi Hirokazu!
> > 
> > The problem is that another thread can fault in the pte 
> > (removing the radix tree entry) while the current thread dropped the 
> > page_table_lock - which explains the NULL lookup_migration_cache. 
> > The swap code handles this situation, but I've completly missed it. 
> > 
> > Updated patch attached.
> > 
> > Extreme thanks for your testing, its being crucial! 
> > 
> > We're getting there.
> > 
> > do_swap_page now does:
> > 
> >  again:
> > +       if (pte_is_migration(orig_pte)) {
> > +               page = lookup_migration_cache(entry.val);
> > +               if (!page) {
> > +                       spin_lock(&mm->page_table_lock);
> > +                       page_table = pte_offset_map(pmd, address);
> > +                       if (likely(pte_same(*page_table, orig_pte)))
> > +                               ret = VM_FAULT_OOM;
> > +                       else
> > +                               ret = VM_FAULT_MINOR;
> > +                       pte_unmap(page_table);
> > +                       spin_unlock(&mm->page_table_lock);
> > +                       goto out;
> > +               }
> > +       } else {
> > 
> 
> 
> 
> Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
> ---
> 
>  linux-2.6.9-rc4-taka/mm/memory.c   |    2 +-
>  linux-2.6.9-rc4-taka/mm/mmigrate.c |   28 +++++++++++++++++++++-------
>  linux-2.6.9-rc4-taka/mm/vmscan.c   |    4 ++++
>  3 files changed, 26 insertions, 8 deletions
> 
> diff -puN mm/mmigrate.c~marcelo-FIX1 mm/mmigrate.c
> --- linux-2.6.9-rc4/mm/mmigrate.c~marcelo-FIX1	Tue Nov 16 10:43:56 2004
> +++ linux-2.6.9-rc4-taka/mm/mmigrate.c	Tue Nov 16 11:07:10 2004
> @@ -114,14 +114,14 @@ int migration_remove_entry(swp_entry_t e
>  
>  	lock_page(page);	
>  
> -	migration_remove_reference(page);
> +	migration_remove_reference(page, 1);
>  
>  	unlock_page(page);
>  
>  	page_cache_release(page);
>  }
>  
> -int migration_remove_reference(struct page *page)
> +int migration_remove_reference(struct page *page, int dec)
>  {
>  	struct counter *c;
>  	swp_entry_t entry;
> @@ -134,10 +134,9 @@ int migration_remove_reference(struct pa
>  
>  	read_unlock_irq(&migration_space.tree_lock);
>  
> -	if (!c->i)
> -		BUG();
> +	BUG_ON(c->i < dec);
>  
> -	c->i--;
> +	c->i -= dec;
>  
>  	if (!c->i) {
>  		remove_from_migration_cache(page, page->private);
> @@ -146,6 +145,15 @@ int migration_remove_reference(struct pa
>  	}
>  }
>  
> +int detach_from_migration_cache(struct page *page)
> +{
> +	lock_page(page);	
> +	migration_remove_reference(page, 0);
> +	unlock_page(page);
> +
> +	return 0;
> +}
> +
>  int add_to_migration_cache(struct page *page, int gfp_mask) 
>  {
>  	int error, offset;
> @@ -522,7 +530,9 @@ generic_migrate_page(struct page *page, 
>  
>  	/* map the newpage where the old page have been mapped. */
>  	touch_unmapped_address(&vlist);
> -	if (PageSwapCache(newpage)) {
> +	if (PageMigration(newpage))
> +		detach_from_migration_cache(newpage);
> +	else if (PageSwapCache(newpage)) {
>  		lock_page(newpage);
>  		__remove_exclusive_swap_page(newpage, 1);
>  		unlock_page(newpage);

I dont see this code on 2.6.9-rc2-mm4-mhp, I should upgrade.

> @@ -538,7 +548,9 @@ out_busy:
>  	/* Roll back all operations. */
>  	unwind_page(page, newpage);
>  	touch_unmapped_address(&vlist);
> -	if (PageSwapCache(page)) {
> +	if (PageMigration(page))
> +		detach_from_migration_cache(page);
> +	else if (PageSwapCache(page)) {
>  		lock_page(page);
>  		__remove_exclusive_swap_page(page, 1);
>  		unlock_page(page);
> @@ -550,6 +562,8 @@ out_removing:
>  		BUG();
>  	unlock_page(page);
>  	unlock_page(newpage);
> +	if (PageMigration(page))
> +		detach_from_migration_cache(page);
>  	return ret;
>  }
>  
> diff -puN mm/vmscan.c~marcelo-FIX1 mm/vmscan.c
> --- linux-2.6.9-rc4/mm/vmscan.c~marcelo-FIX1	Mon Nov 15 12:20:35 2004
> +++ linux-2.6.9-rc4-taka/mm/vmscan.c	Tue Nov 16 11:06:06 2004
> @@ -459,6 +459,10 @@ int shrink_list(struct list_head *page_l
>  			goto keep_locked;
>  		}
>  
> +		if (PageMigration(page)) {
> +			write_unlock_irq(&mapping->tree_lock);
> +			goto keep_locked;
> +		}
>  #ifdef CONFIG_SWAP
>  		if (PageSwapCache(page)) {
>  			swp_entry_t swap = { .val = page->private };
> diff -puN mm/memory.c~marcelo-FIX1 mm/memory.c
> --- linux-2.6.9-rc4/mm/memory.c~marcelo-FIX1	Tue Nov 16 11:06:31 2004
> +++ linux-2.6.9-rc4-taka/mm/memory.c	Tue Nov 16 11:06:57 2004
> @@ -1621,7 +1621,7 @@ again:
>  		if (vm_swap_full())
>  			remove_exclusive_swap_page(page);
>  	} else {
> -		migration_remove_reference(page);
> +		migration_remove_reference(page, 1);
>  	}
>  
>  	mm->rss++;
> _
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-23 12:14                     ` Marcelo Tosatti
@ 2004-11-24 10:21                       ` Hirokazu Takahashi
  2004-12-01 20:21                         ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-11-24 10:21 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi Marcelo,

> > Hi Marcelo,
> > 
> > I've been testing the memory migration code with your patch.
> > I found problems and I think the attached patch would
> > fix some of them.
> > 
> > One of the problems is a race condition between add_to_migration_cache()
> > and try_to_unmap(). Some pages in the migration cache cannot
> > be removed with the current implementation. Please suppose
> > a process space might be removed between them. In this case
> > no one can remove pages the process had from the migration cache,
> > because they can be removed only when the pagetables pointed
> > the pages.
> 
> I guess I dont fully understand you Hirokazu.
> 
> unmap_vmas function (called by exit_mmap) calls zap_pte_range, 
> and that does:
> 
>                         if (pte_is_migration(pte)) {
>                                 migration_remove_entry(swp_entry);
>                         } else
>                                 free_swap_and_cache(swp_entry);
> 
> migration_remove_entry should decrease the IDR counter, and 
> remove the migration cache page on zero reference.
> 
> Am I missing something?

That's true only if the pte points a migration entry.
However, the pte may not point it when zap_pte_range() is called
in some case.

Please suppose the following flow.
Any process may exit or munmap during memory migration
before calling set_pte(migration entry). This will
keep some unreferenced pages in the migration cache.
No one can remove these pages.

  <start page migration>                  <Process A>
        |                                      |
        |                                      |
        |                                      |
 add_to_migration_cache()                      |
    insert a page of Process A  ----------->   |
    in the migration cache.                    |
        |                                      |
        |                               zap_pte_range()
        |                   X <------------ migration_remove_entry()
        |                      the pte associated with the page doesn't
        |                      point any migration entries.
        |
        |
 try_to_unmap() -----------------------> X
     migration_duplicate()       no pte mapping the page can be found.
     set_pte(migration entry)
        |
        |
 migrate_fn()
        |
        |
    <finish>
         the page still remains in the migration cache.
	 the page may be referred by no process.


> I assume you are seeing this problems in practice?

Yes, it often happens without the patch.

> Sorry for the delay, been busy with other things.

No problem. Everyone knows you're doing hard work!

> > Therefore, I made pages removed from the migration cache
> > at the end of generic_migrate_page() if they remain in the cache.
> > 
> > The another is a fork() related problem. If fork() has occurred
> > during page migration, the previous work may not go well.
> > pages may not be removed from the migration cache.
> > 
> > So I made the swapcode ignore pages in the migration cache.
> > However, as you know this is just a workaround and not a correct
> > way to fix it.
> > 
> > > Hi Hirokazu!
> > > 
> > > The problem is that another thread can fault in the pte 
> > > (removing the radix tree entry) while the current thread dropped the 
> > > page_table_lock - which explains the NULL lookup_migration_cache. 
> > > The swap code handles this situation, but I've completly missed it. 
> > > 
> > > Updated patch attached.
> > > 
> > > Extreme thanks for your testing, its being crucial! 
> > > 
> > > We're getting there.
> > > 
> > > do_swap_page now does:
> > > 
> > >  again:
> > > +       if (pte_is_migration(orig_pte)) {
> > > +               page = lookup_migration_cache(entry.val);
> > > +               if (!page) {
> > > +                       spin_lock(&mm->page_table_lock);
> > > +                       page_table = pte_offset_map(pmd, address);
> > > +                       if (likely(pte_same(*page_table, orig_pte)))
> > > +                               ret = VM_FAULT_OOM;
> > > +                       else
> > > +                               ret = VM_FAULT_MINOR;
> > > +                       pte_unmap(page_table);
> > > +                       spin_unlock(&mm->page_table_lock);
> > > +                       goto out;
> > > +               }
> > > +       } else {
> > > 
> > 
> > 
> > 
> > Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
> > ---
> > 
> >  linux-2.6.9-rc4-taka/mm/memory.c   |    2 +-
> >  linux-2.6.9-rc4-taka/mm/mmigrate.c |   28 +++++++++++++++++++++-------
> >  linux-2.6.9-rc4-taka/mm/vmscan.c   |    4 ++++
> >  3 files changed, 26 insertions, 8 deletions
> > 
> > diff -puN mm/mmigrate.c~marcelo-FIX1 mm/mmigrate.c
> > --- linux-2.6.9-rc4/mm/mmigrate.c~marcelo-FIX1	Tue Nov 16 10:43:56 2004
> > +++ linux-2.6.9-rc4-taka/mm/mmigrate.c	Tue Nov 16 11:07:10 2004
> > @@ -114,14 +114,14 @@ int migration_remove_entry(swp_entry_t e
> >  
> >  	lock_page(page);	
> >  
> > -	migration_remove_reference(page);
> > +	migration_remove_reference(page, 1);
> >  
> >  	unlock_page(page);
> >  
> >  	page_cache_release(page);
> >  }
> >  
> > -int migration_remove_reference(struct page *page)
> > +int migration_remove_reference(struct page *page, int dec)
> >  {
> >  	struct counter *c;
> >  	swp_entry_t entry;
> > @@ -134,10 +134,9 @@ int migration_remove_reference(struct pa
> >  
> >  	read_unlock_irq(&migration_space.tree_lock);
> >  
> > -	if (!c->i)
> > -		BUG();
> > +	BUG_ON(c->i < dec);
> >  
> > -	c->i--;
> > +	c->i -= dec;
> >  
> >  	if (!c->i) {
> >  		remove_from_migration_cache(page, page->private);
> > @@ -146,6 +145,15 @@ int migration_remove_reference(struct pa
> >  	}
> >  }
> >  
> > +int detach_from_migration_cache(struct page *page)
> > +{
> > +	lock_page(page);	
> > +	migration_remove_reference(page, 0);
> > +	unlock_page(page);
> > +
> > +	return 0;
> > +}
> > +
> >  int add_to_migration_cache(struct page *page, int gfp_mask) 
> >  {
> >  	int error, offset;
> > @@ -522,7 +530,9 @@ generic_migrate_page(struct page *page, 
> >  
> >  	/* map the newpage where the old page have been mapped. */
> >  	touch_unmapped_address(&vlist);
> > -	if (PageSwapCache(newpage)) {
> > +	if (PageMigration(newpage))
> > +		detach_from_migration_cache(newpage);
> > +	else if (PageSwapCache(newpage)) {
> >  		lock_page(newpage);
> >  		__remove_exclusive_swap_page(newpage, 1);
> >  		unlock_page(newpage);
> 
> I dont see this code on 2.6.9-rc2-mm4-mhp, I should upgrade.
> 
> > @@ -538,7 +548,9 @@ out_busy:
> >  	/* Roll back all operations. */
> >  	unwind_page(page, newpage);
> >  	touch_unmapped_address(&vlist);
> > -	if (PageSwapCache(page)) {
> > +	if (PageMigration(page))
> > +		detach_from_migration_cache(page);
> > +	else if (PageSwapCache(page)) {
> >  		lock_page(page);
> >  		__remove_exclusive_swap_page(page, 1);
> >  		unlock_page(page);
> > @@ -550,6 +562,8 @@ out_removing:
> >  		BUG();
> >  	unlock_page(page);
> >  	unlock_page(newpage);
> > +	if (PageMigration(page))
> > +		detach_from_migration_cache(page);
> >  	return ret;
> >  }
> >  
> > diff -puN mm/vmscan.c~marcelo-FIX1 mm/vmscan.c
> > --- linux-2.6.9-rc4/mm/vmscan.c~marcelo-FIX1	Mon Nov 15 12:20:35 2004
> > +++ linux-2.6.9-rc4-taka/mm/vmscan.c	Tue Nov 16 11:06:06 2004
> > @@ -459,6 +459,10 @@ int shrink_list(struct list_head *page_l
> >  			goto keep_locked;
> >  		}
> >  
> > +		if (PageMigration(page)) {
> > +			write_unlock_irq(&mapping->tree_lock);
> > +			goto keep_locked;
> > +		}
> >  #ifdef CONFIG_SWAP
> >  		if (PageSwapCache(page)) {
> >  			swp_entry_t swap = { .val = page->private };
> > diff -puN mm/memory.c~marcelo-FIX1 mm/memory.c
> > --- linux-2.6.9-rc4/mm/memory.c~marcelo-FIX1	Tue Nov 16 11:06:31 2004
> > +++ linux-2.6.9-rc4-taka/mm/memory.c	Tue Nov 16 11:06:57 2004
> > @@ -1621,7 +1621,7 @@ again:
> >  		if (vm_swap_full())
> >  			remove_exclusive_swap_page(page);
> >  	} else {
> > -		migration_remove_reference(page);
> > +		migration_remove_reference(page, 1);
> >  	}
> >  
> >  	mm->rss++;
> > _
> 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-11-24 10:21                       ` Hirokazu Takahashi
@ 2004-12-01 20:21                         ` Marcelo Tosatti
  2004-12-08 13:23                           ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2004-12-01 20:21 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

On Wed, Nov 24, 2004 at 07:21:56PM +0900, Hirokazu Takahashi wrote:
> Hi Marcelo,

Hi again Hirokazu, finally found sometime to think about this.

> > > Hi Marcelo,
> > > 
> > > I've been testing the memory migration code with your patch.
> > > I found problems and I think the attached patch would
> > > fix some of them.
> > > 
> > > One of the problems is a race condition between add_to_migration_cache()
> > > and try_to_unmap(). Some pages in the migration cache cannot
> > > be removed with the current implementation. Please suppose
> > > a process space might be removed between them. In this case
> > > no one can remove pages the process had from the migration cache,
> > > because they can be removed only when the pagetables pointed
> > > the pages.
> > 
> > I guess I dont fully understand you Hirokazu.
> > 
> > unmap_vmas function (called by exit_mmap) calls zap_pte_range, 
> > and that does:
> > 
> >                         if (pte_is_migration(pte)) {
> >                                 migration_remove_entry(swp_entry);
> >                         } else
> >                                 free_swap_and_cache(swp_entry);
> > 
> > migration_remove_entry should decrease the IDR counter, and 
> > remove the migration cache page on zero reference.
> > 
> > Am I missing something?
> 
> That's true only if the pte points a migration entry.
> However, the pte may not point it when zap_pte_range() is called
> in some case.
> 
> Please suppose the following flow.
> Any process may exit or munmap during memory migration
> before calling set_pte(migration entry). This will
> keep some unreferenced pages in the migration cache.
> No one can remove these pages.
> 
>   <start page migration>                  <Process A>
>         |                                      |
>         |                                      |
>         |                                      |
>  add_to_migration_cache()                      |
>     insert a page of Process A  ----------->   |
>     in the migration cache.                    |
>         |                                      |
>         |                               zap_pte_range()
>         |                   X <------------ migration_remove_entry()
>         |                      the pte associated with the page doesn't
>         |                      point any migration entries.

OK, I see it, its the "normal" anonymous pte which will be removed at
this point.

>         |
>         |
>  try_to_unmap() -----------------------> X
>      migration_duplicate()       no pte mapping the page can be found.
>      set_pte(migration entry)
>         |
>         |
>  migrate_fn()
>         |
>         |
>     <finish>
>          the page still remains in the migration cache.
> 	 the page may be referred by no process.
> 
> 
> > I assume you are seeing this problems in practice?
> 
> Yes, it often happens without the patch.
> 
> > Sorry for the delay, been busy with other things.
> 
> No problem. Everyone knows you're doing hard work!
> 
> > > Therefore, I made pages removed from the migration cache
> > > at the end of generic_migrate_page() if they remain in the cache.

OK, removing migration pages at end of generic_migrate_page() should 
avoid the leak - that part of your patch is fine to me!

> > > The another is a fork() related problem. If fork() has occurred
> > > during page migration, the previous work may not go well.
> > > pages may not be removed from the migration cache.

Can you please expand on that one? I assume it works fine because 
copy_page_range() duplicates the migration page reference (and the 
migration pte), meaning that on exit (zap_pte_range) the migration
pages should be removed through migration_remove_entry(). 

I dont see the problem - please correct me.

> > > So I made the swapcode ignore pages in the migration cache.
> > > However, as you know this is just a workaround and not a correct
> > > way to fix it.

What this has to do with fork()? I can't understand.

Your patch is correct here also - we can't reclaim migration cache 
pages.

+	if (PageMigration(page)) {
+		write_unlock_irq(&mapping->tree_lock);
+		goto keep_locked;
+	}

An enhancement would be to force pagefault of all pte's
mapping to a migration cache page on shrink_list.  

similar to rmap.c's try_to_unmap_anon() but intented to create the pte 
instead of unmapping it

        anon_vma = page_lock_anon_vma(page);

        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
		ret = try_to_faultin(page, vma);

And try_to_faultin() calling handle_mm_fault()...

Is that what you mean?

Anyways, does the migration cache survive your stress testing now 
with these changes ? 

I've coded the beginning of skeleton for the nonblocking version of migrate_onepage().

Can you generate a new migration cache patch on top of linux-2.6.10-rc1-mm2-mhp2 
with your fixes ?

Thanks!
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-12-01 20:21                         ` Marcelo Tosatti
@ 2004-12-08 13:23                           ` Hirokazu Takahashi
  2005-01-17  9:59                             ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2004-12-08 13:23 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi Marcelo,

Sorry for the delayed reply.

> > > > I've been testing the memory migration code with your patch.
> > > > I found problems and I think the attached patch would
> > > > fix some of them.
> > > > 
> > > > One of the problems is a race condition between add_to_migration_cache()
> > > > and try_to_unmap(). Some pages in the migration cache cannot
> > > > be removed with the current implementation. Please suppose
> > > > a process space might be removed between them. In this case
> > > > no one can remove pages the process had from the migration cache,
> > > > because they can be removed only when the pagetables pointed
> > > > the pages.
> > > 
> > > I guess I dont fully understand you Hirokazu.
> > > 
> > > unmap_vmas function (called by exit_mmap) calls zap_pte_range, 
> > > and that does:
> > > 
> > >                         if (pte_is_migration(pte)) {
> > >                                 migration_remove_entry(swp_entry);
> > >                         } else
> > >                                 free_swap_and_cache(swp_entry);
> > > 
> > > migration_remove_entry should decrease the IDR counter, and 
> > > remove the migration cache page on zero reference.
> > > 
> > > Am I missing something?
> > 
> > That's true only if the pte points a migration entry.
> > However, the pte may not point it when zap_pte_range() is called
> > in some case.
> > 
> > Please suppose the following flow.
> > Any process may exit or munmap during memory migration
> > before calling set_pte(migration entry). This will
> > keep some unreferenced pages in the migration cache.
> > No one can remove these pages.
> > 
> >   <start page migration>                  <Process A>
> >         |                                      |
> >         |                                      |
> >         |                                      |
> >  add_to_migration_cache()                      |
> >     insert a page of Process A  ----------->   |
> >     in the migration cache.                    |
> >         |                                      |
> >         |                               zap_pte_range()
> >         |                   X <------------ migration_remove_entry()
> >         |                      the pte associated with the page doesn't
> >         |                      point any migration entries.
> 
> OK, I see it, its the "normal" anonymous pte which will be removed at
> this point.
> 
> >         |
> >         |
> >  try_to_unmap() -----------------------> X
> >      migration_duplicate()       no pte mapping the page can be found.
> >      set_pte(migration entry)
> >         |
> >         |
> >  migrate_fn()
> >         |
> >         |
> >     <finish>
> >          the page still remains in the migration cache.
> > 	 the page may be referred by no process.
> > 
> > 
> > > I assume you are seeing this problems in practice?
> > 
> > Yes, it often happens without the patch.
> > 
> > > Sorry for the delay, been busy with other things.
> > 
> > No problem. Everyone knows you're doing hard work!

> > > > Therefore, I made pages removed from the migration cache
> > > > at the end of generic_migrate_page() if they remain in the cache.
> 
> OK, removing migration pages at end of generic_migrate_page() should 
> avoid the leak - that part of your patch is fine to me!
> 
> > > > The another is a fork() related problem. If fork() has occurred
> > > > during page migrationa, the previous work may not go well.
> > > > pages may not be removed from the migration cache.
> 
> Can you please expand on that one? I assume it works fine because 
> copy_page_range() duplicates the migration page reference (and the 
> migration pte), meaning that on exit (zap_pte_range) the migration
> pages should be removed through migration_remove_entry(). 

Yes, that's true.

> I dont see the problem - please correct me.

However, once the page is moved into the migration cache,
no one can make it swapped out. This problem may be solved
by your approach described below.

> > > > So I made the swapcode ignore pages in the migration cache.
> > > > However, as you know this is just a workaround and not a correct
> > > > way to fix it.
> 
> What this has to do with fork()? I can't understand.

fork() may leave some pages in the migration cache with my
latest implementation, though the memory migration code
tries to remove them from the migration cache by forcible
pagefault in touch_unmapped_address().

However, touch_unmapped_address() doesn't know that the
migration page has been duplicated.

> Your patch is correct here also - we can't reclaim migration cache 
> pages.
> 
> +	if (PageMigration(page)) {
> +		write_unlock_irq(&mapping->tree_lock);
> +		goto keep_locked;
> +	}
> 
> An enhancement would be to force pagefault of all pte's
> mapping to a migration cache page on shrink_list.  
>
> similar to rmap.c's try_to_unmap_anon() but intented to create the pte 
> instead of unmapping it

If it works as we expect, this code can be called at the end of
generic_migrate_page() I guess.

>         anon_vma = page_lock_anon_vma(page);
> 
>         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
> 		ret = try_to_faultin(page, vma);
> 
> And try_to_faultin() calling handle_mm_fault()...
> 
> Is that what you mean?
> 
> Anyways, does the migration cache survive your stress testing now 
> with these changes ? 

Sure.

> I've coded the beginning of skeleton for the nonblocking version of migrate_onepage().
> 
> Can you generate a new migration cache patch on top of linux-2.6.10-rc1-mm2-mhp2 
> with your fixes ?

I ported your patch and my fixes on the top of linux-2.6.10-rc1-mm5-mhp1.


Thanks,
Hirokazu Takahashi.


---

Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>
---

 linux-2.6.10-rc1-mm5-taka/include/linux/mm.h      |   23 ++
 linux-2.6.10-rc1-mm5-taka/include/linux/swap.h    |   16 +
 linux-2.6.10-rc1-mm5-taka/include/linux/swapops.h |   25 ++
 linux-2.6.10-rc1-mm5-taka/mm/fremap.c             |   11 -
 linux-2.6.10-rc1-mm5-taka/mm/memory.c             |   50 ++++-
 linux-2.6.10-rc1-mm5-taka/mm/mmigrate.c           |  192 +++++++++++++++++++++-
 linux-2.6.10-rc1-mm5-taka/mm/page_io.c            |    1 
 linux-2.6.10-rc1-mm5-taka/mm/rmap.c               |   32 ++-
 linux-2.6.10-rc1-mm5-taka/mm/shmem.c              |    1 
 linux-2.6.10-rc1-mm5-taka/mm/swapfile.c           |    9 -
 linux-2.6.10-rc1-mm5-taka/mm/vmscan.c             |    6 
 11 files changed, 331 insertions, 35 deletions

diff -puN include/linux/mm.h~migration_cache_marcelo5 include/linux/mm.h
--- linux-2.6.10-rc1-mm5/include/linux/mm.h~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/include/linux/mm.h	Wed Dec  8 08:26:10 2004
@@ -286,6 +286,24 @@ extern int capture_page_range(unsigned l
  * files which need it (119 of them)
  */
 #include <linux/page-flags.h>
+#include <linux/swap.h>
+#include <linux/swapops.h> 
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
+
 
 /*
  * Methods to modify the page usage count.
@@ -493,11 +511,14 @@ void page_address_init(void);
 #define PAGE_MAPPING_ANON	1
 
 extern struct address_space swapper_space;
+extern struct address_space migration_space;
 static inline struct address_space *page_mapping(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	if (unlikely(PageSwapCache(page)))
+	if (unlikely(PageMigration(page)))
+		mapping = &migration_space;
+	else if (unlikely(PageSwapCache(page)))
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
diff -puN include/linux/swap.h~migration_cache_marcelo5 include/linux/swap.h
--- linux-2.6.10-rc1-mm5/include/linux/swap.h~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/include/linux/swap.h	Wed Dec  8 08:26:10 2004
@@ -258,6 +258,7 @@ static inline int remove_exclusive_swap_
 {
 	return __remove_exclusive_swap_page(p, 0);
 }
+extern int migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
@@ -331,6 +332,21 @@ static inline swp_entry_t get_swap_page(
 #define put_swap_token(x) do { } while(0)
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
+
+static inline int PageMigration(struct page *page)
+{
+        swp_entry_t entry;
+
+        if (!PageSwapCache(page))
+                return 0;
+
+        entry.val = page->private;
+
+        if (swp_type(entry) != MIGRATION_TYPE)
+                return 0;
+
+        return 1;
+}
 
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
diff -puN include/linux/swapops.h~migration_cache_marcelo5 include/linux/swapops.h
--- linux-2.6.10-rc1-mm5/include/linux/swapops.h~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/include/linux/swapops.h	Wed Dec  8 08:26:10 2004
@@ -10,7 +10,9 @@
  * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
  */
 #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
+#define SWP_OFFSET_MASK(e)	((1UL << (SWP_TYPE_SHIFT(e))) - 1)
+
+#define MIGRATION_TYPE  (MAX_SWAPFILES - 1)
 
 /*
  * Store a type+offset into a swp_entry_t in an arch-independent format
@@ -67,3 +69,24 @@ static inline pte_t swp_entry_to_pte(swp
 	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
+
+static inline int pte_is_migration(pte_t pte)
+{
+	unsigned long swp_type;
+	swp_entry_t arch_entry;
+
+	arch_entry = __pte_to_swp_entry(pte);
+
+	swp_type = __swp_type(arch_entry);
+
+	return swp_type == MIGRATION_TYPE;
+}
+
+static inline pte_t migration_entry_to_pte(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+	
+	arch_entry = __swp_entry(MIGRATION_TYPE, swp_offset(entry));
+	return __swp_entry_to_pte(arch_entry);
+}
+
diff -puN mm/fremap.c~migration_cache_marcelo5 mm/fremap.c
--- linux-2.6.10-rc1-mm5/mm/fremap.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/fremap.c	Wed Dec  8 08:26:10 2004
@@ -11,7 +11,6 @@
 #include <linux/file.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/swapops.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -43,8 +42,14 @@ static inline void zap_pte(struct mm_str
 			}
 		}
 	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) { 
+				migration_remove_entry(swp_entry);
+			} else {
+				free_swap_and_cache(swp_entry);
+			}
+		}
 		pte_clear(ptep);
 	}
 }
diff -puN mm/memory.c~migration_cache_marcelo5 mm/memory.c
--- linux-2.6.10-rc1-mm5/mm/memory.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/memory.c	Wed Dec  8 08:36:41 2004
@@ -56,7 +56,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include <linux/swapops.h>
 #include <linux/elf.h>
 
 #ifndef CONFIG_DISCONTIGMEM
@@ -264,7 +263,10 @@ copy_swap_pte(struct mm_struct *dst_mm, 
 {
 	if (pte_file(pte))
 		return;
-	swap_duplicate(pte_to_swp_entry(pte));
+	if (pte_is_migration(pte)) 
+		migration_duplicate(pte_to_swp_entry(pte));
+	else
+		swap_duplicate(pte_to_swp_entry(pte));
 	if (list_empty(&dst_mm->mmlist)) {
 		spin_lock(&mmlist_lock);
 		list_add(&dst_mm->mmlist, &src_mm->mmlist);
@@ -537,8 +539,13 @@ static void zap_pte_range(struct mmu_gat
 		 */
 		if (unlikely(details))
 			continue;
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
+		if (!pte_file(pte)) {
+			swp_entry_t swp_entry = pte_to_swp_entry(pte);
+			if (pte_is_migration(pte)) {
+				migration_remove_entry(swp_entry);
+			} else
+				free_swap_and_cache(swp_entry);
+		}
 		pte_clear(ptep);
 	}
 	pte_unmap(ptep-1);
@@ -1739,6 +1746,20 @@ static int do_swap_page(struct mm_struct
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 again:
+	if (pte_is_migration(orig_pte)) {
+		page = lookup_migration_cache(entry.val);
+		if (!page) { 
+			spin_lock(&mm->page_table_lock);
+			page_table = pte_offset_map(pmd, address);
+			if (likely(pte_same(*page_table, orig_pte)))
+				ret = VM_FAULT_OOM;
+			else
+				ret = VM_FAULT_MINOR;
+			pte_unmap(page_table);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
+	} else {
 	page = lookup_swap_cache(entry);
 	if (!page) {
  		swapin_readahead(entry, address, vma);
@@ -1764,15 +1785,22 @@ again:
 		inc_page_state(pgmajfault);
 		grab_swap_token();
 	}
-
 	mark_page_accessed(page);
 	lock_page(page);
 	if (!PageSwapCache(page)) {
+		/* hiro: add !PageMigration(page) here */
 		/* page-migration has occured */
 		unlock_page(page);
 		page_cache_release(page);
 		goto again;
 	}
+	}
+
+
+	if (pte_is_migration(orig_pte)) {
+		mark_page_accessed(page);
+		lock_page(page);
+	}
 
 	/*
 	 * Back out if somebody else faulted in this pte while we
@@ -1790,10 +1818,14 @@ again:
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
+
+	if (!pte_is_migration(orig_pte)) {
+		swap_free(entry);
+		if (vm_swap_full())
+			remove_exclusive_swap_page(page);
+	} else {
+		migration_remove_reference(page, 1);
+	}
 
 	mm->rss++;
 	acct_update_integrals();
diff -puN mm/mmigrate.c~migration_cache_marcelo5 mm/mmigrate.c
--- linux-2.6.10-rc1-mm5/mm/mmigrate.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/mmigrate.c	Wed Dec  8 08:36:41 2004
@@ -1,4 +1,4 @@
-/*
+ /*
  *  linux/mm/mmigrate.c
  *
  *  Support of memory hotplug
@@ -21,6 +21,8 @@
 #include <linux/rmap.h>
 #include <linux/mmigrate.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/page-flags.h>
 
 /*
  * The concept of memory migration is to replace a target page with
@@ -35,6 +37,169 @@
  * hugetlbpages can be handled in the same way.
  */
 
+struct counter {
+	int i;
+};
+
+struct idr migration_idr;
+
+static struct address_space_operations migration_aops = {
+        .writepage      = NULL,
+        .sync_page      = NULL,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+};
+
+static struct backing_dev_info migration_backing_dev_info = {
+        .memory_backed  = 1,    /* Does not contribute to dirty memory */
+        .unplug_io_fn   = NULL,
+};
+
+struct address_space migration_space = {
+        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC),
+        .tree_lock      = RW_LOCK_UNLOCKED,
+        .a_ops          = &migration_aops,
+        .flags          = GFP_HIGHUSER,
+        .i_mmap_nonlinear = LIST_HEAD_INIT(migration_space.i_mmap_nonlinear),
+        .backing_dev_info = &migration_backing_dev_info,
+};
+
+int init_migration_cache(void) 
+{
+	idr_init(&migration_idr);
+
+	return 0;
+}
+
+__initcall(init_migration_cache);
+
+struct page *lookup_migration_cache(int id) 
+{ 
+	return find_get_page(&migration_space, id);
+}
+
+void migration_duplicate(swp_entry_t entry)
+{
+	int offset;
+	struct counter *cnt;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	cnt = idr_find(&migration_idr, swp_offset(entry));
+	cnt->i = cnt->i + 1;
+
+	read_unlock_irq(&migration_space.tree_lock);
+}
+
+void remove_from_migration_cache(struct page *page, int id)
+{
+	write_lock_irq(&migration_space.tree_lock);
+        idr_remove(&migration_idr, id);
+	radix_tree_delete(&migration_space.page_tree, id);
+	ClearPageSwapCache(page);
+	page->private = NULL;
+	write_unlock_irq(&migration_space.tree_lock);
+}
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+int migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page, 1);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
+int migration_remove_reference(struct page *page, int dec)
+{
+	struct counter *c;
+	swp_entry_t entry;
+
+	entry.val = page->private;
+
+	read_lock_irq(&migration_space.tree_lock);
+
+	c = idr_find(&migration_idr, swp_offset(entry));
+
+	read_unlock_irq(&migration_space.tree_lock);
+
+	BUG_ON(c->i < dec);
+
+	c->i -= dec;
+
+	if (!c->i) {
+		remove_from_migration_cache(page, page->private);
+		kfree(c);
+		page_cache_release(page);
+	}
+}
+
+int detach_from_migration_cache(struct page *page)
+{
+	lock_page(page);	
+	migration_remove_reference(page, 0);
+	unlock_page(page);
+
+	return 0;
+}
+
+int add_to_migration_cache(struct page *page, int gfp_mask) 
+{
+	int error, offset;
+	struct counter *counter;
+	swp_entry_t entry;
+
+	BUG_ON(PageSwapCache(page));
+
+	BUG_ON(PagePrivate(page));
+
+        if (idr_pre_get(&migration_idr, GFP_ATOMIC) == 0)
+                return -ENOMEM;
+
+	counter = kmalloc(sizeof(struct counter), GFP_KERNEL);
+
+	if (!counter)
+		return -ENOMEM;
+
+	error = radix_tree_preload(gfp_mask);
+
+	counter->i = 0;
+
+	if (!error) {
+		write_lock_irq(&migration_space.tree_lock);
+	        error = idr_get_new_above(&migration_idr, counter, 1, &offset);
+
+		if (error < 0)
+			BUG();
+
+		entry = swp_entry(MIGRATION_TYPE, offset);
+
+		error = radix_tree_insert(&migration_space.page_tree, entry.val,
+							page);
+		if (!error) {
+			page_cache_get(page);
+			SetPageLocked(page);
+			page->private = entry.val;
+			SetPageSwapCache(page);
+		}
+		write_unlock_irq(&migration_space.tree_lock);
+                radix_tree_preload_end();
+
+	}
+
+	return error;
+}
 
 /*
  * Try to writeback a dirty page to free its buffers.
@@ -121,9 +286,11 @@ page_migratable(struct page *page, struc
 	if (PageWriteback(page))
 		return -EAGAIN;
 	/* The page might have been truncated */
-	truncated = !PageSwapCache(newpage) && page_mapping(page) == NULL;
-	if (page_count(page) + truncated <= freeable_page_count)
+	truncated = !PageSwapCache(newpage) &&
+		page_mapping(page) == NULL;
+	if (page_count(page) + truncated <= freeable_page_count) 
 		return truncated ? -ENOENT : 0;
+
 	return -EAGAIN;
 }
 
@@ -365,7 +532,9 @@ generic_migrate_page(struct page *page, 
 
 	/* map the newpage where the old page have been mapped. */
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(newpage)) {
+	if (PageMigration(newpage))
+		detach_from_migration_cache(newpage);
+	else if (PageSwapCache(newpage)) {
 		lock_page(newpage);
 		__remove_exclusive_swap_page(newpage, 1);
 		unlock_page(newpage);
@@ -381,7 +550,9 @@ out_busy:
 	/* Roll back all operations. */
 	unwind_page(page, newpage);
 	touch_unmapped_address(&vlist);
-	if (PageSwapCache(page)) {
+	if (PageMigration(page))
+		detach_from_migration_cache(page);
+	else if (PageSwapCache(page)) {
 		lock_page(page);
 		__remove_exclusive_swap_page(page, 1);
 		unlock_page(page);
@@ -394,6 +565,8 @@ out_removing:
 		BUG();
 	unlock_page(page);
 	unlock_page(newpage);
+	if (PageMigration(page))
+		detach_from_migration_cache(page);
 	return ret;
 }
 
@@ -415,10 +588,14 @@ migrate_onepage(struct page *page)
 	 */
 #ifdef CONFIG_SWAP
 	if (PageAnon(page) && !PageSwapCache(page))
-		if (!add_to_swap(page, GFP_KERNEL)) {
+		if (add_to_migration_cache(page, GFP_KERNEL)) {
 			unlock_page(page);
 			return ERR_PTR(-ENOSPC);
 		}
+/*		if (!add_to_swap(page, GFP_KERNEL)) {
+			unlock_page(page);
+			return ERR_PTR(-ENOSPC);
+		} */
 #endif /* CONFIG_SWAP */
 	if ((mapping = page_mapping(page)) == NULL) {
 		/* truncation is in progress */
@@ -437,8 +614,9 @@ migrate_onepage(struct page *page)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	if (mapping->a_ops->migrate_page)
+	if (mapping->a_ops && mapping->a_ops->migrate_page) {
 		ret = mapping->a_ops->migrate_page(page, newpage);
+	}
 	else
 		ret = generic_migrate_page(page, newpage, migrate_page_common);
 	if (ret) {
diff -puN mm/page_io.c~migration_cache_marcelo5 mm/page_io.c
--- linux-2.6.10-rc1-mm5/mm/page_io.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/page_io.c	Wed Dec  8 08:26:10 2004
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
diff -puN mm/rmap.c~migration_cache_marcelo5 mm/rmap.c
--- linux-2.6.10-rc1-mm5/mm/rmap.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/rmap.c	Wed Dec  8 08:26:10 2004
@@ -49,7 +49,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/swapops.h>
+//#include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/acct.h>
@@ -655,15 +655,29 @@ static int try_to_unmap_one(struct page 
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
 		 */
-		BUG_ON(!PageSwapCache(page));
-		swap_duplicate(entry);
-		if (list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_add(&mm->mmlist, &init_mm.mmlist);
-			spin_unlock(&mmlist_lock);
+		//BUG_ON(!PageSwapCache(page));
+		if (PageSwapCache(page) && !PageMigration(page)) {
+			swap_duplicate(entry);
+			if (list_empty(&mm->mmlist)) {
+				spin_lock(&mmlist_lock);
+				list_add(&mm->mmlist, &init_mm.mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+			set_pte(pte, swp_entry_to_pte(entry));
+			BUG_ON(pte_file(*pte));
+		} else if (PageMigration(page)) {
+			// page cache get to reference pte,
+			// remove from migration cache
+			// on zero-users at fault path
+			migration_duplicate(entry);
+			if (list_empty(&mm->mmlist)) {
+				spin_lock(&mmlist_lock);
+				list_add(&mm->mmlist, &init_mm.mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+			set_pte(pte, migration_entry_to_pte(entry));
+			BUG_ON(pte_file(*pte));
 		}
-		set_pte(pte, swp_entry_to_pte(entry));
-		BUG_ON(pte_file(*pte));
 		mm->anon_rss--;
 	}
 
diff -puN mm/shmem.c~migration_cache_marcelo5 mm/shmem.c
--- linux-2.6.10-rc1-mm5/mm/shmem.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/shmem.c	Wed Dec  8 08:26:10 2004
@@ -42,7 +42,6 @@
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
 #include <linux/xattr.h>
diff -puN mm/swapfile.c~migration_cache_marcelo5 mm/swapfile.c
--- linux-2.6.10-rc1-mm5/mm/swapfile.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/swapfile.c	Wed Dec  8 08:26:10 2004
@@ -34,7 +34,6 @@
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-#include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
 unsigned int nr_swapfiles;
@@ -235,6 +234,7 @@ bad_device:
 	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
 	goto out;
 bad_nofile:
+	BUG();
 	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
 	return NULL;
@@ -1409,6 +1409,13 @@ asmlinkage long sys_swapon(const char __
 		swap_list_unlock();
 		goto out;
 	}
+
+	/* MIGRATION_TYPE is reserved for migration pages */
+	if (type >= MIGRATION_TYPE) {
+		swap_list_unlock();
+		goto out;
+	}
+
 	if (type >= nr_swapfiles)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);
diff -puN mm/vmscan.c~migration_cache_marcelo5 mm/vmscan.c
--- linux-2.6.10-rc1-mm5/mm/vmscan.c~migration_cache_marcelo5	Wed Dec  8 08:26:10 2004
+++ linux-2.6.10-rc1-mm5-taka/mm/vmscan.c	Wed Dec  8 08:36:41 2004
@@ -38,8 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 
-#include <linux/swapops.h>
-
 /*
  * The list of shrinker callbacks used by to apply pressure to
  * ageable caches.
@@ -458,6 +456,10 @@ int shrink_list(struct list_head *page_l
 			goto keep_locked;
 		}
 
+		if (PageMigration(page)) {
+			write_unlock_irq(&mapping->tree_lock);
+			goto keep_locked;
+		}
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->private };
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2004-12-08 13:23                           ` Hirokazu Takahashi
@ 2005-01-17  9:59                             ` Marcelo Tosatti
  2005-01-31 18:33                               ` Ray Bryant
  2005-02-03  2:49                               ` Hirokazu Takahashi
  0 siblings, 2 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2005-01-17  9:59 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi Hirokazu,

On Wed, Dec 08, 2004 at 10:23:07PM +0900, Hirokazu Takahashi wrote:
> Hi Marcelo,
> > > > Sorry for the delay, been busy with other things.
> > > 
> > > No problem. Everyone knows you're doing hard work!
> 
> > > > > Therefore, I made pages removed from the migration cache
> > > > > at the end of generic_migrate_page() if they remain in the cache.
> > 
> > OK, removing migration pages at end of generic_migrate_page() should 
> > avoid the leak - that part of your patch is fine to me!
> > 
> > > > > The another is a fork() related problem. If fork() has occurred
> > > > > during page migrationa, the previous work may not go well.
> > > > > pages may not be removed from the migration cache.
> > 
> > Can you please expand on that one? I assume it works fine because 
> > copy_page_range() duplicates the migration page reference (and the 
> > migration pte), meaning that on exit (zap_pte_range) the migration
> > pages should be removed through migration_remove_entry(). 
> 
> Yes, that's true.
> 
> > I dont see the problem - please correct me.
> 
> However, once the page is moved into the migration cache,
> no one can make it swapped out. This problem may be solved
> by your approach described below.
> 
> > > > > So I made the swapcode ignore pages in the migration cache.
> > > > > However, as you know this is just a workaround and not a correct
> > > > > way to fix it.
> > 
> > What this has to do with fork()? I can't understand.
> 
> fork() may leave some pages in the migration cache with my
> latest implementation, though the memory migration code
> tries to remove them from the migration cache by forcible
> pagefault in touch_unmapped_address().

Why are record_unmapped_address/touch_unmapped_address needed ? 

I started investigating the issue which migration pages couldnt 
be swapped out, but found out that migration pages are never left 
in the cache because touch_unmapped_address recreates the ptes removing
the pages from the migration cache.

That means we have no problem with migration cache pages left pinned 
(unswappable) in memory, which means it is fully functional AFAICT.

However, I thought the intent was to fault the pages on demand? 

I even wrote this to be called at vmscan() time but touch_unmapped_address 
already has similar functionality at migration time.

int try_to_faultin(struct page *page)
{
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
        unsigned long address;
        int ret = 0;

restart:
        anon_vma = page_lock_anon_vma(page);
        if (!anon_vma)
                return ret;

        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
                address = vma_address(page, vma);
                // handle if (address = -EFAULT) ? 
                if (!follow_and_check_present(vma->vm_mm, address))
                        continue;

                spin_unlock(&anon_vma->lock);
                switch (handle_mm_fault(vma->vm_mm, vma, address, 0)) {
                case VM_FAULT_MINOR:
                        goto restart;
                case VM_FAULT_MAJOR:
                        BUG();
                case VM_FAULT_SIGBUS:
                case VM_FAULT_OOM:
                        goto out_unlock;
                }
        }
        ret = 1;
        printk(KERN_ERR "faulted migration page in!\n");

out_unlock:
        spin_unlock(&anon_vma->lock);
        return ret;

}


> 
> However, touch_unmapped_address() doesn't know that the
> migration page has been duplicated.
> 
> > Your patch is correct here also - we can't reclaim migration cache 
> > pages.
> > 
> > +	if (PageMigration(page)) {
> > +		write_unlock_irq(&mapping->tree_lock);
> > +		goto keep_locked;
> > +	}
> > 
> > An enhancement would be to force pagefault of all pte's
> > mapping to a migration cache page on shrink_list.  
> >
> > similar to rmap.c's try_to_unmap_anon() but intented to create the pte 
> > instead of unmapping it
> 
> If it works as we expect, this code can be called at the end of
> generic_migrate_page() I guess.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-17  9:59                             ` Marcelo Tosatti
@ 2005-01-31 18:33                               ` Ray Bryant
  2005-01-31 18:44                                 ` Marcelo Tosatti
  2005-02-03  2:49                               ` Hirokazu Takahashi
  1 sibling, 1 reply; 48+ messages in thread
From: Ray Bryant @ 2005-01-31 18:33 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, haveblue, hugh

Marcello and Hirokazu,

I've finally gotten around to working on my user controlled page migration 
project.  (What I'm trying to implement is a way for a suitably authorized
user program to request that some or all of the pages or a particular address
space be migrated from one NUMA node to another.)

The first such thing to try to migrate is anonymous, private pages so that
is what I am working on.  To keep things simple, the user program is trying
to migrate part of its own address space.

What I have found out is that this works correctly with the page-migration
patch (extracted from the memory hotplug patch, e. g.:

http://sr71.net/patches/2.6.10/2.6.10-mm1-mhp-test7/page_migration/

) but when I apply the migration cache patch on top what happens is that
the migration appears to occur, but then as soon as I return from the
system call, I get:

VM: killing process test
Killed

(This message comes from ia64_do_page_fault() and appears to because
handle_mm_fault() returned FAULT_OOM....)

I haven't looked into this further, but was wondering if perhaps one of
you would understand why the migrate cache patch would fail in this way?

Thanks,
-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-31 18:33                               ` Ray Bryant
@ 2005-01-31 18:44                                 ` Marcelo Tosatti
  2005-02-02 21:28                                   ` Ray Bryant
  0 siblings, 1 reply; 48+ messages in thread
From: Marcelo Tosatti @ 2005-01-31 18:44 UTC (permalink / raw)
  To: Ray Bryant; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, haveblue, hugh

On Mon, Jan 31, 2005 at 12:33:19PM -0600, Ray Bryant wrote:
> Marcello and Hirokazu,
> 
> I've finally gotten around to working on my user controlled page migration 
> project.  (What I'm trying to implement is a way for a suitably authorized
> user program to request that some or all of the pages or a particular 
> address
> space be migrated from one NUMA node to another.)
> 
> The first such thing to try to migrate is anonymous, private pages so that
> is what I am working on.  To keep things simple, the user program is trying
> to migrate part of its own address space.
> 
> What I have found out is that this works correctly with the page-migration
> patch (extracted from the memory hotplug patch, e. g.:
> 
> http://sr71.net/patches/2.6.10/2.6.10-mm1-mhp-test7/page_migration/
> 
> ) but when I apply the migration cache patch on top what happens is that
> the migration appears to occur, but then as soon as I return from the
> system call, I get:
> 
> VM: killing process test
> Killed
> 
> (This message comes from ia64_do_page_fault() and appears to because
> handle_mm_fault() returned FAULT_OOM....)
> 
> I haven't looked into this further, but was wondering if perhaps one of
> you would understand why the migrate cache patch would fail in this way?

I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
can you confirm that?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-31 18:44                                 ` Marcelo Tosatti
@ 2005-02-02 21:28                                   ` Ray Bryant
  2005-02-03  2:59                                     ` Hirokazu Takahashi
  2005-02-07 13:16                                     ` Hirokazu Takahashi
  0 siblings, 2 replies; 48+ messages in thread
From: Ray Bryant @ 2005-02-02 21:28 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm, iwamoto, haveblue, hugh

Marcelo Tosatti wrote:
> 
>>
>>(This message comes from ia64_do_page_fault() and appears to because
>>handle_mm_fault() returned FAULT_OOM....)
>>
>>I haven't looked into this further, but was wondering if perhaps one of
>>you would understand why the migrate cache patch would fail in this way?
> 
> 
> I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
> can you confirm that?
> 
No, it doesn't appear to be do_wp_page().  It looks like get_swap_page() 
returns FAULT_OOM followed by get_user_pages() returning FAULT_OOM.
For the page that causes the VM to kill the process, there is no return
from get_user_pages() that returns FAULT_OOM.  Not sure yet what is going
on here.

-- 
-----------------------------------------------
Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
	 so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-17  9:59                             ` Marcelo Tosatti
  2005-01-31 18:33                               ` Ray Bryant
@ 2005-02-03  2:49                               ` Hirokazu Takahashi
  1 sibling, 0 replies; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-03  2:49 UTC (permalink / raw)
  To: marcelo.tosatti; +Cc: linux-mm, iwamoto, haveblue, hugh

Hi Marcelo,

Very sorry for the delayed response.

> Hi Hirokazu,
> 
> On Wed, Dec 08, 2004 at 10:23:07PM +0900, Hirokazu Takahashi wrote:
> > Hi Marcelo,
> > > > > Sorry for the delay, been busy with other things.
> > > > 
> > > > No problem. Everyone knows you're doing hard work!
> > 
> > > > > > Therefore, I made pages removed from the migration cache
> > > > > > at the end of generic_migrate_page() if they remain in the cache.
> > > 
> > > OK, removing migration pages at end of generic_migrate_page() should 
> > > avoid the leak - that part of your patch is fine to me!
> > > 
> > > > > > The another is a fork() related problem. If fork() has occurred
> > > > > > during page migrationa, the previous work may not go well.
> > > > > > pages may not be removed from the migration cache.
> > > 
> > > Can you please expand on that one? I assume it works fine because 
> > > copy_page_range() duplicates the migration page reference (and the 
> > > migration pte), meaning that on exit (zap_pte_range) the migration
> > > pages should be removed through migration_remove_entry(). 
> > 
> > Yes, that's true.
> > 
> > > I dont see the problem - please correct me.
> > 
> > However, once the page is moved into the migration cache,
> > no one can make it swapped out. This problem may be solved
> > by your approach described below.
> > 
> > > > > > So I made the swapcode ignore pages in the migration cache.
> > > > > > However, as you know this is just a workaround and not a correct
> > > > > > way to fix it.
> > > 
> > > What this has to do with fork()? I can't understand.
> > 
> > fork() may leave some pages in the migration cache with my
> > latest implementation, though the memory migration code
> > tries to remove them from the migration cache by forcible
> > pagefault in touch_unmapped_address().
> 
> Why are record_unmapped_address/touch_unmapped_address needed ? 

There are two reasons.
  1. Migrated pages should be mapped to the process space soon
     if they're mlocked. Or the pages might be swapped out as
     the swap code doesn't care about them if they aren't mapped.

  2. Without this, migrated pages will consume entries of
     the migration cache until the process which the pages
     belong to has died.
     And if they're kept in the migration cache, the pages cannot
     be swapped out even if free pages might become few, as you
     mentioned below.

     Previously this is designed to not consume swap entries
     on real devices.

> I started investigating the issue which migration pages couldnt 
> be swapped out, but found out that migration pages are never left 
> in the cache because touch_unmapped_address recreates the ptes removing
> the pages from the migration cache.

Yes. This is the one of the purpose.

> That means we have no problem with migration cache pages left pinned 
> (unswappable) in memory, which means it is fully functional AFAICT.
> 
> However, I thought the intent was to fault the pages on demand? 

Yes, you're absolutely right.
If there is another better way, I'm pleased to replace it.

> I even wrote this to be called at vmscan() time but touch_unmapped_address 
> already has similar functionality at migration time.

Interesting.
It would work pretty good if fork() is invoked during memory migration.
My touch_unmapped_address approach can't handle this case.

But I'm worried about two things.

 - I wonder if mlocked pages can be handled correctly.
   What would happen if the page has been mlocked and it also belongs
   to the swap cache even though this case is very very rare?

 - I'm not sure deriving anon_vma from the page is always correct
   while it isn't mapped to anywhere.

> int try_to_faultin(struct page *page)
> {
>         struct anon_vma *anon_vma;
>         struct vm_area_struct *vma;
>         unsigned long address;
>         int ret = 0;
> 
> restart:
>         anon_vma = page_lock_anon_vma(page);
>         if (!anon_vma)
>                 return ret;
> 
>         list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
>                 address = vma_address(page, vma);
>                 // handle if (address = -EFAULT) ? 
>                 if (!follow_and_check_present(vma->vm_mm, address))
>                         continue;
> 
>                 spin_unlock(&anon_vma->lock);
>                 switch (handle_mm_fault(vma->vm_mm, vma, address, 0)) {
>                 case VM_FAULT_MINOR:
>                         goto restart;
>                 case VM_FAULT_MAJOR:
>                         BUG();
>                 case VM_FAULT_SIGBUS:
>                 case VM_FAULT_OOM:
>                         goto out_unlock;
>                 }
>         }
>         ret = 1;
>         printk(KERN_ERR "faulted migration page in!\n");
> 
> out_unlock:
>         spin_unlock(&anon_vma->lock);
>         return ret;
> 
> }
> 
> 
> > 
> > However, touch_unmapped_address() doesn't know that the
> > migration page has been duplicated.
> > 
> > > Your patch is correct here also - we can't reclaim migration cache 
> > > pages.
> > > 
> > > +	if (PageMigration(page)) {
> > > +		write_unlock_irq(&mapping->tree_lock);
> > > +		goto keep_locked;
> > > +	}
> > > 
> > > An enhancement would be to force pagefault of all pte's
> > > mapping to a migration cache page on shrink_list.  
> > >
> > > similar to rmap.c's try_to_unmap_anon() but intented to create the pte 
> > > instead of unmapping it
> > 
> > If it works as we expect, this code can be called at the end of
> > generic_migrate_page() I guess.
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
> 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-02 21:28                                   ` Ray Bryant
@ 2005-02-03  2:59                                     ` Hirokazu Takahashi
  2005-02-03 15:19                                       ` Ray Bryant
  2005-02-07 13:16                                     ` Hirokazu Takahashi
  1 sibling, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-03  2:59 UTC (permalink / raw)
  To: raybry; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hi Ray,

> >>(This message comes from ia64_do_page_fault() and appears to because
> >>handle_mm_fault() returned FAULT_OOM....)
> >>
> >>I haven't looked into this further, but was wondering if perhaps one of
> >>you would understand why the migrate cache patch would fail in this way?
> > 
> > 
> > I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
> > can you confirm that?
> > 
> No, it doesn't appear to be do_wp_page().  It looks like get_swap_page() 
> returns FAULT_OOM followed by get_user_pages() returning FAULT_OOM.
> For the page that causes the VM to kill the process, there is no return
> from get_user_pages() that returns FAULT_OOM.  Not sure yet what is going
> on here.

The current implementation requires swap devices to migrate pages.
Have you added any swap devices?

This restriction will be solved with the migration cache Marcelo
is working on.

Thanks,
Hirokazu Takahashi.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-03  2:59                                     ` Hirokazu Takahashi
@ 2005-02-03 15:19                                       ` Ray Bryant
  2005-02-04  7:32                                         ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Ray Bryant @ 2005-02-03 15:19 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hirokazu Takahashi wrote:
> Hi Ray,
> 
> 
>>>>(This message comes from ia64_do_page_fault() and appears to because
>>>>handle_mm_fault() returned FAULT_OOM....)
>>>>
>>>>I haven't looked into this further, but was wondering if perhaps one of
>>>>you would understand why the migrate cache patch would fail in this way?
>>>
>>>
>>>I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
>>>can you confirm that?
>>>
>>
>>No, it doesn't appear to be do_wp_page().  It looks like get_swap_page() 
>>returns FAULT_OOM followed by get_user_pages() returning FAULT_OOM.
>>For the page that causes the VM to kill the process, there is no return
>>from get_user_pages() that returns FAULT_OOM.  Not sure yet what is going
>>on here.
> 
> 
> The current implementation requires swap devices to migrate pages.
> Have you added any swap devices?
> 
> This restriction will be solved with the migration cache Marcelo
> is working on.
> 
> Thanks,
> Hirokazu Takahashi.
> 
> 
I'm running with the migration cache patch applied as well.  This is a
requirement for the project I am working on as the customer doesn't want
to swap out pages just to migrated them.

If I take out the migration cache patch, this "VM: killing ..." problem
goes away.   So it has something to do specifically with the migration
cache code.

-- 
-----------------------------------------------
Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
	 so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-03 15:19                                       ` Ray Bryant
@ 2005-02-04  7:32                                         ` Hirokazu Takahashi
  2005-02-04 16:08                                           ` Ray Bryant
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-04  7:32 UTC (permalink / raw)
  To: raybry; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hi Ray,

I realized the situation.

> >>>>(This message comes from ia64_do_page_fault() and appears to because
> >>>>handle_mm_fault() returned FAULT_OOM....)
> >>>>
> >>>>I haven't looked into this further, but was wondering if perhaps one of
> >>>>you would understand why the migrate cache patch would fail in this way?
> >>>
> >>>
> >>>I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
> >>>can you confirm that?
> >>>
> >>
> >>No, it doesn't appear to be do_wp_page().  It looks like get_swap_page() 
> >>returns FAULT_OOM followed by get_user_pages() returning FAULT_OOM.
> >>For the page that causes the VM to kill the process, there is no return
> >>from get_user_pages() that returns FAULT_OOM.  Not sure yet what is going
> >>on here.
> > 
> > 
> > The current implementation requires swap devices to migrate pages.
> > Have you added any swap devices?
> > 
> > This restriction will be solved with the migration cache Marcelo
> > is working on.
> > 
> > Thanks,
> > Hirokazu Takahashi.
> > 
> > 
> I'm running with the migration cache patch applied as well.  This is a
> requirement for the project I am working on as the customer doesn't want
> to swap out pages just to migrated them.

I see.

> If I take out the migration cache patch, this "VM: killing ..." problem
> goes away.   So it has something to do specifically with the migration
> cache code.

I've never seen the message though the migration cache code may have
some bugs. May I ask you some questions about it?

 - Which version of kernel did you use for it?
 - Which migration cache code did you choose?
 - How many nodes, CPUs and memory does your box have?
 - What kind of applications were running on your box?
 - How often did this happened?
 - Did this message appear right after starting the migration?
   Or it appeared some short while later?
 - How the target pages to be migrated were selected?
 - How did you kick memory migration started?
 - Please show me /proc/meminfo when the problem happened.
 - Is it possible to make the same problem on my machine?

And, would you please make your project proceed without the
migration cache code for a while?

Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-04  7:32                                         ` Hirokazu Takahashi
@ 2005-02-04 16:08                                           ` Ray Bryant
  2005-02-07 12:46                                             ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Ray Bryant @ 2005-02-04 16:08 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hirokazu Takahashi wrote:
> 
> 
>>If I take out the migration cache patch, this "VM: killing ..." problem
>>goes away.   So it has something to do specifically with the migration
>>cache code.
> 
> 
> I've never seen the message though the migration cache code may have
> some bugs. May I ask you some questions about it?
> 
>  - Which version of kernel did you use for it?

2.6.10.  I pulled enough of the mm fixes (2 patches) so that the base
migration patch from the hotplug tree would work on top of 2.6.10.  AFAIK
the same problem occurs on 2.6.11-mm2 which is where I started with the
migration cache patch.  But I admit I haven't tested it there recently.

>  - Which migration cache code did you choose?

I'm using a version from Dec 8 I grabbed from an email from you to Marcello
titled:  Subject: Re: migration cache, updated

>  - How many nodes, CPUs and memory does your box have?

8 CPU, 4 Node Altix, but I really don't think that is significant.

>  - What kind of applications were running on your box?

Machine was running single user.  The only thing that was running was the
test program that calls the page migration system call I wrote.

>  - How often did this happened?

Every time.

>  - Did this message appear right after starting the migration?

The pages all get migrated and then when the system call initiating all
of this returns, the calling process gets killed. There is a printf following
the system call that doesn't happen; the VM kill occurs first.

>    Or it appeared some short while later?

Immediately on return.

>  - How the target pages to be migrated were selected?

The system call interface specifes a virtual address range and pid.  We
scan through all pages in the vma specified by the address range (the range
is required to be withing one vma).  All resident pages in the range are
pulled off of the lru list and added to the list to be passed in to
try_to_migrate_pages().

>  - How did you kick memory migration started?

Via the system call mentioned above.

>  - Please show me /proc/meminfo when the problem happened.

Unfortunately, I don't have that data.  There was lots of memory free,
since I was running single user.

>  - Is it possible to make the same problem on my machine?

I think so.  I'd have to send you my system call code and test programs.
Its not a lot of code on top of the existing page migration patch.

> 
> And, would you please make your project proceed without the
> migration cache code for a while?

I've already done that.  :-)

> 
> Thanks,
> Hirokazu Takahashi.
> 


-- 
-----------------------------------------------
Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
	 so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-04 16:08                                           ` Ray Bryant
@ 2005-02-07 12:46                                             ` Hirokazu Takahashi
  2005-02-07 20:54                                               ` Ray Bryant
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-07 12:46 UTC (permalink / raw)
  To: raybry; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hi Ray,

> >>If I take out the migration cache patch, this "VM: killing ..." problem
> >>goes away.   So it has something to do specifically with the migration
> >>cache code.
> > 
> > 
> > I've never seen the message though the migration cache code may have
> > some bugs. May I ask you some questions about it?
> > 
> >  - Which version of kernel did you use for it?
> 
> 2.6.10.  I pulled enough of the mm fixes (2 patches) so that the base
> migration patch from the hotplug tree would work on top of 2.6.10.  AFAIK
> the same problem occurs on 2.6.11-mm2 which is where I started with the
> migration cache patch.  But I admit I haven't tested it there recently.

(snip)

> >  - Is it possible to make the same problem on my machine?
> 
> I think so.  I'd have to send you my system call code and test programs.
> Its not a lot of code on top of the existing page migration patch.

Ok, would you post the code on the list?
I'll take a look at it and run on my box.

> > And, would you please make your project proceed without the
> > migration cache code for a while?
> 
> I've already done that.  :-)

Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-02 21:28                                   ` Ray Bryant
  2005-02-03  2:59                                     ` Hirokazu Takahashi
@ 2005-02-07 13:16                                     ` Hirokazu Takahashi
  1 sibling, 0 replies; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-07 13:16 UTC (permalink / raw)
  To: raybry; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hi Ray,

> >>(This message comes from ia64_do_page_fault() and appears to because
> >>handle_mm_fault() returned FAULT_OOM....)
> >>
> >>I haven't looked into this further, but was wondering if perhaps one of
> >>you would understand why the migrate cache patch would fail in this way?
> > 
> > 
> > I can't think of anything right now - probably do_wp_page() is returning FAULT_OOM,
> > can you confirm that?
> > 
> No, it doesn't appear to be do_wp_page().  It looks like get_swap_page() 
> returns FAULT_OOM followed by get_user_pages() returning FAULT_OOM.
> For the page that causes the VM to kill the process, there is no return
> from get_user_pages() that returns FAULT_OOM.  Not sure yet what is going
> on here.

Is get_swap_page() typo of do_swap_page()?

How did you make sure do_swap_page() returned VM_FAULT_OOM?
I feel do_wp_page() might have returned VM_FAULT_OOM.

Would you please insert printks into the code everywhere VM_FAULT_OOM
is handled and let's see what's going on?

Thanks,
Hirokazu Takahashi.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-07 12:46                                             ` Hirokazu Takahashi
@ 2005-02-07 20:54                                               ` Ray Bryant
  2005-02-08  2:17                                                 ` Hirokazu Takahashi
  0 siblings, 1 reply; 48+ messages in thread
From: Ray Bryant @ 2005-02-07 20:54 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hirokazu and Marcello,

I'll post my patches ASAP, but that may still be a day or so yet.
There is a lot of development code here that the list may not be
interested in.  I wonder if it would be better just to send the
two of you the code to test?

-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-02-07 20:54                                               ` Ray Bryant
@ 2005-02-08  2:17                                                 ` Hirokazu Takahashi
       [not found]                                                   ` <42083913.9050306@sgi.com>
  0 siblings, 1 reply; 48+ messages in thread
From: Hirokazu Takahashi @ 2005-02-08  2:17 UTC (permalink / raw)
  To: raybry; +Cc: marcelo.tosatti, linux-mm, iwamoto, haveblue, hugh

Hi Ray,

> Hirokazu and Marcello,
> 
> I'll post my patches ASAP, but that may still be a day or so yet.
> There is a lot of development code here that the list may not be
> interested in.  I wonder if it would be better just to send the
> two of you the code to test?

Ok, if you don't want open it yet:)

Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
       [not found]                                                     ` <20050209.151938.63052333.taka@valinux.co.jp>
@ 2005-02-09 20:48                                                       ` Ray Bryant
  0 siblings, 0 replies; 48+ messages in thread
From: Ray Bryant @ 2005-02-09 20:48 UTC (permalink / raw)
  To: Hirokazu Takahashi, linux-mm; +Cc: marcelo.tosatti

Hirokazu and Marcello,

Here's some more information on this problem I am having with the
migration cache.

(The problem is that the test application is failing after it returns
from the system call that migrated some of its address space from node
0 to node 3 on my test box.  When the program touches the first page
in the range that was migrated, the process gets killed because
do_swap_page() returns VM_FAULT_OOM.  The test works fine if I remove
the migration cache patch.)

It looks like the page is flagged as being a migration pte, the page
is found in the migration cache, but then the test 	

            "likely(pte_same(*page_table, orig_pte))"

succeeds.  It's not obvious to me, at the moment, what this is supposed
to be doing.

Here is the code segment from do_swap_page(), with the debug printout
that was triggered:

again:
         if (pte_is_migration(orig_pte)) {
                 page = lookup_migration_cache(entry.val);
                 if (!page) {
                         spin_lock(&mm->page_table_lock);
                         page_table = pte_offset_map(pmd, address);
                         if (likely(pte_same(*page_table, orig_pte))) {
==========================>     DEBUG_VM_KILL(address);
                                 ret = VM_FAULT_OOM;
                         }
                         else
                                 ret = VM_FAULT_MINOR;
                         pte_unmap(page_table);
                         spin_unlock(&mm->page_table_lock);
                         goto out;
                 }



-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
@ 2005-02-06  2:02 Marcelo Tosatti
  0 siblings, 0 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2005-02-06  2:02 UTC (permalink / raw)
  To: raybry, taka, linux-mm, iwamoto, haveblue, hugh

On Fri, Feb 04, 2005 at 10:08:57AM -0600, Ray Bryant wrote:
> Hirokazu Takahashi wrote:
> >
> >
> >>If I take out the migration cache patch, this "VM: killing ..." problem
> >>goes away.   So it has something to do specifically with the migration
> >>cache code.
> >
> >
> >I've never seen the message though the migration cache code may have
> >some bugs. May I ask you some questions about it?
> >
> > - Which version of kernel did you use for it?
> 
> 2.6.10.  I pulled enough of the mm fixes (2 patches) so that the base
> migration patch from the hotplug tree would work on top of 2.6.10.  AFAIK
> the same problem occurs on 2.6.11-mm2 which is where I started with the
> migration cache patch.  But I admit I haven't tested it there recently.

Ray,

A possibility is that lookup_migration_cache() returns NULL, but for some
reason (?) pte_same() fails, giving us VM_FAULT_OOM which results in 
do_page_fault() killing the task.

Can you a printk in here to confirm this?

do_swap_page():
if (pte_is_migration(orig_pte)) {
+               page = lookup_migration_cache(entry.val);
+               if (!page) {
+                       spin_lock(&mm->page_table_lock);
+                       page_table = pte_offset_map(pmd, address);
+                       if (likely(pte_same(*page_table, orig_pte)))
+                               ret = VM_FAULT_OOM;
+                       else
+                               ret = VM_FAULT_MINOR;
+                       pte_unmap(page_table);
+                       spin_unlock(&mm->page_table_lock);
+                       goto out;
+               }


If that happens not to be the case, please find out what exactly is going
on (ie where the VM_FAULT_OOM is coming from) so we can try to help you. 

Do you have any other VM modifications in this kernel? What are they, except
the process migration code?

BTW, can you please post your process migration code? 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-07 16:57             ` migration cache, updated Ray Bryant
@ 2005-01-10 10:07               ` Marcelo Tosatti
  0 siblings, 0 replies; 48+ messages in thread
From: Marcelo Tosatti @ 2005-01-10 10:07 UTC (permalink / raw)
  To: Ray Bryant; +Cc: Hirokazu Takahashi, linux-mm

On Fri, Jan 07, 2005 at 10:57:52AM -0600, Ray Bryant wrote:
> Marcello,
> 
> Attached is a patch which fixes some compiler warnings in mmigrate.c
> that I was getting with the migration cache code.  The only substantive
> change was to change:
> 
>         /* Wait for all operations against the page to finish. */
>         ret = migrate_fn(page, newpage, &vlist);
>         switch (ret) {
>         default:
>                 /* The page is busy. Try it later. */
>                 goto out_busy;
>         case -ENOENT:
>                 /* The file the page belongs to has been truncated. */
>                 page_cache_get(page);
>                 page_cache_release(newpage);
>                 newpage->mapping = NULL;
>                 /* fall thru */
>         case 0:
>                 /* fall thru */
>         }
> 
> in generic_migrate_page(), to:
> 
>         /* Wait for all operations against the page to finish. */
>         ret = migrate_fn(page, newpage, &vlist);
>         switch (ret) {
>         case -ENOENT:
>                 /* The file the page belongs to has been truncated. */
>                 page_cache_get(page);
>                 page_cache_release(newpage);
>                 newpage->mapping = NULL;
>                 /* fall thru */
>         case 0:
>                 break;
>         default:
>                 /* The page is busy. Try it later. */
>                 goto out_busy;
>         }
> 
> This change was made to get rid of the warning:
> 
> mm/mmigrate.c:500: warning: deprecated use of label at end of compound 
> statement
> 
> I suppose you used the previous order to eliminate an extra branch or
> some such.  Do you have any other suggestion on how to eliminate that
> warning?

Hi Ray,

No, I think your change is fine. I'll merge it to my tree (which currently
doesnt exist, yet).

> -- 
> Best Regards,
> Ray
> -----------------------------------------------
>                   Ray Bryant
> 512-453-9679 (work)         512-507-7807 (cell)
> raybry@sgi.com             raybry@austin.rr.com
> The box said: "Requires Windows 98 or better",
>            so I installed Linux.
> -----------------------------------------------

> Index: linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swap.h
> ===================================================================
> --- linux-2.6.10-rc2-mm4-page-migration-only.orig/include/linux/swap.h	2005-01-04 07:55:22.000000000 -0800
> +++ linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swap.h	2005-01-04 08:13:16.000000000 -0800
> @@ -258,7 +258,7 @@ static inline int remove_exclusive_swap_
>  {
>  	return __remove_exclusive_swap_page(p, 0);
>  }
> -extern int migration_remove_entry(swp_entry_t);
> +extern void migration_remove_entry(swp_entry_t);
>  struct backing_dev_info;
>  
>  extern struct swap_list_t swap_list;
> Index: linux-2.6.10-rc2-mm4-page-migration-only/mm/mmigrate.c
> ===================================================================
> --- linux-2.6.10-rc2-mm4-page-migration-only.orig/mm/mmigrate.c	2005-01-04 07:55:22.000000000 -0800
> +++ linux-2.6.10-rc2-mm4-page-migration-only/mm/mmigrate.c	2005-01-04 08:11:51.000000000 -0800
> @@ -79,7 +79,6 @@ struct page *lookup_migration_cache(int 
>  
>  void migration_duplicate(swp_entry_t entry)
>  {
> -	int offset;
>  	struct counter *cnt;
>  
>  	read_lock_irq(&migration_space.tree_lock);
> @@ -96,32 +95,11 @@ void remove_from_migration_cache(struct 
>          idr_remove(&migration_idr, id);
>  	radix_tree_delete(&migration_space.page_tree, id);
>  	ClearPageSwapCache(page);
> -	page->private = NULL;
> +	page->private = 0;
>  	write_unlock_irq(&migration_space.tree_lock);
>  }
>  
> -// FIXME: if the page is locked will it be correctly removed from migr cache?
> -// check races
> -
> -int migration_remove_entry(swp_entry_t entry)
> -{
> -	struct page *page;
> -	
> -	page = find_get_page(&migration_space, entry.val);
> -
> -	if (!page)
> -		BUG();
> -
> -	lock_page(page);	
> -
> -	migration_remove_reference(page, 1);
> -
> -	unlock_page(page);
> -
> -	page_cache_release(page);
> -}
> -
> -int migration_remove_reference(struct page *page, int dec)
> +void migration_remove_reference(struct page *page, int dec)
>  {
>  	struct counter *c;
>  	swp_entry_t entry;
> @@ -145,6 +123,28 @@ int migration_remove_reference(struct pa
>  	}
>  }
>  
> +
> +// FIXME: if the page is locked will it be correctly removed from migr cache?
> +// check races
> +
> +void migration_remove_entry(swp_entry_t entry)
> +{
> +	struct page *page;
> +	
> +	page = find_get_page(&migration_space, entry.val);
> +
> +	if (!page)
> +		BUG();
> +
> +	lock_page(page);	
> +
> +	migration_remove_reference(page, 1);
> +
> +	unlock_page(page);
> +
> +	page_cache_release(page);
> +}
> +
>  int detach_from_migration_cache(struct page *page)
>  {
>  	lock_page(page);	
> @@ -486,9 +486,6 @@ generic_migrate_page(struct page *page, 
>  	/* Wait for all operations against the page to finish. */
>  	ret = migrate_fn(page, newpage, &vlist);
>  	switch (ret) {
> -	default:
> -		/* The page is busy. Try it later. */
> -		goto out_busy;
>  	case -ENOENT:
>  		/* The file the page belongs to has been truncated. */
>  		page_cache_get(page);
> @@ -496,7 +493,10 @@ generic_migrate_page(struct page *page, 
>  		newpage->mapping = NULL;
>  		/* fall thru */
>  	case 0:
> -		/* fall thru */
> +		break;
> +	default:
> +		/* The page is busy. Try it later. */
> +		goto out_busy;
>  	}
>  
>  	arch_migrate_page(page, newpage);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
  2005-01-04 18:26           ` Ray Bryant
@ 2005-01-07 16:57             ` Ray Bryant
  2005-01-10 10:07               ` Marcelo Tosatti
  0 siblings, 1 reply; 48+ messages in thread
From: Ray Bryant @ 2005-01-07 16:57 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Hirokazu Takahashi, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1929 bytes --]

Marcello,

Attached is a patch which fixes some compiler warnings in mmigrate.c
that I was getting with the migration cache code.  The only substantive
change was to change:

         /* Wait for all operations against the page to finish. */
         ret = migrate_fn(page, newpage, &vlist);
         switch (ret) {
         default:
                 /* The page is busy. Try it later. */
                 goto out_busy;
         case -ENOENT:
                 /* The file the page belongs to has been truncated. */
                 page_cache_get(page);
                 page_cache_release(newpage);
                 newpage->mapping = NULL;
                 /* fall thru */
         case 0:
                 /* fall thru */
         }

in generic_migrate_page(), to:

         /* Wait for all operations against the page to finish. */
         ret = migrate_fn(page, newpage, &vlist);
         switch (ret) {
         case -ENOENT:
                 /* The file the page belongs to has been truncated. */
                 page_cache_get(page);
                 page_cache_release(newpage);
                 newpage->mapping = NULL;
                 /* fall thru */
         case 0:
                 break;
         default:
                 /* The page is busy. Try it later. */
                 goto out_busy;
         }

This change was made to get rid of the warning:

mm/mmigrate.c:500: warning: deprecated use of label at end of compound statement

I suppose you used the previous order to eliminate an extra branch or
some such.  Do you have any other suggestion on how to eliminate that
warning?
-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------

[-- Attachment #2: fix-migration-cache-warnings.patch --]
[-- Type: text/plain, Size: 3087 bytes --]

Index: linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swap.h
===================================================================
--- linux-2.6.10-rc2-mm4-page-migration-only.orig/include/linux/swap.h	2005-01-04 07:55:22.000000000 -0800
+++ linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swap.h	2005-01-04 08:13:16.000000000 -0800
@@ -258,7 +258,7 @@ static inline int remove_exclusive_swap_
 {
 	return __remove_exclusive_swap_page(p, 0);
 }
-extern int migration_remove_entry(swp_entry_t);
+extern void migration_remove_entry(swp_entry_t);
 struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
Index: linux-2.6.10-rc2-mm4-page-migration-only/mm/mmigrate.c
===================================================================
--- linux-2.6.10-rc2-mm4-page-migration-only.orig/mm/mmigrate.c	2005-01-04 07:55:22.000000000 -0800
+++ linux-2.6.10-rc2-mm4-page-migration-only/mm/mmigrate.c	2005-01-04 08:11:51.000000000 -0800
@@ -79,7 +79,6 @@ struct page *lookup_migration_cache(int 
 
 void migration_duplicate(swp_entry_t entry)
 {
-	int offset;
 	struct counter *cnt;
 
 	read_lock_irq(&migration_space.tree_lock);
@@ -96,32 +95,11 @@ void remove_from_migration_cache(struct 
         idr_remove(&migration_idr, id);
 	radix_tree_delete(&migration_space.page_tree, id);
 	ClearPageSwapCache(page);
-	page->private = NULL;
+	page->private = 0;
 	write_unlock_irq(&migration_space.tree_lock);
 }
 
-// FIXME: if the page is locked will it be correctly removed from migr cache?
-// check races
-
-int migration_remove_entry(swp_entry_t entry)
-{
-	struct page *page;
-	
-	page = find_get_page(&migration_space, entry.val);
-
-	if (!page)
-		BUG();
-
-	lock_page(page);	
-
-	migration_remove_reference(page, 1);
-
-	unlock_page(page);
-
-	page_cache_release(page);
-}
-
-int migration_remove_reference(struct page *page, int dec)
+void migration_remove_reference(struct page *page, int dec)
 {
 	struct counter *c;
 	swp_entry_t entry;
@@ -145,6 +123,28 @@ int migration_remove_reference(struct pa
 	}
 }
 
+
+// FIXME: if the page is locked will it be correctly removed from migr cache?
+// check races
+
+void migration_remove_entry(swp_entry_t entry)
+{
+	struct page *page;
+	
+	page = find_get_page(&migration_space, entry.val);
+
+	if (!page)
+		BUG();
+
+	lock_page(page);	
+
+	migration_remove_reference(page, 1);
+
+	unlock_page(page);
+
+	page_cache_release(page);
+}
+
 int detach_from_migration_cache(struct page *page)
 {
 	lock_page(page);	
@@ -486,9 +486,6 @@ generic_migrate_page(struct page *page, 
 	/* Wait for all operations against the page to finish. */
 	ret = migrate_fn(page, newpage, &vlist);
 	switch (ret) {
-	default:
-		/* The page is busy. Try it later. */
-		goto out_busy;
 	case -ENOENT:
 		/* The file the page belongs to has been truncated. */
 		page_cache_get(page);
@@ -496,7 +493,10 @@ generic_migrate_page(struct page *page, 
 		newpage->mapping = NULL;
 		/* fall thru */
 	case 0:
-		/* fall thru */
+		break;
+	default:
+		/* The page is busy. Try it later. */
+		goto out_busy;
 	}
 
 	arch_migrate_page(page, newpage);

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: migration cache, updated
@ 2005-01-03 19:25 Ray Bryant
  0 siblings, 0 replies; 48+ messages in thread
From: Ray Bryant @ 2005-01-03 19:25 UTC (permalink / raw)
  To: Marcello Tosatti, Hirokazu Takahashi; +Cc: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1031 bytes --]

Marcello and Takahashi-san,

In working with your migration cache patch, I found out that
if CONFIG_MIGRATE_MEMORY is not set, then the kernel with your patch
applied (on top of my "split out" version of the memory migration
code from the hotplug patch) doesn't link.  (It still expects
migration_space, etc to be defined as externals, and these aren't
defined if CONFIG_MIGRATE_MEMORY is not set.)

Now I realize your patch is probably not "final" (there are a couple
of FIXME's still in there....), but I found the attached patch
useful as it lets my patched kernel compile with or without
CONFIG_MEMORY_MIGRATE set.

I hope you find this useful and will incorporate it into your
migration cache patch.

-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------

[-- Attachment #2: migration_cache_update_fix_link.patch --]
[-- Type: text/plain, Size: 2120 bytes --]

Fix the migration cache patch so that it will link even if
CONFIG_MEMORY_MIGRATE is not set.

Signed-off-by:Ray Bryant <raybry@sgi.com>

Index: linux-2.6.10-rc2-mm4-page-migration-only/include/linux/mm.h
===================================================================
--- linux-2.6.10-rc2-mm4-page-migration-only.orig/include/linux/mm.h	2004-12-29 09:30:00.000000000 -0800
+++ linux-2.6.10-rc2-mm4-page-migration-only/include/linux/mm.h	2004-12-29 09:33:46.000000000 -0800
@@ -279,6 +279,7 @@ struct page {
 #include <linux/swap.h>
 #include <linux/swapops.h> 
 
+#ifdef CONFIG_MEMORY_MIGRATE
 static inline int PageMigration(struct page *page)
 {
         swp_entry_t entry;
@@ -293,7 +294,9 @@ static inline int PageMigration(struct p
 
         return 1;
 }
-
+#else
+#define PageMigration(p)  0
+#endif /* CONFIG_MEMORY_MIGRATE */
 
 /*
  * Methods to modify the page usage count.
@@ -506,9 +509,13 @@ static inline struct address_space *page
 {
 	struct address_space *mapping = page->mapping;
 
+#ifdef CONFIG_MEMORY_MIGRATE
 	if (unlikely(PageMigration(page)))
 		mapping = &migration_space;
 	else if (unlikely(PageSwapCache(page)))
+#else
+ 	if (unlikely(PageSwapCache(page)))
+#endif
 		mapping = &swapper_space;
 	else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
 		mapping = NULL;
Index: linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swapops.h
===================================================================
--- linux-2.6.10-rc2-mm4-page-migration-only.orig/include/linux/swapops.h	2004-12-29 09:30:00.000000000 -0800
+++ linux-2.6.10-rc2-mm4-page-migration-only/include/linux/swapops.h	2004-12-29 09:36:30.000000000 -0800
@@ -70,6 +70,7 @@ static inline pte_t swp_entry_to_pte(swp
 	return __swp_entry_to_pte(arch_entry);
 }
 
+#ifdef CONFIG_MEMORY_MIGRATE
 static inline int pte_is_migration(pte_t pte)
 {
 	unsigned long swp_type;
@@ -81,6 +82,9 @@ static inline int pte_is_migration(pte_t
 
 	return swp_type == MIGRATION_TYPE;
 }
+#else
+#define pte_is_migration(x) 0
+#endif /* CONFIG_MEMORY_MIGRATE */
 
 static inline pte_t migration_entry_to_pte(swp_entry_t entry)
 {

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2005-02-09 20:48 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-10-25 21:39 migration cache, updated Marcelo Tosatti
2004-10-26  1:17 ` Hiroyuki KAMEZAWA
2004-10-26 12:01   ` Marcelo Tosatti
2004-10-26 23:47     ` Hiroyuki KAMEZAWA
2004-10-26  6:37 ` Hirokazu Takahashi
2004-10-26  9:20   ` Marcelo Tosatti
2004-10-26 13:45     ` Hirokazu Takahashi
2004-10-26 11:41       ` Marcelo Tosatti
2004-10-27 13:40       ` Hirokazu Takahashi
2004-10-26  9:15 ` Hirokazu Takahashi
2004-10-26  9:25   ` Marcelo Tosatti
2004-10-26 14:01     ` Hirokazu Takahashi
2004-10-26 12:24       ` Marcelo Tosatti
2004-10-27  7:25         ` IWAMOTO Toshihiro
2004-10-27 16:27           ` Marcelo Tosatti
2004-10-27 13:48         ` Hirokazu Takahashi
2004-10-28 15:19           ` Marcelo Tosatti
2004-10-28 16:05             ` Marcelo Tosatti
2004-10-28 18:51               ` Dave Hansen
2004-10-28 16:26                 ` Marcelo Tosatti
2004-10-28 20:24                   ` Dave Hansen
2004-11-03 15:21                   ` Marcelo Tosatti
2004-11-04  8:01                     ` Hirokazu Takahashi
2004-11-05 13:49               ` Hirokazu Takahashi
2004-11-05 15:16                 ` Marcelo Tosatti
2004-11-16  4:07                   ` Hirokazu Takahashi
2004-11-23 12:14                     ` Marcelo Tosatti
2004-11-24 10:21                       ` Hirokazu Takahashi
2004-12-01 20:21                         ` Marcelo Tosatti
2004-12-08 13:23                           ` Hirokazu Takahashi
2005-01-17  9:59                             ` Marcelo Tosatti
2005-01-31 18:33                               ` Ray Bryant
2005-01-31 18:44                                 ` Marcelo Tosatti
2005-02-02 21:28                                   ` Ray Bryant
2005-02-03  2:59                                     ` Hirokazu Takahashi
2005-02-03 15:19                                       ` Ray Bryant
2005-02-04  7:32                                         ` Hirokazu Takahashi
2005-02-04 16:08                                           ` Ray Bryant
2005-02-07 12:46                                             ` Hirokazu Takahashi
2005-02-07 20:54                                               ` Ray Bryant
2005-02-08  2:17                                                 ` Hirokazu Takahashi
     [not found]                                                   ` <42083913.9050306@sgi.com>
     [not found]                                                     ` <20050209.151938.63052333.taka@valinux.co.jp>
2005-02-09 20:48                                                       ` Ray Bryant
2005-02-07 13:16                                     ` Hirokazu Takahashi
2005-02-03  2:49                               ` Hirokazu Takahashi
2005-01-03 19:04 page migration Ray Bryant
2005-01-03 19:37 ` Dave Hansen
2005-01-03 20:15   ` Ray Bryant
2005-01-04 14:42     ` Hirokazu Takahashi
2005-01-04 17:30       ` Ray Bryant
2005-01-04 17:40         ` process " Dave Hansen
2005-01-04 18:26           ` Ray Bryant
2005-01-07 16:57             ` migration cache, updated Ray Bryant
2005-01-10 10:07               ` Marcelo Tosatti
2005-01-03 19:25 Ray Bryant
2005-02-06  2:02 Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox