RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code
@ 2008-01-04 16:02 Ross Biro
  2008-01-07 19:49 ` Christoph Lameter
  0 siblings, 1 reply; 3+ messages in thread
From: Ross Biro @ 2008-01-04 16:02 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: Type: text/plain, Size: 469 bytes --]

Here is the code to do the actual relocation.  I believe there is a
race in this version of the code.  If the relocation code sleeps while
allocating memory and the rcu code frees the old page tables before
the tlbflush, then if another cpu is running the process in user
space, the page tables could be corrupted.  Not really a big deal, but
it needs to be fixed.  Probably by not scheduling the rcu free until
after all the page tables have been relocated.

    Ross

[-- Attachment #2: relocate.patch --]
[-- Type: application/octet-stream, Size: 24506 bytes --]

diff -urwNbB 2.6.23/arch/i386/mm/hugetlbpage.c 2.6.23a/arch/i386/mm/hugetlbpage.c
--- 2.6.23/arch/i386/mm/hugetlbpage.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/i386/mm/hugetlbpage.c	2007-10-29 09:48:48.000000000 -0700
@@ -87,6 +87,7 @@
 		goto out;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pud(&pud, mm, addr);
 	if (pud_none(*pud))
 		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
 	else
diff -urwNbB 2.6.23/arch/powerpc/mm/fault.c 2.6.23a/arch/powerpc/mm/fault.c
--- 2.6.23/arch/powerpc/mm/fault.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/powerpc/mm/fault.c	2007-10-29 09:38:09.000000000 -0700
@@ -301,6 +301,8 @@
 		if (get_pteptr(mm, address, &ptep, &pmdp)) {
 			spinlock_t *ptl = pte_lockptr(mm, pmdp);
 			spin_lock(ptl);
+			delimbo_pte(&ptep, &ptl, &pmdp, mm, address);
+
 			if (pte_present(*ptep)) {
 				struct page *page = pte_page(*ptep);
 
diff -urwNbB 2.6.23/arch/powerpc/mm/hugetlbpage.c 2.6.23a/arch/powerpc/mm/hugetlbpage.c
--- 2.6.23/arch/powerpc/mm/hugetlbpage.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/powerpc/mm/hugetlbpage.c	2007-10-29 09:53:36.000000000 -0700
@@ -77,6 +77,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_hpd(&hpdp, mm, address);
 	if (!hugepd_none(*hpdp))
 		kmem_cache_free(huge_pgtable_cache, new);
 	else
diff -urwNbB 2.6.23/arch/ppc/mm/fault.c 2.6.23a/arch/ppc/mm/fault.c
--- 2.6.23/arch/ppc/mm/fault.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/ppc/mm/fault.c	2007-10-29 09:38:19.000000000 -0700
@@ -219,6 +219,7 @@
 		if (get_pteptr(mm, address, &ptep, &pmdp)) {
 			spinlock_t *ptl = pte_lockptr(mm, pmdp);
 			spin_lock(ptl);
+			delimbo_pte(&ptep, &ptl, &pmdp, mm, address);
 			if (pte_present(*ptep)) {
 				struct page *page = pte_page(*ptep);
 
diff -urwNbB 2.6.23/arch/x86_64/kernel/smp.c 2.6.23a/arch/x86_64/kernel/smp.c
--- 2.6.23/arch/x86_64/kernel/smp.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/x86_64/kernel/smp.c	2008-01-02 09:07:04.000000000 -0800
@@ -56,6 +56,7 @@
 		struct mm_struct *flush_mm;
 		unsigned long flush_va;
 #define FLUSH_ALL	-1ULL
+#define RELOAD_ALL	-2ULL
 		spinlock_t tlbstate_lock;
 	};
 	char pad[SMP_CACHE_BYTES];
@@ -155,6 +156,8 @@
 		if (read_pda(mmu_state) == TLBSTATE_OK) {
 			if (f->flush_va == FLUSH_ALL)
 				local_flush_tlb();
+			else if (f->flush_va == RELOAD_ALL)
+				local_reload_tlb_mm(f->flush_mm);
 			else
 				__flush_tlb_one(f->flush_va);
 		} else
@@ -225,10 +228,34 @@
 }
 EXPORT_SYMBOL(flush_tlb_current_task);
 
+void reload_tlb_mm(struct mm_struct *mm)
+{
+	cpumask_t cpu_mask;
+
+	clear_bit(MMF_NEED_RELOAD, &mm->flags);
+
+	preempt_disable();
+	cpu_mask = mm->cpu_vm_mask;
+	cpu_clear(smp_processor_id(), cpu_mask);
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			local_reload_tlb_mm(mm);
+		else
+			leave_mm(smp_processor_id());
+	}
+	if (!cpus_empty(cpu_mask))
+		flush_tlb_others(cpu_mask, mm, RELOAD_ALL);
+
+	preempt_enable();
+}
+
 void flush_tlb_mm (struct mm_struct * mm)
 {
 	cpumask_t cpu_mask;
 
+	clear_bit(MMF_NEED_FLUSH, &mm->flags);
+
 	preempt_disable();
 	cpu_mask = mm->cpu_vm_mask;
 	cpu_clear(smp_processor_id(), cpu_mask);
diff -urwNbB 2.6.23/include/asm-generic/pgtable.h 2.6.23a/include/asm-generic/pgtable.h
--- 2.6.23/include/asm-generic/pgtable.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/asm-generic/pgtable.h	2008-01-04 07:36:41.000000000 -0800
@@ -4,6 +4,8 @@
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_MMU
 
+#include <linux/sched.h>
+
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
  * Largely same as above, but only sets the access flags (dirty,
@@ -199,6 +201,88 @@
 	}
 	return 0;
 }
+
+
+/* Used to rewalk the page tables if after we grab the appropriate lock,
+   we end up with a page that's just waiting to go away. */
+static inline pgd_t *walk_page_table_pgd(struct mm_struct *mm,
+					  unsigned long addr)
+{
+	return pgd_offset(mm, addr);
+}
+
+static inline pud_t *walk_page_table_pud(struct mm_struct *mm,
+					 unsigned long addr) {
+	pgd_t *pgd;
+	pgd = walk_page_table_pgd(mm, addr);
+	BUG_ON(!pgd);
+	return pud_offset(pgd, addr);
+}
+
+static inline pmd_t *walk_page_table_pmd(struct mm_struct *mm,
+					 unsigned long addr)
+{
+	pud_t *pud;
+	pud = walk_page_table_pud(mm, addr);
+	BUG_ON(!pud);
+	return  pmd_offset(pud, addr);
+}
+
+static inline pte_t *walk_page_table_pte(struct mm_struct *mm,
+					 unsigned long addr)
+{
+	pmd_t *pmd;
+	pmd = walk_page_table_pmd(mm, addr);
+	BUG_ON(!pmd);
+	return pte_offset_map(pmd, addr);
+}
+
+static inline pte_t *walk_page_table_huge_pte(struct mm_struct *mm,
+					      unsigned long addr)
+{
+	return (pte_t *)walk_page_table_pmd(mm, addr);
+}
+
+
+#ifdef ARCH_HAS_FORCE_PAGE_TABLE_RELOAD
+static inline void maybe_force_reload_tlb_mm(struct mm_struct *mm)
+{
+	if (test_and_clear_bit(MMF_NEED_RELOAD, &mm->flags))
+		force_reload_tlb_mm(mm);
+}
+
+static inline int maybe_need_force_reload_mm(struct mm_struct *mm)
+{
+	if (!cpus_empty(mm->cpu_vm_mask)) {
+		set_bit(MMF_NEED_RELOAD, &mm->flags);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int migrate_top_level_page_table(struct mm_struct *mm,
+					       struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, mm->pgd, PAGE_SIZE);
+	INIT_RCU_HEAD(&(virt_to_page(mm->pgd)->rcu));
+	call_rcu(&(virt_to_page(mm->pgd)->rcu), rcu_free_pt);
+	mm->pgd = (pgd_t *)dest_ptr;
+	maybe_need_force_reload_mm(mm);
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+	return 0;
+}
+#else
+static inline int migrate_top_level_page_table(struct mm_struct *mm,
+					       struct page *dest) {
+	return 1;
+}
+#endif
+
 #endif /* CONFIG_MMU */
 
 /*
diff -urwNbB 2.6.23/include/asm-x86_64/tlbflush.h 2.6.23a/include/asm-x86_64/tlbflush.h
--- 2.6.23/include/asm-x86_64/tlbflush.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/asm-x86_64/tlbflush.h	2008-01-04 06:38:28.000000000 -0800
@@ -6,6 +6,13 @@
 #include <asm/processor.h>
 #include <asm/system.h>
 
+#define ARCH_HAS_TLB_RELOAD
+static inline void load_cr3(pgd_t *pgd);
+static inline void __reload_tlb_mm(struct mm_struct *mm)
+{
+	load_cr3(mm->pgd);
+}
+
 static inline void __flush_tlb(void)
 {
 	write_cr3(read_cr3());
@@ -44,6 +50,12 @@
 #define flush_tlb_all() __flush_tlb_all()
 #define local_flush_tlb() __flush_tlb()
 
+static inline void reload_tlb_mm(struct mm_struct *mm)
+{
+	if (mm == current->active_mm)
+		__reload_tlb_mm(mm);
+}
+
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 	if (mm == current->active_mm)
@@ -71,6 +83,10 @@
 #define local_flush_tlb() \
 	__flush_tlb()
 
+#define local_reload_tlb_mm(mm) \
+	__reload_tlb_mm(mm)
+
+extern void reload_tlb_mm(struct mm_struct *mm);
 extern void flush_tlb_all(void);
 extern void flush_tlb_current_task(void);
 extern void flush_tlb_mm(struct mm_struct *);
@@ -106,4 +122,9 @@
 	   by the normal TLB flushing algorithms. */
 }
 
+/* Do this at the end because we need TLBSTATE_OK et al defined, but
+   we need to include this because we use load_cr3. */
+#include <asm/mmu_context.h>
+
+
 #endif /* _X8664_TLBFLUSH_H */
diff -urwNbB 2.6.23/include/linux/mm.h 2.6.23a/include/linux/mm.h
--- 2.6.23/include/linux/mm.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/linux/mm.h	2008-01-02 08:07:47.000000000 -0800
@@ -14,6 +14,7 @@
 #include <linux/debug_locks.h>
 #include <linux/backing-dev.h>
 #include <linux/mm_types.h>
+#include <asm/pgtable.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -935,6 +936,7 @@
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
+	delimbo_pte(&__pte, ptlp, &pmd, mm, address);	\
 	__pte;						\
 })
 
@@ -959,6 +962,62 @@
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
+
+
+
+static inline void delimbo_pte(pte_t **pte, spinlock_t **ptl,  pmd_t **pmd,
+			  struct mm_struct *mm,
+			  unsigned long addr)
+{
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	spin_unlock(*ptl);
+#endif
+	pte_unmap(*pte);
+	*pmd = walk_page_table_pmd(mm, addr);
+	*pte = pte_offset_map(*pmd, addr);
+	*ptl = pte_lockptr(mm, *pmd);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	spin_lock(*ptl);
+#endif
+}
+
+static inline void delimbo_pte_nested(pte_t **pte, spinlock_t **ptl,
+				pmd_t **pmd,
+				struct mm_struct *mm,
+				unsigned long addr, int subclass)
+{
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	spin_unlock(*ptl);
+#endif
+	*pmd = walk_page_table_pmd(mm, addr);
+	*pte = pte_offset_map(*pmd, addr);
+	*ptl = pte_lockptr(mm, *pmd);
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	spin_lock_nested(*ptl, subclass);
+#endif
+}
+
+static inline void delimbo_pud(pud_t **pud,  struct mm_struct *mm,
+			  unsigned long addr) {
+	*pud = walk_page_table_pud(mm, addr);
+}
+
+static inline void delimbo_pmd(pmd_t **pmd,  struct mm_struct *mm,
+			       unsigned long addr) {
+	*pmd = walk_page_table_pmd(mm, addr);
+}
+
+static inline void delimbo_pgd(pgd_t **pgd,  struct mm_struct *mm,
+			       unsigned long addr) {
+	*pgd = walk_page_table_pgd(mm, addr);
+}
+
+static inline void delimbo_huge_pte(pte_t **pte,  struct mm_struct *mm,
+				    unsigned long addr) {
+	*pte = walk_page_table_huge_pte(mm, addr);
+}
+
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 /*
  * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
diff -urwNbB 2.6.23/include/linux/mm_types.h 2.6.23a/include/linux/mm_types.h
--- 2.6.23/include/linux/mm_types.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/linux/mm_types.h	2008-01-02 08:06:09.000000000 -0800
@@ -5,6 +5,7 @@
 #include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 struct address_space;
 
@@ -61,9 +62,18 @@
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
 	};
+
+	union {
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
 					 */
+		struct rcu_head rcu;	/* Used by page table relocation code
+					 * to remember page for later freeing,
+					 * after we are sure anyone
+					 * poking at the page tables is no
+					 * longer looking at this page.
+					 */
+	};
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
diff -urwNbB 2.6.23/mm/hugetlb.c 2.6.23a/mm/hugetlb.c
--- 2.6.23/mm/hugetlb.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/hugetlb.c	2007-10-30 07:32:50.000000000 -0700
@@ -379,6 +379,8 @@
 			goto nomem;
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
+		delimbo_huge_pte(&src_pte, src, addr);
+		delimbo_huge_pte(&dst_pte, dst, addr);
 		if (!pte_none(*src_pte)) {
 			if (cow)
 				ptep_set_wrprotect(src, addr, src_pte);
@@ -551,6 +553,7 @@
 	}
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_huge_pte(&ptep, mm, address);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
 		goto backout;
@@ -609,6 +612,7 @@
 	ret = 0;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_huge_pte(&ptep, mm, address);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (likely(pte_same(entry, *ptep)))
 		if (write_access && !pte_write(entry))
diff -urwNbB 2.6.23/mm/memory.c 2.6.23a/mm/memory.c
--- 2.6.23/mm/memory.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/memory.c	2007-11-30 08:06:17.000000000 -0800
@@ -306,6 +306,7 @@
 
 	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
+	delimbo_pmd(&pmd, mm, address);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
 		pte_free(new);
@@ -325,6 +326,7 @@
 		return -ENOMEM;
 
 	spin_lock(&init_mm.page_table_lock);
+	delimbo_pmd(&pmd, &init_mm, address);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free_kernel(new);
 	else
@@ -504,6 +506,8 @@
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+	delimbo_pte(&src_pte, &src_ptl, &src_pmd, src_mm, addr);
 	arch_enter_lazy_mmu_mode();
 
 	do {
@@ -1558,13 +1562,15 @@
  * and do_anonymous_page and do_no_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
+				pte_t *page_table, pte_t orig_pte,
+				unsigned long address)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
 		spinlock_t *ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
+		delimbo_pte(&page_table, &ptl, &pmd, mm, address);
 		same = pte_same(*page_table, orig_pte);
 		spin_unlock(ptl);
 	}
@@ -2153,7 +2159,7 @@
 	pte_t pte;
 	int ret = 0;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte, address))
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
@@ -2227,6 +2233,10 @@
 	}
 
 	/* No need to invalidate - it was non-present before */
+	/* Unless of course the cpu might be looking at an old
+	   copy of the pte. */
+	maybe_flush_tlb_mm(mm);
+
 	update_mmu_cache(vma, address, pte);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
@@ -2279,6 +2289,7 @@
 
 		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
+		delimbo_pte(&page_table, &ptl, &pmd, mm, address);
 		if (!pte_none(*page_table))
 			goto release;
 		inc_mm_counter(mm, file_rss);
@@ -2288,6 +2299,10 @@
 	set_pte_at(mm, address, page_table, entry);
 
 	/* No need to invalidate - it was non-present before */
+	/* Unless of course the cpu might be looking at an old
+	   copy of the pte. */
+	maybe_flush_tlb_mm(mm);
+
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
 unlock:
@@ -2441,6 +2456,10 @@
 		}
 
 		/* no need to invalidate: a not-present page won't be cached */
+		/* Unless of course the cpu could be looking at an old page
+		   table entry. */
+		maybe_flush_tlb_mm(mm);
+
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 	} else {
@@ -2544,7 +2563,7 @@
 				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte, address))
 		return 0;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
@@ -2603,6 +2622,7 @@
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
+	delimbo_pte(&pte, &ptl, &pmd, mm, address);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
@@ -2625,6 +2645,12 @@
 		if (write_access)
 			flush_tlb_page(vma, address);
 	}
+
+	/* if the cpu could be looking at an old page table, we need to
+	   flush out everything. */
+	maybe_flush_tlb_mm(mm);
+
+
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;
@@ -2674,6 +2700,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pgd(&pgd, mm, address);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
@@ -2695,6 +2722,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pud(&pud, mm, address);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
diff -urwNbB 2.6.23/mm/mempolicy.c 2.6.23a/mm/mempolicy.c
--- 2.6.23/mm/mempolicy.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/mempolicy.c	2008-01-02 07:59:05.000000000 -0800
@@ -101,6 +101,11 @@
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 
+
+int migrate_page_tables_mm(struct mm_struct *mm,  int source,
+			   new_page_t get_new_page, unsigned long private);
+
+
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
 enum zone_type policy_zone = 0;
@@ -597,6 +602,11 @@
 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
+static struct page *new_node_page_page_tables(struct page *page,
+					      unsigned long node, int **x) {
+	return alloc_pages_node(node, GFP_USER, 0);
+}
+
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
@@ -616,6 +626,10 @@
 	if (!list_empty(&pagelist))
 		err = migrate_pages(&pagelist, new_node_page, dest);
 
+	if (!err)
+		err = migrate_page_tables_mm(mm, source,
+					     new_node_page_page_tables, dest);
+
 	return err;
 }
 
diff -urwNbB 2.6.23/mm/migrate.c 2.6.23a/mm/migrate.c
--- 2.6.23/mm/migrate.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/migrate.c	2008-01-04 07:37:37.000000000 -0800
@@ -28,9 +28,14 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
-
+#include <linux/mm.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
 
+int migrate_page_tables_mm(struct mm_struct *mm, int source,
+			   new_page_t get_new_page, unsigned long private);
+
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -158,6 +163,7 @@
 
  	ptl = pte_lockptr(mm, pmd);
  	spin_lock(ptl);
+	delimbo_pte(&ptep, &ptl, &pmd, mm, addr);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
 		goto out;
@@ -859,9 +865,10 @@
 		err = migrate_pages(&pagelist, new_page_node,
 				(unsigned long)pm);
 	else
-		err = -ENOENT;
+		err = 0;
 
 	up_read(&mm->mmap_sem);
+
 	return err;
 }
 
@@ -1039,3 +1046,260 @@
  	}
  	return err;
 }
+
+static void rcu_free_pt(struct rcu_head *head)
+{
+	/* Need to know that the mm has been flushed before
+	 * we get here.  Otherwise we need a way to find
+	 * the appropriate mm to flush.
+	 */
+	struct page *page = container_of(head, struct page, rcu);
+	__free_page(page);
+}
+
+int migrate_pgd(pgd_t *pgd, struct mm_struct *mm,
+		unsigned long addr, struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	pud_t *pud;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pgd(&pgd, mm, addr);
+
+	pud = pud_offset(pgd, addr);
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, pud, PAGE_SIZE);
+
+	INIT_RCU_HEAD(&pgd_page(*pgd)->rcu);
+	call_rcu(&(pgd_page(*pgd)->rcu), rcu_free_pt);
+	pgd_populate(mm, pgd, dest_ptr);
+
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+
+	maybe_need_flush_mm(mm);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+
+}
+
+int migrate_pud(pud_t *pud, struct mm_struct *mm, unsigned long addr,
+		struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	pmd_t *pmd;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pud(&pud, mm, addr);
+	pmd = pmd_offset(pud, addr);
+
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, pmd, PAGE_SIZE);
+
+	INIT_RCU_HEAD(&pud_page(*pud)->rcu);
+	call_rcu(&(pud_page(*pud)->rcu), rcu_free_pt);
+	pud_populate(mm, pud, dest_ptr);
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+	maybe_need_flush_mm(mm);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+}
+
+
+int migrate_pmd(pmd_t *pmd, struct mm_struct *mm, unsigned long addr,
+		struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	spinlock_t *ptl;
+	pte_t *pte;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pmd(&pmd, mm, addr);
+
+	/* this could happen if the page table has been swapped out and we
+	   were looking at the old one. */
+	if (unlikely(!pmd_present(*pmd))) {
+		spin_unlock_irqrestore(&mm->page_table_lock, flags);
+		return 1;
+	}
+
+	ptl = pte_lockptr(mm, pmd);
+
+	/* We need the page lock as well. */
+	if (ptl != &mm->page_table_lock)
+		spin_lock(ptl);
+
+	pte = pte_offset_map(pmd, addr);
+
+	dest_ptr = kmap_atomic(dest, KM_USER0);
+	memcpy(dest_ptr, pte, PAGE_SIZE);
+	INIT_RCU_HEAD(&pmd_page(*pmd)->rcu);
+	call_rcu(&(pmd_page(*pmd)->rcu), rcu_free_pt);
+	kunmap_atomic(dest, KM_USER0);
+	pte_unmap(pte);
+	pte_lock_init(dest);
+	pmd_populate(NULL, pmd, dest);
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+	maybe_need_flush_mm(mm);
+
+	if (ptl != &mm->page_table_lock)
+		spin_unlock(ptl);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+}
+
+static int migrate_page_tables_pmd(pmd_t *pmd, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pmd);
+	struct page *new_page;
+
+	if (!pmd_present(*pmd)) {
+		*address +=  (unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+		pages_not_migrated += migrate_pmd(pmd, mm, *address, new_page);
+	}
+	*address +=  (unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+
+	return pages_not_migrated;
+}
+
+static int migrate_page_tables_pud(pud_t *pud, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pud);
+	struct page *new_page;
+
+	if (!pud_present(*pud)) {
+		*address += (unsigned long)PTRS_PER_PMD *
+				(unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+
+		pages_not_migrated += migrate_pud(pud, mm, *address, new_page);
+	}
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		int ret;
+		ret = migrate_page_tables_pmd(pmd_offset(pud, *address), mm,
+					      address, source,
+					      get_new_page, private);
+		if (ret < 0)
+			return ret;
+		pages_not_migrated += ret;
+	}
+
+	return pages_not_migrated;
+}
+
+static int migrate_page_tables_pgd(pgd_t *pgd, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pgd);
+	struct page *new_page;
+
+	if (!pgd_present(*pgd)) {
+		*address +=  (unsigned long)PTRS_PER_PUD *
+				(unsigned long)PTRS_PER_PMD *
+				(unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+
+		pages_not_migrated += migrate_pgd(pgd, mm,  *address, new_page);
+	}
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		int ret;
+		ret = migrate_page_tables_pud(pud_offset(pgd, *address), mm,
+					      address, source,
+					      get_new_page, private);
+		if (ret < 0)
+			return ret;
+		pages_not_migrated += ret;
+	}
+	return pages_not_migrated;
+}
+
+/* similiar to migrate pages, but migrages the page tables. */
+int migrate_page_tables_mm(struct mm_struct *mm, int source,
+			   new_page_t get_new_page, unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(mm->pgd);
+	struct page *new_page;
+	unsigned long address = 0UL;
+
+	if (mm->pgd == NULL)
+		return 0;
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+		pages_not_migrated += migrate_top_level_page_table(mm,
+								new_page);
+	}
+
+	for (i = 0; i < PTRS_PER_PGD && address < mm->task_size; i++) {
+		int ret;
+		ret = migrate_page_tables_pgd(pgd_offset(mm, address), mm,
+					      &address, source,
+					      get_new_page, private);
+		if (ret < 0)
+			return ret;
+
+		pages_not_migrated += ret;
+	}
+
+	/* flush the tlbs if necessary. */
+	maybe_flush_tlb_mm(mm);
+	return pages_not_migrated;
+}
diff -urwNbB 2.6.23/mm/mremap.c 2.6.23a/mm/mremap.c
--- 2.6.23/mm/mremap.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/mremap.c	2007-10-30 06:57:49.000000000 -0700
@@ -98,6 +98,7 @@
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	delimbo_pte(&new_pte, &new_ptl, &new_pmd, mm, new_addr);
 	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
diff -urwNbB 2.6.23/mm/rmap.c 2.6.23a/mm/rmap.c
--- 2.6.23/mm/rmap.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/rmap.c	2007-10-29 09:46:25.000000000 -0700
@@ -254,6 +254,7 @@
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
+	delimbo_pte(&pte, &ptl, &pmd, mm, address);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
 		return pte;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code
  2008-01-04 16:02 RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code Ross Biro
@ 2008-01-07 19:49 ` Christoph Lameter
  2008-01-07 20:17   ` Ross Biro
  0 siblings, 1 reply; 3+ messages in thread
From: Christoph Lameter @ 2008-01-07 19:49 UTC (permalink / raw)
  To: Ross Biro; +Cc: linux-mm

On Fri, 4 Jan 2008, Ross Biro wrote:

> Here is the code to do the actual relocation.  I believe there is a
> race in this version of the code.  If the relocation code sleeps while
> allocating memory and the rcu code frees the old page tables before
> the tlbflush, then if another cpu is running the process in user
> space, the page tables could be corrupted.  Not really a big deal, but
> it needs to be fixed.  Probably by not scheduling the rcu free until
> after all the page tables have been relocated.

Interesting approach. It moves all page table pages even if only a subset 
of the address space was migrated?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code
  2008-01-07 19:49 ` Christoph Lameter
@ 2008-01-07 20:17   ` Ross Biro
  0 siblings, 0 replies; 3+ messages in thread
From: Ross Biro @ 2008-01-07 20:17 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On Jan 7, 2008 2:49 PM, Christoph Lameter <clameter@sgi.com> wrote:
> Interesting approach. It moves all page table pages even if only a subset
> of the address space was migrated?

That's a side effect of not understanding the page migration code.  I
couldn't figure out where to hook into the existing code, so I did it
the easy way and migrated everything.  Passing around an address range
or other subset representation and only migrating some of the page
tables would be a trivial addition.  But not one that makes any sense
until the page table migration code is integrated into the rest of the
migration code properly.  I'm assuming someone else will point out the
proper place to hook into the existing code, and then only migrating a
portion of the page tables will be easy.

    Ross

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2008-01-07 20:17 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-04 16:02 RFC/Patch Make Page Tables Relocatable Part 2/2 Page Table Migration Code Ross Biro
2008-01-07 19:49 ` Christoph Lameter
2008-01-07 20:17   ` Ross Biro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox