[PATCH 2.5.41-mm1] new snapshot of shared page tables

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 2.5.41-mm1] new snapshot of shared page tables
@ 2002-10-09 21:07 Dave McCracken
  2002-10-09 22:51 ` Andrew Morton
  2002-10-10  3:04 ` Ed Tomlinson
  0 siblings, 2 replies; 8+ messages in thread
From: Dave McCracken @ 2002-10-09 21:07 UTC (permalink / raw)
  To: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 530 bytes --]


Here's the latest shared page table patch.  Changes are mostly cleanups,
with the added feature that shared page tables are now a config option.
This means the patch should build on other architectures (this is untested).

At Andrew Morton's request, I've moved my development base to the -mm line.

Dave McCracken

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: shpte-2.5.41-mm1-2.diff --]
[-- Type: text/plain, Size: 38920 bytes --]

--- 2.5.41-mm1/./include/linux/mm.h	2002-10-09 11:02:57.000000000 -0500
+++ 2.5.41-mm1-shpte/./include/linux/mm.h	2002-10-09 13:53:27.000000000 -0500
@@ -163,6 +163,8 @@
 		struct pte_chain *chain;/* Reverse pte mapping pointer.
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
+		struct mm_chain *mmchain;/* Reverse mm_struct mapping pointer */
+		struct mm_struct *mmdirect;
 	} pte;
 	unsigned long private;		/* mapping-private opaque data */
 
@@ -358,6 +360,7 @@
 
 extern void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size);
 extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+extern int share_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma, pmd_t **prev_pmd);
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot);
 
--- 2.5.41-mm1/./include/linux/rmap-locking.h	2002-10-07 13:23:25.000000000 -0500
+++ 2.5.41-mm1-shpte/./include/linux/rmap-locking.h	2002-10-09 10:25:52.000000000 -0500
@@ -31,3 +31,6 @@
 #endif
 	preempt_enable();
 }
+
+#define	pte_page_lock	pte_chain_lock
+#define	pte_page_unlock	pte_chain_unlock
--- 2.5.41-mm1/./include/asm-generic/rmap.h	2002-10-07 13:24:40.000000000 -0500
+++ 2.5.41-mm1-shpte/./include/asm-generic/rmap.h	2002-10-09 10:25:52.000000000 -0500
@@ -26,33 +26,6 @@
  */
 #include <linux/mm.h>
 
-static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address)
-{
-#ifdef BROKEN_PPC_PTE_ALLOC_ONE
-	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
-	extern int mem_init_done;
-
-	if (!mem_init_done)
-		return;
-#endif
-	page->mapping = (void *)mm;
-	page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
-	inc_page_state(nr_page_table_pages);
-}
-
-static inline void pgtable_remove_rmap(struct page * page)
-{
-	page->mapping = NULL;
-	page->index = 0;
-	dec_page_state(nr_page_table_pages);
-}
-
-static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
-{
-	struct page * page = kmap_atomic_to_page(ptep);
-	return (struct mm_struct *) page->mapping;
-}
-
 static inline unsigned long ptep_to_address(pte_t * ptep)
 {
 	struct page * page = kmap_atomic_to_page(ptep);
@@ -87,4 +60,10 @@
 }
 #endif
 
+extern void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address);
+extern void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm, unsigned long address);
+extern void pgtable_remove_rmap(struct page * page, struct mm_struct *mm);
+extern void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm);
+extern void increment_rss(struct page *ptepage);
+
 #endif /* _GENERIC_RMAP_H */
--- 2.5.41-mm1/./include/asm-i386/pgtable.h	2002-10-07 13:24:48.000000000 -0500
+++ 2.5.41-mm1-shpte/./include/asm-i386/pgtable.h	2002-10-09 10:25:52.000000000 -0500
@@ -124,6 +124,7 @@
 #define _PAGE_PROTNONE	0x080	/* If not present */
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_TABLE_RDONLY	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 
@@ -184,8 +185,8 @@
 #define pmd_none(x)	(!pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
-#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
-
+#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_RW)) != \
+			(_KERNPG_TABLE & ~_PAGE_RW))
 
 #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
 
@@ -209,6 +210,8 @@
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
+static inline int pmd_write(pmd_t pmd)		{ return (pmd).pmd & _PAGE_RW; }
+static inline pmd_t pmd_wrprotect(pmd_t pmd)	{ (pmd).pmd &= ~_PAGE_RW; return pmd; }
 
 static inline  int ptep_test_and_clear_dirty(pte_t *ptep)	{ return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); }
 static inline  int ptep_test_and_clear_young(pte_t *ptep)	{ return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); }
@@ -263,6 +266,10 @@
 	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + __pte_offset(address))
 #define pte_offset_map_nested(dir, address) \
 	((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + __pte_offset(address))
+#define pte_page_map(__page, address) \
+	((pte_t *)kmap_atomic(__page,KM_PTE0) + __pte_offset(address))
+#define pte_page_map_nested(__page, address) \
+	((pte_t *)kmap_atomic(__page,KM_PTE1) + __pte_offset(address))
 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
 
--- 2.5.41-mm1/./arch/i386/Config.help	2002-10-07 13:23:25.000000000 -0500
+++ 2.5.41-mm1-shpte/./arch/i386/Config.help	2002-10-09 15:49:31.000000000 -0500
@@ -143,6 +143,13 @@
   low memory.  Setting this option will put user-space page table
   entries in high memory.
 
+CONFIG_SHAREPTE
+  Normally each address space has its own complete page table for all
+  its mappings.  This can mean many mappings of a set of shared data
+  pages.  With this option, the VM will attempt to share the bottom
+  level of the page table between address spaces that are sharing data
+  pages.
+
 CONFIG_HIGHMEM4G
   Select this if you have a 32-bit processor and between 1 and 4
   gigabytes of physical RAM.
--- 2.5.41-mm1/./arch/i386/config.in	2002-10-07 13:24:02.000000000 -0500
+++ 2.5.41-mm1-shpte/./arch/i386/config.in	2002-10-09 15:57:18.000000000 -0500
@@ -232,6 +232,7 @@
 if [ "$CONFIG_HIGHMEM4G" = "y" -o "$CONFIG_HIGHMEM64G" = "y" ]; then
    bool 'Allocate 3rd-level pagetables from highmem' CONFIG_HIGHPTE
 fi
+bool 'Share 3rd-level pagetables' CONFIG_SHAREPTE y
 
 bool 'Math emulation' CONFIG_MATH_EMULATION
 bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
--- 2.5.41-mm1/./fs/exec.c	2002-10-09 11:02:54.000000000 -0500
+++ 2.5.41-mm1-shpte/./fs/exec.c	2002-10-09 10:30:54.000000000 -0500
@@ -46,6 +46,7 @@
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
+#include <asm/rmap.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
@@ -308,8 +309,8 @@
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
 	page_add_rmap(page, pte);
+	increment_rss(kmap_atomic_to_page(pte));
 	pte_unmap(pte);
-	tsk->mm->rss++;
 	spin_unlock(&tsk->mm->page_table_lock);
 
 	/* no need for flush_tlb */
--- 2.5.41-mm1/./kernel/fork.c	2002-10-09 11:02:58.000000000 -0500
+++ 2.5.41-mm1-shpte/./kernel/fork.c	2002-10-09 15:19:16.000000000 -0500
@@ -208,6 +208,9 @@
 	struct vm_area_struct * mpnt, *tmp, **pprev;
 	int retval;
 	unsigned long charge = 0;
+#ifdef CONFIG_SHAREPTE
+	pmd_t *prev_pmd = 0;
+#endif
 
 	flush_cache_mm(current->mm);
 	mm->locked_vm = 0;
@@ -270,7 +273,11 @@
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 		mm->map_count++;
+#ifdef CONFIG_SHAREPTE
+		retval = share_page_range(mm, current->mm, tmp, &prev_pmd);
+#else
 		retval = copy_page_range(mm, current->mm, tmp);
+#endif
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
--- 2.5.41-mm1/./mm/mmap.c	2002-10-09 11:02:58.000000000 -0500
+++ 2.5.41-mm1-shpte/./mm/mmap.c	2002-10-09 10:40:09.000000000 -0500
@@ -23,6 +23,7 @@
 #include <asm/tlb.h>
 
 extern void unmap_page_range(mmu_gather_t *,struct vm_area_struct *vma, unsigned long address, unsigned long size);
+extern void unmap_all_pages(mmu_gather_t *tlb, struct mm_struct *mm, unsigned long address, unsigned long end);
 extern void clear_page_tables(mmu_gather_t *tlb, unsigned long first, int nr);
 
 /*
@@ -1248,11 +1249,22 @@
 	}
 }
 
+/*
+ * For small tasks, it's most efficient to unmap the pages for each
+ * vma.  For larger tasks, it's better to just walk the entire address
+ * space in one pass, particularly with shared pte pages.  This
+ * threshold determines the size where we switch from one method to
+ * the other.
+ */
+
+#define	UNMAP_THRESHOLD		500
+
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct * mm)
 {
 	mmu_gather_t *tlb;
 	struct vm_area_struct * mpnt;
+	int unmap_vma = mm->total_vm < UNMAP_THRESHOLD;
 
 	release_segments(mm);
 	spin_lock(&mm->page_table_lock);
@@ -1273,10 +1285,10 @@
 			vm_unacct_memory((end - start) >> PAGE_SHIFT);
 
 		mm->map_count--;
-		if (!(is_vm_hugetlb_page(mpnt)))
-			unmap_page_range(tlb, mpnt, start, end);
-		else
+		if (is_vm_hugetlb_page(mpnt))
 			mpnt->vm_ops->close(mpnt);
+		else if (unmap_vma)
+			unmap_page_range(tlb, mpnt, start, end);
 		mpnt = mpnt->vm_next;
 	}
 
@@ -1284,6 +1296,9 @@
 	if (mm->map_count)
 		BUG();
 
+	if (!unmap_vma)
+		unmap_all_pages(tlb, mm, 0, TASK_SIZE);
+
 	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 	tlb_finish_mmu(tlb, 0, TASK_SIZE);
 
--- 2.5.41-mm1/./mm/swapfile.c	2002-10-07 13:23:34.000000000 -0500
+++ 2.5.41-mm1-shpte/./mm/swapfile.c	2002-10-09 10:35:30.000000000 -0500
@@ -17,6 +17,7 @@
 #include <linux/buffer_head.h>
 
 #include <asm/pgtable.h>
+#include <asm/rmap.h>
 #include <linux/swapops.h>
 
 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
@@ -371,7 +372,7 @@
  */
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
-	pte_t *dir, swp_entry_t entry, struct page* page)
+	pte_t *dir, swp_entry_t entry, struct page* page, pmd_t *pmd)
 {
 	pte_t pte = *dir;
 
@@ -383,7 +384,7 @@
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
 	page_add_rmap(page, dir);
 	swap_free(entry);
-	++vma->vm_mm->rss;
+	increment_rss(pmd_page(*pmd));
 }
 
 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
@@ -408,7 +409,7 @@
 	if (end > PMD_SIZE)
 		end = PMD_SIZE;
 	do {
-		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
+		unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page, dir);
 		address += PAGE_SIZE;
 		pte++;
 	} while (address && (address < end));
--- 2.5.41-mm1/./mm/memory.c	2002-10-09 11:02:58.000000000 -0500
+++ 2.5.41-mm1-shpte/./mm/memory.c	2002-10-09 15:25:18.000000000 -0500
@@ -36,6 +36,18 @@
  *		(Gerhard.Wichert@pdb.siemens.de)
  */
 
+/*
+ * A note on locking of the page table structure:
+ *
+ *  The top level lock that protects the page table is the mm->page_table_lock.
+ *  This lock protects the pgd and pmd layer.  However, with the advent of shared
+ *  pte pages, this lock is not sufficient.  The pte layer is now protected by the
+ *  pte_page_lock, set in the struct page of the pte page.  Note that with this
+ *  locking scheme, once the pgd and pmd layers have been set in the page fault
+ *  path and the pte_page_lock has been taken, the page_table_lock can be released.
+ * 
+ */
+
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
@@ -45,6 +57,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/vcache.h>
+#include <linux/rmap-locking.h>
 
 #include <asm/pgalloc.h>
 #include <asm/rmap.h>
@@ -84,7 +97,7 @@
  */
 static inline void free_one_pmd(mmu_gather_t *tlb, pmd_t * dir)
 {
-	struct page *page;
+	struct page *ptepage;
 
 	if (pmd_none(*dir))
 		return;
@@ -93,10 +106,13 @@
 		pmd_clear(dir);
 		return;
 	}
-	page = pmd_page(*dir);
+	ptepage = pmd_page(*dir);
 	pmd_clear(dir);
-	pgtable_remove_rmap(page);
-	pte_free_tlb(tlb, page);
+	pgtable_remove_rmap(ptepage, tlb->mm);
+	if (page_count(ptepage) == 1) {
+		dec_page_state(nr_page_table_pages);
+	}
+	pte_free_tlb(tlb, ptepage);
 }
 
 static inline void free_one_pgd(mmu_gather_t *tlb, pgd_t * dir)
@@ -137,6 +153,216 @@
 	} while (--nr);
 }
 
+/*
+ * This function makes the decision whether a pte page needs to be unshared
+ * or not.  Note that page_count() == 1 isn't even tested here.  The assumption
+ * is that if the pmd entry is marked writeable, then the page is either already
+ * unshared or doesn't need to be unshared.  This catches the situation where
+ * task B unshares the pte page, then task A faults and needs to unprotect the
+ * pmd entry.  This is actually done in pte_unshare.
+ *
+ * This function should be called with the page_table_lock held.
+ */
+static inline int pte_needs_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+				    pmd_t *pmd, unsigned long address, int write_access)
+{
+#ifdef CONFIG_SHAREPTE
+	struct page *ptepage;
+
+	/* It's not even there, nothing to unshare. */
+	if (!pmd_present(*pmd))
+		return 0;
+
+	/*
+	 * If it's already writable, then it doesn't need to be unshared.
+	 * It's either already not shared or it's part of a large shared
+	 * region that will never need to be unshared.
+	 */
+	if (pmd_write(*pmd))
+		return 0;
+
+	/* If this isn't a write fault we don't need to unshare. */
+	if (!write_access)
+		return 0;
+
+	/*
+	 * If this page fits entirely inside a shared region, don't unshare it.
+	 */
+	ptepage = pmd_page(*pmd);
+	if ((vma->vm_flags & VM_SHARED) &&
+	    (vma->vm_start <= ptepage->index) &&
+	    (vma->vm_end >= (ptepage->index + PMD_SIZE))) {
+		return 0;
+	}
+	/*
+	 * Ok, we have to unshare.
+	 */
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Here is where a pte page is actually unshared.  It actually covers a couple of
+ * possible conditions.  If the page_count() is already 1, then that means it just
+ * needs to be set writeable.  Otherwise, a new page needs to be allocated.
+ *
+ * When each pte entry is copied, it is evaluated for COW protection, as well as
+ * checking whether the swap count needs to be incremented.
+ *
+ * This function must be called with the page_table_lock held.  It
+ * will release and reacquire the lock when it allocates a new page.
+ *
+ * The function must also be called with the pte_page_lock held on the
+ * old page.  This lock will also be dropped, then reacquired when we
+ * allocate a new page.  The pte_page_lock will be taken on the new
+ * page.  Whichever pte page is returned will have its pte_page_lock
+ * held.
+ */
+
+static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+#ifdef CONFIG_SHAREPTE
+	pte_t	*src_ptb, *dst_ptb;
+	struct page *oldpage, *newpage, *tmppage;
+	struct vm_area_struct *vma;
+	int	base, addr;
+	int	end, page_end;
+	int	src_unshare;
+
+	oldpage = pmd_page(*pmd);
+
+	/* If it's already unshared, we just need to set it writeable */
+	if (page_count(oldpage) == 1) {
+is_unshared:
+		pmd_populate(mm, pmd, oldpage);
+		flush_tlb_mm(mm);
+		goto out_map;
+	}
+
+	pte_page_unlock(oldpage);
+	spin_unlock(&mm->page_table_lock);
+	newpage = pte_alloc_one(mm, address);
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!newpage))
+		return NULL;
+
+	pte_page_lock(oldpage);
+	/*
+	 * It's possible some other task using our mm_struct did an unshare
+	 * and we're now supposed to be using a different pte page.  If so,
+	 * switch to it.
+	 */
+	tmppage = pmd_page(*pmd);
+	if (oldpage != tmppage) {
+		pte_page_lock(tmppage);
+		pte_page_unlock(oldpage);
+		oldpage = tmppage;
+	}
+	/* See if it got unshared while we dropped the lock */
+	if (page_count(oldpage) == 1) {
+		pte_free(newpage);
+		goto is_unshared;
+	}
+
+	pte_page_lock(newpage);
+
+	base = addr = oldpage->index;
+	page_end = base + PMD_SIZE;
+	vma = find_vma(mm, base);
+	if (!vma || (page_end <= vma->vm_start))
+		BUG(); 		/* No valid pages in this pte page */
+
+	src_unshare = page_count(oldpage) == 2;
+	dst_ptb = pte_page_map(newpage, base);
+	src_ptb = pte_page_map_nested(oldpage, base);
+
+	if (vma->vm_start > addr)
+		addr = vma->vm_start;
+
+	if (vma->vm_end < page_end)
+		end = vma->vm_end;
+	else
+		end = page_end;
+
+	do {
+		unsigned int cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+		pte_t *src_pte = src_ptb + __pte_offset(addr);
+		pte_t *dst_pte = dst_ptb + __pte_offset(addr);
+
+		do {
+			pte_t pte = *src_pte;
+			struct page *page;
+
+			if (pte_none(pte))
+				goto unshare_skip_set;
+
+			if (!pte_present(pte)) {
+				swap_duplicate(pte_to_swp_entry(pte));
+				set_pte(dst_pte, pte);
+				goto unshare_skip_set;
+			}
+			page = pte_page(pte);
+			if (!PageReserved(page)) {
+				/* COW mappings require write protecting both sides */
+				if (cow) {
+					pte = pte_wrprotect(pte);
+					if (src_unshare)
+						set_pte(src_pte, pte);
+				}
+				/* If it's a shared mapping,
+				 *  mark it clean in the new mapping
+				 */
+				if (vma->vm_flags & VM_SHARED)
+					pte = pte_mkclean(pte);
+				pte = pte_mkold(pte);
+				get_page(page);
+			}
+			set_pte(dst_pte, pte);
+			page_add_rmap(page, dst_pte);
+unshare_skip_set:
+			src_pte++;
+			dst_pte++;
+			addr += PAGE_SIZE;
+		} while (addr < end);
+
+		if (addr >= page_end)
+			break;
+
+		vma = vma->vm_next;
+		if (!vma)
+			break;
+
+		if (page_end <= vma->vm_start)
+			break;
+
+		addr = vma->vm_start;
+		if (vma->vm_end < page_end)
+			end = vma->vm_end;
+		else
+			end = page_end;
+	} while (1);
+
+	pte_unmap_nested(src_ptb);
+
+	pgtable_remove_rmap_locked(oldpage, mm);
+	pgtable_add_rmap_locked(newpage, mm, base);
+	pmd_populate(mm, pmd, newpage);
+	inc_page_state(nr_page_table_pages);
+
+	flush_tlb_mm(mm);
+
+	put_page(oldpage);
+	pte_page_unlock(oldpage);
+
+	return dst_ptb + __pte_offset(address);
+
+out_map:
+#endif
+	return pte_offset_map(pmd, address);
+}
+
 pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
@@ -158,11 +384,10 @@
 		}
 		pgtable_add_rmap(new, mm, address);
 		pmd_populate(mm, pmd, new);
+		inc_page_state(nr_page_table_pages);
 	}
 out:
-	if (pmd_present(*pmd))
-		return pte_offset_map(pmd, address);
-	return NULL;
+	return pte_offset_map(pmd, address);
 }
 
 pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
@@ -184,7 +409,6 @@
 			pte_free_kernel(new);
 			goto out;
 		}
-		pgtable_add_rmap(virt_to_page(new), mm, address);
 		pmd_populate_kernel(mm, pmd, new);
 	}
 out:
@@ -193,6 +417,98 @@
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
+int share_page_range(struct mm_struct *dst, struct mm_struct *src,
+	struct vm_area_struct *vma, pmd_t **prev_pmd)
+{
+#ifdef CONFIG_SHAREPTE
+	pgd_t *src_pgd, *dst_pgd;
+	unsigned long address = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	if (is_vm_hugetlb_page(vma))
+		return copy_hugetlb_page_range(dst, src, vma);
+
+	src_pgd = pgd_offset(src, address)-1;
+	dst_pgd = pgd_offset(dst, address)-1;
+
+	for (;;) {
+		pmd_t * src_pmd, * dst_pmd;
+
+		src_pgd++; dst_pgd++;
+
+		if (pgd_none(*src_pgd))
+			goto skip_share_pmd_range;
+		if (pgd_bad(*src_pgd)) {
+			pgd_ERROR(*src_pgd);
+			pgd_clear(src_pgd);
+skip_share_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
+			if (!address || (address >= end))
+				goto out;
+			continue;
+		}
+
+		src_pmd = pmd_offset(src_pgd, address);
+		dst_pmd = pmd_alloc(dst, dst_pgd, address);
+		if (!dst_pmd)
+			goto nomem;
+
+		spin_lock(&src->page_table_lock);
+
+		do {
+			pmd_t	pmdval = *src_pmd;
+			struct page *ptepage = pmd_page(pmdval);
+
+			if (pmd_none(pmdval))
+				goto skip_share_pte_range;
+			if (pmd_bad(pmdval)) {
+				pmd_ERROR(*src_pmd);
+				pmd_clear(src_pmd);
+				goto skip_share_pte_range;
+			}
+
+			/*
+			 * We set the pmd read-only in both the parent and the
+			 * child unless it's a writeable shared region that
+			 * spans the entire pte page.
+			 */
+			if ((((vma->vm_flags & (VM_SHARED|VM_MAYWRITE)) !=
+			    (VM_SHARED|VM_MAYWRITE)) ||
+			    (ptepage->index < vma->vm_start) ||
+			    ((ptepage->index + PMD_SIZE) > vma->vm_end)) &&
+			    pmd_write(pmdval)) {
+				pmdval = pmd_wrprotect(pmdval);
+				set_pmd(src_pmd, pmdval);
+			}
+			set_pmd(dst_pmd, pmdval);
+
+			/* Only do this if we haven't seen this pte page before */
+			if (src_pmd != *prev_pmd) {
+				get_page(ptepage);
+				pgtable_add_rmap(ptepage, dst, address);
+				*prev_pmd = src_pmd;
+				dst->rss += ptepage->private;
+			}
+
+skip_share_pte_range:	address = (address + PMD_SIZE) & PMD_MASK;
+			if (address >= end)
+				goto out_unlock;
+
+			src_pmd++;
+			dst_pmd++;
+		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
+		spin_unlock(&src->page_table_lock);
+	}
+
+out_unlock:
+	spin_unlock(&src->page_table_lock);
+
+out:
+	return 0;
+nomem:
+#endif
+	return -ENOMEM;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -327,6 +643,7 @@
 
 static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
+	struct page *ptepage;
 	unsigned long offset;
 	pte_t *ptep;
 
@@ -337,11 +654,34 @@
 		pmd_clear(pmd);
 		return;
 	}
-	ptep = pte_offset_map(pmd, address);
+
 	offset = address & ~PMD_MASK;
 	if (offset + size > PMD_SIZE)
 		size = PMD_SIZE - offset;
 	size &= PAGE_MASK;
+
+	/*
+	 * Check to see if the pte page is shared.  If it is and we're unmapping
+	 * the entire page, just decrement the reference count and we're done.
+	 * If we're only unmapping part of the page we'll have to unshare it the
+	 * slow way.
+	 */
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
+	if (page_count(ptepage) > 1) {
+		if ((offset == 0) && (size == PMD_SIZE)) {
+			pmd_clear(pmd);
+			pgtable_remove_rmap_locked(ptepage, tlb->mm);
+			tlb->mm->rss -= ptepage->private;
+			put_page(ptepage);
+			pte_page_unlock(ptepage);
+			return;
+		}
+		ptep = pte_unshare(tlb->mm, pmd, address);
+		ptepage = pmd_page(*pmd);
+	} else {
+		ptep = pte_offset_map(pmd, address);
+	}
 	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
 		pte_t pte = *ptep;
 		if (pte_none(pte))
@@ -369,6 +709,7 @@
 			pte_clear(ptep);
 		}
 	}
+	pte_page_unlock(ptepage);
 	pte_unmap(ptep-1);
 }
 
@@ -464,6 +805,19 @@
 	spin_unlock(&mm->page_table_lock);
 }
 
+void unmap_all_pages(mmu_gather_t *tlb, struct mm_struct *mm, unsigned long address, unsigned long end)
+{
+	pgd_t * dir;
+
+	if (address >= end)
+		BUG();
+	dir = pgd_offset(mm, address);
+	do {
+		zap_pmd_range(tlb, dir, address, end - address);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && (address < end));
+}
 /*
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
@@ -1009,6 +1363,7 @@
 	unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
 {
 	struct page *old_page, *new_page;
+	struct page *ptepage = pmd_page(*pmd);
 	unsigned long pfn = pte_pfn(pte);
 
 	if (!pfn_valid(pfn))
@@ -1022,7 +1377,7 @@
 			flush_cache_page(vma, address);
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return VM_FAULT_MINOR;
 		}
 	}
@@ -1032,7 +1387,7 @@
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 
 	new_page = alloc_page(GFP_HIGHUSER);
 	if (!new_page)
@@ -1042,11 +1397,12 @@
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	spin_lock(&mm->page_table_lock);
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 	if (pte_same(*page_table, pte)) {
 		if (PageReserved(old_page))
-			++mm->rss;
+			increment_rss(ptepage);
 		page_remove_rmap(old_page, page_table);
 		break_cow(vma, new_page, address, page_table);
 		page_add_rmap(new_page, page_table);
@@ -1056,14 +1412,14 @@
 		new_page = old_page;
 	}
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	return VM_FAULT_MINOR;
 
 bad_wp_page:
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address);
 	/*
 	 * This should really halt the system so it can be debugged or
@@ -1192,12 +1548,13 @@
 	pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
 {
 	struct page *page;
+	struct page *ptepage = pmd_page(*pmd);
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	page = lookup_swap_cache(entry);
 	if (!page) {
 		swapin_readahead(entry);
@@ -1207,14 +1564,15 @@
 			 * Back out if somebody else faulted in this pte while
 			 * we released the page table lock.
 			 */
-			spin_lock(&mm->page_table_lock);
+			ptepage = pmd_page(*pmd);
+			pte_page_lock(ptepage);
 			page_table = pte_offset_map(pmd, address);
 			if (pte_same(*page_table, orig_pte))
 				ret = VM_FAULT_OOM;
 			else
 				ret = VM_FAULT_MINOR;
 			pte_unmap(page_table);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return ret;
 		}
 
@@ -1230,11 +1588,12 @@
 	 * Back out if somebody else faulted in this pte while we
 	 * released the page table lock.
 	 */
-	spin_lock(&mm->page_table_lock);
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 	if (!pte_same(*page_table, orig_pte)) {
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 		unlock_page(page);
 		page_cache_release(page);
 		return VM_FAULT_MINOR;
@@ -1246,7 +1605,7 @@
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);
 
-	mm->rss++;
+	increment_rss(ptepage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page))
 		pte = pte_mkdirty(pte_mkwrite(pte));
@@ -1260,7 +1619,7 @@
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return ret;
 }
 
@@ -1273,6 +1632,7 @@
 {
 	pte_t entry;
 	struct page * page = ZERO_PAGE(addr);
+	struct page *ptepage = pmd_page(*pmd);
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
@@ -1281,23 +1641,24 @@
 	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 
 		page = alloc_page(GFP_HIGHUSER);
 		if (!page)
 			goto no_mem;
 		clear_user_highpage(page, addr);
 
-		spin_lock(&mm->page_table_lock);
+		ptepage = pmd_page(*pmd);
+		pte_page_lock(ptepage);
 		page_table = pte_offset_map(pmd, addr);
 
 		if (!pte_none(*page_table)) {
 			pte_unmap(page_table);
 			page_cache_release(page);
-			spin_unlock(&mm->page_table_lock);
+			pte_page_unlock(ptepage);
 			return VM_FAULT_MINOR;
 		}
-		mm->rss++;
+		increment_rss(ptepage);
 		flush_page_to_ram(page);
 		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 		lru_cache_add(page);
@@ -1310,7 +1671,7 @@
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return VM_FAULT_MINOR;
 
 no_mem:
@@ -1333,12 +1694,13 @@
 	unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
 {
 	struct page * new_page;
+	struct page *ptepage = pmd_page(*pmd);
 	pte_t entry;
 
 	if (!vma->vm_ops || !vma->vm_ops->nopage)
 		return do_anonymous_page(mm, vma, page_table, pmd, write_access, address);
 	pte_unmap(page_table);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 
 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 
@@ -1363,7 +1725,8 @@
 		new_page = page;
 	}
 
-	spin_lock(&mm->page_table_lock);
+	ptepage = pmd_page(*pmd);
+	pte_page_lock(ptepage);
 	page_table = pte_offset_map(pmd, address);
 
 	/*
@@ -1378,7 +1741,7 @@
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		++mm->rss;
+		increment_rss(ptepage);
 		flush_page_to_ram(new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1391,13 +1754,13 @@
 		/* One of our sibling threads was faster, back out. */
 		pte_unmap(page_table);
 		page_cache_release(new_page);
-		spin_unlock(&mm->page_table_lock);
+		pte_page_unlock(ptepage);
 		return VM_FAULT_MINOR;
 	}
 
 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(ptepage);
 	return VM_FAULT_MAJOR;
 }
 
@@ -1449,7 +1812,7 @@
 	entry = pte_mkyoung(entry);
 	establish_pte(vma, address, pte, entry);
 	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_page_unlock(pmd_page(*pmd));
 	return VM_FAULT_MINOR;
 }
 
@@ -1474,9 +1837,20 @@
 	pmd = pmd_alloc(mm, pgd, address);
 
 	if (pmd) {
-		pte_t * pte = pte_alloc_map(mm, pmd, address);
-		if (pte)
+		pte_t * pte;
+
+		if (pte_needs_unshare(mm, vma, pmd, address, write_access)) {
+			pte_page_lock(pmd_page(*pmd));
+			pte = pte_unshare(mm, pmd, address);
+		} else {
+			pte = pte_alloc_map(mm, pmd, address);
+			pte_page_lock(pmd_page(*pmd));
+		}
+
+		if (pte) {
+			spin_unlock(&mm->page_table_lock);
 			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	return VM_FAULT_OOM;
--- 2.5.41-mm1/./mm/rmap.c	2002-10-07 13:25:15.000000000 -0500
+++ 2.5.41-mm1-shpte/./mm/rmap.c	2002-10-09 10:25:52.000000000 -0500
@@ -45,11 +45,17 @@
  */
 #define NRPTE ((L1_CACHE_BYTES - sizeof(void *))/sizeof(pte_addr_t))
 
+struct mm_chain {
+	struct mm_chain *next;
+	struct mm_struct *mm;
+};
+
 struct pte_chain {
 	struct pte_chain *next;
 	pte_addr_t ptes[NRPTE];
 };
 
+static kmem_cache_t	*mm_chain_cache;
 static kmem_cache_t	*pte_chain_cache;
 
 /*
@@ -102,6 +108,25 @@
 	kmem_cache_free(pte_chain_cache, pte_chain);
 }
 
+static inline struct mm_chain *mm_chain_alloc(void)
+{
+	struct mm_chain *ret;
+
+	ret = kmem_cache_alloc(mm_chain_cache, GFP_ATOMIC);
+	return ret;
+}
+
+static void mm_chain_free(struct mm_chain *mc,
+		struct mm_chain *prev_mc, struct page *page)
+{
+	if (prev_mc)
+		prev_mc->next = mc->next;
+	else if (page)
+		page->pte.mmchain = mc->next;
+
+	kmem_cache_free(mm_chain_cache, mc);
+}
+
 /**
  ** VM stuff below this comment
  **/
@@ -161,6 +186,94 @@
 	return referenced;
 }
 
+void pgtable_add_rmap_locked(struct page * page, struct mm_struct * mm,
+			     unsigned long address)
+{
+	struct mm_chain *mc;
+
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+	/* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+	extern int mem_init_done;
+
+	if (!mem_init_done)
+		return;
+#endif
+#ifdef RMAP_DEBUG
+	BUG_ON(mm == NULL);
+#endif
+
+	if (PageDirect(page)) {
+		mc = mm_chain_alloc();
+		mc->mm = page->pte.mmdirect;
+		mc->next = NULL;
+		page->pte.mmchain = mc;
+		ClearPageDirect(page);
+	}
+	if (page->pte.mmchain) {
+		/* Hook up the mm_chain to the page. */
+		mc = mm_chain_alloc();
+		mc->mm = mm;
+		mc->next = page->pte.mmchain;
+		page->pte.mmchain = mc;
+	} else {
+		page->pte.mmdirect = mm;
+		SetPageDirect(page);
+		page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+	}
+}
+
+void pgtable_remove_rmap_locked(struct page * page, struct mm_struct *mm)
+{
+	struct mm_chain * mc, * prev_mc = NULL;
+
+#ifdef DEBUG_RMAP
+	BUG_ON(mm == NULL);
+#endif
+
+	if (PageDirect(page)) {
+		if (page->pte.mmdirect == mm) {
+			page->pte.mmdirect = NULL;
+			ClearPageDirect(page);
+			page->index = 0;
+			goto out;
+		}
+	} else {
+#ifdef DEBUG_RMAP
+		BUG_ON(page->pte.mmchain->next == NULL);
+#endif
+		for (mc = page->pte.mmchain; mc; prev_mc = mc, mc = mc->next) {
+			if (mc->mm == mm) {
+				mm_chain_free(mc, prev_mc, page);
+				/* Check whether we can convert to direct */
+				mc = page->pte.mmchain;
+				if (!mc->next) {
+					page->pte.mmdirect = mc->mm;
+					SetPageDirect(page);
+					mm_chain_free(mc, NULL, NULL);
+				}
+				goto out;
+			}
+		}
+	}
+	BUG();
+out:
+}
+
+void pgtable_add_rmap(struct page * page, struct mm_struct * mm,
+			     unsigned long address)
+{
+	pte_page_lock(page);
+	pgtable_add_rmap_locked(page, mm, address);
+	pte_page_unlock(page);
+}
+
+void pgtable_remove_rmap(struct page * page, struct mm_struct *mm)
+{
+	pte_page_lock(page);
+	pgtable_remove_rmap_locked(page, mm);
+	pte_page_unlock(page);
+}
+
 /**
  * page_add_rmap - add reverse mapping entry to a page
  * @page: the page to add the mapping to
@@ -180,8 +293,6 @@
 		BUG();
 	if (!pte_present(*ptep))
 		BUG();
-	if (!ptep_to_mm(ptep))
-		BUG();
 #endif
 
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
@@ -199,12 +310,15 @@
 			if (page->pte.direct == pte_paddr)
 				BUG();
 		} else {
+			int count = 0;
 			for (pc = page->pte.chain; pc; pc = pc->next) {
-				for (i = 0; i < NRPTE; i++) {
+				for (i = 0; i < NRPTE; i++, count++) {
 					pte_addr_t p = pc->ptes[i];
 
-					if (p && p == pte_paddr)
+					if (p && p == pte_paddr) {
+						printk(KERN_ERR "page_add_rmap: page %08lx (count %d), ptep %08lx, rmap count %d\n", page, page_count(page), ptep, count);
 						BUG();
+					}
 				}
 			}
 		}
@@ -342,6 +456,98 @@
 	return;
 }
 
+static inline int pgtable_check_mlocked_mm(struct mm_struct *mm, unsigned long address)
+{
+	struct vm_area_struct *vma;
+	int ret = SWAP_SUCCESS;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out;
+	}
+
+	/* The page is mlock()d, we cannot swap it out. */
+	if (vma->vm_flags & VM_LOCKED) {
+		ret = SWAP_FAIL;
+	}
+out:
+	return ret;
+}
+
+static inline int pgtable_check_mlocked(struct page *ptepage, unsigned long address)
+{
+	struct mm_chain *mc;
+	int ret = SWAP_SUCCESS;
+
+	if (PageDirect(ptepage)) {
+		ret = pgtable_check_mlocked_mm(ptepage->pte.mmdirect, address);
+		goto out;
+	}
+
+	for (mc = ptepage->pte.mmchain; mc; mc = mc->next) {
+#ifdef DEBUG_RMAP
+		BUG_ON(mc->mm == NULL);
+#endif
+		ret = pgtable_check_mlocked_mm(mc->mm, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	}
+out:
+	return ret;
+}
+
+static inline int pgtable_unmap_one_mm(struct mm_struct *mm, unsigned long address)
+{
+	struct vm_area_struct *vma;
+	int ret = SWAP_SUCCESS;
+
+	/* During mremap, it's possible pages are not in a VMA. */
+	vma = find_vma(mm, address);
+	if (!vma) {
+		ret = SWAP_FAIL;
+		goto out;
+	}
+	flush_tlb_page(vma, address);
+	flush_cache_page(vma, address);
+	mm->rss--;
+
+out:
+	return ret;
+}
+
+static inline int pgtable_unmap_one(struct page *ptepage, unsigned long address)
+{
+	struct mm_chain *mc;
+	int ret = SWAP_SUCCESS;
+
+	if (PageDirect(ptepage)) {
+		ret = pgtable_unmap_one_mm(ptepage->pte.mmdirect, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	} else for (mc = ptepage->pte.mmchain; mc; mc = mc->next) {
+		ret = pgtable_unmap_one_mm(mc->mm, address);
+		if (ret != SWAP_SUCCESS)
+			goto out;
+	}
+	ptepage->private--;
+out:
+	return ret;
+}
+
+void increment_rss(struct page *ptepage)
+{
+	struct mm_chain *mc;
+
+	if (PageDirect(ptepage))
+		ptepage->pte.mmdirect->rss++;
+	else for (mc = ptepage->pte.mmchain; mc; mc = mc->next)
+		mc->mm->rss++;
+
+	ptepage->private++;
+}
+
 /**
  * try_to_unmap_one - worker function for try_to_unmap
  * @page: page to unmap
@@ -360,42 +566,24 @@
 static int try_to_unmap_one(struct page * page, pte_addr_t paddr)
 {
 	pte_t *ptep = rmap_ptep_map(paddr);
-	unsigned long address = ptep_to_address(ptep);
-	struct mm_struct * mm = ptep_to_mm(ptep);
-	struct vm_area_struct * vma;
 	pte_t pte;
+	struct page *ptepage = kmap_atomic_to_page(ptep);
+	unsigned long address = ptep_to_address(ptep);
 	int ret;
 
-	if (!mm)
-		BUG();
-
-	/*
-	 * We need the page_table_lock to protect us from page faults,
-	 * munmap, fork, etc...
-	 */
-	if (!spin_trylock(&mm->page_table_lock)) {
-		rmap_ptep_unmap(ptep);
-		return SWAP_AGAIN;
-	}
-
+	pte_page_lock(ptepage);
 
-	/* During mremap, it's possible pages are not in a VMA. */
-	vma = find_vma(mm, address);
-	if (!vma) {
-		ret = SWAP_FAIL;
+	ret = pgtable_check_mlocked(ptepage, address);
+	if (ret != SWAP_SUCCESS)
 		goto out_unlock;
-	}
+	pte = ptep_get_and_clear(ptep);
 
-	/* The page is mlock()d, we cannot swap it out. */
-	if (vma->vm_flags & VM_LOCKED) {
-		ret = SWAP_FAIL;
+	ret = pgtable_unmap_one(ptepage, address);
+	if (ret != SWAP_SUCCESS) {
+		set_pte(ptep, pte);
 		goto out_unlock;
 	}
-
-	/* Nuke the page table entry. */
-	pte = ptep_get_and_clear(ptep);
-	flush_tlb_page(vma, address);
-	flush_cache_page(vma, address);
+	pte_page_unlock(ptepage);
 
 	/* Store the swap location in the pte. See handle_pte_fault() ... */
 	if (PageSwapCache(page)) {
@@ -408,13 +596,15 @@
 	if (pte_dirty(pte))
 		set_page_dirty(page);
 
-	mm->rss--;
 	page_cache_release(page);
 	ret = SWAP_SUCCESS;
+	goto out;
 
 out_unlock:
+	pte_page_unlock(ptepage);
+
+out:
 	rmap_ptep_unmap(ptep);
-	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
@@ -523,6 +713,17 @@
 
 void __init pte_chain_init(void)
 {
+
+	mm_chain_cache = kmem_cache_create(	"mm_chain",
+						sizeof(struct mm_chain),
+						0,
+						0,
+						NULL,
+						NULL);
+
+	if (!mm_chain_cache)
+		panic("failed to create mm_chain cache!\n");
+
 	pte_chain_cache = kmem_cache_create(	"pte_chain",
 						sizeof(struct pte_chain),
 						0,

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-09 21:07 [PATCH 2.5.41-mm1] new snapshot of shared page tables Dave McCracken
@ 2002-10-09 22:51 ` Andrew Morton
  2002-10-10 17:27   ` Dave McCracken
  2002-10-10  3:04 ` Ed Tomlinson
  1 sibling, 1 reply; 8+ messages in thread
From: Andrew Morton @ 2002-10-09 22:51 UTC (permalink / raw)
  To: Dave McCracken; +Cc: Linux Memory Management

Dave McCracken wrote:
> 
> with the added feature that shared page tables are now a config option.

Good idea, that.   The ppc64 guys (guy) don't actually want the feature,
and it significantly de-risks things.  Although it's one more datapoint
to be gathered when pondering oops reports.

Stylistic trivia: When stubbing out a function it's cleaner (and faster)
to do:

#ifdef CONFIG_FOO
int my_function(arg1, arg2)
{
	...
}
#else
static inline int my_function(arg1, arg2)
{
	return 0;
}
#endif

Do we have any performance numbers on this yet?  Both for speed
and space?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-09 22:51 ` Andrew Morton
@ 2002-10-10 17:27   ` Dave McCracken
  0 siblings, 0 replies; 8+ messages in thread
From: Dave McCracken @ 2002-10-10 17:27 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 708 bytes --]


--On Wednesday, October 09, 2002 15:51:15 -0700 Andrew Morton
<akpm@digeo.com> wrote:

> Stylistic trivia: When stubbing out a function it's cleaner (and faster)
> to do:
> 
># ifdef CONFIG_FOO
> int my_function(arg1, arg2)
> {
> 	...
> }
># else
> static inline int my_function(arg1, arg2)
> {
> 	return 0;
> }
># endif

I'll go one better and remove the function and the references to it, if
that's cleaner.  It feels cleaner to me.  Here's a patch that does it, on
top of 2.5.41-mm2.

Dave

======================================================================
Dave McCracken          IBM Linux Base Kernel Team      1-512-838-3059
dmccr@us.ibm.com                                        T/L   678-3059

[-- Attachment #2: shpte-2.5.41-mm2-1.diff --]
[-- Type: text/plain, Size: 4103 bytes --]

--- 2.5.41-mm2/./include/linux/mm.h	2002-10-10 10:17:56.000000000 -0500
+++ 2.5.41-mm2-shpte/./include/linux/mm.h	2002-10-10 10:34:58.000000000 -0500
@@ -359,8 +359,11 @@
 extern int shmem_zero_setup(struct vm_area_struct *);
 
 extern void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size);
-extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+#ifdef CONFIG_SHAREPTE
 extern int share_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma, pmd_t **prev_pmd);
+#else
+extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+#endif
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot);
 
--- 2.5.41-mm2/./mm/memory.c	2002-10-10 10:17:57.000000000 -0500
+++ 2.5.41-mm2-shpte/./mm/memory.c	2002-10-10 10:35:34.000000000 -0500
@@ -153,6 +153,7 @@
 	} while (--nr);
 }
 
+#ifdef CONFIG_SHAREPTE
 /*
  * This function makes the decision whether a pte page needs to be unshared
  * or not.  Note that page_count() == 1 isn't even tested here.  The assumption
@@ -166,7 +167,6 @@
 static inline int pte_needs_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				    pmd_t *pmd, unsigned long address, int write_access)
 {
-#ifdef CONFIG_SHAREPTE
 	struct page *ptepage;
 
 	/* It's not even there, nothing to unshare. */
@@ -198,9 +198,6 @@
 	 * Ok, we have to unshare.
 	 */
 	return 1;
-#else
-	return 0;
-#endif
 }
 
 /*
@@ -223,7 +220,6 @@
 
 static pte_t *pte_unshare(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-#ifdef CONFIG_SHAREPTE
 	pte_t	*src_ptb, *dst_ptb;
 	struct page *oldpage, *newpage, *tmppage;
 	struct vm_area_struct *vma;
@@ -359,9 +355,9 @@
 	return dst_ptb + __pte_offset(address);
 
 out_map:
-#endif
 	return pte_offset_map(pmd, address);
 }
+#endif
 
 pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
@@ -417,10 +413,10 @@
 #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
 #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
 
+#ifdef CONFIG_SHAREPTE
 int share_page_range(struct mm_struct *dst, struct mm_struct *src,
 	struct vm_area_struct *vma, pmd_t **prev_pmd)
 {
-#ifdef CONFIG_SHAREPTE
 	pgd_t *src_pgd, *dst_pgd;
 	unsigned long address = vma->vm_start;
 	unsigned long end = vma->vm_end;
@@ -505,10 +501,9 @@
 out:
 	return 0;
 nomem:
-#endif
 	return -ENOMEM;
 }
-
+#else
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -640,6 +635,7 @@
 nomem:
 	return -ENOMEM;
 }
+#endif
 
 static void zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
 {
@@ -668,6 +664,7 @@
 	 */
 	ptepage = pmd_page(*pmd);
 	pte_page_lock(ptepage);
+#ifdef CONFIG_SHAREPTE
 	if (page_count(ptepage) > 1) {
 		if ((offset == 0) && (size == PMD_SIZE)) {
 			pmd_clear(pmd);
@@ -679,9 +676,10 @@
 		}
 		ptep = pte_unshare(tlb->mm, pmd, address);
 		ptepage = pmd_page(*pmd);
-	} else {
+	} else
+#endif
 		ptep = pte_offset_map(pmd, address);
-	}
+
 	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
 		pte_t pte = *ptep;
 		if (pte_none(pte))
@@ -1839,6 +1837,7 @@
 	if (pmd) {
 		pte_t * pte;
 
+#ifdef CONFIG_SHAREPTE
 		if (pte_needs_unshare(mm, vma, pmd, address, write_access)) {
 			pte_page_lock(pmd_page(*pmd));
 			pte = pte_unshare(mm, pmd, address);
@@ -1846,7 +1845,10 @@
 			pte = pte_alloc_map(mm, pmd, address);
 			pte_page_lock(pmd_page(*pmd));
 		}
-
+#else
+		pte = pte_alloc_map(mm, pmd, address);
+		pte_page_lock(pmd_page(*pmd));
+#endif
 		if (pte) {
 			spin_unlock(&mm->page_table_lock);
 			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-09 21:07 [PATCH 2.5.41-mm1] new snapshot of shared page tables Dave McCracken
  2002-10-09 22:51 ` Andrew Morton
@ 2002-10-10  3:04 ` Ed Tomlinson
  2002-10-10  3:19   ` William Lee Irwin III
  1 sibling, 1 reply; 8+ messages in thread
From: Ed Tomlinson @ 2002-10-10  3:04 UTC (permalink / raw)
  To: Dave McCracken, Linux Memory Management

On October 9, 2002 05:07 pm, Dave McCracken wrote:
> Here's the latest shared page table patch.  Changes are mostly cleanups,
> with the added feature that shared page tables are now a config option.
> This means the patch should build on other architectures (this is
> untested).
>
> At Andrew Morton's request, I've moved my development base to the -mm line.

After realizing (thanks Dave) that kmail 3.03 has a bug saving multipart/mixed 
mime messages, I was able to use uudeview to extract a clean patch, and build
kernel which boot fine.  Thats the good news.

When I try to start kde 3.03 on an up to date debian sid (X 4.2 etc) kde fails to start.
It complains that ksmserver cannot be started.  Same setup works with 41-mm1.

Know this is not a meaty report.  With X4.2 I have not yet figgered out how to get 
more debug messages (the log from xstart is anemic) nor is there anything in
messages, kern.log or on the serial console.  The box is a K6-III 400 on a via MVP3
chipset.

What other info can I gather?

Ed Tomlinson
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-10  3:04 ` Ed Tomlinson
@ 2002-10-10  3:19   ` William Lee Irwin III
  2002-10-10 22:29     ` Ed Tomlinson
  0 siblings, 1 reply; 8+ messages in thread
From: William Lee Irwin III @ 2002-10-10  3:19 UTC (permalink / raw)
  To: Ed Tomlinson; +Cc: Dave McCracken, Linux Memory Management

On Wed, Oct 09, 2002 at 11:04:47PM -0400, Ed Tomlinson wrote:
> After realizing (thanks Dave) that kmail 3.03 has a bug saving
> multipart/mixed mime messages, I was able to use uudeview to extract
> a clean patch, and build kernel which boot fine.  Thats the good news.
> When I try to start kde 3.03 on an up to date debian sid (X 4.2 etc)
> kde fails to start. It complains that ksmserver cannot be started.
> Same setup works with 41-mm1.
> Know this is not a meaty report.  With X4.2 I have not yet figgered
> out how to get more debug messages (the log from xstart is anemic)
> nor is there anything in messages, kern.log or on the serial console.
> The box is a K6-III 400 on a via MVP3 chipset.
> What other info can I gather?
> Ed Tomlinson

Could you strace ksmserver on a working and non-working console and
(privately) send (probably large) logs to dmc & me? Please use
strace -f -ff or some equivalent that follows children.

I'll try to reproduce something locally.


Thanks,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-10  3:19   ` William Lee Irwin III
@ 2002-10-10 22:29     ` Ed Tomlinson
  2002-10-12 14:19       ` Ed Tomlinson
  0 siblings, 1 reply; 8+ messages in thread
From: Ed Tomlinson @ 2002-10-10 22:29 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Dave McCracken, Linux Memory Management

On October 9, 2002 11:19 pm, William Lee Irwin III wrote:
> On Wed, Oct 09, 2002 at 11:04:47PM -0400, Ed Tomlinson wrote:
> > After realizing (thanks Dave) that kmail 3.03 has a bug saving
> > multipart/mixed mime messages, I was able to use uudeview to extract
> > a clean patch, and build kernel which boot fine.  Thats the good news.
> > When I try to start kde 3.03 on an up to date debian sid (X 4.2 etc)
> > kde fails to start. It complains that ksmserver cannot be started.
> > Same setup works with 41-mm1.
> > Know this is not a meaty report.  With X4.2 I have not yet figgered
> > out how to get more debug messages (the log from xstart is anemic)
> > nor is there anything in messages, kern.log or on the serial console.
> > The box is a K6-III 400 on a via MVP3 chipset.
> > What other info can I gather?
> > Ed Tomlinson
>
> Could you strace ksmserver on a working and non-working console and
> (privately) send (probably large) logs to dmc & me? Please use
> strace -f -ff or some equivalent that follows children.

Hope the straces helped...

I tried again this evening with mm2 plus shpte-2.5.41-mm2-1.diff and 
shpte-2.5.41-mm2-2.diff and still get the same error.

Ed
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-10 22:29     ` Ed Tomlinson
@ 2002-10-12 14:19       ` Ed Tomlinson
  2002-10-12 14:33         ` William Lee Irwin III
  0 siblings, 1 reply; 8+ messages in thread
From: Ed Tomlinson @ 2002-10-12 14:19 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: Dave McCracken, Linux Memory Management

On October 10, 2002 06:29 pm, Ed Tomlinson wrote:
> On October 9, 2002 11:19 pm, William Lee Irwin III wrote:
> > On Wed, Oct 09, 2002 at 11:04:47PM -0400, Ed Tomlinson wrote:
> > > After realizing (thanks Dave) that kmail 3.03 has a bug saving
> > > multipart/mixed mime messages, I was able to use uudeview to extract
> > > a clean patch, and build kernel which boot fine.  Thats the good news.
> > > When I try to start kde 3.03 on an up to date debian sid (X 4.2 etc)
> > > kde fails to start. It complains that ksmserver cannot be started.
> > > Same setup works with 41-mm1.
> > > Know this is not a meaty report.  With X4.2 I have not yet figgered
> > > out how to get more debug messages (the log from xstart is anemic)
> > > nor is there anything in messages, kern.log or on the serial console.
> > > The box is a K6-III 400 on a via MVP3 chipset.
> > > What other info can I gather?
> > > Ed Tomlinson
> >
> > Could you strace ksmserver on a working and non-working console and
> > (privately) send (probably large) logs to dmc & me? Please use
> > strace -f -ff or some equivalent that follows children.
>
> Hope the straces helped...
>
> I tried again this evening with mm2 plus shpte-2.5.41-mm2-1.diff and
> shpte-2.5.41-mm2-2.diff and still get the same error.

And again with 2.5.42-mm2 - still no joy.  Errror looks the same here.
Do you want another set of traces?

Ed 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2.5.41-mm1] new snapshot of shared page tables
  2002-10-12 14:19       ` Ed Tomlinson
@ 2002-10-12 14:33         ` William Lee Irwin III
  0 siblings, 0 replies; 8+ messages in thread
From: William Lee Irwin III @ 2002-10-12 14:33 UTC (permalink / raw)
  To: Ed Tomlinson; +Cc: Dave McCracken, Linux Memory Management

On Sat, Oct 12, 2002 at 10:19:58AM -0400, Ed Tomlinson wrote:
> And again with 2.5.42-mm2 - still no joy.  Errror looks the same here.
> Do you want another set of traces?

Go for it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-10-12 14:33 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-10-09 21:07 [PATCH 2.5.41-mm1] new snapshot of shared page tables Dave McCracken
2002-10-09 22:51 ` Andrew Morton
2002-10-10 17:27   ` Dave McCracken
2002-10-10  3:04 ` Ed Tomlinson
2002-10-10  3:19   ` William Lee Irwin III
2002-10-10 22:29     ` Ed Tomlinson
2002-10-12 14:19       ` Ed Tomlinson
2002-10-12 14:33         ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox