linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [rfc][patch 2/2] mm: mlocked pages off LRU
@ 2007-03-05 16:17 Nick Piggin
  2007-03-05 16:40 ` Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-05 16:17 UTC (permalink / raw)
  To: Linux Memory Management List, Christoph Lameter
  Cc: Andrew Morton, Christoph Hellwig

This is a competing implementation of the mlocked stuff that has been going
on, because I don't really like those patches and I found that it went
beyond just simple coding style issues.

While I think this patchset is functionally more capable (for the reasons
listed below), I don't really care about that aspect so much as the
cleanliness of the implementation, which I think is better in mine.

This is against 2.6.21-rc2 + my nopage vs invalidate patches, because
they actually fix real bugs, some of which affect this mlock patch. I can
repost those if anyone wants.

This is not heavily tested at thist stage. A very slightly older patch
boots and runs here, but I'm posting this now to ask for comments.

Nick
--

Remove mlocked pages from the LRU. Use PageMLocked page flag, and when that
is set, page->lru.next becomes the mlock_count (conservative count of VM_LOCKED
vmas that reference this page). Based on idea from Christoph Hellwig.

This patch has a number of advantages over Christoph Lameter's scheme.
- munlock doesn't put pages back onto LRU lists if they are still mlocked
- PageMLock explicitly elevates the page's refcount, so PageMLock pages
  don't ever get freed (thus requires less awareness in the rest of mm).

Since I change a few things in install_arg_page, I'll finally move it to
mm/memory.c where it belongs, and rename it to install_new_anon_page.

 fs/exec.c                  |   52 ++-----------
 include/linux/mm.h         |    2 
 include/linux/page-flags.h |    5 +
 include/linux/rmap.h       |    1 
 mm/internal.h              |   32 ++++++++
 mm/memory.c                |   63 +++++++++++++++
 mm/mlock.c                 |  180 ++++++++++++++++++++++++++++++++++++++++++---
 mm/mmap.c                  |    6 +
 mm/mremap.c                |    8 +-
 mm/page_alloc.c            |    9 +-
 mm/rmap.c                  |   57 +++++++++-----
 mm/vmscan.c                |    5 +
 12 files changed, 339 insertions(+), 81 deletions(-)

Index: linux-2.6/mm/mlock.c
===================================================================
--- linux-2.6.orig/mm/mlock.c
+++ linux-2.6/mm/mlock.c
@@ -8,16 +8,199 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 
+#include "internal.h"
+
+#define page_mlock_count(page)		(*(unsigned long *)&(page)->lru.next)
+#define set_page_mlock_count(page, v)	(page_mlock_count(page) = (v))
+#define inc_page_mlock_count(page)	(page_mlock_count(page)++)
+#define dec_page_mlock_count(page)	(page_mlock_count(page)--)
+
+/*
+ * A page's mlock_count is kept in page->lru.next as an unsigned long.
+ * Access to this count is serialised with the page lock (or, in the
+ * case of mlock_page, virtue that there are no other references to
+ * the page).
+ *
+ * mlock counts are incremented at mlock, mmap, mremap, and new anon page
+ * faults, and lazily via vmscan. Decremented at munlock, munmap, and exit.
+ * mlock is not inherited across fork or exec, so we're safe there.
+ *
+ * If PageMLock is set, then the page is removed from the LRU list, and
+ * has its refcount incremented. This increment prevents the page from being
+ * freed until the mlock_count is decremented to zero and PageMLock is cleared.
+ */
+
+/*
+ * Marks a page, belonging to the given mlocked vma, as mlocked.
+ *
+ * The page must be either locked or new, and must not be on the LRU.
+ */
+void set_page_mlock(struct page *page, unsigned long count)
+{
+	BUG_ON(PageLRU(page));
+	BUG_ON(PageMLock(page));
+	/* BUG_ON(!list_empty(&page->lru)); -- if we always did list_del_init */
+
+	SetPageMLock(page);
+	get_page(page);
+	set_page_mlock_count(page, count);
+}
+
+static void __clear_page_mlock(struct page *page)
+{
+	BUG_ON(!PageMLock(page));
+	BUG_ON(PageLRU(page));
+	BUG_ON(page_mlock_count(page));
+
+	ClearPageMLock(page);
+
+	/*
+	 * Could just do a put_page here, however this special
+	 * case prevents anonymous mlocked pages from being put
+	 * on the LRU, then removed from the LRU when we
+	 * put_page.
+	 *
+	 * It is a little bit naughty to open-code this here.
+	 */
+	if (put_page_testzero(page))
+		free_hot_page(page);
+	else
+		lru_cache_add_active(page);
+}
+
+/*
+ * Zero the page's mlock_count. This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache.
+ *
+ * It is not illegal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+void clear_page_mlock(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (likely(!PageMLock(page)))
+		return;
+	BUG_ON(!page_mlock_count(page));
+	set_page_mlock_count(page, 0);
+	__clear_page_mlock(page);
+}
+
+static void inc_page_mlock(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!PageMLock(page)) {
+		if (!isolate_lru_page(page)) {
+			SetPageMLock(page);
+			get_page(page);
+			set_page_mlock_count(page, 1);
+		}
+	} else if (PageMLock(page)) {
+		BUG_ON(!page_mlock_count(page));
+		inc_page_mlock_count(page);
+	}
+}
+
+static void dec_page_mlock(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (PageMLock(page)) {
+		BUG_ON(!page_mlock_count(page));
+		dec_page_mlock_count(page);
+		if (page_mlock_count(page) == 0)
+			__clear_page_mlock(page);
+	} /* else page was not able to be removed from the lru when mlocked */
+}
+
+/*
+ * Increment or decrement the mlock count for a range of pages in the vma
+ * depending on whether lock is 1 or 0, respectively.
+ *
+ * This takes care of making the pages present too.
+ *
+ * vma->vm_mm->mmap_sem must be held for read or write.
+ */
+void __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int lock)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	struct page *pages[16]; /* 16 gives a reasonable batch */
+	int write = !!(vma->vm_flags & VM_WRITE);
+	int nr_pages;
+
+	BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
+
+	if (vma->vm_flags & VM_IO)
+		return;
+
+	nr_pages = (end - start) >> PAGE_SHIFT;
+
+	while (nr_pages > 0) {
+		int ret, i;
+
+		cond_resched();
+
+		/*
+		 * get_user_pages makes pages present if we are
+		 * setting mlock.
+		 */
+		ret = get_user_pages(current, mm, addr, nr_pages,
+				write, 0, pages, NULL);
+		if (ret < 0)
+			break;
+		if (ret == 0) {
+			/*
+			 * We know the vma is there, so the only time
+			 * we cannot get a single page should be an
+			 * error (ret < 0) case.
+			 */
+			WARN_ON(1);
+			ret = -EFAULT;
+			break;
+		}
+
+		for (i = 0; i < ret; i++) {
+			struct page *page = pages[i];
+			lock_page(page);
+			if (lock) {
+				/*
+				 * Anonymous pages may have already been
+				 * mlocked by get_user_pages->handle_mm_fault.
+				 * Be conservative and don't count these:
+				 * We can underestimate the mlock_count because
+				 * that will just cause the page to be added
+				 * to the lru then lazily removed again.
+				 * However if we overestimate the count, the
+				 * page will become unfreeable.
+				 */
+				if (vma->vm_file || !PageMLock(page))
+					inc_page_mlock(page);
+			} else
+				dec_page_mlock(page);
+			unlock_page(page);
+			put_page(page);
+
+			addr += PAGE_SIZE;
+			nr_pages--;
+		}
+	}
+}
 
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
-	int pages;
+	int nr_pages;
 	int ret = 0;
 
 	if (newflags == vma->vm_flags) {
@@ -51,21 +234,21 @@ success:
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 * It's okay if try_to_unmap_one unmaps a page just after we
-	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 * set VM_LOCKED, get_user_pages below will bring it back.
 	 */
 	vma->vm_flags = newflags;
 
 	/*
 	 * Keep track of amount of locked VM.
 	 */
-	pages = (end - start) >> PAGE_SHIFT;
-	if (newflags & VM_LOCKED) {
-		pages = -pages;
-		if (!(newflags & VM_IO))
-			ret = make_pages_present(start, end);
-	}
+	nr_pages = (end - start) >> PAGE_SHIFT;
+	if (newflags & VM_LOCKED)
+		nr_pages = -nr_pages;
+	mm->locked_vm -= nr_pages;
+
+	/* Mlock or unlock all pages in range, including anon pages */
+	__mlock_vma_pages_range(vma, start, end, !!(newflags & VM_LOCKED));
 
-	mm->locked_vm -= pages;
 out:
 	if (ret == -ENOMEM)
 		ret = -EAGAIN;
Index: linux-2.6/mm/internal.h
===================================================================
--- linux-2.6.orig/mm/internal.h
+++ linux-2.6/mm/internal.h
@@ -36,6 +36,36 @@ static inline void __put_page(struct pag
 
 extern int isolate_lru_page(struct page *page);
 
+extern void set_page_mlock(struct page *page, unsigned long count);
+extern void clear_page_mlock(struct page *page);
+
+static inline void mlock_new_page(struct page *page)
+{
+	set_page_mlock(page, 1);
+}
+
+extern void __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int lock);
+
+/*
+ * mlock all pages in this vma range.
+ */
+static inline void mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__mlock_vma_pages_range(vma, start, end, 1);
+}
+
+/*
+ * munlock pages.
+ */
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__mlock_vma_pages_range(vma, start, end, 0);
+}
+
+
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
 
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -51,6 +51,8 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
@@ -301,6 +303,13 @@ static int page_referenced_one(struct pa
 	if (!pte)
 		goto out;
 
+	/*
+	 * Don't want to elevate mlocked refcount, in order that it
+	 * progresses to try_to_unmap and is removed from the LRU
+	 */
+	if (vma->vm_flags & VM_LOCKED)
+		goto out_unmap;
+
 	if (ptep_clear_flush_young(vma, address, pte))
 		referenced++;
 
@@ -310,6 +319,7 @@ static int page_referenced_one(struct pa
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
+out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
@@ -381,11 +391,6 @@ static int page_referenced_file(struct p
 	mapcount = page_mapcount(page);
 
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
-				  == (VM_LOCKED|VM_MAYSHARE)) {
-			referenced++;
-			break;
-		}
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
@@ -631,10 +636,15 @@ static int try_to_unmap_one(struct page 
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
+	if (!migration) {
+		if (vma->vm_flags & VM_LOCKED) {
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
+		if (ptep_clear_flush_young(vma, address, pte)) {
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
 	}
 
 	/* Nuke the page table entry. */
@@ -716,6 +726,9 @@ out:
  * For very sparsely populated VMAs this is a little inefficient - chances are
  * there there won't be many ptes located within the scan cluster.  In this case
  * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages also aren't handled very well at the moment: they aren't
+ * moved off the LRU like they are for linear pages.
  */
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
@@ -787,7 +800,8 @@ static void try_to_unmap_cluster(unsigne
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
-static int try_to_unmap_anon(struct page *page, int migration)
+static int try_to_unmap_anon(struct page *page, int migration,
+						unsigned int *mlocked)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
@@ -801,6 +815,8 @@ static int try_to_unmap_anon(struct page
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
+		if (ret == SWAP_MLOCK)
+			(*mlocked)++;
 	}
 	spin_unlock(&anon_vma->lock);
 	return ret;
@@ -815,7 +831,8 @@ static int try_to_unmap_anon(struct page
  *
  * This function is only called from try_to_unmap for object-based pages.
  */
-static int try_to_unmap_file(struct page *page, int migration)
+static int try_to_unmap_file(struct page *page, int migration,
+						unsigned int *mlocked)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -832,6 +849,8 @@ static int try_to_unmap_file(struct page
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
+		if (ret == SWAP_MLOCK)
+			(*mlocked)++;
 	}
 
 	if (list_empty(&mapping->i_mmap_nonlinear))
@@ -839,7 +858,7 @@ static int try_to_unmap_file(struct page
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if ((vma->vm_flags & VM_LOCKED) && !migration)
+		if (!migration && (vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -873,8 +892,6 @@ static int try_to_unmap_file(struct page
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if ((vma->vm_flags & VM_LOCKED) && !migration)
-				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
@@ -917,15 +934,19 @@ out:
 int try_to_unmap(struct page *page, int migration)
 {
 	int ret;
+	unsigned int mlocked = 0;
 
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, migration);
+		ret = try_to_unmap_anon(page, migration, &mlocked);
 	else
-		ret = try_to_unmap_file(page, migration);
-
-	if (!page_mapped(page))
+		ret = try_to_unmap_file(page, migration, &mlocked);
+	if (mlocked) {
+		/* Lazily move the page off the LRU list */
+		set_page_mlock(page, mlocked);
+		ret = SWAP_MLOCK;
+	} else if (!page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -30,6 +30,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 
+#include "internal.h"
+
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
@@ -1145,7 +1147,7 @@ out:	
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
+		mlock_vma_pages_range(vma, addr, addr + len);
 	}
 	if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
@@ -1958,7 +1960,7 @@ out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
+		mlock_vma_pages_range(vma, addr, addr + len);
 	}
 	return addr;
 }
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c
+++ linux-2.6/mm/mremap.c
@@ -23,6 +23,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -232,8 +234,8 @@ static unsigned long move_vma(struct vm_
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		if (new_len > old_len)
-			make_pages_present(new_addr + old_len,
-					   new_addr + new_len);
+			mlock_vma_pages_range(vma, new_addr + old_len,
+						   new_addr + new_len);
 	}
 
 	return new_addr;
@@ -369,7 +371,7 @@ unsigned long do_mremap(unsigned long ad
 			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				mm->locked_vm += pages;
-				make_pages_present(addr + old_len,
+				mlock_vma_pages_range(vma, addr + old_len,
 						   addr + new_len);
 			}
 			ret = addr;
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h
+++ linux-2.6/include/linux/rmap.h
@@ -134,5 +134,6 @@ static inline int page_mkclean(struct pa
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -60,6 +60,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -855,6 +857,9 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
+			if (vma->vm_flags & VM_LOCKED)
+				munlock_vma_pages_range(vma, start, end);
+
 			if (need_resched() ||
 				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
 				if (i_mmap_lock) {
@@ -1655,7 +1660,10 @@ gotten:
 		ptep_clear_flush(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
-		lru_cache_add_active(new_page);
+		if (!(vma->vm_flags & VM_LOCKED))
+			lru_cache_add_active(new_page);
+		else
+			mlock_new_page(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
@@ -2119,6 +2127,49 @@ out_nomap:
 }
 
 /*
+ * This routine is used to map in an anonymous page into an address space:
+ * needed by execve() for the initial stack and environment pages.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * Returns 0 on success, otherwise the failure code.
+ *
+ * The routine consumes the reference on the page if it is successful,
+ * otherwise the caller must free it.
+ */
+int install_new_anon_page(struct vm_area_struct *vma,
+			struct page *page, unsigned long address)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t * pte;
+	spinlock_t *ptl;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+
+	flush_dcache_page(page);
+	pte = get_locked_pte(mm, address, &ptl);
+	if (!pte)
+		return -ENOMEM;
+	if (!pte_none(*pte)) {
+		pte_unmap_unlock(pte, ptl);
+		return -EEXIST;
+	}
+	inc_mm_counter(mm, anon_rss);
+	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
+					page, vma->vm_page_prot))));
+	if (!(vma->vm_flags & VM_LOCKED))
+		lru_cache_add_active(page);
+	else
+		mlock_new_page(page);
+	page_add_new_anon_rmap(page, vma, address);
+	pte_unmap_unlock(pte, ptl);
+
+	/* no need for flush_tlb */
+	return 0;
+}
+
+/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2148,7 +2199,10 @@ static int do_anonymous_page(struct mm_s
 		if (!pte_none(*page_table))
 			goto release;
 		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
+		if (!(vma->vm_flags & VM_LOCKED))
+			lru_cache_add_active(page);
+		else
+			mlock_new_page(page);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
@@ -2291,7 +2345,10 @@ static int __do_fault(struct mm_struct *
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
                         inc_mm_counter(mm, anon_rss);
-                        lru_cache_add_active(page);
+			if (!(vma->vm_flags & VM_LOCKED))
+				lru_cache_add_active(page);
+			else
+				mlock_new_page(page);
                         page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -512,6 +512,8 @@ static unsigned long shrink_page_list(st
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -594,6 +596,9 @@ keep_locked:
 keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page));
+		continue;
+mlocked:
+		unlock_page(page);
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -91,6 +91,7 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_mlock		20	/* Page has mlocked vmas */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -247,6 +248,10 @@ static inline void SetPageUptodate(struc
 #define PageSwapCache(page)	0
 #endif
 
+#define PageMLock(page)		test_bit(PG_mlock, &(page)->flags)
+#define SetPageMLock(page)	set_bit(PG_mlock, &(page)->flags)
+#define ClearPageMLock(page)	clear_bit(PG_mlock, &(page)->flags)
+
 #define PageUncached(page)	test_bit(PG_uncached, &(page)->flags)
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -203,7 +203,8 @@ static void bad_page(struct page *page)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_mlock );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@ static inline int free_pages_check(struc
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_mlock ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@ static int prep_new_page(struct page *pa
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_mlock ))))
 		bad_page(page);
 
 	/*
Index: linux-2.6/fs/exec.c
===================================================================
--- linux-2.6.orig/fs/exec.c
+++ linux-2.6/fs/exec.c
@@ -297,44 +297,6 @@ int copy_strings_kernel(int argc,char **
 EXPORT_SYMBOL(copy_strings_kernel);
 
 #ifdef CONFIG_MMU
-/*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
- *
- * vma->vm_mm->mmap_sem is held for writing.
- */
-void install_arg_page(struct vm_area_struct *vma,
-			struct page *page, unsigned long address)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pte_t * pte;
-	spinlock_t *ptl;
-
-	if (unlikely(anon_vma_prepare(vma)))
-		goto out;
-
-	flush_dcache_page(page);
-	pte = get_locked_pte(mm, address, &ptl);
-	if (!pte)
-		goto out;
-	if (!pte_none(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		goto out;
-	}
-	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
-	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
-					page, vma->vm_page_prot))));
-	page_add_new_anon_rmap(page, vma, address);
-	pte_unmap_unlock(pte, ptl);
-
-	/* no need for flush_tlb */
-	return;
-out:
-	__free_page(page);
-	force_sig(SIGKILL, current);
-}
-
 #define EXTRA_STACK_VM_PAGES	20	/* random */
 
 int setup_arg_pages(struct linux_binprm *bprm,
@@ -438,17 +400,25 @@ int setup_arg_pages(struct linux_binprm 
 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}
 
+	ret = 0;
 	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
+			if (!ret)
+				ret = install_new_anon_page(mpnt, page,
+								stack_base);
+			if (ret)
+				put_page(page);
 		}
 		stack_base += PAGE_SIZE;
 	}
 	up_write(&mm->mmap_sem);
-	
-	return 0;
+
+	if (ret)
+		do_munmap(mm, mpnt->vm_start, mpnt->vm_start - mpnt->vm_end);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(setup_arg_pages);
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -791,7 +791,7 @@ static inline int handle_mm_fault(struct
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
-void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
+int install_new_anon_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -2179,8 +2179,16 @@ generic_file_direct_IO(int rw, struct ki
 	 */
 	if (rw == WRITE) {
 		write_len = iov_length(iov, nr_segs);
-	       	if (mapping_mapped(mapping))
+	       	if (mapping_mapped(mapping)) {
+			/*
+			 * Calling unmap_mapping_range like this is wrong,
+			 * because it can lead to mlocked pages being
+			 * discarded (this is true even before the PageMLock
+			 * work). direct-IO vs pagecache is a load of junk
+			 * anyway, so who cares.
+			 */
 			unmap_mapping_range(mapping, offset, write_len, 0);
+		}
 	}
 
 	retval = filemap_write_and_wait(mapping);
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -16,6 +16,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include "internal.h"
 
 
 /**
@@ -99,6 +100,7 @@ truncate_complete_page(struct address_sp
 	if (PagePrivate(page))
 		do_invalidatepage(page, 0);
 
+	clear_page_mlock(page);
 	ClearPageUptodate(page);
 	ClearPageMappedToDisk(page);
 	remove_from_page_cache(page);
@@ -124,6 +126,7 @@ invalidate_complete_page(struct address_
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
+	clear_page_mlock(page);
 	ret = remove_mapping(mapping, page);
 
 	return ret;
@@ -342,6 +345,7 @@ invalidate_complete_page2(struct address
 	if (PageDirty(page))
 		goto failed;
 
+	clear_page_mlock(page);
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
 	write_unlock_irq(&mapping->tree_lock);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 16:17 [rfc][patch 2/2] mm: mlocked pages off LRU Nick Piggin
@ 2007-03-05 16:40 ` Nick Piggin
  2007-03-05 17:12 ` Christoph Hellwig
  2007-03-05 18:14 ` Christoph Lameter
  2 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-05 16:40 UTC (permalink / raw)
  To: Linux Memory Management List, Christoph Lameter
  Cc: Andrew Morton, Christoph Hellwig

On Mon, Mar 05, 2007 at 05:17:46PM +0100, Nick Piggin wrote:
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c
> +++ linux-2.6/mm/memory.c
> @@ -60,6 +60,8 @@
>  #include <linux/swapops.h>
>  #include <linux/elf.h>
>  
> +#include "internal.h"
> +
>  #ifndef CONFIG_NEED_MULTIPLE_NODES
>  /* use the per-pgdat data instead for discontigmem - mbligh */
>  unsigned long max_mapnr;
> @@ -855,6 +857,9 @@ unsigned long unmap_vmas(struct mmu_gath
>  
>  			tlb_finish_mmu(*tlbp, tlb_start, start);
>  
> +			if (vma->vm_flags & VM_LOCKED)
> +				munlock_vma_pages_range(vma, start, end);
> +
>  			if (need_resched() ||
>  				(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
>  				if (i_mmap_lock) {


Argh, I missed fixing this. It is only supposed to munlock if i_mmap_lock
is not set (because munlock requires taking the page lock).

Those paths which do take i_mmap_lock here (unmap_mapping...) already do
their own handling of mlocked.

There are probably other bugs in my patchset, but this was the obvious
one.

BTW. anything that invalidates pagecache breaks mlock I think (not with
my patch but in general). I'll have to fix this...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 16:17 [rfc][patch 2/2] mm: mlocked pages off LRU Nick Piggin
  2007-03-05 16:40 ` Nick Piggin
@ 2007-03-05 17:12 ` Christoph Hellwig
  2007-03-05 18:17   ` Christoph Lameter
  2007-03-05 18:14 ` Christoph Lameter
  2 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2007-03-05 17:12 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Memory Management List, Christoph Lameter, Andrew Morton,
	Christoph Hellwig

On Mon, Mar 05, 2007 at 05:17:46PM +0100, Nick Piggin wrote:
> +#include "internal.h"
> +
> +#define page_mlock_count(page)		(*(unsigned long *)&(page)->lru.next)
> +#define set_page_mlock_count(page, v)	(page_mlock_count(page) = (v))
> +#define inc_page_mlock_count(page)	(page_mlock_count(page)++)
> +#define dec_page_mlock_count(page)	(page_mlock_count(page)--)

Now that we've dropped support for old gccs this would be a lot using
anonymous unions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 16:17 [rfc][patch 2/2] mm: mlocked pages off LRU Nick Piggin
  2007-03-05 16:40 ` Nick Piggin
  2007-03-05 17:12 ` Christoph Hellwig
@ 2007-03-05 18:14 ` Christoph Lameter
  2007-03-05 19:26   ` Rik van Riel
  2007-03-06  1:05   ` Nick Piggin
  2 siblings, 2 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-03-05 18:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Memory Management List, Andrew Morton, Christoph Hellwig

On Mon, 5 Mar 2007, Nick Piggin wrote:

> - PageMLock explicitly elevates the page's refcount, so PageMLock pages
>   don't ever get freed (thus requires less awareness in the rest of mm).

Which breaks page migration for mlocked pages.

I think there is still some thinking going on about also removing 
anonymous pages off the LRU if we are out of swap or have no swap. In 
that case we may need page->lru to track these pages so that they can be 
fed back to the LRU when swap is added later.

I was a bit hesitant to use an additional ref counter because we are here 
overloading a refcounter on a LRU field? I have a bad feeling here. There 
are possible race conditions and it seems that earlier approaches failed 
to address those.

> +static void inc_page_mlock(struct page *page)
> +{
> +	BUG_ON(!PageLocked(page));
> +
> +	if (!PageMLock(page)) {
> +		if (!isolate_lru_page(page)) {
> +			SetPageMLock(page);
> +			get_page(page);
> +			set_page_mlock_count(page, 1);
> +		}
> +	} else if (PageMLock(page)) {

You already checked for !PageMlock so PageMlock is true.

> -	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
> -			(ptep_clear_flush_young(vma, address, pte)))) {
> -		ret = SWAP_FAIL;
> -		goto out_unmap;
> +	if (!migration) {
> +		if (vma->vm_flags & VM_LOCKED) {
> +			ret = SWAP_MLOCK;
> +			goto out_unmap;
> +		}
> +		if (ptep_clear_flush_young(vma, address, pte)) {
> +			ret = SWAP_FAIL;
> +			goto out_unmap;
> +		}

Ok you basically keep the first patch of my set. Maybe include that 
explicitly ?

>  /*
> + * This routine is used to map in an anonymous page into an address space:
> + * needed by execve() for the initial stack and environment pages.

Could we have some common code that also covers do_anonymous page etc?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 17:12 ` Christoph Hellwig
@ 2007-03-05 18:17   ` Christoph Lameter
  0 siblings, 0 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-03-05 18:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Nick Piggin, Linux Memory Management List, Andrew Morton

On Mon, 5 Mar 2007, Christoph Hellwig wrote:

> Now that we've dropped support for old gccs this would be a lot using
> anonymous unions.

Yup that would be a nice cleanup. SLUB also heavily overloads the 
page_struct. Maybe we need to have some guidelines first on how to avoid 
utter chaos in mm_types.h?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 18:14 ` Christoph Lameter
@ 2007-03-05 19:26   ` Rik van Riel
  2007-03-06  1:05   ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Rik van Riel @ 2007-03-05 19:26 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

Christoph Lameter wrote:

>>  /*
>> + * This routine is used to map in an anonymous page into an address space:
>> + * needed by execve() for the initial stack and environment pages.
> 
> Could we have some common code that also covers do_anonymous page etc?

It would be good to cover ramfs, too.

(unless it already does some magic that I overlooked)

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-05 18:14 ` Christoph Lameter
  2007-03-05 19:26   ` Rik van Riel
@ 2007-03-06  1:05   ` Nick Piggin
  2007-03-06  1:27     ` Christoph Lameter
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-03-06  1:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Linux Memory Management List, Andrew Morton, Christoph Hellwig

On Mon, Mar 05, 2007 at 10:14:58AM -0800, Christoph Lameter wrote:
> On Mon, 5 Mar 2007, Nick Piggin wrote:
> 
> > - PageMLock explicitly elevates the page's refcount, so PageMLock pages
> >   don't ever get freed (thus requires less awareness in the rest of mm).
> 
> Which breaks page migration for mlocked pages.

Yeah, the simple way to fix migration is to just clear_page_mlock those
pages so they'll lazily be mlocked again. However we could probably do
something fancier like transferring the PG_mlock bit and the mlock_count.

> I think there is still some thinking going on about also removing 
> anonymous pages off the LRU if we are out of swap or have no swap. In 
> that case we may need page->lru to track these pages so that they can be 
> fed back to the LRU when swap is added later.

That's OK: they won't get mlocked if they are not on the LRU (and won't
get taken off the LRU if they are mlocked).

> I was a bit hesitant to use an additional ref counter because we are here 
> overloading a refcounter on a LRU field? I have a bad feeling here. There 

If we ensure !PageLRU then we can use the lru field. I don't see
a problem.

> are possible race conditions and it seems that earlier approaches failed 
> to address those.

What are they?

> 
> > +static void inc_page_mlock(struct page *page)
> > +{
> > +	BUG_ON(!PageLocked(page));
> > +
> > +	if (!PageMLock(page)) {
> > +		if (!isolate_lru_page(page)) {
> > +			SetPageMLock(page);
> > +			get_page(page);
> > +			set_page_mlock_count(page, 1);
> > +		}
> > +	} else if (PageMLock(page)) {
> 
> You already checked for !PageMlock so PageMlock is true.

Thanks.

> > -	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
> > -			(ptep_clear_flush_young(vma, address, pte)))) {
> > -		ret = SWAP_FAIL;
> > -		goto out_unmap;
> > +	if (!migration) {
> > +		if (vma->vm_flags & VM_LOCKED) {
> > +			ret = SWAP_MLOCK;
> > +			goto out_unmap;
> > +		}
> > +		if (ptep_clear_flush_young(vma, address, pte)) {
> > +			ret = SWAP_FAIL;
> > +			goto out_unmap;
> > +		}
> 
> Ok you basically keep the first patch of my set. Maybe include that 
> explicitly ?

It is a bit different. I don't want to break out as soon as it hits
an mlocked vma, in order to be able to count up all mlocked vmas and
set the correct mlock_count.

Actually there is a race here, because a subsequent munlock could
cause the mlock state to be incorrect. I'll have to fix that.

It looks like your patches suffer from the same race?

> >  /*
> > + * This routine is used to map in an anonymous page into an address space:
> > + * needed by execve() for the initial stack and environment pages.
> 
> Could we have some common code that also covers do_anonymous page etc?

That could be possible, yes. I'd like Hugh to ack that sort of thing.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  1:05   ` Nick Piggin
@ 2007-03-06  1:27     ` Christoph Lameter
  2007-03-06  1:44       ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-03-06  1:27 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Tue, 6 Mar 2007, Nick Piggin wrote:

> > Which breaks page migration for mlocked pages.
> 
> Yeah, the simple way to fix migration is to just clear_page_mlock those
> pages so they'll lazily be mlocked again. However we could probably do
> something fancier like transferring the PG_mlock bit and the mlock_count.

That will also drop the page count.

> > I think there is still some thinking going on about also removing 
> > anonymous pages off the LRU if we are out of swap or have no swap. In 
> > that case we may need page->lru to track these pages so that they can be 
> > fed back to the LRU when swap is added later.
> 
> That's OK: they won't get mlocked if they are not on the LRU (and won't
> get taken off the LRU if they are mlocked).

But we may want to keep them off the LRU.

> > I was a bit hesitant to use an additional ref counter because we are here 
> > overloading a refcounter on a LRU field? I have a bad feeling here. There 
> 
> If we ensure !PageLRU then we can use the lru field. I don't see
> a problem.

Wrong. !PageLRU means that the page may be on some other list. Like the 
vmscan pagelist and the page migration list. You can only be sure that it
is not on those lists if a function took the page off the LRU. If you then 
mark it PageMlocked then you may be sure that the LRU field is free for 
use.

> > Ok you basically keep the first patch of my set. Maybe include that 
> > explicitly ?
> 
> It is a bit different. I don't want to break out as soon as it hits
> an mlocked vma, in order to be able to count up all mlocked vmas and
> set the correct mlock_count.

?? The first patch just adds a new exist code to try_to_unmap.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  1:27     ` Christoph Lameter
@ 2007-03-06  1:44       ` Nick Piggin
  2007-03-06  1:55         ` Christoph Lameter
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-03-06  1:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Mon, Mar 05, 2007 at 05:27:37PM -0800, Christoph Lameter wrote:
> On Tue, 6 Mar 2007, Nick Piggin wrote:
> 
> > > Which breaks page migration for mlocked pages.
> > 
> > Yeah, the simple way to fix migration is to just clear_page_mlock those
> > pages so they'll lazily be mlocked again. However we could probably do
> > something fancier like transferring the PG_mlock bit and the mlock_count.
> 
> That will also drop the page count.

?? You _want_ to drop the page count so that migration will work.

> > > I think there is still some thinking going on about also removing 
> > > anonymous pages off the LRU if we are out of swap or have no swap. In 
> > > that case we may need page->lru to track these pages so that they can be 
> > > fed back to the LRU when swap is added later.
> > 
> > That's OK: they won't get mlocked if they are not on the LRU (and won't
> > get taken off the LRU if they are mlocked).
> 
> But we may want to keep them off the LRU.

They will be. Either by mlock or by the !swap condition.

> > > I was a bit hesitant to use an additional ref counter because we are here 
> > > overloading a refcounter on a LRU field? I have a bad feeling here. There 
> > 
> > If we ensure !PageLRU then we can use the lru field. I don't see
> > a problem.
> 
> Wrong. !PageLRU means that the page may be on some other list. Like the 
> vmscan pagelist and the page migration list. You can only be sure that it
> is not on those lists if a function took the page off the LRU. If you then 
> mark it PageMlocked then you may be sure that the LRU field is free for 
> use.

Bad wording: by "if we ensure !PageLRU" I meant "if we take the page off
the LRU ourselves". Why do you have a bad feeling about this? As you
say, vmscan and page migration do exactly the same thing and it is a
fundamental way that the lru mechanism works.

> > > Ok you basically keep the first patch of my set. Maybe include that 
> > > explicitly ?
> > 
> > It is a bit different. I don't want to break out as soon as it hits
> > an mlocked vma, in order to be able to count up all mlocked vmas and
> > set the correct mlock_count.
> 
> ?? The first patch just adds a new exist code to try_to_unmap.

Well I will probably break my patch out into several bits if/when it
is ready to merge. Not such a big deal at present though.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  1:44       ` Nick Piggin
@ 2007-03-06  1:55         ` Christoph Lameter
  2007-03-06  2:13           ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-03-06  1:55 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Tue, 6 Mar 2007, Nick Piggin wrote:

> > > > I think there is still some thinking going on about also removing 
> > > > anonymous pages off the LRU if we are out of swap or have no swap. In 
> > > > that case we may need page->lru to track these pages so that they can be 
> > > > fed back to the LRU when swap is added later.
> > > 
> > > That's OK: they won't get mlocked if they are not on the LRU (and won't
> > > get taken off the LRU if they are mlocked).
> > 
> > But we may want to keep them off the LRU.
> 
> They will be. Either by mlock or by the !swap condition.

The above is a bit contradictory. Assuming they are taken off the LRU:
How will they be returned to the LRU?

> > Wrong. !PageLRU means that the page may be on some other list. Like the 
> > vmscan pagelist and the page migration list. You can only be sure that it
> > is not on those lists if a function took the page off the LRU. If you then 
> > mark it PageMlocked then you may be sure that the LRU field is free for 
> > use.
> 
> Bad wording: by "if we ensure !PageLRU" I meant "if we take the page off
> the LRU ourselves". Why do you have a bad feeling about this? As you
> say, vmscan and page migration do exactly the same thing and it is a
> fundamental way that the lru mechanism works.

Refcounts are generally there to be updated in a racy way and it seems 
here that the refcount variable itself can only exist under certain 
conditions. If you can handle that cleanly then we are okay.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  1:55         ` Christoph Lameter
@ 2007-03-06  2:13           ` Nick Piggin
  2007-03-06  2:46             ` Christoph Lameter
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-03-06  2:13 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Mon, Mar 05, 2007 at 05:55:57PM -0800, Christoph Lameter wrote:
> On Tue, 6 Mar 2007, Nick Piggin wrote:
> 
> > > > > I think there is still some thinking going on about also removing 
> > > > > anonymous pages off the LRU if we are out of swap or have no swap. In 
> > > > > that case we may need page->lru to track these pages so that they can be 
> > > > > fed back to the LRU when swap is added later.
> > > > 
> > > > That's OK: they won't get mlocked if they are not on the LRU (and won't
> > > > get taken off the LRU if they are mlocked).
> > > 
> > > But we may want to keep them off the LRU.
> > 
> > They will be. Either by mlock or by the !swap condition.
> 
> The above is a bit contradictory. Assuming they are taken off the LRU:
> How will they be returned to the LRU?

In what way is it contradictory? If they are mlocked, we put them on the
LRU when they get munlocked. If they are off the LRU due to a !swap condition,
then we put them back on the LRU by whatever mechanism that uses (eg. a
3rd LRU list that we go through much more slowly...).

If they get munlocked and put back on the LRU when there is no swap, then
presumably the !swap condition handling will lazily take care of them.


> > > Wrong. !PageLRU means that the page may be on some other list. Like the 
> > > vmscan pagelist and the page migration list. You can only be sure that it
> > > is not on those lists if a function took the page off the LRU. If you then 
> > > mark it PageMlocked then you may be sure that the LRU field is free for 
> > > use.
> > 
> > Bad wording: by "if we ensure !PageLRU" I meant "if we take the page off
> > the LRU ourselves". Why do you have a bad feeling about this? As you
> > say, vmscan and page migration do exactly the same thing and it is a
> > fundamental way that the lru mechanism works.
> 
> Refcounts are generally there to be updated in a racy way and it seems 
> here that the refcount variable itself can only exist under certain 
> conditions. If you can handle that cleanly then we are okay.

Well it's there in the code. I don't know if you consider my way of
handling it clean or not... It isn't a racy refcount, just a conservative
count of mlocked vmas, which is protected by PG_locked. PG_mlock is also
protected by PG_locked, so it is easy to get the mlock_count when the page
is locked.

I did put a little bit of thought into this ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  2:13           ` Nick Piggin
@ 2007-03-06  2:46             ` Christoph Lameter
  2007-03-06  2:50               ` Nick Piggin
  2007-03-06 15:59               ` Rik van Riel
  0 siblings, 2 replies; 19+ messages in thread
From: Christoph Lameter @ 2007-03-06  2:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Tue, 6 Mar 2007, Nick Piggin wrote:

> > The above is a bit contradictory. Assuming they are taken off the LRU:
> > How will they be returned to the LRU?
> 
> In what way is it contradictory? If they are mlocked, we put them on the
> LRU when they get munlocked. If they are off the LRU due to a !swap condition,
> then we put them back on the LRU by whatever mechanism that uses (eg. a
> 3rd LRU list that we go through much more slowly...).

Ok how are we going to implement the 3rd LRU for non mlocked anonymous 
pages if you use the lru for the refcounter field? Another page flags bit? 
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  2:46             ` Christoph Lameter
@ 2007-03-06  2:50               ` Nick Piggin
  2007-03-06 14:30                 ` Nick Piggin
  2007-03-06 15:59               ` Rik van Riel
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-03-06  2:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Mon, Mar 05, 2007 at 06:46:31PM -0800, Christoph Lameter wrote:
> On Tue, 6 Mar 2007, Nick Piggin wrote:
> 
> > > The above is a bit contradictory. Assuming they are taken off the LRU:
> > > How will they be returned to the LRU?
> > 
> > In what way is it contradictory? If they are mlocked, we put them on the
> > LRU when they get munlocked. If they are off the LRU due to a !swap condition,
> > then we put them back on the LRU by whatever mechanism that uses (eg. a
> > 3rd LRU list that we go through much more slowly...).
> 
> Ok how are we going to implement the 3rd LRU for non mlocked anonymous 
> pages if you use the lru for the refcounter field? Another page flags bit? 

I'll implement it exactly the same way as I would if *not* using the lru
fields privately for !PageLRU && PageMLock pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  2:50               ` Nick Piggin
@ 2007-03-06 14:30                 ` Nick Piggin
  2007-03-06 18:30                   ` Christoph Lameter
  2007-03-06 22:23                   ` Lee Schermerhorn
  0 siblings, 2 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-06 14:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

New core patch. This one is actually tested and works, and you can see
the mlocked pages being accounted.

Same basic idea. Too many fixes and changes to list. Haven't taken up
Christoph's idea to do a union in struct page, but it could be a followup.

Most importantly (aside from crashes and obvious bugs), it should correctly
synchronise munlock vs vmscan lazy mlock now. Before this, it was possible
to have pages leak. This took me a bit of thinking to get right, but was
rather simple in the end.

Memory migration should work now, too, but not tested.

What do people think? Yes? No?

--

Index: linux-2.6/mm/mlock.c
===================================================================
--- linux-2.6.orig/mm/mlock.c
+++ linux-2.6/mm/mlock.c
@@ -8,17 +8,204 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 
+#include "internal.h"
+
+#define page_mlock_count(page)		(*(unsigned long *)&(page)->lru.next)
+#define set_page_mlock_count(page, v)	(page_mlock_count(page) = (v))
+#define inc_page_mlock_count(page)	(page_mlock_count(page)++)
+#define dec_page_mlock_count(page)	(page_mlock_count(page)--)
+
+/*
+ * A page's mlock_count is kept in page->lru.next as an unsigned long.
+ * Access to this count is serialised with the page lock (or, in the
+ * case of mlock_page, virtue that there are no other references to
+ * the page).
+ *
+ * mlock counts are incremented at mlock, mmap, mremap, and new anon page
+ * faults, and lazily via vmscan. Decremented at munlock, munmap, and exit.
+ * mlock is not inherited across fork or exec, so we're safe there.
+ *
+ * If PageMLock is set, then the page is removed from the LRU list, and
+ * has its refcount incremented. This increment prevents the page from being
+ * freed until the mlock_count is decremented to zero and PageMLock is cleared.
+ *
+ * When lazy incrementing via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have elevated mlock_count of a page that is being munlocked. So lazy
+ * mlocked must take the mmap_sem for read, and verify that the vma really
+ * is locked (see mm/rmap.c).
+ */
+
+/*
+ * Marks a page, belonging to the given mlocked vma, as mlocked.
+ *
+ * The page must be either locked or new, and must not be on the LRU.
+ */
+static void __set_page_mlock(struct page *page)
+{
+	BUG_ON(PageLRU(page));
+	BUG_ON(PageMLock(page));
+	/* BUG_ON(!list_empty(&page->lru)); -- if we always did list_del_init */
+
+	SetPageMLock(page);
+	get_page(page);
+	inc_zone_page_state(page, NR_MLOCK);
+	set_page_mlock_count(page, 1);
+}
+
+static void __clear_page_mlock(struct page *page)
+{
+	BUG_ON(!PageMLock(page));
+	BUG_ON(PageLRU(page));
+	BUG_ON(page_mlock_count(page));
+
+	dec_zone_page_state(page, NR_MLOCK);
+	ClearPageMLock(page);
+	lru_cache_add_active(page);
+	put_page(page);
+}
+
+/*
+ * Zero the page's mlock_count. This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache.
+ *
+ * It is not illegal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+void clear_page_mlock(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (likely(!PageMLock(page)))
+		return;
+	BUG_ON(!page_mlock_count(page));
+	set_page_mlock_count(page, 0);
+	__clear_page_mlock(page);
+}
+
+void mlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!PageMLock(page)) {
+		if (!isolate_lru_page(page)) {
+			if (PageActive(page))
+				ClearPageActive(page);
+			__set_page_mlock(page);
+		}
+	} else {
+		BUG_ON(!page_mlock_count(page));
+		inc_page_mlock_count(page);
+	}
+}
+
+void mlock_new_vma_page(struct page *page)
+{
+	__set_page_mlock(page);
+}
+
+static void munlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (PageMLock(page)) {
+		BUG_ON(!page_mlock_count(page));
+		dec_page_mlock_count(page);
+		if (page_mlock_count(page) == 0)
+			__clear_page_mlock(page);
+	} /* else page was not able to be removed from the lru when mlocked */
+}
+
+/*
+ * Increment or decrement the mlock count for a range of pages in the vma
+ * depending on whether lock is 1 or 0, respectively.
+ *
+ * This takes care of making the pages present too.
+ *
+ * vma->vm_mm->mmap_sem must be held for write.
+ */
+void __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int lock)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	struct page *pages[16]; /* 16 gives a reasonable batch */
+	int write = !!(vma->vm_flags & VM_WRITE);
+	int nr_pages;
+
+	BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK);
+
+	if (vma->vm_flags & VM_IO)
+		return;
+
+	nr_pages = (end - start) / PAGE_SIZE;
+
+	while (nr_pages > 0) {
+		int ret, i;
+
+		cond_resched();
+
+		/*
+		 * get_user_pages makes pages present if we are
+		 * setting mlock.
+		 */
+		ret = get_user_pages(current, mm, addr,
+				min_t(int, nr_pages, ARRAY_SIZE(pages)),
+				write, 0, pages, NULL);
+		if (ret < 0)
+			break;
+		if (ret == 0) {
+			/*
+			 * We know the vma is there, so the only time
+			 * we cannot get a single page should be an
+			 * error (ret < 0) case.
+			 */
+			WARN_ON(1);
+			ret = -EFAULT;
+			break;
+		}
+
+		for (i = 0; i < ret; i++) {
+			struct page *page = pages[i];
+			lock_page(page);
+			if (lock) {
+				/*
+				 * Anonymous pages may have already been
+				 * mlocked by get_user_pages->handle_mm_fault.
+				 * Be conservative and don't count these:
+				 * We can underestimate the mlock_count because
+				 * that will just cause the page to be added
+				 * to the lru then lazily removed again.
+				 * However if we overestimate the count, the
+				 * page will become unfreeable.
+				 */
+				if (vma->vm_file || !PageMLock(page))
+					mlock_vma_page(page);
+			} else
+				munlock_vma_page(page);
+			unlock_page(page);
+			put_page(page);
+
+			addr += PAGE_SIZE;
+			nr_pages--;
+		}
+	}
+}
 
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
-	int pages;
+	int nr_pages;
 	int ret = 0;
+	int lock;
 
 	if (newflags == vma->vm_flags) {
 		*prev = vma;
@@ -48,24 +235,25 @@ static int mlock_fixup(struct vm_area_st
 	}
 
 success:
+	lock = !!(newflags & VM_LOCKED);
+
+	/*
+	 * Keep track of amount of locked VM.
+	 */
+	nr_pages = (end - start) >> PAGE_SHIFT;
+	if (!lock)
+		nr_pages = -nr_pages;
+	mm->locked_vm += nr_pages;
+
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 * It's okay if try_to_unmap_one unmaps a page just after we
-	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 	 */
 	vma->vm_flags = newflags;
 
-	/*
-	 * Keep track of amount of locked VM.
-	 */
-	pages = (end - start) >> PAGE_SHIFT;
-	if (newflags & VM_LOCKED) {
-		pages = -pages;
-		if (!(newflags & VM_IO))
-			ret = make_pages_present(start, end);
-	}
+	__mlock_vma_pages_range(vma, start, end, lock);
 
-	mm->locked_vm -= pages;
 out:
 	if (ret == -ENOMEM)
 		ret = -EAGAIN;
Index: linux-2.6/mm/internal.h
===================================================================
--- linux-2.6.orig/mm/internal.h
+++ linux-2.6/mm/internal.h
@@ -36,6 +36,40 @@ static inline void __put_page(struct pag
 
 extern int isolate_lru_page(struct page *page);
 
+/*
+ * must be called with vma's mmap_sem held for read, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+
+/*
+ * must be called with a new page (before being inserted into locked vma).
+ */
+extern void mlock_new_vma_page(struct page *page);
+
+extern void __mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int lock);
+
+/*
+ * mlock all pages in this vma range.
+ */
+static inline void mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__mlock_vma_pages_range(vma, start, end, 1);
+}
+
+/*
+ * munlock pages.
+ */
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__mlock_vma_pages_range(vma, start, end, 0);
+}
+
+extern void clear_page_mlock(struct page *page);
+
+
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
 
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -51,6 +51,8 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
@@ -301,6 +303,13 @@ static int page_referenced_one(struct pa
 	if (!pte)
 		goto out;
 
+	/*
+	 * Don't want to elevate referenced for mlocked, in order that it
+	 * progresses to try_to_unmap and is removed from the LRU
+	 */
+	if (vma->vm_flags & VM_LOCKED)
+		goto out_unmap;
+
 	if (ptep_clear_flush_young(vma, address, pte))
 		referenced++;
 
@@ -310,6 +319,7 @@ static int page_referenced_one(struct pa
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
+out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
@@ -381,11 +391,6 @@ static int page_referenced_file(struct p
 	mapcount = page_mapcount(page);
 
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
-				  == (VM_LOCKED|VM_MAYSHARE)) {
-			referenced++;
-			break;
-		}
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
@@ -631,10 +636,15 @@ static int try_to_unmap_one(struct page 
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
+	if (!migration) {
+		if (vma->vm_flags & VM_LOCKED) {
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
+		if (ptep_clear_flush_young(vma, address, pte)) {
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
 	}
 
 	/* Nuke the page table entry. */
@@ -716,6 +726,9 @@ out:
  * For very sparsely populated VMAs this is a little inefficient - chances are
  * there there won't be many ptes located within the scan cluster.  In this case
  * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages also aren't handled very well at the moment: they aren't
+ * moved off the LRU like they are for linear pages.
  */
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
@@ -791,6 +804,7 @@ static int try_to_unmap_anon(struct page
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
+	unsigned int mlocked = 0;
 	int ret = SWAP_AGAIN;
 
 	anon_vma = page_lock_anon_vma(page);
@@ -801,8 +815,21 @@ static int try_to_unmap_anon(struct page
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			break;
+		if (ret == SWAP_MLOCK) {
+			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+				if (vma->vm_flags & VM_LOCKED) {
+					mlock_vma_page(page);
+					mlocked++;
+				}
+				up_read(&vma->vm_mm->mmap_sem);
+			}
+		}
 	}
 	spin_unlock(&anon_vma->lock);
+	if (mlocked)
+		ret = SWAP_MLOCK;
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;
 	return ret;
 }
 
@@ -825,21 +852,33 @@ static int try_to_unmap_file(struct page
 	unsigned long cursor;
 	unsigned long max_nl_cursor = 0;
 	unsigned long max_nl_size = 0;
-	unsigned int mapcount;
+	unsigned int mapcount, mlocked = 0;
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		ret = try_to_unmap_one(page, vma, migration);
 		if (ret == SWAP_FAIL || !page_mapped(page))
 			goto out;
+		if (ret == SWAP_MLOCK) {
+			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+				if (vma->vm_flags & VM_LOCKED) {
+					mlock_vma_page(page);
+					mlocked++;
+				}
+				up_read(&vma->vm_mm->mmap_sem);
+			}
+		}
 	}
 
+	if (mlocked)
+		goto out;
+
 	if (list_empty(&mapping->i_mmap_nonlinear))
 		goto out;
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if ((vma->vm_flags & VM_LOCKED) && !migration)
+		if (!migration && (vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -873,8 +912,6 @@ static int try_to_unmap_file(struct page
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if ((vma->vm_flags & VM_LOCKED) && !migration)
-				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
@@ -899,6 +936,10 @@ static int try_to_unmap_file(struct page
 		vma->vm_private_data = NULL;
 out:
 	spin_unlock(&mapping->i_mmap_lock);
+	if (mlocked)
+		ret = SWAP_MLOCK;
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;
 	return ret;
 }
 
@@ -924,8 +965,7 @@ int try_to_unmap(struct page *page, int 
 		ret = try_to_unmap_anon(page, migration);
 	else
 		ret = try_to_unmap_file(page, migration);
-
-	if (!page_mapped(page))
+	if (ret != SWAP_MLOCK && !page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -30,6 +30,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 
+#include "internal.h"
+
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
@@ -1145,7 +1147,7 @@ out:	
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
+		mlock_vma_pages_range(vma, addr, addr + len);
 	}
 	if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
@@ -1684,6 +1686,9 @@ static void unmap_region(struct mm_struc
 	struct mmu_gather *tlb;
 	unsigned long nr_accounted = 0;
 
+	if (vma->vm_flags & VM_LOCKED)
+		munlock_vma_pages_range(vma, start, end);
+
 	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
@@ -1958,7 +1963,7 @@ out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
+		mlock_vma_pages_range(vma, addr, addr + len);
 	}
 	return addr;
 }
@@ -1969,10 +1974,21 @@ EXPORT_SYMBOL(do_brk);
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
-	struct vm_area_struct *vma = mm->mmap;
+	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	unsigned long end;
 
+	if (mm->locked_vm) {
+		vma = mm->mmap;
+		while (vma) {
+			if (vma->vm_flags & VM_LOCKED)
+				munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+			vma = vma->vm_next;
+		}
+	}
+
+	vma = mm->mmap;
+
 	lru_add_drain();
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c
+++ linux-2.6/mm/mremap.c
@@ -23,6 +23,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -232,8 +234,8 @@ static unsigned long move_vma(struct vm_
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		if (new_len > old_len)
-			make_pages_present(new_addr + old_len,
-					   new_addr + new_len);
+			mlock_vma_pages_range(vma, new_addr + old_len,
+						   new_addr + new_len);
 	}
 
 	return new_addr;
@@ -369,7 +371,7 @@ unsigned long do_mremap(unsigned long ad
 			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				mm->locked_vm += pages;
-				make_pages_present(addr + old_len,
+				mlock_vma_pages_range(vma, addr + old_len,
 						   addr + new_len);
 			}
 			ret = addr;
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h
+++ linux-2.6/include/linux/rmap.h
@@ -134,5 +134,6 @@ static inline int page_mkclean(struct pa
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -60,6 +60,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -1655,7 +1657,10 @@ gotten:
 		ptep_clear_flush(vma, address, page_table);
 		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
-		lru_cache_add_active(new_page);
+		if (!(vma->vm_flags & VM_LOCKED))
+			lru_cache_add_active(new_page);
+		else
+			mlock_new_vma_page(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
@@ -2119,6 +2124,49 @@ out_nomap:
 }
 
 /*
+ * This routine is used to map in an anonymous page into an address space:
+ * needed by execve() for the initial stack and environment pages.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * Returns 0 on success, otherwise the failure code.
+ *
+ * The routine consumes the reference on the page if it is successful,
+ * otherwise the caller must free it.
+ */
+int install_new_anon_page(struct vm_area_struct *vma,
+			struct page *page, unsigned long address)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t * pte;
+	spinlock_t *ptl;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
+
+	flush_dcache_page(page);
+	pte = get_locked_pte(mm, address, &ptl);
+	if (!pte)
+		return -ENOMEM;
+	if (!pte_none(*pte)) {
+		pte_unmap_unlock(pte, ptl);
+		return -EEXIST;
+	}
+	inc_mm_counter(mm, anon_rss);
+	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
+					page, vma->vm_page_prot))));
+	if (!(vma->vm_flags & VM_LOCKED))
+		lru_cache_add_active(page);
+	else
+		mlock_new_vma_page(page);
+	page_add_new_anon_rmap(page, vma, address);
+	pte_unmap_unlock(pte, ptl);
+
+	/* no need for flush_tlb */
+	return 0;
+}
+
+/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2148,7 +2196,10 @@ static int do_anonymous_page(struct mm_s
 		if (!pte_none(*page_table))
 			goto release;
 		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
+		if (!(vma->vm_flags & VM_LOCKED))
+			lru_cache_add_active(page);
+		else
+			mlock_new_vma_page(page);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
@@ -2291,7 +2342,10 @@ static int __do_fault(struct mm_struct *
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
                         inc_mm_counter(mm, anon_rss);
-                        lru_cache_add_active(page);
+			if (!(vma->vm_flags & VM_LOCKED))
+				lru_cache_add_active(page);
+			else
+				mlock_new_vma_page(page);
                         page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -512,6 +512,8 @@ static unsigned long shrink_page_list(st
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -594,6 +596,9 @@ keep_locked:
 keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page));
+		continue;
+mlocked:
+		unlock_page(page);
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -91,6 +91,7 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_mlock		20	/* Page has mlocked vmas */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -247,6 +248,10 @@ static inline void SetPageUptodate(struc
 #define PageSwapCache(page)	0
 #endif
 
+#define PageMLock(page)		test_bit(PG_mlock, &(page)->flags)
+#define SetPageMLock(page)	set_bit(PG_mlock, &(page)->flags)
+#define ClearPageMLock(page)	clear_bit(PG_mlock, &(page)->flags)
+
 #define PageUncached(page)	test_bit(PG_uncached, &(page)->flags)
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -203,7 +203,8 @@ static void bad_page(struct page *page)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_mlock );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@ static inline int free_pages_check(struc
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_mlock ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@ static int prep_new_page(struct page *pa
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_mlock ))))
 		bad_page(page);
 
 	/*
Index: linux-2.6/fs/exec.c
===================================================================
--- linux-2.6.orig/fs/exec.c
+++ linux-2.6/fs/exec.c
@@ -297,44 +297,6 @@ int copy_strings_kernel(int argc,char **
 EXPORT_SYMBOL(copy_strings_kernel);
 
 #ifdef CONFIG_MMU
-/*
- * This routine is used to map in a page into an address space: needed by
- * execve() for the initial stack and environment pages.
- *
- * vma->vm_mm->mmap_sem is held for writing.
- */
-void install_arg_page(struct vm_area_struct *vma,
-			struct page *page, unsigned long address)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pte_t * pte;
-	spinlock_t *ptl;
-
-	if (unlikely(anon_vma_prepare(vma)))
-		goto out;
-
-	flush_dcache_page(page);
-	pte = get_locked_pte(mm, address, &ptl);
-	if (!pte)
-		goto out;
-	if (!pte_none(*pte)) {
-		pte_unmap_unlock(pte, ptl);
-		goto out;
-	}
-	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
-	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
-					page, vma->vm_page_prot))));
-	page_add_new_anon_rmap(page, vma, address);
-	pte_unmap_unlock(pte, ptl);
-
-	/* no need for flush_tlb */
-	return;
-out:
-	__free_page(page);
-	force_sig(SIGKILL, current);
-}
-
 #define EXTRA_STACK_VM_PAGES	20	/* random */
 
 int setup_arg_pages(struct linux_binprm *bprm,
@@ -438,17 +400,25 @@ int setup_arg_pages(struct linux_binprm 
 		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
 	}
 
+	ret = 0;
 	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
+			if (!ret)
+				ret = install_new_anon_page(mpnt, page,
+								stack_base);
+			if (ret)
+				put_page(page);
 		}
 		stack_base += PAGE_SIZE;
 	}
 	up_write(&mm->mmap_sem);
-	
-	return 0;
+
+	if (ret)
+		do_munmap(mm, mpnt->vm_start, mpnt->vm_start - mpnt->vm_end);
+
+	return ret;
 }
 
 EXPORT_SYMBOL(setup_arg_pages);
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -791,7 +791,7 @@ static inline int handle_mm_fault(struct
 
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
-void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
+int install_new_anon_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -2179,8 +2179,16 @@ generic_file_direct_IO(int rw, struct ki
 	 */
 	if (rw == WRITE) {
 		write_len = iov_length(iov, nr_segs);
-	       	if (mapping_mapped(mapping))
+	       	if (mapping_mapped(mapping)) {
+			/*
+			 * Calling unmap_mapping_range like this is wrong,
+			 * because it can lead to mlocked pages being
+			 * discarded (this is true even before the PageMLock
+			 * work). direct-IO vs pagecache is a load of junk
+			 * anyway, so who cares.
+			 */
 			unmap_mapping_range(mapping, offset, write_len, 0);
+		}
 	}
 
 	retval = filemap_write_and_wait(mapping);
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -16,6 +16,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include "internal.h"
 
 
 /**
@@ -99,6 +100,7 @@ truncate_complete_page(struct address_sp
 	if (PagePrivate(page))
 		do_invalidatepage(page, 0);
 
+	clear_page_mlock(page);
 	ClearPageUptodate(page);
 	ClearPageMappedToDisk(page);
 	remove_from_page_cache(page);
@@ -124,6 +126,7 @@ invalidate_complete_page(struct address_
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
+	clear_page_mlock(page);
 	ret = remove_mapping(mapping, page);
 
 	return ret;
@@ -342,6 +345,7 @@ invalidate_complete_page2(struct address
 	if (PageDirty(page))
 		goto failed;
 
+	clear_page_mlock(page);
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
 	write_unlock_irq(&mapping->tree_lock);
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c
+++ linux-2.6/mm/migrate.c
@@ -272,6 +272,8 @@ static int migrate_page_move_mapping(str
 		return 0;
 	}
 
+	clear_page_mlock(page);
+
 	write_lock_irq(&mapping->tree_lock);
 
 	pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -775,6 +777,17 @@ static int do_move_pages(struct mm_struc
 				!migrate_all)
 			goto put_and_set;
 
+		/*
+		 * Just do the simple thing and put back mlocked pages onto
+		 * the LRU list so they can be taken off again (inefficient
+		 * but not a big deal).
+		 */
+		if (PageMLock(page)) {
+			lock_page(page);
+			clear_page_mlock(page);
+			unlock_page(page);
+		}
+
 		err = isolate_lru_page(page);
 		if (err) {
 put_and_set:
Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -89,6 +89,7 @@
 #include <linux/migrate.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
+#include <linux/pagemap.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -224,7 +225,10 @@ static int check_pte_range(struct vm_are
 	pte_t *orig_pte;
 	pte_t *pte;
 	spinlock_t *ptl;
+	struct page *mlocked;
 
+resume:
+	mlocked = NULL;
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		struct page *page;
@@ -254,12 +258,24 @@ static int check_pte_range(struct vm_are
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private, pte_dirty(*pte));
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			if (PageMLock(page) && !mlocked) {
+				mlocked = page;
+				break;
+			}
 			migrate_page_add(page, private, flags);
-		else
+		} else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
+
+	if (mlocked) {
+		lock_page(mlocked);
+		clear_page_mlock(mlocked);
+		unlock_page(mlocked);
+		goto resume;
+	}
+
 	return addr != end;
 }
 
@@ -372,6 +388,7 @@ check_range(struct mm_struct *mm, unsign
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
+
 			err = check_pgd_range(vma, start, endvma, nodes,
 						flags, private);
 			if (err) {
Index: linux-2.6/drivers/base/node.c
===================================================================
--- linux-2.6.orig/drivers/base/node.c
+++ linux-2.6/drivers/base/node.c
@@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
+		       "Node %d MLock:        %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
Index: linux-2.6/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.orig/fs/proc/proc_misc.c
+++ linux-2.6/fs/proc/proc_misc.c
@@ -166,6 +166,7 @@ static int meminfo_read_proc(char *page,
 		"Writeback:    %8lu kB\n"
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
+		"MLock:        %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"SReclaimable: %8lu kB\n"
 		"SUnreclaim:   %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_MLOCK)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -54,6 +54,7 @@ enum zone_stat_item {
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
+	NR_MLOCK,	/* MLocked pages (conservative guess) */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06  2:46             ` Christoph Lameter
  2007-03-06  2:50               ` Nick Piggin
@ 2007-03-06 15:59               ` Rik van Riel
  1 sibling, 0 replies; 19+ messages in thread
From: Rik van Riel @ 2007-03-06 15:59 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Christoph Lameter, Linux Memory Management List,
	Andrew Morton, Christoph Hellwig

Christoph Lameter wrote:
> On Tue, 6 Mar 2007, Nick Piggin wrote:
> 
>>> The above is a bit contradictory. Assuming they are taken off the LRU:
>>> How will they be returned to the LRU?
>> In what way is it contradictory? If they are mlocked, we put them on the
>> LRU when they get munlocked. If they are off the LRU due to a !swap condition,
>> then we put them back on the LRU by whatever mechanism that uses (eg. a
>> 3rd LRU list that we go through much more slowly...).
> 
> Ok how are we going to implement the 3rd LRU for non mlocked anonymous 
> pages if you use the lru for the refcounter field? Another page flags bit? 

I'm working on it, in-between my other duties.  A separate set
of LRU lists for file backed pages vs swap backed/anon pages.

I think I'm about halfway done with the patch now - I'm amazed
how much stuff changed in the VM since I got abducted to work
on Xen...

http://linux-mm.org/PageReplacementDesign

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06 14:30                 ` Nick Piggin
@ 2007-03-06 18:30                   ` Christoph Lameter
  2007-03-07  3:07                     ` Nick Piggin
  2007-03-06 22:23                   ` Lee Schermerhorn
  1 sibling, 1 reply; 19+ messages in thread
From: Christoph Lameter @ 2007-03-06 18:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Tue, 6 Mar 2007, Nick Piggin wrote:

> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long addr = start;
> +	struct page *pages[16]; /* 16 gives a reasonable batch */

Use a pagevec instead?


> +		/*
> +		 * get_user_pages makes pages present if we are
> +		 * setting mlock.
> +		 */
> +		ret = get_user_pages(current, mm, addr,
> +				min_t(int, nr_pages, ARRAY_SIZE(pages)),
> +				write, 0, pages, NULL);
> +		if (ret < 0)
> +			break;
> +		if (ret == 0) {
> +			/*
> +			 * We know the vma is there, so the only time
> +			 * we cannot get a single page should be an
> +			 * error (ret < 0) case.
> +			 */
> +			WARN_ON(1);
> +			ret = -EFAULT;
> +			break;
> +		}

... pages could be evicted here by reclaim?

> +
> +		for (i = 0; i < ret; i++) {
> +			struct page *page = pages[i];
> +			lock_page(page);
> +			if (lock) {
> +				/*
> +				 * Anonymous pages may have already been
> +				 * mlocked by get_user_pages->handle_mm_fault.
> +				 * Be conservative and don't count these:


> @@ -801,8 +815,21 @@ static int try_to_unmap_anon(struct page
>  		ret = try_to_unmap_one(page, vma, migration);
>  		if (ret == SWAP_FAIL || !page_mapped(page))
>  			break;
> +		if (ret == SWAP_MLOCK) {
> +			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
> +				if (vma->vm_flags & VM_LOCKED) {
> +					mlock_vma_page(page);
> +					mlocked++;
> +				}
> +				up_read(&vma->vm_mm->mmap_sem);
> +			}
> +		}

Taking mmap_sem in try_to_unmap_one? It may already have been taken by 
page migration. Ok, trylock but still.

>  			goto out;
> +		if (ret == SWAP_MLOCK) {
> +			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
> +				if (vma->vm_flags & VM_LOCKED) {
> +					mlock_vma_page(page);
> +					mlocked++;
> +				}
> +				up_read(&vma->vm_mm->mmap_sem);
> +			}


Well this piece of code seem to repeat itself. New function?

> @@ -2148,7 +2196,10 @@ static int do_anonymous_page(struct mm_s
>  		if (!pte_none(*page_table))
>  			goto release;
>  		inc_mm_counter(mm, anon_rss);
> -		lru_cache_add_active(page);
> +		if (!(vma->vm_flags & VM_LOCKED))
> +			lru_cache_add_active(page);
> +		else
> +			mlock_new_vma_page(page);
>  		page_add_new_anon_rmap(page, vma, address);
>  	} else {
>  		/* Map the ZERO_PAGE - vm_page_prot is readonly */
> @@ -2291,7 +2342,10 @@ static int __do_fault(struct mm_struct *
>  		set_pte_at(mm, address, page_table, entry);
>  		if (anon) {
>                          inc_mm_counter(mm, anon_rss);
> -                        lru_cache_add_active(page);
> +			if (!(vma->vm_flags & VM_LOCKED))
> +				lru_cache_add_active(page);
> +			else
> +				mlock_new_vma_page(page);
>                          page_add_new_anon_rmap(page, vma, address);
>  		} else {

Another repeating chunk of code?

> Index: linux-2.6/drivers/base/node.c
> ===================================================================
> --- linux-2.6.orig/drivers/base/node.c
> +++ linux-2.6/drivers/base/node.c
> @@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
>  		       "Node %d FilePages:    %8lu kB\n"
>  		       "Node %d Mapped:       %8lu kB\n"
>  		       "Node %d AnonPages:    %8lu kB\n"
> +		       "Node %d MLock:        %8lu kB\n"

Upper case L in MLock? Should it not be Mlock from mlock with first letter 
capitalized?

> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h
> +++ linux-2.6/include/linux/mmzone.h
> @@ -54,6 +54,7 @@ enum zone_stat_item {
>  	NR_ANON_PAGES,	/* Mapped anonymous pages */
>  	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
>  			   only modified from process context */
> +	NR_MLOCK,	/* MLocked pages (conservative guess) */

Discovered mlocked pages?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06 14:30                 ` Nick Piggin
  2007-03-06 18:30                   ` Christoph Lameter
@ 2007-03-06 22:23                   ` Lee Schermerhorn
  2007-03-07  3:52                     ` Nick Piggin
  1 sibling, 1 reply; 19+ messages in thread
From: Lee Schermerhorn @ 2007-03-06 22:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Christoph Lameter,
	Linux Memory Management List, Andrew Morton, Christoph Hellwig

On Tue, 2007-03-06 at 15:30 +0100, Nick Piggin wrote: 
> New core patch. This one is actually tested and works, and you can see
> the mlocked pages being accounted.
> 
> Same basic idea. Too many fixes and changes to list. Haven't taken up
> Christoph's idea to do a union in struct page, but it could be a followup.
> 
> Most importantly (aside from crashes and obvious bugs), it should correctly
> synchronise munlock vs vmscan lazy mlock now. Before this, it was possible
> to have pages leak. This took me a bit of thinking to get right, but was
> rather simple in the end.
> 
> Memory migration should work now, too, but not tested.
> 
> What do people think? Yes? No?

Nick:  I've grabbed your 2 patches in this series and rebased them to
21-rc2-mm2 so I can test them and compare with Christoph's [which I've
also rebased to -mm2].  I had to fix up the ia32_setup_arg_pages() for
ia64 to track the change you made to install_new_arg_page.  Patch
included below.  Some comments in-line below, as well.

Now builds, boots, and successfully builds a kernel with Christoph's
series.  Some basic testing with memtoy [see link below] shows pages
being locked according to the /proc/meminfo stats, but the counts don't
decrease when I unmap the segment nor when I exit the task.  I'll
investigate why and let you know how further testing goes.  After that,
I plan to merge both series with my page migration series and your page
cache replication patch to test the effects there.  Should be
"interesting".

If you're interested, I have a little tool/toy for testing mm stuff at:
http://free.linux.hp.com/~lts/Tools/memtoy-latest.tar.gz
I recently added a lock()/unlock() command for testing locking of memory
regions.  It could use more [a lot more] documentation, but there it
does have a README and internal help.

Lee


> 
> --
> 
> Index: linux-2.6/mm/mlock.c
> ===================================================================
> --- linux-2.6.orig/mm/mlock.c
> +++ linux-2.6/mm/mlock.c
> @@ -8,17 +8,204 @@
>  #include <linux/capability.h>
>  #include <linux/mman.h>
>  #include <linux/mm.h>
> +#include <linux/swap.h>
> +#include <linux/pagemap.h>
>  #include <linux/mempolicy.h>
>  #include <linux/syscalls.h>
>  
> +#include "internal.h"
> +
> +#define page_mlock_count(page)		(*(unsigned long *)&(page)->lru.next)
> +#define set_page_mlock_count(page, v)	(page_mlock_count(page) = (v))
> +#define inc_page_mlock_count(page)	(page_mlock_count(page)++)
> +#define dec_page_mlock_count(page)	(page_mlock_count(page)--)
> +
> +/*
> + * A page's mlock_count is kept in page->lru.next as an unsigned long.
> + * Access to this count is serialised with the page lock (or, in the
> + * case of mlock_page, virtue that there are no other references to
> + * the page).
> + *
> + * mlock counts are incremented at mlock, mmap, mremap, and new anon page
> + * faults, and lazily via vmscan. Decremented at munlock, munmap, and exit.
> + * mlock is not inherited across fork or exec, so we're safe there.
> + *
> + * If PageMLock is set, then the page is removed from the LRU list, and
> + * has its refcount incremented. This increment prevents the page from being
> + * freed until the mlock_count is decremented to zero and PageMLock is cleared.
> + *
> + * When lazy incrementing via vmscan, it is important to ensure that the
> + * vma's VM_LOCKED status is not concurrently being modified, otherwise we
> + * may have elevated mlock_count of a page that is being munlocked. So lazy
> + * mlocked must take the mmap_sem for read, and verify that the vma really
> + * is locked (see mm/rmap.c).
> + */
> +
> +/*
> + * Marks a page, belonging to the given mlocked vma, as mlocked.
> + *
> + * The page must be either locked or new, and must not be on the LRU.
> + */
> +static void __set_page_mlock(struct page *page)
> +{
> +	BUG_ON(PageLRU(page));
> +	BUG_ON(PageMLock(page));
> +	/* BUG_ON(!list_empty(&page->lru)); -- if we always did list_del_init */
> +
> +	SetPageMLock(page);
> +	get_page(page);
> +	inc_zone_page_state(page, NR_MLOCK);
> +	set_page_mlock_count(page, 1);
> +}
> +
> +static void __clear_page_mlock(struct page *page)
> +{
> +	BUG_ON(!PageMLock(page));
> +	BUG_ON(PageLRU(page));
> +	BUG_ON(page_mlock_count(page));
> +
> +	dec_zone_page_state(page, NR_MLOCK);
> +	ClearPageMLock(page);
> +	lru_cache_add_active(page);
> +	put_page(page);
> +}
> +
> +/*
> + * Zero the page's mlock_count. This can be useful in a situation where
> + * we want to unconditionally remove a page from the pagecache.
> + *
> + * It is not illegal to call this function for any page, mlocked or not.
Maybe "It is legal ..."  ???

> + * If called for a page that is still mapped by mlocked vmas, all we do
> + * is revert to lazy LRU behaviour -- semantics are not broken.
> + */
> +void clear_page_mlock(struct page *page)
> +{
> +	BUG_ON(!PageLocked(page));
> +
> +	if (likely(!PageMLock(page)))
> +		return;
> +	BUG_ON(!page_mlock_count(page));
> +	set_page_mlock_count(page, 0);
> +	__clear_page_mlock(page);
> +}
> +
<snip>

> Index: linux-2.6/include/linux/page-flags.h
> ===================================================================
> --- linux-2.6.orig/include/linux/page-flags.h
> +++ linux-2.6/include/linux/page-flags.h
> @@ -91,6 +91,7 @@
>  #define PG_nosave_free		18	/* Used for system suspend/resume */
>  #define PG_buddy		19	/* Page is free, on buddy lists */
>  
> +#define PG_mlock		20	/* Page has mlocked vmas */

Conflicts with PG_readahead in 21-rc2-mm2.  I temporarily used bit
30--valid only for 64-bit systems.  [Same in Christoph's series.]

>  
>  #if (BITS_PER_LONG > 32)
>  /*
> @@ -247,6 +248,10 @@ static inline void SetPageUptodate(struc
>  #define PageSwapCache(page)	0
>  #endif
>  
> +#define PageMLock(page)		test_bit(PG_mlock, &(page)->flags)
> +#define SetPageMLock(page)	set_bit(PG_mlock, &(page)->flags)
> +#define ClearPageMLock(page)	clear_bit(PG_mlock, &(page)->flags)
> +
>  #define PageUncached(page)	test_bit(PG_uncached, &(page)->flags)
>  #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
>  #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c
> +++ linux-2.6/mm/page_alloc.c
> @@ -203,7 +203,8 @@ static void bad_page(struct page *page)
>  			1 << PG_slab    |
>  			1 << PG_swapcache |
>  			1 << PG_writeback |
> -			1 << PG_buddy );
> +			1 << PG_buddy |
> +			1 << PG_mlock );
>  	set_page_count(page, 0);
>  	reset_page_mapcount(page);
>  	page->mapping = NULL;
> @@ -438,7 +439,8 @@ static inline int free_pages_check(struc
>  			1 << PG_swapcache |
>  			1 << PG_writeback |
>  			1 << PG_reserved |
> -			1 << PG_buddy ))))
> +			1 << PG_buddy |
> +			1 << PG_mlock ))))
>  		bad_page(page);
>  	if (PageDirty(page))
>  		__ClearPageDirty(page);
> @@ -588,7 +590,8 @@ static int prep_new_page(struct page *pa
>  			1 << PG_swapcache |
>  			1 << PG_writeback |
>  			1 << PG_reserved |
> -			1 << PG_buddy ))))
> +			1 << PG_buddy |
> +			1 << PG_mlock ))))
>  		bad_page(page);
>  
>  	/*
> Index: linux-2.6/fs/exec.c
> ===================================================================
> --- linux-2.6.orig/fs/exec.c
> +++ linux-2.6/fs/exec.c
> @@ -297,44 +297,6 @@ int copy_strings_kernel(int argc,char **
>  EXPORT_SYMBOL(copy_strings_kernel);
>  
>  #ifdef CONFIG_MMU
> -/*
> - * This routine is used to map in a page into an address space: needed by
> - * execve() for the initial stack and environment pages.
> - *
> - * vma->vm_mm->mmap_sem is held for writing.
> - */
> -void install_arg_page(struct vm_area_struct *vma,
> -			struct page *page, unsigned long address)
> -{
> -	struct mm_struct *mm = vma->vm_mm;
> -	pte_t * pte;
> -	spinlock_t *ptl;
> -
> -	if (unlikely(anon_vma_prepare(vma)))
> -		goto out;
> -
> -	flush_dcache_page(page);
> -	pte = get_locked_pte(mm, address, &ptl);
> -	if (!pte)
> -		goto out;
> -	if (!pte_none(*pte)) {
> -		pte_unmap_unlock(pte, ptl);
> -		goto out;
> -	}
> -	inc_mm_counter(mm, anon_rss);
> -	lru_cache_add_active(page);
> -	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
> -					page, vma->vm_page_prot))));
> -	page_add_new_anon_rmap(page, vma, address);
> -	pte_unmap_unlock(pte, ptl);
> -
> -	/* no need for flush_tlb */
> -	return;
> -out:
> -	__free_page(page);
> -	force_sig(SIGKILL, current);
> -}
> -
>  #define EXTRA_STACK_VM_PAGES	20	/* random */
>  
>  int setup_arg_pages(struct linux_binprm *bprm,
> @@ -438,17 +400,25 @@ int setup_arg_pages(struct linux_binprm 
>  		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
>  	}
>  
> +	ret = 0;
>  	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
>  		struct page *page = bprm->page[i];
>  		if (page) {
>  			bprm->page[i] = NULL;
> -			install_arg_page(mpnt, page, stack_base);
> +			if (!ret)
> +				ret = install_new_anon_page(mpnt, page,
> +								stack_base);
> +			if (ret)
> +				put_page(page);

Need similar mod in arch/ia64/ia32/binfmt_elf32.c:ia32_setup_arg_pages()
Patch included below.

>  		}
>  		stack_base += PAGE_SIZE;
>  	}
>  	up_write(&mm->mmap_sem);
> -	
> -	return 0;
> +
> +	if (ret)
> +		do_munmap(mm, mpnt->vm_start, mpnt->vm_start - mpnt->vm_end);
> +
> +	return ret;
>  }
>  
>  EXPORT_SYMBOL(setup_arg_pages);

> Index: linux-2.6/mm/migrate.c
> ===================================================================
> --- linux-2.6.orig/mm/migrate.c
> +++ linux-2.6/mm/migrate.c
> @@ -272,6 +272,8 @@ static int migrate_page_move_mapping(str
>  		return 0;
>  	}
>  
> +	clear_page_mlock(page);
> +
>  	write_lock_irq(&mapping->tree_lock);
>  
>  	pslot = radix_tree_lookup_slot(&mapping->page_tree,
> @@ -775,6 +777,17 @@ static int do_move_pages(struct mm_struc
>  				!migrate_all)
>  			goto put_and_set;
>  
> +		/*
> +		 * Just do the simple thing and put back mlocked pages onto
> +		 * the LRU list so they can be taken off again (inefficient
> +		 * but not a big deal).
> +		 */
> +		if (PageMLock(page)) {
> +			lock_page(page);
> +			clear_page_mlock(page);
Note that this will put the page into the lru pagevec cache
[__clear_page_mlock() above] where isolate_lru_page(), called from
migrate_page_add(), is unlikely to find it.  do_move_pages() has already
called migrate_prep() to drain the lru caches so that it is more likely
to find the pages, as does check_range() when called to collect pages
for migration.  Yes, this is already racy--the target task or other
threads therein can fault additional pages into the lru cache after call
to migrate_prep().  But this almost guarantees we'll miss ~ the last
PAGEVEC_SIZE pages.

> +			unlock_page(page);
> +		}
> +
>  		err = isolate_lru_page(page);
>  		if (err) {
>  put_and_set:
> Index: linux-2.6/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.orig/mm/mempolicy.c
> +++ linux-2.6/mm/mempolicy.c
> @@ -89,6 +89,7 @@
>  #include <linux/migrate.h>
>  #include <linux/rmap.h>
>  #include <linux/security.h>
> +#include <linux/pagemap.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/uaccess.h>
> @@ -224,7 +225,10 @@ static int check_pte_range(struct vm_are
>  	pte_t *orig_pte;
>  	pte_t *pte;
>  	spinlock_t *ptl;
> +	struct page *mlocked;
>  
> +resume:
> +	mlocked = NULL;
>  	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  	do {
>  		struct page *page;
> @@ -254,12 +258,24 @@ static int check_pte_range(struct vm_are
>  
>  		if (flags & MPOL_MF_STATS)
>  			gather_stats(page, private, pte_dirty(*pte));
> -		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
> +		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
> +			if (PageMLock(page) && !mlocked) {
> +				mlocked = page;
> +				break;
> +			}
>  			migrate_page_add(page, private, flags);
> -		else
> +		} else
>  			break;
>  	} while (pte++, addr += PAGE_SIZE, addr != end);
>  	pte_unmap_unlock(orig_pte, ptl);
> +
> +	if (mlocked) {
> +		lock_page(mlocked);
> +		clear_page_mlock(mlocked);

Same comment as for do_move_pages() above.

> +		unlock_page(mlocked);
> +		goto resume;
> +	}
> +
>  	return addr != end;
>  }
>  
> @@ -372,6 +388,7 @@ check_range(struct mm_struct *mm, unsign
>  				endvma = end;
>  			if (vma->vm_start > start)
>  				start = vma->vm_start;
> +
>  			err = check_pgd_range(vma, start, endvma, nodes,
>  						flags, private);
>  			if (err) {

Here's the patch mentioned above:

Need to replace call to install_arg_page() in ia64's
ia32 version of setup_arg_pages() to build 21-rc2-mm2
with Nick's "mlocked pages off LRU" patch on ia64. 

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>

 arch/ia64/ia32/binfmt_elf32.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Index: Linux/arch/ia64/ia32/binfmt_elf32.c
===================================================================
--- Linux.orig/arch/ia64/ia32/binfmt_elf32.c	2007-03-06 12:16:33.000000000 -0500
+++ Linux/arch/ia64/ia32/binfmt_elf32.c	2007-03-06 15:19:02.000000000 -0500
@@ -240,7 +240,11 @@ ia32_setup_arg_pages (struct linux_binpr
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			install_arg_page(mpnt, page, stack_base);
+			if (!ret)
+				ret = install_new_anon_page(mpnt, page,
+								stack_base);
+			if (ret)
+				put_page(page);
 		}
 		stack_base += PAGE_SIZE;
 	}


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06 18:30                   ` Christoph Lameter
@ 2007-03-07  3:07                     ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-07  3:07 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Christoph Lameter, Linux Memory Management List, Andrew Morton,
	Christoph Hellwig

On Tue, Mar 06, 2007 at 10:30:43AM -0800, Christoph Lameter wrote:
> On Tue, 6 Mar 2007, Nick Piggin wrote:
> 
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	unsigned long addr = start;
> > +	struct page *pages[16]; /* 16 gives a reasonable batch */
> 
> Use a pagevec instead?

That's annoying because get_user_pages doesn't do pagevec_add, so
you have to do everything the same, except manually fish out the pages
array from the pagevec every use.

> > +		/*
> > +		 * get_user_pages makes pages present if we are
> > +		 * setting mlock.
> > +		 */
> > +		ret = get_user_pages(current, mm, addr,
> > +				min_t(int, nr_pages, ARRAY_SIZE(pages)),
> > +				write, 0, pages, NULL);
> > +		if (ret < 0)
> > +			break;
> > +		if (ret == 0) {
> > +			/*
> > +			 * We know the vma is there, so the only time
> > +			 * we cannot get a single page should be an
> > +			 * error (ret < 0) case.
> > +			 */
> > +			WARN_ON(1);
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> 
> ... pages could be evicted here by reclaim?

No, get_user_pages elevates refcount.

> > +
> > +		for (i = 0; i < ret; i++) {
> > +			struct page *page = pages[i];
> > +			lock_page(page);
> > +			if (lock) {
> > +				/*
> > +				 * Anonymous pages may have already been
> > +				 * mlocked by get_user_pages->handle_mm_fault.
> > +				 * Be conservative and don't count these:
> 
> 
> > @@ -801,8 +815,21 @@ static int try_to_unmap_anon(struct page
> >  		ret = try_to_unmap_one(page, vma, migration);
> >  		if (ret == SWAP_FAIL || !page_mapped(page))
> >  			break;
> > +		if (ret == SWAP_MLOCK) {
> > +			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
> > +				if (vma->vm_flags & VM_LOCKED) {
> > +					mlock_vma_page(page);
> > +					mlocked++;
> > +				}
> > +				up_read(&vma->vm_mm->mmap_sem);
> > +			}
> > +		}
> 
> Taking mmap_sem in try_to_unmap_one? It may already have been taken by 
> page migration. Ok, trylock but still.

Migration path won't get SWAP_MLOCK, though.
 
We still have to trylock because we're inside i_mmap_lock or anon_vma_lock.
It's not pretty, but it is very fortunate that this should be an uncommon
path and it is no problem if it fails sometimes.

I believe your patches have this race (ie. vmscan lazily mlocking the
page after the last vma has just been munlocked). But if not,I would like
to know how you dealt with it.

> >  			goto out;
> > +		if (ret == SWAP_MLOCK) {
> > +			if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
> > +				if (vma->vm_flags & VM_LOCKED) {
> > +					mlock_vma_page(page);
> > +					mlocked++;
> > +				}
> > +				up_read(&vma->vm_mm->mmap_sem);
> > +			}
> 
> 
> Well this piece of code seem to repeat itself. New function?

I was thinking about putting it in mlock.c...

> > @@ -2148,7 +2196,10 @@ static int do_anonymous_page(struct mm_s
> >  		if (!pte_none(*page_table))
> >  			goto release;
> >  		inc_mm_counter(mm, anon_rss);
> > -		lru_cache_add_active(page);
> > +		if (!(vma->vm_flags & VM_LOCKED))
> > +			lru_cache_add_active(page);
> > +		else
> > +			mlock_new_vma_page(page);
> >  		page_add_new_anon_rmap(page, vma, address);
> >  	} else {
> >  		/* Map the ZERO_PAGE - vm_page_prot is readonly */
> > @@ -2291,7 +2342,10 @@ static int __do_fault(struct mm_struct *
> >  		set_pte_at(mm, address, page_table, entry);
> >  		if (anon) {
> >                          inc_mm_counter(mm, anon_rss);
> > -                        lru_cache_add_active(page);
> > +			if (!(vma->vm_flags & VM_LOCKED))
> > +				lru_cache_add_active(page);
> > +			else
> > +				mlock_new_vma_page(page);
> >                          page_add_new_anon_rmap(page, vma, address);
> >  		} else {
> 
> Another repeating chunk of code?

Well there are a few of those in mm/memory.c already. Consolidation
doesn't belong in this patch.

> 
> > Index: linux-2.6/drivers/base/node.c
> > ===================================================================
> > --- linux-2.6.orig/drivers/base/node.c
> > +++ linux-2.6/drivers/base/node.c
> > @@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
> >  		       "Node %d FilePages:    %8lu kB\n"
> >  		       "Node %d Mapped:       %8lu kB\n"
> >  		       "Node %d AnonPages:    %8lu kB\n"
> > +		       "Node %d MLock:        %8lu kB\n"
> 
> Upper case L in MLock? Should it not be Mlock from mlock with first letter 
> capitalized?

I didn't really think about it much. We use upper case for contractions
as well (eg. MemFree, Committed_AS). I was thinking M stands for memory,
but you could argue that it is just a single word, named after the syscall
... but then what about mlockall? ;)

Shal I just rename to Locked?

> > Index: linux-2.6/include/linux/mmzone.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/mmzone.h
> > +++ linux-2.6/include/linux/mmzone.h
> > @@ -54,6 +54,7 @@ enum zone_stat_item {
> >  	NR_ANON_PAGES,	/* Mapped anonymous pages */
> >  	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
> >  			   only modified from process context */
> > +	NR_MLOCK,	/* MLocked pages (conservative guess) */
> 
> Discovered mlocked pages?

Yeah.

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [rfc][patch 2/2] mm: mlocked pages off LRU
  2007-03-06 22:23                   ` Lee Schermerhorn
@ 2007-03-07  3:52                     ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-07  3:52 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Christoph Lameter, Christoph Lameter,
	Linux Memory Management List, Andrew Morton, Christoph Hellwig

On Tue, Mar 06, 2007 at 05:23:55PM -0500, Lee Schermerhorn wrote:
> On Tue, 2007-03-06 at 15:30 +0100, Nick Piggin wrote: 
> > New core patch. This one is actually tested and works, and you can see
> > the mlocked pages being accounted.
> > 
> > Same basic idea. Too many fixes and changes to list. Haven't taken up
> > Christoph's idea to do a union in struct page, but it could be a followup.
> > 
> > Most importantly (aside from crashes and obvious bugs), it should correctly
> > synchronise munlock vs vmscan lazy mlock now. Before this, it was possible
> > to have pages leak. This took me a bit of thinking to get right, but was
> > rather simple in the end.
> > 
> > Memory migration should work now, too, but not tested.
> > 
> > What do people think? Yes? No?
> 
> Nick:  I've grabbed your 2 patches in this series and rebased them to
> 21-rc2-mm2 so I can test them and compare with Christoph's [which I've
> also rebased to -mm2].  I had to fix up the ia32_setup_arg_pages() for
> ia64 to track the change you made to install_new_arg_page.  Patch
> included below.  Some comments in-line below, as well.

Thanks Lee!

> Now builds, boots, and successfully builds a kernel with Christoph's
> series.  Some basic testing with memtoy [see link below] shows pages
> being locked according to the /proc/meminfo stats, but the counts don't
> decrease when I unmap the segment nor when I exit the task.  I'll
> investigate why and let you know how further testing goes.  After that,

OK, It works here (not memtoy, but a simple mlock/munlock program), so
that's interesting if you can work it out.

> > +/*
> > + * Zero the page's mlock_count. This can be useful in a situation where
> > + * we want to unconditionally remove a page from the pagecache.
> > + *
> > + * It is not illegal to call this function for any page, mlocked or not.
> Maybe "It is legal ..."  ???

Yeah ;)

> > Index: linux-2.6/include/linux/page-flags.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/page-flags.h
> > +++ linux-2.6/include/linux/page-flags.h
> > @@ -91,6 +91,7 @@
> >  #define PG_nosave_free		18	/* Used for system suspend/resume */
> >  #define PG_buddy		19	/* Page is free, on buddy lists */
> >  
> > +#define PG_mlock		20	/* Page has mlocked vmas */
> 
> Conflicts with PG_readahead in 21-rc2-mm2.  I temporarily used bit
> 30--valid only for 64-bit systems.  [Same in Christoph's series.]

OK, I'll sort that out when it gets more merge worthy.

> > @@ -438,17 +400,25 @@ int setup_arg_pages(struct linux_binprm 
> >  		mm->stack_vm = mm->total_vm = vma_pages(mpnt);
> >  	}
> >  
> > +	ret = 0;
> >  	for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
> >  		struct page *page = bprm->page[i];
> >  		if (page) {
> >  			bprm->page[i] = NULL;
> > -			install_arg_page(mpnt, page, stack_base);
> > +			if (!ret)
> > +				ret = install_new_anon_page(mpnt, page,
> > +								stack_base);
> > +			if (ret)
> > +				put_page(page);
> 
> Need similar mod in arch/ia64/ia32/binfmt_elf32.c:ia32_setup_arg_pages()
> Patch included below.

Thanks. I need to split out the install_arg_page change too.

> > @@ -272,6 +272,8 @@ static int migrate_page_move_mapping(str
> >  		return 0;
> >  	}
> >  
> > +	clear_page_mlock(page);
> > +
> >  	write_lock_irq(&mapping->tree_lock);
> >  
> >  	pslot = radix_tree_lookup_slot(&mapping->page_tree,
> > @@ -775,6 +777,17 @@ static int do_move_pages(struct mm_struc
> >  				!migrate_all)
> >  			goto put_and_set;
> >  
> > +		/*
> > +		 * Just do the simple thing and put back mlocked pages onto
> > +		 * the LRU list so they can be taken off again (inefficient
> > +		 * but not a big deal).
> > +		 */
> > +		if (PageMLock(page)) {
> > +			lock_page(page);
> > +			clear_page_mlock(page);
> Note that this will put the page into the lru pagevec cache
> [__clear_page_mlock() above] where isolate_lru_page(), called from
> migrate_page_add(), is unlikely to find it.  do_move_pages() has already
> called migrate_prep() to drain the lru caches so that it is more likely
> to find the pages, as does check_range() when called to collect pages
> for migration.  Yes, this is already racy--the target task or other
> threads therein can fault additional pages into the lru cache after call
> to migrate_prep().  But this almost guarantees we'll miss ~ the last
> PAGEVEC_SIZE pages.

Yeah I realised this :P I guess we could do another flush if the page
was mlocked?

> > @@ -254,12 +258,24 @@ static int check_pte_range(struct vm_are
> >  
> >  		if (flags & MPOL_MF_STATS)
> >  			gather_stats(page, private, pte_dirty(*pte));
> > -		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
> > +		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
> > +			if (PageMLock(page) && !mlocked) {
> > +				mlocked = page;
> > +				break;
> > +			}
> >  			migrate_page_add(page, private, flags);
> > -		else
> > +		} else
> >  			break;
> >  	} while (pte++, addr += PAGE_SIZE, addr != end);
> >  	pte_unmap_unlock(orig_pte, ptl);
> > +
> > +	if (mlocked) {
> > +		lock_page(mlocked);
> > +		clear_page_mlock(mlocked);
> 
> Same comment as for do_move_pages() above.

Yeah, thanks. I should really also be using a pagevec for these guys,
so that we don't have to break out of the loop so frequently. Not that
I was optimising for mlocked pages, but this loop sucks, as is ;)

> Here's the patch mentioned above:
> 
> Need to replace call to install_arg_page() in ia64's
> ia32 version of setup_arg_pages() to build 21-rc2-mm2
> with Nick's "mlocked pages off LRU" patch on ia64. 
> 
> Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>

Cheers, thanks.

> 
>  arch/ia64/ia32/binfmt_elf32.c |    6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> Index: Linux/arch/ia64/ia32/binfmt_elf32.c
> ===================================================================
> --- Linux.orig/arch/ia64/ia32/binfmt_elf32.c	2007-03-06 12:16:33.000000000 -0500
> +++ Linux/arch/ia64/ia32/binfmt_elf32.c	2007-03-06 15:19:02.000000000 -0500
> @@ -240,7 +240,11 @@ ia32_setup_arg_pages (struct linux_binpr
>  		struct page *page = bprm->page[i];
>  		if (page) {
>  			bprm->page[i] = NULL;
> -			install_arg_page(mpnt, page, stack_base);
> +			if (!ret)
> +				ret = install_new_anon_page(mpnt, page,
> +								stack_base);
> +			if (ret)
> +				put_page(page);
>  		}
>  		stack_base += PAGE_SIZE;
>  	}
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-03-07  3:52 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-05 16:17 [rfc][patch 2/2] mm: mlocked pages off LRU Nick Piggin
2007-03-05 16:40 ` Nick Piggin
2007-03-05 17:12 ` Christoph Hellwig
2007-03-05 18:17   ` Christoph Lameter
2007-03-05 18:14 ` Christoph Lameter
2007-03-05 19:26   ` Rik van Riel
2007-03-06  1:05   ` Nick Piggin
2007-03-06  1:27     ` Christoph Lameter
2007-03-06  1:44       ` Nick Piggin
2007-03-06  1:55         ` Christoph Lameter
2007-03-06  2:13           ` Nick Piggin
2007-03-06  2:46             ` Christoph Lameter
2007-03-06  2:50               ` Nick Piggin
2007-03-06 14:30                 ` Nick Piggin
2007-03-06 18:30                   ` Christoph Lameter
2007-03-07  3:07                     ` Nick Piggin
2007-03-06 22:23                   ` Lee Schermerhorn
2007-03-07  3:52                     ` Nick Piggin
2007-03-06 15:59               ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox