linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <npiggin@suse.de>
To: Christoph Lameter <clameter@engr.sgi.com>
Cc: Nick Piggin <npiggin@suse.de>, Andrew Morton <akpm@osdl.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Linux Memory Management List <linux-mm@kvack.org>
Subject: Re: Race in new page migration code?
Date: Sun, 15 Jan 2006 07:58:23 +0100	[thread overview]
Message-ID: <20060115065822.GA11441@wotan.suse.de> (raw)
In-Reply-To: <Pine.LNX.4.62.0601141040400.11601@schroedinger.engr.sgi.com>

On Sat, Jan 14, 2006 at 10:58:25AM -0800, Christoph Lameter wrote:
> On Sat, 14 Jan 2006, Nick Piggin wrote:
> 
> > > We take that reference count on the page:
> > Yes, after you have dropped all your claims to pin this page
> > (ie. pte lock). You really can't take a refcount on a page that
> 
> Oh. Now I see. I screwed that up by a fix I added.... We cannot drop the 
> ptl here. So back to the way it was before. Remove the draining from 
> isolate_lru_page and do it before scanning for pages so that we do not
> have to drop the ptl. 
> 

OK, if you prefer doing it with the pte lock held, how's this patch?

--
Migration code currently does not take a reference to target page
properly, so between unlocking the pte and trying to take a new
reference to the page with isolate_lru_page, anything could happen to
it.

Fix this by holding the pte lock until we get a chance to elevate the
refcount.

Other small cleanups while we're here.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c
+++ linux-2.6/mm/mempolicy.c
@@ -216,11 +216,8 @@ static int check_pte_range(struct vm_are
 
 		if (flags & MPOL_MF_STATS)
 			gather_stats(page, private);
-		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-			spin_unlock(ptl);
+		else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(vma, page, private, flags);
-			spin_lock(ptl);
-		}
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +306,10 @@ check_range(struct mm_struct *mm, unsign
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 
+	/* Clear the LRU lists so pages can be isolated */
+	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+		lru_add_drain_all();
+
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
@@ -555,15 +556,8 @@ static void migrate_page_add(struct vm_a
 	if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 	    mapping_writably_mapped(page->mapping) ||
 	    single_mm_mapping(vma->vm_mm, page->mapping)) {
-		int rc = isolate_lru_page(page);
-
-		if (rc == 1)
+		if (isolate_lru_page(page))
 			list_add(&page->lru, pagelist);
-		/*
-		 * If the isolate attempt was not successful then we just
-		 * encountered an unswappable page. Something must be wrong.
-	 	 */
-		WARN_ON(rc == 0);
 	}
 }
 
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -586,7 +586,7 @@ static inline void move_to_lru(struct pa
 }
 
 /*
- * Add isolated pages on the list back to the LRU
+ * Add isolated pages on the list back to the LRU.
  *
  * returns the number of pages put back.
  */
@@ -760,46 +760,33 @@ next:
 	return nr_failed + retry;
 }
 
-static void lru_add_drain_per_cpu(void *dummy)
-{
-	lru_add_drain();
-}
-
 /*
  * Isolate one page from the LRU lists and put it on the
- * indicated list. Do necessary cache draining if the
- * page is not on the LRU lists yet.
+ * indicated list with elevated refcount.
  *
  * Result:
  *  0 = page not on LRU list
  *  1 = page removed from LRU list and added to the specified list.
- * -ENOENT = page is being freed elsewhere.
  */
 int isolate_lru_page(struct page *page)
 {
-	int rc = 0;
-	struct zone *zone = page_zone(page);
+	int ret = 0;
 
-redo:
-	spin_lock_irq(&zone->lru_lock);
-	rc = __isolate_lru_page(page);
-	if (rc == 1) {
-		if (PageActive(page))
-			del_page_from_active_list(zone, page);
-		else
-			del_page_from_inactive_list(zone, page);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-	if (rc == 0) {
-		/*
-		 * Maybe this page is still waiting for a cpu to drain it
-		 * from one of the lru lists?
-		 */
-		rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
-		if (rc == 0 && PageLRU(page))
-			goto redo;
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (TestClearPageLRU(page)) {
+			ret = 1;
+			get_page(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
 	}
-	return rc;
+
+	return ret;
 }
 #endif
 
@@ -831,18 +818,20 @@ static int isolate_lru_pages(int nr_to_s
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		switch (__isolate_lru_page(page)) {
-		case 1:
-			/* Succeeded to isolate page */
-			list_move(&page->lru, dst);
-			nr_taken++;
-			break;
-		case -ENOENT:
-			/* Not possible to isolate */
-			list_move(&page->lru, src);
-			break;
-		default:
+		if (!TestClearPageLRU(page))
 			BUG();
+		list_del(&page->lru);
+		if (get_page_testone(page)) {
+			/*
+			 * It is being freed elsewhere
+			 */
+			__put_page(page);
+			SetPageLRU(page);
+			list_add(&page->lru, src);
+			continue;
+		} else {
+			list_add(&page->lru, dst);
+			nr_taken++;
 		}
 	}
 
Index: linux-2.6/include/linux/mm_inline.h
===================================================================
--- linux-2.6.orig/include/linux/mm_inline.h
+++ linux-2.6/include/linux/mm_inline.h
@@ -39,24 +39,3 @@ del_page_from_lru(struct zone *zone, str
 	}
 }
 
-/*
- * Isolate one page from the LRU lists.
- *
- * - zone->lru_lock must be held
- */
-static inline int __isolate_lru_page(struct page *page)
-{
-	if (unlikely(!TestClearPageLRU(page)))
-		return 0;
-
-	if (get_page_testone(page)) {
-		/*
-		 * It is being freed elsewhere
-		 */
-		__put_page(page);
-		SetPageLRU(page);
-		return -ENOENT;
-	}
-
-	return 1;
-}
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -167,6 +167,7 @@ extern void FASTCALL(lru_cache_add_activ
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
+extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -174,6 +174,24 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+static void lru_add_drain_per_cpu(void *dummy)
+{
+	lru_add_drain();
+}
+
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+#ifdef CONFIG_SMP
+	return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+#else
+	lru_add_drain();
+	return 0;
+#endif
+}
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct ki
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
+ *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode_lock		(page_remove_rmap->set_page_dirty)
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c
+++ linux-2.6/mm/rmap.c
@@ -33,7 +33,7 @@
  *     mapping->i_mmap_lock
  *       anon_vma->lock
  *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
  *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
  *             mapping->private_lock (in __set_page_dirty_buffers)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2006-01-15  6:58 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-01-14 15:55 Nick Piggin
2006-01-14 18:01 ` Christoph Lameter
2006-01-14 18:19   ` Nick Piggin
2006-01-14 18:58     ` Christoph Lameter
2006-01-15  5:28       ` Nick Piggin
2006-01-16  6:54         ` Christoph Lameter
2006-01-16  7:44           ` Nick Piggin
2006-01-17  8:29           ` Magnus Damm
2006-01-17  9:01             ` Nick Piggin
2006-01-17  9:22               ` Magnus Damm
2006-01-15  6:58       ` Nick Piggin [this message]
2006-01-15 10:58       ` Hugh Dickins
2006-01-16  6:51         ` Christoph Lameter
2006-01-16 12:32           ` Hugh Dickins
2006-01-16 15:47             ` Christoph Lameter
2006-01-16 16:06               ` Hugh Dickins
2006-01-16 16:10                 ` Christoph Lameter
2006-01-16 16:28                   ` Hugh Dickins
2006-01-16 16:51                     ` Andi Kleen
2006-01-16 16:56                       ` Nick Piggin
2006-01-17  5:06                         ` Christoph Lameter
2006-01-17 11:16                           ` Nick Piggin
2006-01-17 17:29             ` Christoph Lameter
2006-01-17 18:46               ` Lee Schermerhorn
2006-01-17 18:48                 ` Christoph Lameter
2006-01-17 19:01               ` Hugh Dickins
2006-01-17 20:15                 ` Christoph Lameter
2006-01-17 20:49                   ` Hugh Dickins

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060115065822.GA11441@wotan.suse.de \
    --to=npiggin@suse.de \
    --cc=akpm@osdl.org \
    --cc=clameter@engr.sgi.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox