linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Robin Holt <holt@sgi.com>, Andrew Morton <akpm@osdl.org>,
	Linus Torvalds <torvalds@osdl.org>
Cc: Roland McGrath <roland@redhat.com>,
	Hugh Dickins <hugh@veritas.com>,
	linux-mm@kvack.org, linux-kernel <linux-kernel@vger.kernel.org>
Subject: [patch 2.6.13-rc4] fix get_user_pages bug
Date: Mon, 01 Aug 2005 18:21:22 +1000	[thread overview]
Message-ID: <42EDDB82.1040900@yahoo.com.au> (raw)
In-Reply-To: <20050801032258.A465C180EC0@magilla.sf.frob.com>

[-- Attachment #1: Type: text/plain, Size: 500 bytes --]

Hi,

Not sure if this should be fixed for 2.6.13. It can result in
pagecache corruption: so I guess that answers my own question.

This was tested by Robin and appears to solve the problem. Roland
had a quick look and thought the basic idea was sound. I'd like to
get a couple more acks before going forward, and in particular
Robin was contemplating possible efficiency improvements (although
efficiency can wait on correctness).

Feedback please, anyone.

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.


[-- Attachment #2: mm-gup-fix.patch --]
[-- Type: text/plain, Size: 8794 bytes --]

When get_user_pages for write access races with another process in the page
fault handler that has established the pte for read access, handle_mm_fault
in get_user_pages will return VM_FAULT_MINOR even if it hasn't made the page
correctly writeable (for example, broken COW).

Thus the assumption that get_user_pages has a writeable page at the mapping
after handle_mm_fault returns is incorrect. Fix this by reporting a raced
(uncompleted) fault and retrying the lookup and fault in get_user_pages before
making the assumption that we have a writeable page.

Great work by Robin Holt <holt@sgi.com> to debug the problem.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/i386/mm/fault.c
+++ linux-2.6/arch/i386/mm/fault.c
@@ -351,6 +351,8 @@ good_area:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -625,6 +625,7 @@ static inline int page_mapped(struct pag
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  */
+#define VM_FAULT_RACE	(-2)
 #define VM_FAULT_OOM	(-1)
 #define VM_FAULT_SIGBUS	0
 #define VM_FAULT_MINOR	1
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -969,6 +969,16 @@ int get_user_pages(struct task_struct *t
 					return i ? i : -EFAULT;
 				case VM_FAULT_OOM:
 					return i ? i : -ENOMEM;
+				case VM_FAULT_RACE:
+					/*
+					 * Someone else got there first.
+					 * Must retry before we can assume
+					 * that we have actually performed
+					 * the write fault (below).
+					 */
+					if (write)
+						continue;
+					break;
 				default:
 					BUG();
 				}
@@ -1229,6 +1239,7 @@ static int do_wp_page(struct mm_struct *
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
 	pte_t entry;
+	int ret;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1285,7 +1296,9 @@ static int do_wp_page(struct mm_struct *
 	 */
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
+	ret = VM_FAULT_RACE;
 	if (likely(pte_same(*page_table, pte))) {
+		ret = VM_FAULT_MINOR;
 		if (PageAnon(old_page))
 			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
@@ -1304,7 +1317,7 @@ static int do_wp_page(struct mm_struct *
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_MINOR;
+	return ret;
 
 no_new_page:
 	page_cache_release(old_page);
@@ -1659,7 +1672,7 @@ static int do_swap_page(struct mm_struct
 			if (likely(pte_same(*page_table, orig_pte)))
 				ret = VM_FAULT_OOM;
 			else
-				ret = VM_FAULT_MINOR;
+				ret = VM_FAULT_RACE;
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
 			goto out;
@@ -1681,7 +1694,7 @@ static int do_swap_page(struct mm_struct
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (unlikely(!pte_same(*page_table, orig_pte))) {
-		ret = VM_FAULT_MINOR;
+		ret = VM_FAULT_RACE;
 		goto out_nomap;
 	}
 
@@ -1742,6 +1755,7 @@ do_anonymous_page(struct mm_struct *mm, 
 {
 	pte_t entry;
 	struct page * page = ZERO_PAGE(addr);
+	int ret = VM_FAULT_MINOR;
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
@@ -1765,6 +1779,7 @@ do_anonymous_page(struct mm_struct *mm, 
 			pte_unmap(page_table);
 			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
+			ret = VM_FAULT_RACE;
 			goto out;
 		}
 		inc_mm_counter(mm, rss);
@@ -1784,7 +1799,7 @@ do_anonymous_page(struct mm_struct *mm, 
 	lazy_mmu_prot_update(entry);
 	spin_unlock(&mm->page_table_lock);
 out:
-	return VM_FAULT_MINOR;
+	return ret;
 no_mem:
 	return VM_FAULT_OOM;
 }
@@ -1902,6 +1917,7 @@ retry:
 		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
+		ret = VM_FAULT_RACE;
 		goto out;
 	}
 
Index: linux-2.6/arch/alpha/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/alpha/mm/fault.c
+++ linux-2.6/arch/alpha/mm/fault.c
@@ -162,6 +162,8 @@ do_page_fault(unsigned long address, uns
 		goto do_sigbus;
 	      case VM_FAULT_OOM:
 		goto out_of_memory;
+	      case VM_FAULT_RACE:
+		break;
 	      default:
 		BUG();
 	}
Index: linux-2.6/arch/arm/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/arm/mm/fault.c
+++ linux-2.6/arch/arm/mm/fault.c
@@ -195,6 +195,7 @@ survive:
 	case VM_FAULT_MINOR:
 		tsk->min_flt++;
 	case VM_FAULT_SIGBUS:
+	case VM_FAULT_RACE:
 		return fault;
 	}
 
Index: linux-2.6/arch/ia64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ia64/mm/fault.c
+++ linux-2.6/arch/ia64/mm/fault.c
@@ -164,6 +164,8 @@ ia64_do_page_fault (unsigned long addres
 		goto bad_area;
 	      case VM_FAULT_OOM:
 		goto out_of_memory;
+	      case VM_FAULT_RACE:
+		break;
 	      default:
 		BUG();
 	}
Index: linux-2.6/arch/m32r/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/m32r/mm/fault.c
+++ linux-2.6/arch/m32r/mm/fault.c
@@ -234,6 +234,8 @@ survive:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/arch/mips/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/mips/mm/fault.c
+++ linux-2.6/arch/mips/mm/fault.c
@@ -109,6 +109,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/ppc/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ppc/mm/fault.c
+++ linux-2.6/arch/ppc/mm/fault.c
@@ -259,6 +259,8 @@ good_area:
                 goto do_sigbus;
         case VM_FAULT_OOM:
                 goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/ppc64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/ppc64/mm/fault.c
+++ linux-2.6/arch/ppc64/mm/fault.c
@@ -234,6 +234,8 @@ good_area:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/s390/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/s390/mm/fault.c
+++ linux-2.6/arch/s390/mm/fault.c
@@ -260,6 +260,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/sh/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sh/mm/fault.c
+++ linux-2.6/arch/sh/mm/fault.c
@@ -101,6 +101,8 @@ survive:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 	}
Index: linux-2.6/arch/sparc/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sparc/mm/fault.c
+++ linux-2.6/arch/sparc/mm/fault.c
@@ -302,8 +302,8 @@ good_area:
 		current->maj_flt++;
 		break;
 	case VM_FAULT_MINOR:
-	default:
 		current->min_flt++;
+	case VM_FAULT_RACE:
 		break;
 	}
 	up_read(&mm->mmap_sem);
Index: linux-2.6/arch/sparc64/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/sparc64/mm/fault.c
+++ linux-2.6/arch/sparc64/mm/fault.c
@@ -454,6 +454,8 @@ good_area:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}
Index: linux-2.6/arch/um/kernel/trap_kern.c
===================================================================
--- linux-2.6.orig/arch/um/kernel/trap_kern.c
+++ linux-2.6/arch/um/kernel/trap_kern.c
@@ -76,6 +76,8 @@ int handle_page_fault(unsigned long addr
 		case VM_FAULT_OOM:
 			err = -ENOMEM;
 			goto out_of_memory;
+		case VM_FAULT_RACE:
+			break;
 		default:
 			BUG();
 		}
Index: linux-2.6/arch/xtensa/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/xtensa/mm/fault.c
+++ linux-2.6/arch/xtensa/mm/fault.c
@@ -113,6 +113,8 @@ survive:
 		goto do_sigbus;
 	case VM_FAULT_OOM:
 		goto out_of_memory;
+	case VM_FAULT_RACE:
+		break;
 	default:
 		BUG();
 	}

  reply	other threads:[~2005-08-01  8:21 UTC|newest]

Thread overview: 73+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-07-30 20:53 get_user_pages() with write=1 and force=1 gets read-only pages Robin Holt
2005-07-30 22:13 ` Hugh Dickins
2005-07-31  1:52   ` Nick Piggin
2005-07-31 10:52     ` Robin Holt
2005-07-31 11:07       ` Nick Piggin
2005-07-31 11:30         ` Robin Holt
2005-07-31 11:39           ` Robin Holt
2005-07-31 12:09           ` Robin Holt
2005-07-31 22:27             ` Nick Piggin
2005-08-01  3:22               ` Roland McGrath
2005-08-01  8:21                 ` Nick Piggin [this message]
2005-08-01  9:19                   ` [patch 2.6.13-rc4] fix get_user_pages bug Ingo Molnar
2005-08-01  9:27                     ` Nick Piggin
2005-08-01 10:15                       ` Ingo Molnar
2005-08-01 10:57                         ` Nick Piggin
2005-08-01 19:43                           ` Hugh Dickins
2005-08-01 20:08                             ` Linus Torvalds
2005-08-01 21:06                               ` Hugh Dickins
2005-08-01 21:51                                 ` Linus Torvalds
2005-08-01 22:01                                   ` Linus Torvalds
2005-08-02 12:01                                     ` Martin Schwidefsky
2005-08-02 12:26                                       ` Hugh Dickins
2005-08-02 12:28                                         ` Nick Piggin
2005-08-02 15:19                                         ` Martin Schwidefsky
2005-08-02 15:30                                       ` Linus Torvalds
2005-08-02 16:03                                         ` Hugh Dickins
2005-08-02 16:25                                           ` Linus Torvalds
2005-08-02 17:02                                             ` Linus Torvalds
2005-08-02 17:27                                               ` Hugh Dickins
2005-08-02 17:21                                             ` Hugh Dickins
2005-08-02 18:47                                               ` Linus Torvalds
2005-08-02 19:20                                                 ` Hugh Dickins
2005-08-02 19:54                                                   ` Linus Torvalds
2005-08-02 20:55                                                     ` Hugh Dickins
2005-08-03 10:24                                                       ` Nick Piggin
2005-08-03 11:47                                                         ` Hugh Dickins
2005-08-03 12:13                                                           ` Nick Piggin
2005-08-03 16:12                                                         ` Linus Torvalds
2005-08-03 16:39                                                           ` Linus Torvalds
2005-08-03 16:42                                                             ` Linus Torvalds
2005-08-03 17:12                                                           ` Hugh Dickins
2005-08-03 23:03                                                           ` Nick Piggin
2005-08-04 14:14                                                           ` Alexander Nyberg
2005-08-04 14:30                                                             ` Nick Piggin
2005-08-04 15:00                                                               ` Alexander Nyberg
2005-08-04 15:35                                                                 ` Hugh Dickins
2005-08-04 16:32                                                                   ` Russell King
2005-08-04 15:36                                                                 ` Linus Torvalds
2005-08-04 16:29                                                               ` Russell King
2005-08-03 10:24                                                       ` Martin Schwidefsky
2005-08-03 11:57                                                         ` Hugh Dickins
2005-08-02 16:44                                         ` Martin Schwidefsky
2005-08-01 15:42                   ` Linus Torvalds
2005-08-01 18:18                     ` Linus Torvalds
2005-08-03  8:24                       ` Robin Holt
2005-08-03 11:31                         ` Hugh Dickins
2005-08-04 11:48                           ` Robin Holt
2005-08-04 13:04                             ` Hugh Dickins
2005-08-01 19:29                     ` Hugh Dickins
2005-08-01 19:48                       ` Linus Torvalds
2005-08-02  8:07                         ` Martin Schwidefsky
2005-08-01 19:57                       ` Andrew Morton
2005-08-01 20:16                         ` Linus Torvalds
2005-08-02  0:14                     ` Nick Piggin
2005-08-02  1:27                     ` Nick Piggin
2005-08-02  3:45                       ` Linus Torvalds
2005-08-02  4:25                         ` Nick Piggin
2005-08-02  4:35                           ` Linus Torvalds
2005-08-01 20:03                   ` Hugh Dickins
2005-08-01 20:12                     ` Andrew Morton
2005-08-01 20:26                       ` Linus Torvalds
2005-08-01 20:51                       ` Hugh Dickins
2005-08-02 14:02 Dan Higgins

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42EDDB82.1040900@yahoo.com.au \
    --to=nickpiggin@yahoo.com.au \
    --cc=akpm@osdl.org \
    --cc=holt@sgi.com \
    --cc=hugh@veritas.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=roland@redhat.com \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox