[RFC MM 4/4] speculative page fault

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: cl@linux-foundation.org, "linux-mm@kvack.org" <linux-mm@kvack.org>
Subject: [RFC MM 4/4]  speculative page fault
Date: Fri, 13 Nov 2009 16:41:34 +0900	[thread overview]
Message-ID: <20091113164134.79805c13.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20091113163544.d92561c7.kamezawa.hiroyu@jp.fujitsu.com>

Speculative page fault.

 This patch tries to implement speculative page fault.
 Do page fault without taking mm->semaphore and check tag mm->generation
 after taking page table lock. If generation is modified, someone took
 write lock on mm->semaphore and we need to take read lock.

 Now, hugepage is not handled. And stack page is not handled because
 it can change [vm_start, vm_end).

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 arch/x86/mm/fault.c |   54 ++++++++++++++++++++++++++--------------
 include/linux/mm.h  |    2 -
 mm/memory.c         |   70 ++++++++++++++++++++++++++++++++--------------------
 3 files changed, 81 insertions(+), 45 deletions(-)

Index: mmotm-2.6.32-Nov2/arch/x86/mm/fault.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/arch/x86/mm/fault.c
+++ mmotm-2.6.32-Nov2/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>		/* __kprobes, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
 #include <linux/perf_event.h>		/* perf_sw_event		*/
+#include <linux/hugetlb.h>		/* is_vm_hugetlbe_page()...     */
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -952,7 +953,8 @@ do_page_fault(struct pt_regs *regs, unsi
 	struct mm_struct *mm;
 	int write;
 	int fault;
-	int cachehit = 0;
+	int cachehit;
+	unsigned int key;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -1057,6 +1059,18 @@ do_page_fault(struct pt_regs *regs, unsi
 	 * validate the source. If this is invalid we can skip the address
 	 * space check, thus avoiding the deadlock:
 	 */
+	 if ((error_code & PF_USER) &&
+	     (mm->generation == current->mm_generation) && current->vma_cache) {
+		vma = current->vma_cache;
+		if ((vma->vm_start <= address) && (address < vma->vm_end)) {
+			key = mm->generation;
+			cachehit = 1;
+			goto got_vma;
+		}
+	}
+speculative_fault_retry:
+	cachehit = 0;
+	vma = NULL;
 	if (unlikely(!mm_reader_trylock(mm))) {
 		if ((error_code & PF_USER) == 0 &&
 		    !search_exception_tables(regs->ip)) {
@@ -1072,13 +1086,9 @@ do_page_fault(struct pt_regs *regs, unsi
 		 */
 		might_sleep();
 	}
-	if ((mm->generation == current->mm_generation) && current->vma_cache) {
-		vma = current->vma_cache;
-		if ((vma->vm_start <= address) && (address < vma->vm_end))
-			cachehit = 1;
-	}
-	if (!cachehit)
-		vma = find_vma(mm, address);
+	key = mm->generation;
+	vma = find_vma(mm, address);
+got_vma:
 	if (unlikely(!vma)) {
 		bad_area(regs, error_code, address);
 		return;
@@ -1123,13 +1133,17 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault:
 	 */
-	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, address,
+		write ? FAULT_FLAG_WRITE : 0, key);
 
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		mm_fault_error(regs, error_code, address, fault);
 		return;
 	}
 
+	if (mm->generation != key)
+		goto speculative_fault_retry;
+
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
@@ -1139,16 +1153,20 @@ good_area:
 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
-	/* cache information */
+	check_v8086_mode(regs, address, tsk);
+
 	if (!cachehit) {
-		if (current->vma_cache)
+		/* cache information if not hit. */
+		if (current->vma_cache) {
 			vma_put(current->vma_cache);
-		current->vma_cache = vma;
-		current->mm_generation = mm->generation;
-		vma_get(vma);
+			current->vma_cache = NULL;
+		}
+		if (!is_vm_hugetlb_page(vma) &&
+		    !((vma->vm_flags & VM_STACK_FLAGS) == VM_STACK_FLAGS)){
+			current->vma_cache = vma;
+			current->mm_generation = mm->generation;
+			vma_get(vma);
+		}
+		mm_reader_unlock(mm);
 	}
-
-	check_v8086_mode(regs, address, tsk);
-
-	mm_reader_unlock(mm);
 }
Index: mmotm-2.6.32-Nov2/include/linux/mm.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm.h
+++ mmotm-2.6.32-Nov2/include/linux/mm.h
@@ -803,7 +803,7 @@ int invalidate_inode_page(struct page *p
 
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, unsigned int flags);
+	unsigned long address, unsigned int flags, unsigned int key);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -145,6 +145,13 @@ void pmd_clear_bad(pmd_t *pmd)
 	pmd_clear(pmd);
 }
 
+static int match_key(struct mm_struct *mm, unsigned int key)
+{
+	if (likely(key == mm->generation))
+		return 1;
+	return 0;
+}
+
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -1339,7 +1346,7 @@ int __get_user_pages(struct task_struct 
 
 				ret = handle_mm_fault(mm, vma, start,
 					(foll_flags & FOLL_WRITE) ?
-					FAULT_FLAG_WRITE : 0);
+					FAULT_FLAG_WRITE : 0, mm->generation);
 
 				if (ret & VM_FAULT_ERROR) {
 					if (ret & VM_FAULT_OOM)
@@ -2002,7 +2009,7 @@ static inline void cow_user_page(struct 
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		spinlock_t *ptl, pte_t orig_pte)
+		spinlock_t *ptl, pte_t orig_pte, unsigned int key)
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
@@ -2036,7 +2043,8 @@ static int do_wp_page(struct mm_struct *
 			lock_page(old_page);
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			if (!pte_same(*page_table, orig_pte)) {
+			if (!match_key(mm, key) ||
+				!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
 				goto unlock;
@@ -2097,7 +2105,8 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			if (!pte_same(*page_table, orig_pte)) {
+			if (!match_key(mm, key) ||
+				!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				page_cache_release(old_page);
 				goto unlock;
@@ -2160,7 +2169,8 @@ gotten:
 	 * Re-check the pte - we dropped the lock
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (likely(pte_same(*page_table, orig_pte))) {
+	if (likely(match_key(mm, key) &&
+		   pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
@@ -2509,7 +2519,7 @@ int vmtruncate_range(struct inode *inode
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte, unsigned int key)
 {
 	spinlock_t *ptl;
 	struct page *page;
@@ -2572,6 +2582,8 @@ static int do_swap_page(struct mm_struct
 	 * Back out if somebody else already faulted in this pte.
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!match_key(mm, key))
+		goto out_nomap;
 	if (unlikely(!pte_same(*page_table, orig_pte)))
 		goto out_nomap;
 
@@ -2612,7 +2624,8 @@ static int do_swap_page(struct mm_struct
 	unlock_page(page);
 
 	if (flags & FAULT_FLAG_WRITE) {
-		ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+		ret |= do_wp_page(mm, vma, address, page_table,
+				pmd, ptl, pte, key);
 		if (ret & VM_FAULT_ERROR)
 			ret &= VM_FAULT_ERROR;
 		goto out;
@@ -2641,7 +2654,7 @@ out_release:
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags)
+		unsigned int flags, unsigned int key)
 {
 	struct page *page;
 	spinlock_t *ptl;
@@ -2652,7 +2665,7 @@ static int do_anonymous_page(struct mm_s
 						vma->vm_page_prot));
 		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
-		if (!pte_none(*page_table))
+		if (!match_key(mm, key) || !pte_none(*page_table))
 			goto unlock;
 		goto setpte;
 	}
@@ -2675,7 +2688,7 @@ static int do_anonymous_page(struct mm_s
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-	if (!pte_none(*page_table))
+	if (!match_key(mm, key) || !pte_none(*page_table))
 		goto release;
 
 	inc_mm_counter(mm, anon_rss);
@@ -2712,8 +2725,8 @@ oom:
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pmd_t *pmd,
-		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+		unsigned long address, pmd_t *pmd, pgoff_t pgoff,
+		unsigned int flags, pte_t orig_pte, unsigned int key)
 {
 	pte_t *page_table;
 	spinlock_t *ptl;
@@ -2826,7 +2839,8 @@ static int __do_fault(struct mm_struct *
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_same(*page_table, orig_pte))) {
+	if (likely(match_key(mm, key) &&
+		pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
@@ -2891,13 +2905,14 @@ unwritable_page:
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte, unsigned int key)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
 	pte_unmap(page_table);
-	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, pmd, pgoff, flags,
+			orig_pte, key);
 }
 
 /*
@@ -2911,7 +2926,7 @@ static int do_linear_fault(struct mm_str
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
+		unsigned int flags, pte_t orig_pte, unsigned int key)
 {
 	pgoff_t pgoff;
 
@@ -2929,7 +2944,8 @@ static int do_nonlinear_fault(struct mm_
 	}
 
 	pgoff = pte_to_pgoff(orig_pte);
-	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, pmd, pgoff,
+				flags, orig_pte, key);
 }
 
 /*
@@ -2946,8 +2962,8 @@ static int do_nonlinear_fault(struct mm_
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
-		struct vm_area_struct *vma, unsigned long address,
-		pte_t *pte, pmd_t *pmd, unsigned int flags)
+	struct vm_area_struct *vma, unsigned long address,
+	pte_t *pte, pmd_t *pmd, unsigned int flags, unsigned int key)
 {
 	pte_t entry;
 	spinlock_t *ptl;
@@ -2958,26 +2974,28 @@ static inline int handle_pte_fault(struc
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+					pte, pmd, flags, entry, key);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pte, pmd, flags);
+					 pte, pmd, flags, key);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
+					pte, pmd, flags, entry, key);
 		return do_swap_page(mm, vma, address,
-					pte, pmd, flags, entry);
+				pte, pmd, flags, entry, key);
 	}
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
+	if (!match_key(mm, key))
+		goto unlock;
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+					pte, pmd, ptl, entry, key);
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
@@ -3002,7 +3020,7 @@ unlock:
  * By the time we get here, we already hold the mm semaphore
  */
 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags)
+	unsigned long address, unsigned int flags, unsigned int key)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -3027,7 +3045,7 @@ int handle_mm_fault(struct mm_struct *mm
 	if (!pte)
 		return VM_FAULT_OOM;
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+	return handle_pte_fault(mm, vma, address, pte, pmd, flags, key);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2009-11-13  7:44 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-11-13  7:35 [RFC MM] " KAMEZAWA Hiroyuki
2009-11-13  7:37 ` [RFC MM 1/4] mm accessor (updated) KAMEZAWA Hiroyuki
2009-11-13  7:38 ` [RFC MM 2/4] refcnt for vm_area_struct KAMEZAWA Hiroyuki
2009-11-13  7:40 ` [RFC MM 3/4] add mm version number KAMEZAWA Hiroyuki
2009-11-13 15:27   ` Minchan Kim
2009-11-13 16:26     ` KAMEZAWA Hiroyuki
2009-11-13  7:41 ` KAMEZAWA Hiroyuki [this message]
2009-11-13 15:59   ` [RFC MM 4/4] speculative page fault Minchan Kim
2009-11-13 16:28     ` KAMEZAWA Hiroyuki
2009-11-13 16:20 ` [RFC MM] " Minchan Kim
2009-11-13 16:38   ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091113164134.79805c13.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=cl@linux-foundation.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox