From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: cl@linux-foundation.org, "linux-mm@kvack.org" <linux-mm@kvack.org>
Subject: [RFC MM 4/4] speculative page fault
Date: Fri, 13 Nov 2009 16:41:34 +0900 [thread overview]
Message-ID: <20091113164134.79805c13.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20091113163544.d92561c7.kamezawa.hiroyu@jp.fujitsu.com>
Speculative page fault.
This patch tries to implement speculative page fault.
Do page fault without taking mm->semaphore and check tag mm->generation
after taking page table lock. If generation is modified, someone took
write lock on mm->semaphore and we need to take read lock.
Now, hugepage is not handled. And stack page is not handled because
it can change [vm_start, vm_end).
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
arch/x86/mm/fault.c | 54 ++++++++++++++++++++++++++--------------
include/linux/mm.h | 2 -
mm/memory.c | 70 ++++++++++++++++++++++++++++++++--------------------
3 files changed, 81 insertions(+), 45 deletions(-)
Index: mmotm-2.6.32-Nov2/arch/x86/mm/fault.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/arch/x86/mm/fault.c
+++ mmotm-2.6.32-Nov2/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
+#include <linux/hugetlb.h> /* is_vm_hugetlbe_page()... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -952,7 +953,8 @@ do_page_fault(struct pt_regs *regs, unsi
struct mm_struct *mm;
int write;
int fault;
- int cachehit = 0;
+ int cachehit;
+ unsigned int key;
tsk = current;
mm = tsk->mm;
@@ -1057,6 +1059,18 @@ do_page_fault(struct pt_regs *regs, unsi
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
+ if ((error_code & PF_USER) &&
+ (mm->generation == current->mm_generation) && current->vma_cache) {
+ vma = current->vma_cache;
+ if ((vma->vm_start <= address) && (address < vma->vm_end)) {
+ key = mm->generation;
+ cachehit = 1;
+ goto got_vma;
+ }
+ }
+speculative_fault_retry:
+ cachehit = 0;
+ vma = NULL;
if (unlikely(!mm_reader_trylock(mm))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
@@ -1072,13 +1086,9 @@ do_page_fault(struct pt_regs *regs, unsi
*/
might_sleep();
}
- if ((mm->generation == current->mm_generation) && current->vma_cache) {
- vma = current->vma_cache;
- if ((vma->vm_start <= address) && (address < vma->vm_end))
- cachehit = 1;
- }
- if (!cachehit)
- vma = find_vma(mm, address);
+ key = mm->generation;
+ vma = find_vma(mm, address);
+got_vma:
if (unlikely(!vma)) {
bad_area(regs, error_code, address);
return;
@@ -1123,13 +1133,17 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
- fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
+ fault = handle_mm_fault(mm, vma, address,
+ write ? FAULT_FLAG_WRITE : 0, key);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
+ if (mm->generation != key)
+ goto speculative_fault_retry;
+
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
@@ -1139,16 +1153,20 @@ good_area:
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
regs, address);
}
- /* cache information */
+ check_v8086_mode(regs, address, tsk);
+
if (!cachehit) {
- if (current->vma_cache)
+ /* cache information if not hit. */
+ if (current->vma_cache) {
vma_put(current->vma_cache);
- current->vma_cache = vma;
- current->mm_generation = mm->generation;
- vma_get(vma);
+ current->vma_cache = NULL;
+ }
+ if (!is_vm_hugetlb_page(vma) &&
+ !((vma->vm_flags & VM_STACK_FLAGS) == VM_STACK_FLAGS)){
+ current->vma_cache = vma;
+ current->mm_generation = mm->generation;
+ vma_get(vma);
+ }
+ mm_reader_unlock(mm);
}
-
- check_v8086_mode(regs, address, tsk);
-
- mm_reader_unlock(mm);
}
Index: mmotm-2.6.32-Nov2/include/linux/mm.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm.h
+++ mmotm-2.6.32-Nov2/include/linux/mm.h
@@ -803,7 +803,7 @@ int invalidate_inode_page(struct page *p
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags);
+ unsigned long address, unsigned int flags, unsigned int key);
#else
static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -145,6 +145,13 @@ void pmd_clear_bad(pmd_t *pmd)
pmd_clear(pmd);
}
+static int match_key(struct mm_struct *mm, unsigned int key)
+{
+ if (likely(key == mm->generation))
+ return 1;
+ return 0;
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1339,7 +1346,7 @@ int __get_user_pages(struct task_struct
ret = handle_mm_fault(mm, vma, start,
(foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ FAULT_FLAG_WRITE : 0, mm->generation);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
@@ -2002,7 +2009,7 @@ static inline void cow_user_page(struct
*/
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+ spinlock_t *ptl, pte_t orig_pte, unsigned int key)
{
struct page *old_page, *new_page;
pte_t entry;
@@ -2036,7 +2043,8 @@ static int do_wp_page(struct mm_struct *
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- if (!pte_same(*page_table, orig_pte)) {
+ if (!match_key(mm, key) ||
+ !pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
@@ -2097,7 +2105,8 @@ static int do_wp_page(struct mm_struct *
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
- if (!pte_same(*page_table, orig_pte)) {
+ if (!match_key(mm, key) ||
+ !pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_cache_release(old_page);
goto unlock;
@@ -2160,7 +2169,8 @@ gotten:
* Re-check the pte - we dropped the lock
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (likely(pte_same(*page_table, orig_pte))) {
+ if (likely(match_key(mm, key) &&
+ pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
@@ -2509,7 +2519,7 @@ int vmtruncate_range(struct inode *inode
*/
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte, unsigned int key)
{
spinlock_t *ptl;
struct page *page;
@@ -2572,6 +2582,8 @@ static int do_swap_page(struct mm_struct
* Back out if somebody else already faulted in this pte.
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!match_key(mm, key))
+ goto out_nomap;
if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
@@ -2612,7 +2624,8 @@ static int do_swap_page(struct mm_struct
unlock_page(page);
if (flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ ret |= do_wp_page(mm, vma, address, page_table,
+ pmd, ptl, pte, key);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
@@ -2641,7 +2654,7 @@ out_release:
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags)
+ unsigned int flags, unsigned int key)
{
struct page *page;
spinlock_t *ptl;
@@ -2652,7 +2665,7 @@ static int do_anonymous_page(struct mm_s
vma->vm_page_prot));
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
- if (!pte_none(*page_table))
+ if (!match_key(mm, key) || !pte_none(*page_table))
goto unlock;
goto setpte;
}
@@ -2675,7 +2688,7 @@ static int do_anonymous_page(struct mm_s
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
+ if (!match_key(mm, key) || !pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
@@ -2712,8 +2725,8 @@ oom:
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+ unsigned long address, pmd_t *pmd, pgoff_t pgoff,
+ unsigned int flags, pte_t orig_pte, unsigned int key)
{
pte_t *page_table;
spinlock_t *ptl;
@@ -2826,7 +2839,8 @@ static int __do_fault(struct mm_struct *
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
- if (likely(pte_same(*page_table, orig_pte))) {
+ if (likely(match_key(mm, key) &&
+ pte_same(*page_table, orig_pte))) {
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
@@ -2891,13 +2905,14 @@ unwritable_page:
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte, unsigned int key)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte, key);
}
/*
@@ -2911,7 +2926,7 @@ static int do_linear_fault(struct mm_str
*/
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte, unsigned int key)
{
pgoff_t pgoff;
@@ -2929,7 +2944,8 @@ static int do_nonlinear_fault(struct mm_
}
pgoff = pte_to_pgoff(orig_pte);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault(mm, vma, address, pmd, pgoff,
+ flags, orig_pte, key);
}
/*
@@ -2946,8 +2962,8 @@ static int do_nonlinear_fault(struct mm_
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags, unsigned int key)
{
pte_t entry;
spinlock_t *ptl;
@@ -2958,26 +2974,28 @@ static inline int handle_pte_fault(struc
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ pte, pmd, flags, entry, key);
}
return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ pte, pmd, flags, key);
}
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ pte, pmd, flags, entry, key);
return do_swap_page(mm, vma, address,
- pte, pmd, flags, entry);
+ pte, pmd, flags, entry, key);
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
+ if (!match_key(mm, key))
+ goto unlock;
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
- pte, pmd, ptl, entry);
+ pte, pmd, ptl, entry, key);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
@@ -3002,7 +3020,7 @@ unlock:
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+ unsigned long address, unsigned int flags, unsigned int key)
{
pgd_t *pgd;
pud_t *pud;
@@ -3027,7 +3045,7 @@ int handle_mm_fault(struct mm_struct *mm
if (!pte)
return VM_FAULT_OOM;
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags, key);
}
#ifndef __PAGETABLE_PUD_FOLDED
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2009-11-13 7:44 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-11-13 7:35 [RFC MM] " KAMEZAWA Hiroyuki
2009-11-13 7:37 ` [RFC MM 1/4] mm accessor (updated) KAMEZAWA Hiroyuki
2009-11-13 7:38 ` [RFC MM 2/4] refcnt for vm_area_struct KAMEZAWA Hiroyuki
2009-11-13 7:40 ` [RFC MM 3/4] add mm version number KAMEZAWA Hiroyuki
2009-11-13 15:27 ` Minchan Kim
2009-11-13 16:26 ` KAMEZAWA Hiroyuki
2009-11-13 7:41 ` KAMEZAWA Hiroyuki [this message]
2009-11-13 15:59 ` [RFC MM 4/4] speculative page fault Minchan Kim
2009-11-13 16:28 ` KAMEZAWA Hiroyuki
2009-11-13 16:20 ` [RFC MM] " Minchan Kim
2009-11-13 16:38 ` KAMEZAWA Hiroyuki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20091113164134.79805c13.kamezawa.hiroyu@jp.fujitsu.com \
--to=kamezawa.hiroyu@jp.fujitsu.com \
--cc=cl@linux-foundation.org \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox