linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Minchan Kim <minchan.kim@gmail.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>,
	Andi Kleen <andi@firstfloor.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	cl@linux-foundation.org,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	"mingo@elte.hu" <mingo@elte.hu>
Subject: Re: [RFC 4/4] speculative pag fault
Date: Fri, 18 Dec 2009 14:54:49 +0900	[thread overview]
Message-ID: <20091218145449.d3fb94cd.minchan.kim@barrios-desktop> (raw)
In-Reply-To: <20091218094602.3dcd5a02.kamezawa.hiroyu@jp.fujitsu.com>

Hi, Kame. 

On Fri, 18 Dec 2009 09:46:02 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> 
> Lookup vma in lockless style, do page fault, and check mm's version
> after takine page table lock. If racy, mm's version is invalid .
> Then, retry page fault.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  arch/x86/mm/fault.c |   28 +++++++++++++++++++++++++---
>  mm/memory.c         |   21 ++++++++++++++-------
>  2 files changed, 39 insertions(+), 10 deletions(-)
> 
> Index: mmotm-mm-accessor/arch/x86/mm/fault.c
> ===================================================================
> --- mmotm-mm-accessor.orig/arch/x86/mm/fault.c
> +++ mmotm-mm-accessor/arch/x86/mm/fault.c
> @@ -11,6 +11,7 @@
>  #include <linux/kprobes.h>		/* __kprobes, ...		*/
>  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
>  #include <linux/perf_event.h>		/* perf_sw_event		*/
> +#include <linux/hugetlb.h>		/* is_vm_hugetlb...*/
>  
>  #include <asm/traps.h>			/* dotraplinkage, ...		*/
>  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
> @@ -952,6 +953,7 @@ do_page_fault(struct pt_regs *regs, unsi
>  	struct mm_struct *mm;
>  	int write;
>  	int fault;
> +	int speculative;
>  
>  	tsk = current;
>  	mm = tsk->mm;
> @@ -1040,6 +1042,17 @@ do_page_fault(struct pt_regs *regs, unsi
>  		return;
>  	}
>  
> +	if ((error_code & PF_USER) && mm_version_check(mm)) {
> +		vma = lookup_vma_cache(mm, address);
> +		if (vma && mm_version_check(mm) &&
> +		   (vma->vm_start <= address) && (address < vma->vm_end)) {
> +			speculative = 1;
> +			goto found_vma;
> +		}
> +		if (vma)
> +			vma_release(vma);
> +	}
> +
>  	/*
>  	 * When running in the kernel we expect faults to occur only to
>  	 * addresses in user space.  All other faults represent errors in
> @@ -1056,6 +1069,8 @@ do_page_fault(struct pt_regs *regs, unsi
>  	 * validate the source. If this is invalid we can skip the address
>  	 * space check, thus avoiding the deadlock:
>  	 */
> +retry_with_lock:
> +	speculative = 0;
>  	if (unlikely(!mm_read_trylock(mm))) {
>  		if ((error_code & PF_USER) == 0 &&
>  		    !search_exception_tables(regs->ip)) {
> @@ -1073,6 +1088,7 @@ do_page_fault(struct pt_regs *regs, unsi
>  	}
>  
>  	vma = find_vma(mm, address);
> +found_vma:
>  	if (unlikely(!vma)) {
>  		bad_area(regs, error_code, address);
>  		return;
> @@ -1119,6 +1135,7 @@ good_area:
>  	 */
>  	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
>  
> +
>  	if (unlikely(fault & VM_FAULT_ERROR)) {
>  		mm_fault_error(regs, error_code, address, fault);
>  		return;
> @@ -1128,13 +1145,18 @@ good_area:
>  		tsk->maj_flt++;
>  		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
>  				     regs, address);
> -	} else {
> +	} else if (!speculative || mm_version_check(mm)) {
>  		tsk->min_flt++;
>  		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
>  				     regs, address);
> +	} else {
> +		vma_release(vma);
> +		goto retry_with_lock;
>  	}
>  
>  	check_v8086_mode(regs, address, tsk);
> -
> -	mm_read_unlock(mm);
> +	if (!speculative)
> +		mm_read_unlock(mm);
> +	else
> +		vma_release(vma);
>  }
> Index: mmotm-mm-accessor/mm/memory.c
> ===================================================================
> --- mmotm-mm-accessor.orig/mm/memory.c
> +++ mmotm-mm-accessor/mm/memory.c
> @@ -121,6 +121,13 @@ static int __init init_zero_pfn(void)
>  }
>  core_initcall(init_zero_pfn);
>  
> +static bool test_valid_pte(struct mm_struct *mm, pte_t pte, pte_t orig)
> +{
> +	if (likely(mm_version_check(mm) && pte_same(pte, orig)))
> +		return true;
> +	return false;
> +}
> +
>  /*
>   * If a p?d_bad entry is found while walking page tables, report
>   * the error, before resetting entry to p?d_none.  Usually (but
> @@ -2044,7 +2051,7 @@ static int do_wp_page(struct mm_struct *
>  			lock_page(old_page);
>  			page_table = pte_offset_map_lock(mm, pmd, address,
>  							 &ptl);
> -			if (!pte_same(*page_table, orig_pte)) {
> +			if (!test_valid_pte(mm, *page_table, orig_pte)) {
>  				unlock_page(old_page);
>  				page_cache_release(old_page);
>  				goto unlock;
> @@ -2105,7 +2112,7 @@ static int do_wp_page(struct mm_struct *
>  			 */
>  			page_table = pte_offset_map_lock(mm, pmd, address,
>  							 &ptl);
> -			if (!pte_same(*page_table, orig_pte)) {
> +			if (!test_valid_pte(mm, *page_table, orig_pte)) {
>  				unlock_page(old_page);
>  				page_cache_release(old_page);
>  				goto unlock;
> @@ -2169,7 +2176,7 @@ gotten:
>  	 * Re-check the pte - we dropped the lock
>  	 */
>  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> -	if (likely(pte_same(*page_table, orig_pte))) {
> +	if (test_valid_pte(mm, *page_table, orig_pte)) {
>  		if (old_page) {
>  			if (!PageAnon(old_page)) {
>  				dec_mm_counter(mm, file_rss);
> @@ -2555,7 +2562,7 @@ static int do_swap_page(struct mm_struct
>  			 * while we released the pte lock.
>  			 */
>  			page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> -			if (likely(pte_same(*page_table, orig_pte)))
> +			if (pte_same(*page_table, orig_pte))
>  				ret = VM_FAULT_OOM;
>  			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>  			goto unlock;
> @@ -2588,7 +2595,7 @@ static int do_swap_page(struct mm_struct
>  	 * Back out if somebody else already faulted in this pte.
>  	 */
>  	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> -	if (unlikely(!pte_same(*page_table, orig_pte)))
> +	if (unlikely(!test_valid_pte(mm, *page_table, orig_pte)))
>  		goto out_nomap;
>  
>  	if (unlikely(!PageUptodate(page))) {
> @@ -2844,7 +2851,7 @@ static int __do_fault(struct mm_struct *
>  	 * handle that later.
>  	 */
>  	/* Only go through if we didn't race with anybody else... */
> -	if (likely(pte_same(*page_table, orig_pte))) {
> +	if (likely(test_valid_pte(mm, *page_table, orig_pte))) {
>  		flush_icache_page(vma, page);
>  		entry = mk_pte(page, vma->vm_page_prot);
>  		if (flags & FAULT_FLAG_WRITE)
> @@ -2991,7 +2998,7 @@ static inline int handle_pte_fault(struc
>  
>  	ptl = pte_lockptr(mm, pmd);
>  	spin_lock(ptl);
> -	if (unlikely(!pte_same(*pte, entry)))
> +	if (unlikely(!test_valid_pte(mm, *pte, entry)))
>  		goto unlock;
>  	if (flags & FAULT_FLAG_WRITE) {
>  		if (!pte_write(entry))
> 

I looked over the patch series and come up to one scenario.

CPU A				CPU 2

"Thread A reads page"
		
do_page_fault
lookup_vma_cache
vma->cache_access++
				"Thread B unmap the vma"

				mm_write_lock
				down_write(mm->mmap_sem)
				mm->version++
				do_munmap
				wait_vmas_cache_access
				wait_event_interruptible
mm_version_check fail
vma_release
wake_up(vma->cache_wait)
				unmap_region
				mm_write_unlock
mm_read_trylock
find_vma
!vma
bad_area
				
As above scenario, Apparently, Thread A reads proper page in the vma at that time.
but it would meet the segment fault by speculative page fault. 

Sorry that i don't have time to review more detail. 
If I miss something, Pz correct me. 

I will review more detail sooner or later. :)

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2009-12-18  6:01 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-16  3:00 [mm][RFC][PATCH 0/11] mm accessor updates KAMEZAWA Hiroyuki
2009-12-16  3:01 ` [mm][RFC][PATCH 1/11] mm accessor for replacing mmap_sem KAMEZAWA Hiroyuki
2009-12-16  3:02 ` [mm][RFC][PATCH 2/11] mm accessor for kernel core KAMEZAWA Hiroyuki
2009-12-16  3:03 ` [mm][RFC][PATCH 3/11] mm accessor for fs KAMEZAWA Hiroyuki
2009-12-16  3:04 ` [mm][RFC][PATCH 4/11] mm accessor for kvm KAMEZAWA Hiroyuki
2009-12-16  3:05 ` [mm][RFC][PATCH 5/11] mm accessor for tomoyo KAMEZAWA Hiroyuki
2009-12-16  3:06 ` [mm][RFC][PATCH 6/11] mm accessor for driver/gpu KAMEZAWA Hiroyuki
2009-12-16  3:07 ` [mm][RFC][PATCH 7/11] mm accessor for inifiniband KAMEZAWA Hiroyuki
2009-12-16  3:08 ` [mm][RFC][PATCH 8/11] mm accessor for video KAMEZAWA Hiroyuki
2009-12-16  3:09 ` [mm][RFC][PATCH 9/11] mm accessor for sgi gru KAMEZAWA Hiroyuki
2009-12-16  3:10 ` [mm][RFC][PATCH 10/11] mm accessor for misc drivers KAMEZAWA Hiroyuki
2009-12-16  3:11 ` [mm][RFC][PATCH 11/11] mm accessor for x86 KAMEZAWA Hiroyuki
2009-12-16 10:11 ` [mm][RFC][PATCH 0/11] mm accessor updates Andi Kleen
2009-12-16 10:13   ` KAMEZAWA Hiroyuki
2009-12-16 10:28     ` Andi Kleen
2009-12-16 10:31       ` KAMEZAWA Hiroyuki
2009-12-16 10:49         ` Andi Kleen
2009-12-16 11:12           ` KAMEZAWA Hiroyuki
2009-12-16 11:31             ` Andi Kleen
2009-12-16 16:27               ` Christoph Lameter
2009-12-16 23:01                 ` Peter Zijlstra
2009-12-17  4:11                   ` KOSAKI Motohiro
2009-12-17  8:41                   ` Andi Kleen
2009-12-16 22:57         ` Peter Zijlstra
2009-12-17  8:40           ` Andi Kleen
2009-12-17  8:45             ` Peter Zijlstra
2009-12-17  8:54               ` Andi Kleen
2009-12-17 14:45                 ` Paul E. McKenney
2009-12-17 15:02                   ` Peter Zijlstra
2009-12-17 17:53                   ` Andi Kleen
2009-12-17 19:08                     ` Paul E. McKenney
2009-12-17 19:55                       ` Andi Kleen
2009-12-17 19:56                         ` Christoph Lameter
2009-12-17 20:14                           ` Peter Zijlstra
2009-12-17 20:42                             ` Christoph Lameter
2009-12-18  5:17                               ` Ingo Molnar
2009-12-18 17:00                                 ` Avi Kivity
2009-12-18 17:12                                   ` Ingo Molnar
2009-12-18 18:12                                     ` Christoph Lameter
2009-12-18 18:43                                       ` Andi Kleen
2009-12-18 18:45                                       ` Ingo Molnar
2009-12-18 23:18                                         ` KAMEZAWA Hiroyuki
2009-12-17 19:33             ` Christoph Lameter
2009-12-17 20:07               ` Peter Zijlstra
2009-12-17 20:13                 ` Christoph Lameter
2009-12-17 20:19                   ` Peter Zijlstra
2009-12-16 10:31       ` Minchan Kim
2009-12-16 10:33         ` KAMEZAWA Hiroyuki
2009-12-18  0:38           ` [RFC 0/4] speculative page fault (Was " KAMEZAWA Hiroyuki
2009-12-18  0:41             ` [RFC 1/4] uninline mm accessor KAMEZAWA Hiroyuki
2009-12-18  0:43             ` [RFC 2/4] add mm event counter KAMEZAWA Hiroyuki
2009-12-19  3:23               ` Minchan Kim
2009-12-19  6:37                 ` KAMEZAWA Hiroyuki
2009-12-18  0:45             ` [RFC 3/4] lockless vma caching KAMEZAWA Hiroyuki
2009-12-19  3:43               ` Minchan Kim
2009-12-19  6:44                 ` KAMEZAWA Hiroyuki
2009-12-18  0:46             ` [RFC 4/4] speculative pag fault KAMEZAWA Hiroyuki
2009-12-18  5:54               ` Minchan Kim [this message]
2009-12-18  6:06                 ` KAMEZAWA Hiroyuki
2009-12-18  6:33                   ` Minchan Kim
2009-12-19  3:55               ` Minchan Kim
2009-12-19  6:49                 ` KAMEZAWA Hiroyuki
2009-12-16 16:24   ` [mm][RFC][PATCH 0/11] mm accessor updates Christoph Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091218145449.d3fb94cd.minchan.kim@barrios-desktop \
    --to=minchan.kim@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=cl@linux-foundation.org \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox