[PATCH][1/2]page_fault retry with NOPAGE

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH][1/2]page_fault retry with NOPAGE_RETRY
@ 2009-04-08 20:06 Ying Han
  0 siblings, 0 replies; 11+ messages in thread
From: Ying Han @ 2009-04-08 20:06 UTC (permalink / raw)
  To: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin, Wu Fengguang

support for FAULT_FLAG_RETRY with no user change:

Signed-off-by: Ying Han <yinghan@google.com>
              Mike Waychison <mikew@google.com>

 include/linux/fs.h  |    2 +-
 include/linux/mm.h  |    2 +
 mm/filemap.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++++--
 mm/memory.c         |   33 +++++++++++++++++------


diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a853ef..29c2c39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -793,7 +793,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */

 	unsigned int ra_pages;		/* Maximum readahead window */
-	int mmap_miss;			/* Cache miss stat for mmap accesses */
+	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..5a134a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];

 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_RETRY	0x04	/* Retry major fault */


 /*
@@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)

 #define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */

+#define VM_FAULT_RETRY	0x0010
 #define VM_FAULT_OOM	0x0001
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..6eb7c36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,58 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);

 /**
+ * find_lock_page_retry - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @ppage: zero if page not present, otherwise point to the page in pagecache
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * If retry flag is on, and page is already locked by someone else, return
+ * a hint of retry and leave *ppage untouched.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+				struct vm_area_struct *vma, struct page **ppage,
+				int retry)
+{
+	unsigned int ret = 0;
+	struct page *page;
+
+repeat:
+	page = find_get_page(mapping, offset);
+	if (page) {
+		if (!retry)
+			lock_page(page);
+		else {
+			if (!trylock_page(page)) {
+				struct mm_struct *mm = vma->vm_mm;
+
+				up_read(&mm->mmap_sem);
+				wait_on_page_locked(page);
+				down_read(&mm->mmap_sem);
+
+				page_cache_release(page);
+				return VM_FAULT_RETRY;
+			}
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		VM_BUG_ON(page->index != offset);
+	}
+	*ppage = page;
+	return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	pgoff_t size;
 	int did_readaround = 0;
 	int ret = 0;
+	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+	int retry_ret;

 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	 */
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
+
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1465,7 +1520,13 @@ retry_find:
 		if (!page) {
 			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
+			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+						vma, &page, retry_flag);
+			if (retry_ret == VM_FAULT_RETRY) {
+				/* counteract the followed retry hit */
+				ra->mmap_miss++;
+				return retry_ret;
+			}
 			if (!page)
 				goto no_cached_page;
 		}
@@ -1504,7 +1565,14 @@ retry_find:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+retry_find_retry:
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+				vma, &page, retry_flag);
+		if (retry_ret == VM_FAULT_RETRY) {
+			/* counteract the followed retry hit */
+			ra->mmap_miss++;
+			return retry_ret;
+		}
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1548,7 +1616,7 @@ no_cached_page:
 	 * meantime, we'll just come back here and read it again.
 	 */
 	if (error >= 0)
-		goto retry_find;
+		goto retry_find_retry;

 	/*
 	 * An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..5e215c9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
 	vmf.page = NULL;

 	ret = vma->vm_ops->fault(vma, &vmf);
+
+	/* page may be available, but we have to restart the process
+	 * because mmap_sem was dropped during the ->fault
+	 */
+	if (ret & VM_FAULT_RETRY)
+		return ret;
+
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;

@@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+	int write = write_access & ~FAULT_FLAG_RETRY;
+	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

+	flags |= (write_access & FAULT_FLAG_RETRY);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
@@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	int ret;

 	__set_current_state(TASK_RUNNING);

-	count_vm_event(PGFAULT);
-
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return hugetlb_fault(mm, vma, address, write_access);
+	if (unlikely(is_vm_hugetlb_page(vma))) {
+		ret = hugetlb_fault(mm, vma, address, write_access);
+		goto out;
+	}

+	ret = VM_FAULT_OOM;
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		return VM_FAULT_OOM;
+		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		return VM_FAULT_OOM;
+		goto out;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		return VM_FAULT_OOM;
+		goto out;

-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+	ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+out:
+	if (!(ret & VM_FAULT_RETRY))
+		count_vm_event(PGFAULT);
+	return ret;
 }

 #ifndef __PAGETABLE_PUD_FOLDED

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH][1/2]page_fault retry with NOPAGE_RETRY
@ 2009-04-08 20:02 Ying Han
  2009-04-09  7:47 ` Wu Fengguang
  2009-04-10  6:02 ` Andrew Morton
  0 siblings, 2 replies; 11+ messages in thread
From: Ying Han @ 2009-04-08 20:02 UTC (permalink / raw)
  To: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin, Wu Fengguang

support for FAULT_FLAG_RETRY with no user change:

Signed-off-by: Ying Han <yinghan@google.com>
	       Mike Waychison <mikew@google.com>

 include/linux/fs.h  |    2 +-
 include/linux/mm.h  |    2 +
 mm/filemap.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++++--
 mm/memory.c         |   33 +++++++++++++++++------


diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a853ef..29c2c39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -793,7 +793,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */

 	unsigned int ra_pages;		/* Maximum readahead window */
-	int mmap_miss;			/* Cache miss stat for mmap accesses */
+	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..5a134a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];

 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_RETRY	0x04	/* Retry major fault */


 /*
@@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)

 #define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */

+#define VM_FAULT_RETRY	0x0010
 #define VM_FAULT_OOM	0x0001
 #define VM_FAULT_SIGBUS	0x0002
 #define VM_FAULT_MAJOR	0x0004
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..6eb7c36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,58 @@ repeat:
 EXPORT_SYMBOL(find_lock_page);

 /**
+ * find_lock_page_retry - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @ppage: zero if page not present, otherwise point to the page in pagecache
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * If retry flag is on, and page is already locked by someone else, return
+ * a hint of retry and leave *ppage untouched.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+				struct vm_area_struct *vma, struct page **ppage,
+				int retry)
+{
+	unsigned int ret = 0;
+	struct page *page;
+
+repeat:
+	page = find_get_page(mapping, offset);
+	if (page) {
+		if (!retry)
+			lock_page(page);
+		else {
+			if (!trylock_page(page)) {
+				struct mm_struct *mm = vma->vm_mm;
+
+				up_read(&mm->mmap_sem);
+				wait_on_page_locked(page);
+				down_read(&mm->mmap_sem);
+
+				page_cache_release(page);
+				return VM_FAULT_RETRY;
+			}
+		}
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
+		}
+		VM_BUG_ON(page->index != offset);
+	}
+	*ppage = page;
+	return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
  * find_or_create_page - locate or add a pagecache page
  * @mapping: the page's address_space
  * @index: the page's index into the mapping
@@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	pgoff_t size;
 	int did_readaround = 0;
 	int ret = 0;
+	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+	int retry_ret;

 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
 	 */
 retry_find:
 	page = find_lock_page(mapping, vmf->pgoff);
+
 	/*
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
@@ -1465,7 +1520,13 @@ retry_find:
 		if (!page) {
 			page_cache_sync_readahead(mapping, ra, file,
 							   vmf->pgoff, 1);
-			page = find_lock_page(mapping, vmf->pgoff);
+			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+						vma, &page, retry_flag);
+			if (retry_ret == VM_FAULT_RETRY) {
+				/* counteract the followed retry hit */
+				ra->mmap_miss++;
+				return retry_ret;
+			}
 			if (!page)
 				goto no_cached_page;
 		}
@@ -1504,7 +1565,14 @@ retry_find:
 				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, vmf->pgoff);
+retry_find_retry:
+		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+				vma, &page, retry_flag);
+		if (retry_ret == VM_FAULT_RETRY) {
+			/* counteract the followed retry hit */
+			ra->mmap_miss++;
+			return retry_ret;
+		}
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1548,7 +1616,7 @@ no_cached_page:
 	 * meantime, we'll just come back here and read it again.
 	 */
 	if (error >= 0)
-		goto retry_find;
+		goto retry_find_retry;

 	/*
 	 * An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..5e215c9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
 	vmf.page = NULL;

 	ret = vma->vm_ops->fault(vma, &vmf);
+
+	/* page may be available, but we have to restart the process
+	 * because mmap_sem was dropped during the ->fault
+	 */
+	if (ret & VM_FAULT_RETRY)
+		return ret;
+
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;

@@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+	int write = write_access & ~FAULT_FLAG_RETRY;
+	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

+	flags |= (write_access & FAULT_FLAG_RETRY);
 	pte_unmap(page_table);
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
@@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
+	int ret;

 	__set_current_state(TASK_RUNNING);

-	count_vm_event(PGFAULT);
-
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		return hugetlb_fault(mm, vma, address, write_access);
+	if (unlikely(is_vm_hugetlb_page(vma))) {
+		ret = hugetlb_fault(mm, vma, address, write_access);
+		goto out;
+	}

+	ret = VM_FAULT_OOM;
 	pgd = pgd_offset(mm, address);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		return VM_FAULT_OOM;
+		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		return VM_FAULT_OOM;
+		goto out;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		return VM_FAULT_OOM;
+		goto out;

-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+	ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+out:
+	if (!(ret & VM_FAULT_RETRY))
+		count_vm_event(PGFAULT);
+	return ret;
 }

 #ifndef __PAGETABLE_PUD_FOLDED

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-08 20:02 Ying Han
@ 2009-04-09  7:47 ` Wu Fengguang
  2009-04-09 16:21   ` Ying Han
  2009-04-10  6:02 ` Andrew Morton
  1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-04-09  7:47 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> support for FAULT_FLAG_RETRY with no user change:

A better changelog is desired, otherwise:

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>

> Signed-off-by: Ying Han <yinghan@google.com>
> 	       Mike Waychison <mikew@google.com>
> 
>  include/linux/fs.h  |    2 +-
>  include/linux/mm.h  |    2 +
>  mm/filemap.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  mm/memory.c         |   33 +++++++++++++++++------
> 
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
>  					   there are only # of pages ahead */
> 
>  	unsigned int ra_pages;		/* Maximum readahead window */
> -	int mmap_miss;			/* Cache miss stat for mmap accesses */
> +	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
>  	loff_t prev_pos;		/* Cache last read() position */
>  };
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
> 
>  #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
>  #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY	0x04	/* Retry major fault */
> 
> 
>  /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
> 
>  #define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */
> 
> +#define VM_FAULT_RETRY	0x0010
>  #define VM_FAULT_OOM	0x0001
>  #define VM_FAULT_SIGBUS	0x0002
>  #define VM_FAULT_MAJOR	0x0004

Why not append VM_FAULT_RETRY here, and indent the _OOM line?

> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
>  EXPORT_SYMBOL(find_lock_page);
> 
>  /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> +				struct vm_area_struct *vma, struct page **ppage,
> +				int retry)
> +{
> +	unsigned int ret = 0;
> +	struct page *page;
> +
> +repeat:
> +	page = find_get_page(mapping, offset);
> +	if (page) {
> +		if (!retry)
> +			lock_page(page);
> +		else {
> +			if (!trylock_page(page)) {
> +				struct mm_struct *mm = vma->vm_mm;
> +
> +				up_read(&mm->mmap_sem);
> +				wait_on_page_locked(page);
> +				down_read(&mm->mmap_sem);
> +
> +				page_cache_release(page);
> +				return VM_FAULT_RETRY;
> +			}
> +		}
> +		if (unlikely(page->mapping != mapping)) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +		VM_BUG_ON(page->index != offset);
> +	}
> +	*ppage = page;
> +	return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
>   * find_or_create_page - locate or add a pagecache page
>   * @mapping: the page's address_space
>   * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	pgoff_t size;
>  	int did_readaround = 0;
>  	int ret = 0;
> +	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> +	int retry_ret;
> 
>  	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	 */
>  retry_find:
>  	page = find_lock_page(mapping, vmf->pgoff);
> +
>  	/*
>  	 * For sequential accesses, we use the generic readahead logic.
>  	 */
> @@ -1465,7 +1520,13 @@ retry_find:
>  		if (!page) {
>  			page_cache_sync_readahead(mapping, ra, file,
>  							   vmf->pgoff, 1);
> -			page = find_lock_page(mapping, vmf->pgoff);
> +			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +						vma, &page, retry_flag);
> +			if (retry_ret == VM_FAULT_RETRY) {
> +				/* counteract the followed retry hit */
> +				ra->mmap_miss++;

Please don't relocate the comment...because that will break my
following patches(mainly Linus' filemap cleanups), which will
_heavily_ rework these chunks anyway. And make a hard time for Andrew
to merge them.

> +				return retry_ret;
> +			}

>  			if (!page)
>  				goto no_cached_page;
>  		}
> @@ -1504,7 +1565,14 @@ retry_find:
>  				start = vmf->pgoff - ra_pages / 2;
>  			do_page_cache_readahead(mapping, file, start, ra_pages);
>  		}
> -		page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> +		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +				vma, &page, retry_flag);
> +		if (retry_ret == VM_FAULT_RETRY) {
> +			/* counteract the followed retry hit */
> +			ra->mmap_miss++;

ditto

Thanks,
Fengguang

> +			return retry_ret;
> +		}
>  		if (!page)
>  			goto no_cached_page;
>  	}
> @@ -1548,7 +1616,7 @@ no_cached_page:
>  	 * meantime, we'll just come back here and read it again.
>  	 */
>  	if (error >= 0)
> -		goto retry_find;
> +		goto retry_find_retry;
> 
>  	/*
>  	 * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>  	vmf.page = NULL;
> 
>  	ret = vma->vm_ops->fault(vma, &vmf);
> +
> +	/* page may be available, but we have to restart the process
> +	 * because mmap_sem was dropped during the ->fault
> +	 */
> +	if (ret & VM_FAULT_RETRY)
> +		return ret;
> +
>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>  		return ret;
> 
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>  {
>  	pgoff_t pgoff = (((address & PAGE_MASK)
>  			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> -	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> +	int write = write_access & ~FAULT_FLAG_RETRY;
> +	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> 
> +	flags |= (write_access & FAULT_FLAG_RETRY);
>  	pte_unmap(page_table);
>  	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>  }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>  	pud_t *pud;
>  	pmd_t *pmd;
>  	pte_t *pte;
> +	int ret;
> 
>  	__set_current_state(TASK_RUNNING);
> 
> -	count_vm_event(PGFAULT);
> -
> -	if (unlikely(is_vm_hugetlb_page(vma)))
> -		return hugetlb_fault(mm, vma, address, write_access);
> +	if (unlikely(is_vm_hugetlb_page(vma))) {
> +		ret = hugetlb_fault(mm, vma, address, write_access);
> +		goto out;
> +	}
> 
> +	ret = VM_FAULT_OOM;
>  	pgd = pgd_offset(mm, address);
>  	pud = pud_alloc(mm, pgd, address);
>  	if (!pud)
> -		return VM_FAULT_OOM;
> +		goto out;
>  	pmd = pmd_alloc(mm, pud, address);
>  	if (!pmd)
> -		return VM_FAULT_OOM;
> +		goto out;
>  	pte = pte_alloc_map(mm, pmd, address);
>  	if (!pte)
> -		return VM_FAULT_OOM;
> +		goto out;
> 
> -	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +	ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> +	if (!(ret & VM_FAULT_RETRY))
> +		count_vm_event(PGFAULT);
> +	return ret;
>  }
> 
>  #ifndef __PAGETABLE_PUD_FOLDED

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-09  7:47 ` Wu Fengguang
@ 2009-04-09 16:21   ` Ying Han
  2009-04-10  0:25     ` Wu Fengguang
  0 siblings, 1 reply; 11+ messages in thread
From: Ying Han @ 2009-04-09 16:21 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
>> support for FAULT_FLAG_RETRY with no user change:
>
> A better changelog is desired, otherwise:
>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
>
>> Signed-off-by: Ying Han <yinghan@google.com>
>>              Mike Waychison <mikew@google.com>
>>
>>  include/linux/fs.h  |    2 +-
>>  include/linux/mm.h  |    2 +
>>  mm/filemap.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>>  mm/memory.c         |   33 +++++++++++++++++------
>>
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>>                                          there are only # of pages ahead */
>>
>>       unsigned int ra_pages;          /* Maximum readahead window */
>> -     int mmap_miss;                  /* Cache miss stat for mmap accesses */
>> +     unsigned int mmap_miss;         /* Cache miss stat for mmap accesses */
>>       loff_t prev_pos;                /* Cache last read() position */
>>  };
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>>  #define FAULT_FLAG_WRITE     0x01    /* Fault was a write access */
>>  #define FAULT_FLAG_NONLINEAR 0x02    /* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY     0x04    /* Retry major fault */
>>
>>
>>  /*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>>  #define VM_FAULT_MINOR       0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY       0x0010
>>  #define VM_FAULT_OOM 0x0001
>>  #define VM_FAULT_SIGBUS      0x0002
>>  #define VM_FAULT_MAJOR       0x0004
>
> Why not append VM_FAULT_RETRY here, and indent the _OOM line?
>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>>  EXPORT_SYMBOL(find_lock_page);
>>
>>  /**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> +                             struct vm_area_struct *vma, struct page **ppage,
>> +                             int retry)
>> +{
>> +     unsigned int ret = 0;
>> +     struct page *page;
>> +
>> +repeat:
>> +     page = find_get_page(mapping, offset);
>> +     if (page) {
>> +             if (!retry)
>> +                     lock_page(page);
>> +             else {
>> +                     if (!trylock_page(page)) {
>> +                             struct mm_struct *mm = vma->vm_mm;
>> +
>> +                             up_read(&mm->mmap_sem);
>> +                             wait_on_page_locked(page);
>> +                             down_read(&mm->mmap_sem);
>> +
>> +                             page_cache_release(page);
>> +                             return VM_FAULT_RETRY;
>> +                     }
>> +             }
>> +             if (unlikely(page->mapping != mapping)) {
>> +                     unlock_page(page);
>> +                     page_cache_release(page);
>> +                     goto repeat;
>> +             }
>> +             VM_BUG_ON(page->index != offset);
>> +     }
>> +     *ppage = page;
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>>   * find_or_create_page - locate or add a pagecache page
>>   * @mapping: the page's address_space
>>   * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>>       pgoff_t size;
>>       int did_readaround = 0;
>>       int ret = 0;
>> +     int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> +     int retry_ret;
>>
>>       size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>>       if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>>        */
>>  retry_find:
>>       page = find_lock_page(mapping, vmf->pgoff);
>> +
>>       /*
>>        * For sequential accesses, we use the generic readahead logic.
>>        */
>> @@ -1465,7 +1520,13 @@ retry_find:
>>               if (!page) {
>>                       page_cache_sync_readahead(mapping, ra, file,
>>                                                          vmf->pgoff, 1);
>> -                     page = find_lock_page(mapping, vmf->pgoff);
>> +                     retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> +                                             vma, &page, retry_flag);
>> +                     if (retry_ret == VM_FAULT_RETRY) {
>> +                             /* counteract the followed retry hit */
>> +                             ra->mmap_miss++;
>
> Please don't relocate the comment...because that will break my
> following patches(mainly Linus' filemap cleanups), which will
> _heavily_ rework these chunks anyway. And make a hard time for Andrew
> to merge them.

then how you force the 80 character lines?
>
>> +                             return retry_ret;
>> +                     }
>
>>                       if (!page)
>>                               goto no_cached_page;
>>               }
>> @@ -1504,7 +1565,14 @@ retry_find:
>>                               start = vmf->pgoff - ra_pages / 2;
>>                       do_page_cache_readahead(mapping, file, start, ra_pages);
>>               }
>> -             page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> +             retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> +                             vma, &page, retry_flag);
>> +             if (retry_ret == VM_FAULT_RETRY) {
>> +                     /* counteract the followed retry hit */
>> +                     ra->mmap_miss++;
>
> ditto
>
> Thanks,
> Fengguang
>
>> +                     return retry_ret;
>> +             }
>>               if (!page)
>>                       goto no_cached_page;
>>       }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>>        * meantime, we'll just come back here and read it again.
>>        */
>>       if (error >= 0)
>> -             goto retry_find;
>> +             goto retry_find_retry;
>>
>>       /*
>>        * An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>>       vmf.page = NULL;
>>
>>       ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> +     /* page may be available, but we have to restart the process
>> +      * because mmap_sem was dropped during the ->fault
>> +      */
>> +     if (ret & VM_FAULT_RETRY)
>> +             return ret;
>> +
>>       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>>               return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>>  {
>>       pgoff_t pgoff = (((address & PAGE_MASK)
>>                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> -     unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> +     int write = write_access & ~FAULT_FLAG_RETRY;
>> +     unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> +     flags |= (write_access & FAULT_FLAG_RETRY);
>>       pte_unmap(page_table);
>>       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>>  }
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>>       pud_t *pud;
>>       pmd_t *pmd;
>>       pte_t *pte;
>> +     int ret;
>>
>>       __set_current_state(TASK_RUNNING);
>>
>> -     count_vm_event(PGFAULT);
>> -
>> -     if (unlikely(is_vm_hugetlb_page(vma)))
>> -             return hugetlb_fault(mm, vma, address, write_access);
>> +     if (unlikely(is_vm_hugetlb_page(vma))) {
>> +             ret = hugetlb_fault(mm, vma, address, write_access);
>> +             goto out;
>> +     }
>>
>> +     ret = VM_FAULT_OOM;
>>       pgd = pgd_offset(mm, address);
>>       pud = pud_alloc(mm, pgd, address);
>>       if (!pud)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>       pmd = pmd_alloc(mm, pud, address);
>>       if (!pmd)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>       pte = pte_alloc_map(mm, pmd, address);
>>       if (!pte)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>
>> -     return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +     ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> +     if (!(ret & VM_FAULT_RETRY))
>> +             count_vm_event(PGFAULT);
>> +     return ret;
>>  }
>>
>>  #ifndef __PAGETABLE_PUD_FOLDED
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-09 16:21   ` Ying Han
@ 2009-04-10  0:25     ` Wu Fengguang
  0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10  0:25 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Fri, Apr 10, 2009 at 12:21:25AM +0800, Ying Han wrote:
> On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > A better changelog is desired, otherwise:
> >
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> >
> >> Signed-off-by: Ying Han <yinghan@google.com>
> >>              Mike Waychison <mikew@google.com>
> >>
> >>  include/linux/fs.h  |    2 +-
> >>  include/linux/mm.h  |    2 +
> >>  mm/filemap.c        |   72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >>  mm/memory.c         |   33 +++++++++++++++++------
> >>
> >>
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >>                                          there are only # of pages ahead */
> >>
> >>       unsigned int ra_pages;          /* Maximum readahead window */
> >> -     int mmap_miss;                  /* Cache miss stat for mmap accesses */
> >> +     unsigned int mmap_miss;         /* Cache miss stat for mmap accesses */
> >>       loff_t prev_pos;                /* Cache last read() position */
> >>  };
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index ffee2f7..5a134a9 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
> >>
> >>  #define FAULT_FLAG_WRITE     0x01    /* Fault was a write access */
> >>  #define FAULT_FLAG_NONLINEAR 0x02    /* Fault was via a nonlinear mapping */
> >> +#define FAULT_FLAG_RETRY     0x04    /* Retry major fault */
> >>
> >>
> >>  /*
> >> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
> >>
> >>  #define VM_FAULT_MINOR       0 /* For backwards compat. Remove me quickly. */
> >>
> >> +#define VM_FAULT_RETRY       0x0010
> >>  #define VM_FAULT_OOM 0x0001
> >>  #define VM_FAULT_SIGBUS      0x0002
> >>  #define VM_FAULT_MAJOR       0x0004
> >
> > Why not append VM_FAULT_RETRY here, and indent the _OOM line?
> >
> >> diff --git a/mm/filemap.c b/mm/filemap.c
> >> index f3e5f89..6eb7c36 100644
> >> --- a/mm/filemap.c
> >> +++ b/mm/filemap.c
> >> @@ -714,6 +714,58 @@ repeat:
> >>  EXPORT_SYMBOL(find_lock_page);
> >>
> >>  /**
> >> + * find_lock_page_retry - locate, pin and lock a pagecache page
> >> + * @mapping: the address_space to search
> >> + * @offset: the page index
> >> + * @vma: vma in which the fault was taken
> >> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> >> + * @retry: 1 indicate caller tolerate a retry.
> >> + *
> >> + * If retry flag is on, and page is already locked by someone else, return
> >> + * a hint of retry and leave *ppage untouched.
> >> + *
> >> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> >> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> >> + * hint to caller for retry, or ret=0 which means page is succefully
> >> + * locked.
> >> + */
> >> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> >> +                             struct vm_area_struct *vma, struct page **ppage,
> >> +                             int retry)
> >> +{
> >> +     unsigned int ret = 0;
> >> +     struct page *page;
> >> +
> >> +repeat:
> >> +     page = find_get_page(mapping, offset);
> >> +     if (page) {
> >> +             if (!retry)
> >> +                     lock_page(page);
> >> +             else {
> >> +                     if (!trylock_page(page)) {
> >> +                             struct mm_struct *mm = vma->vm_mm;
> >> +
> >> +                             up_read(&mm->mmap_sem);
> >> +                             wait_on_page_locked(page);
> >> +                             down_read(&mm->mmap_sem);
> >> +
> >> +                             page_cache_release(page);
> >> +                             return VM_FAULT_RETRY;
> >> +                     }
> >> +             }
> >> +             if (unlikely(page->mapping != mapping)) {
> >> +                     unlock_page(page);
> >> +                     page_cache_release(page);
> >> +                     goto repeat;
> >> +             }
> >> +             VM_BUG_ON(page->index != offset);
> >> +     }
> >> +     *ppage = page;
> >> +     return ret;
> >> +}
> >> +EXPORT_SYMBOL(find_lock_page_retry);
> >> +
> >> +/**
> >>   * find_or_create_page - locate or add a pagecache page
> >>   * @mapping: the page's address_space
> >>   * @index: the page's index into the mapping
> >> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >>       pgoff_t size;
> >>       int did_readaround = 0;
> >>       int ret = 0;
> >> +     int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> >> +     int retry_ret;
> >>
> >>       size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> >>       if (vmf->pgoff >= size)
> >> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >>        */
> >>  retry_find:
> >>       page = find_lock_page(mapping, vmf->pgoff);
> >> +
> >>       /*
> >>        * For sequential accesses, we use the generic readahead logic.
> >>        */
> >> @@ -1465,7 +1520,13 @@ retry_find:
> >>               if (!page) {
> >>                       page_cache_sync_readahead(mapping, ra, file,
> >>                                                          vmf->pgoff, 1);
> >> -                     page = find_lock_page(mapping, vmf->pgoff);
> >> +                     retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> +                                             vma, &page, retry_flag);
> >> +                     if (retry_ret == VM_FAULT_RETRY) {
> >> +                             /* counteract the followed retry hit */
> >> +                             ra->mmap_miss++;
> >
> > Please don't relocate the comment...because that will break my
> > following patches(mainly Linus' filemap cleanups), which will
> > _heavily_ rework these chunks anyway. And make a hard time for Andrew
> > to merge them.
> 
> then how you force the 80 character lines?

The above line will be removed by my following patch, and...

> >
> >> +                             return retry_ret;
> >> +                     }
> >
> >>                       if (!page)
> >>                               goto no_cached_page;
> >>               }
> >> @@ -1504,7 +1565,14 @@ retry_find:
> >>                               start = vmf->pgoff - ra_pages / 2;
> >>                       do_page_cache_readahead(mapping, file, start, ra_pages);
> >>               }
> >> -             page = find_lock_page(mapping, vmf->pgoff);
> >> +retry_find_retry:
> >> +             retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> +                             vma, &page, retry_flag);
> >> +             if (retry_ret == VM_FAULT_RETRY) {
> >> +                     /* counteract the followed retry hit */
> >> +                     ra->mmap_miss++;
> >

...this line can have the comment within 80 column, and will be kept
by my patch.

Thanks,
Fengguang


> > ditto
> >
> > Thanks,
> > Fengguang
> >
> >> +                     return retry_ret;
> >> +             }
> >>               if (!page)
> >>                       goto no_cached_page;
> >>       }
> >> @@ -1548,7 +1616,7 @@ no_cached_page:
> >>        * meantime, we'll just come back here and read it again.
> >>        */
> >>       if (error >= 0)
> >> -             goto retry_find;
> >> +             goto retry_find_retry;
> >>
> >>       /*
> >>        * An error return from page_cache_read can result if the
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 164951c..5e215c9 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> >>       vmf.page = NULL;
> >>
> >>       ret = vma->vm_ops->fault(vma, &vmf);
> >> +
> >> +     /* page may be available, but we have to restart the process
> >> +      * because mmap_sem was dropped during the ->fault
> >> +      */
> >> +     if (ret & VM_FAULT_RETRY)
> >> +             return ret;
> >> +
> >>       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> >>               return ret;
> >>
> >> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> >>  {
> >>       pgoff_t pgoff = (((address & PAGE_MASK)
> >>                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> >> -     unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> >> +     int write = write_access & ~FAULT_FLAG_RETRY;
> >> +     unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> >>
> >> +     flags |= (write_access & FAULT_FLAG_RETRY);
> >>       pte_unmap(page_table);
> >>       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> >>  }
> >> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> >>       pud_t *pud;
> >>       pmd_t *pmd;
> >>       pte_t *pte;
> >> +     int ret;
> >>
> >>       __set_current_state(TASK_RUNNING);
> >>
> >> -     count_vm_event(PGFAULT);
> >> -
> >> -     if (unlikely(is_vm_hugetlb_page(vma)))
> >> -             return hugetlb_fault(mm, vma, address, write_access);
> >> +     if (unlikely(is_vm_hugetlb_page(vma))) {
> >> +             ret = hugetlb_fault(mm, vma, address, write_access);
> >> +             goto out;
> >> +     }
> >>
> >> +     ret = VM_FAULT_OOM;
> >>       pgd = pgd_offset(mm, address);
> >>       pud = pud_alloc(mm, pgd, address);
> >>       if (!pud)
> >> -             return VM_FAULT_OOM;
> >> +             goto out;
> >>       pmd = pmd_alloc(mm, pud, address);
> >>       if (!pmd)
> >> -             return VM_FAULT_OOM;
> >> +             goto out;
> >>       pte = pte_alloc_map(mm, pmd, address);
> >>       if (!pte)
> >> -             return VM_FAULT_OOM;
> >> +             goto out;
> >>
> >> -     return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> +     ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> +out:
> >> +     if (!(ret & VM_FAULT_RETRY))
> >> +             count_vm_event(PGFAULT);
> >> +     return ret;
> >>  }
> >>
> >>  #ifndef __PAGETABLE_PUD_FOLDED
> >

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-08 20:02 Ying Han
  2009-04-09  7:47 ` Wu Fengguang
@ 2009-04-10  6:02 ` Andrew Morton
  2009-04-10  6:32   ` Ying Han
  2009-04-10  7:30   ` Wu Fengguang
  1 sibling, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2009-04-10  6:02 UTC (permalink / raw)
  To: Ying Han
  Cc: linux-mm, linux-kernel, torvalds, Ingo Molnar, Mike Waychison,
	Rohit Seth, Hugh Dickins, Peter Zijlstra, H. Peter Anvin,
	Török Edwin, Lee Schermerhorn, Nick Piggin,
	Wu Fengguang


> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

Please give each patch in the series a unique and meaningful title.

On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:

> support for FAULT_FLAG_RETRY with no user change:

yup, we'd prefer a complete changelog here please.

> Signed-off-by: Ying Han <yinghan@google.com>
> 	       Mike Waychison <mikew@google.com>

This form:

Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: Mike Waychison <mikew@google.com>

is conventional.

> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
>  					   there are only # of pages ahead */
> 
>  	unsigned int ra_pages;		/* Maximum readahead window */
> -	int mmap_miss;			/* Cache miss stat for mmap accesses */
> +	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */

This change makes sense, but we're not told the reasons for making it? 
Did it fix a bug, or is it an unrelated fixlet, or...?

>  	loff_t prev_pos;		/* Cache last read() position */
>  };
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
> 
>  #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
>  #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY	0x04	/* Retry major fault */
> 
> 
>  /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
> 
>  #define VM_FAULT_MINOR	0 /* For backwards compat. Remove me quickly. */
> 
> +#define VM_FAULT_RETRY	0x0010
>  #define VM_FAULT_OOM	0x0001
>  #define VM_FAULT_SIGBUS	0x0002
>  #define VM_FAULT_MAJOR	0x0004
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
>  EXPORT_SYMBOL(find_lock_page);
> 
>  /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */

How about this:

  If the page was not found in pagecache, find_lock_page_retry()
  returns 0 and sets *@ppage to NULL.

  If the page was found in pagecache but is locked and @retry is
  true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
  write to *@ppage.

  If the page was found in pagecache and @retry is false,
  find_lock_page_retry() locks the page, writes its address to *@ppage
  and returns 0.

> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> +				struct vm_area_struct *vma, struct page **ppage,
> +				int retry)
> +{
> +	unsigned int ret = 0;
> +	struct page *page;
> +
> +repeat:
> +	page = find_get_page(mapping, offset);
> +	if (page) {
> +		if (!retry)
> +			lock_page(page);
> +		else {
> +			if (!trylock_page(page)) {
> +				struct mm_struct *mm = vma->vm_mm;
> +
> +				up_read(&mm->mmap_sem);
> +				wait_on_page_locked(page);

It looks strange that we wait for the page lock and then don't
just lock the page and return it.

Adding a comment explaining _why_ we do this would be nice.

> +				down_read(&mm->mmap_sem);
> +
> +				page_cache_release(page);
> +				return VM_FAULT_RETRY;
> +			}
> +		}
> +		if (unlikely(page->mapping != mapping)) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +		VM_BUG_ON(page->index != offset);
> +	}
> +	*ppage = page;
> +	return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
>   * find_or_create_page - locate or add a pagecache page
>   * @mapping: the page's address_space
>   * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	pgoff_t size;
>  	int did_readaround = 0;
>  	int ret = 0;
> +	int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> +	int retry_ret;
> 
>  	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>  	if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>  	 */
>  retry_find:
>  	page = find_lock_page(mapping, vmf->pgoff);
> +
>  	/*
>  	 * For sequential accesses, we use the generic readahead logic.
>  	 */
> @@ -1465,7 +1520,13 @@ retry_find:
>  		if (!page) {
>  			page_cache_sync_readahead(mapping, ra, file,
>  							   vmf->pgoff, 1);
> -			page = find_lock_page(mapping, vmf->pgoff);
> +			retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +						vma, &page, retry_flag);
> +			if (retry_ret == VM_FAULT_RETRY) {
> +				/* counteract the followed retry hit */
> +				ra->mmap_miss++;
> +				return retry_ret;
> +			}
>  			if (!page)
>  				goto no_cached_page;
>  		}
> @@ -1504,7 +1565,14 @@ retry_find:
>  				start = vmf->pgoff - ra_pages / 2;
>  			do_page_cache_readahead(mapping, file, start, ra_pages);
>  		}
> -		page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> +		retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> +				vma, &page, retry_flag);
> +		if (retry_ret == VM_FAULT_RETRY) {
> +			/* counteract the followed retry hit */
> +			ra->mmap_miss++;
> +			return retry_ret;
> +		}
>  		if (!page)
>  			goto no_cached_page;
>  	}
> @@ -1548,7 +1616,7 @@ no_cached_page:
>  	 * meantime, we'll just come back here and read it again.
>  	 */
>  	if (error >= 0)
> -		goto retry_find;
> +		goto retry_find_retry;
> 
>  	/*
>  	 * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>  	vmf.page = NULL;
> 
>  	ret = vma->vm_ops->fault(vma, &vmf);
> +
> +	/* page may be available, but we have to restart the process
> +	 * because mmap_sem was dropped during the ->fault
> +	 */

The preferred comment layout is:

	/*
	 * The page may be available, but we have to restart the process
	 * because mmap_sem was dropped during the ->fault
	 */

> +	if (ret & VM_FAULT_RETRY)
> +		return ret;

Again, I worry that in some places we treat VM_FAULT_RETRY as an
enumerated type:

	if (foo == VM_FAULT_RETRY)

but in other places we treat it as a bitfield

	if (foo & VM_FAULT_RETRY)

it may not be buggy, but it _looks_ buggy.  And confusing and
inconsistent and dangerous.

Can it be improved?

>  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>  		return ret;
> 
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>  {
>  	pgoff_t pgoff = (((address & PAGE_MASK)
>  			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> -	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> +	int write = write_access & ~FAULT_FLAG_RETRY;
> +	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> 
> +	flags |= (write_access & FAULT_FLAG_RETRY);

gee, I'm lost.

Can we please redo this as:


	int write;
	unsigned int flags;

	/*
	 * Big fat comment explaining the next three lines goes here
	 */
	write = write_access & ~FAULT_FLAG_RETRY;
	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
	flags |= (write_access & FAULT_FLAG_RETRY);

?

>  	pte_unmap(page_table);
>  	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>  }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>  	pud_t *pud;
>  	pmd_t *pmd;
>  	pte_t *pte;
> +	int ret;
> 
>  	__set_current_state(TASK_RUNNING);
> 
> -	count_vm_event(PGFAULT);
> -
> -	if (unlikely(is_vm_hugetlb_page(vma)))
> -		return hugetlb_fault(mm, vma, address, write_access);
> +	if (unlikely(is_vm_hugetlb_page(vma))) {
> +		ret = hugetlb_fault(mm, vma, address, write_access);
> +		goto out;
> +	}
> 
> +	ret = VM_FAULT_OOM;
>  	pgd = pgd_offset(mm, address);
>  	pud = pud_alloc(mm, pgd, address);
>  	if (!pud)
> -		return VM_FAULT_OOM;
> +		goto out;
>  	pmd = pmd_alloc(mm, pud, address);
>  	if (!pmd)
> -		return VM_FAULT_OOM;
> +		goto out;
>  	pte = pte_alloc_map(mm, pmd, address);
>  	if (!pte)
> -		return VM_FAULT_OOM;
> +		goto out;
> 
> -	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +	ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> +	if (!(ret & VM_FAULT_RETRY))
> +		count_vm_event(PGFAULT);
> +	return ret;
>  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-10  6:02 ` Andrew Morton
@ 2009-04-10  6:32   ` Ying Han
  2009-04-10  6:48     ` Wu Fengguang
  2009-04-10  7:30   ` Wu Fengguang
  1 sibling, 1 reply; 11+ messages in thread
From: Ying Han @ 2009-04-10  6:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, linux-kernel, torvalds, Ingo Molnar, Mike Waychison,
	Rohit Seth, Hugh Dickins, Peter Zijlstra, H. Peter Anvin,
	Török Edwin, Lee Schermerhorn, Nick Piggin,
	Wu Fengguang

2009/4/9 Andrew Morton <akpm@linux-foundation.org>:
>
>> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
>
> Please give each patch in the series a unique and meaningful title.
>
> On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:
>
>> support for FAULT_FLAG_RETRY with no user change:
>
> yup, we'd prefer a complete changelog here please.
>
>> Signed-off-by: Ying Han <yinghan@google.com>
>>              Mike Waychison <mikew@google.com>
>
> This form:
>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: Mike Waychison <mikew@google.com>

Thanks Andrew,  and i need to add Fengguang to Signed-off-by.

>
> is conventional.
>
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>>                                          there are only # of pages ahead */
>>
>>       unsigned int ra_pages;          /* Maximum readahead window */
>> -     int mmap_miss;                  /* Cache miss stat for mmap accesses */
>> +     unsigned int mmap_miss;         /* Cache miss stat for mmap accesses */
>
> This change makes sense, but we're not told the reasons for making it?
> Did it fix a bug, or is it an unrelated fixlet, or...?

Fengguang: Could you help making comments on this part? and i will
make changes elsewhere
as Andrew pointed. Thanks

>
>>       loff_t prev_pos;                /* Cache last read() position */
>>  };
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>>  #define FAULT_FLAG_WRITE     0x01    /* Fault was a write access */
>>  #define FAULT_FLAG_NONLINEAR 0x02    /* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY     0x04    /* Retry major fault */
>>
>>
>>  /*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>>  #define VM_FAULT_MINOR       0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY       0x0010
>>  #define VM_FAULT_OOM 0x0001
>>  #define VM_FAULT_SIGBUS      0x0002
>>  #define VM_FAULT_MAJOR       0x0004
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>>  EXPORT_SYMBOL(find_lock_page);
>>
>>  /**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>
> How about this:
>
>  If the page was not found in pagecache, find_lock_page_retry()
>  returns 0 and sets *@ppage to NULL.
>
>  If the page was found in pagecache but is locked and @retry is
>  true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
>  write to *@ppage.
>
>  If the page was found in pagecache and @retry is false,
>  find_lock_page_retry() locks the page, writes its address to *@ppage
>  and returns 0.
>
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> +                             struct vm_area_struct *vma, struct page **ppage,
>> +                             int retry)
>> +{
>> +     unsigned int ret = 0;
>> +     struct page *page;
>> +
>> +repeat:
>> +     page = find_get_page(mapping, offset);
>> +     if (page) {
>> +             if (!retry)
>> +                     lock_page(page);
>> +             else {
>> +                     if (!trylock_page(page)) {
>> +                             struct mm_struct *mm = vma->vm_mm;
>> +
>> +                             up_read(&mm->mmap_sem);
>> +                             wait_on_page_locked(page);
>
> It looks strange that we wait for the page lock and then don't
> just lock the page and return it.
>
> Adding a comment explaining _why_ we do this would be nice.
>
>> +                             down_read(&mm->mmap_sem);
>> +
>> +                             page_cache_release(page);
>> +                             return VM_FAULT_RETRY;
>> +                     }
>> +             }
>> +             if (unlikely(page->mapping != mapping)) {
>> +                     unlock_page(page);
>> +                     page_cache_release(page);
>> +                     goto repeat;
>> +             }
>> +             VM_BUG_ON(page->index != offset);
>> +     }
>> +     *ppage = page;
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>>   * find_or_create_page - locate or add a pagecache page
>>   * @mapping: the page's address_space
>>   * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>>       pgoff_t size;
>>       int did_readaround = 0;
>>       int ret = 0;
>> +     int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> +     int retry_ret;
>>
>>       size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>>       if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>>        */
>>  retry_find:
>>       page = find_lock_page(mapping, vmf->pgoff);
>> +
>>       /*
>>        * For sequential accesses, we use the generic readahead logic.
>>        */
>> @@ -1465,7 +1520,13 @@ retry_find:
>>               if (!page) {
>>                       page_cache_sync_readahead(mapping, ra, file,
>>                                                          vmf->pgoff, 1);
>> -                     page = find_lock_page(mapping, vmf->pgoff);
>> +                     retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> +                                             vma, &page, retry_flag);
>> +                     if (retry_ret == VM_FAULT_RETRY) {
>> +                             /* counteract the followed retry hit */
>> +                             ra->mmap_miss++;
>> +                             return retry_ret;
>> +                     }
>>                       if (!page)
>>                               goto no_cached_page;
>>               }
>> @@ -1504,7 +1565,14 @@ retry_find:
>>                               start = vmf->pgoff - ra_pages / 2;
>>                       do_page_cache_readahead(mapping, file, start, ra_pages);
>>               }
>> -             page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> +             retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> +                             vma, &page, retry_flag);
>> +             if (retry_ret == VM_FAULT_RETRY) {
>> +                     /* counteract the followed retry hit */
>> +                     ra->mmap_miss++;
>> +                     return retry_ret;
>> +             }
>>               if (!page)
>>                       goto no_cached_page;
>>       }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>>        * meantime, we'll just come back here and read it again.
>>        */
>>       if (error >= 0)
>> -             goto retry_find;
>> +             goto retry_find_retry;
>>
>>       /*
>>        * An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>>       vmf.page = NULL;
>>
>>       ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> +     /* page may be available, but we have to restart the process
>> +      * because mmap_sem was dropped during the ->fault
>> +      */
>
> The preferred comment layout is:
>
>        /*
>         * The page may be available, but we have to restart the process
>         * because mmap_sem was dropped during the ->fault
>         */
>
>> +     if (ret & VM_FAULT_RETRY)
>> +             return ret;
>
> Again, I worry that in some places we treat VM_FAULT_RETRY as an
> enumerated type:
>
>        if (foo == VM_FAULT_RETRY)
>
> but in other places we treat it as a bitfield
>
>        if (foo & VM_FAULT_RETRY)
>
> it may not be buggy, but it _looks_ buggy.  And confusing and
> inconsistent and dangerous.
>
> Can it be improved?
>
>>       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>>               return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>>  {
>>       pgoff_t pgoff = (((address & PAGE_MASK)
>>                       - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> -     unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> +     int write = write_access & ~FAULT_FLAG_RETRY;
>> +     unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> +     flags |= (write_access & FAULT_FLAG_RETRY);
>
> gee, I'm lost.
>
> Can we please redo this as:
>
>
>        int write;
>        unsigned int flags;
>
>        /*
>         * Big fat comment explaining the next three lines goes here
>         */
>        write = write_access & ~FAULT_FLAG_RETRY;
>        unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>        flags |= (write_access & FAULT_FLAG_RETRY);
>
> ?
>
>>       pte_unmap(page_table);
>>       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>>  }
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>>       pud_t *pud;
>>       pmd_t *pmd;
>>       pte_t *pte;
>> +     int ret;
>>
>>       __set_current_state(TASK_RUNNING);
>>
>> -     count_vm_event(PGFAULT);
>> -
>> -     if (unlikely(is_vm_hugetlb_page(vma)))
>> -             return hugetlb_fault(mm, vma, address, write_access);
>> +     if (unlikely(is_vm_hugetlb_page(vma))) {
>> +             ret = hugetlb_fault(mm, vma, address, write_access);
>> +             goto out;
>> +     }
>>
>> +     ret = VM_FAULT_OOM;
>>       pgd = pgd_offset(mm, address);
>>       pud = pud_alloc(mm, pgd, address);
>>       if (!pud)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>       pmd = pmd_alloc(mm, pud, address);
>>       if (!pmd)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>       pte = pte_alloc_map(mm, pmd, address);
>>       if (!pte)
>> -             return VM_FAULT_OOM;
>> +             goto out;
>>
>> -     return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +     ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> +     if (!(ret & VM_FAULT_RETRY))
>> +             count_vm_event(PGFAULT);
>> +     return ret;
>>  }
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-10  6:32   ` Ying Han
@ 2009-04-10  6:48     ` Wu Fengguang
  0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10  6:48 UTC (permalink / raw)
  To: Ying Han
  Cc: Andrew Morton, linux-mm, linux-kernel, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Fri, Apr 10, 2009 at 02:32:01PM +0800, Ying Han wrote:
> 2009/4/9 Andrew Morton <akpm@linux-foundation.org>:
> >
> >> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
> >
> > Please give each patch in the series a unique and meaningful title.
> >
> > On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:
> >
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > yup, we'd prefer a complete changelog here please.
> >
> >> Signed-off-by: Ying Han <yinghan@google.com>
> >>              Mike Waychison <mikew@google.com>
> >
> > This form:
> >
> > Signed-off-by: Ying Han <yinghan@google.com>
> > Signed-off-by: Mike Waychison <mikew@google.com>
> 
> Thanks Andrew,  and i need to add Fengguang to Signed-off-by.

Thank you.

> >
> > is conventional.
> >
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >>                                          there are only # of pages ahead */
> >>
> >>       unsigned int ra_pages;          /* Maximum readahead window */
> >> -     int mmap_miss;                  /* Cache miss stat for mmap accesses */
> >> +     unsigned int mmap_miss;         /* Cache miss stat for mmap accesses */
> >
> > This change makes sense, but we're not told the reasons for making it?
> > Did it fix a bug, or is it an unrelated fixlet, or...?
> 
> Fengguang: Could you help making comments on this part? and i will
> make changes elsewhere as Andrew pointed. Thanks

Ah this may deserve a standalone patch:
---
readhead: make mmap_miss an unsigned int

This makes the performance impact of possible mmap_miss wrap around to be
temporary and tolerable: i.e. MMAP_LOTSAMISS=100 extra readarounds.

Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX
cache misses to bring it back to normal state.  During the time mmap
readaround will be _enabled_ for whatever wild random workload. That's
almost permanent performance impact.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/fs.h |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- mm.orig/include/linux/fs.h
+++ mm/include/linux/fs.h
@@ -824,7 +824,7 @@ struct file_ra_state {
 					   there are only # of pages ahead */
 
 	unsigned int ra_pages;		/* Maximum readahead window */
-	int mmap_miss;			/* Cache miss stat for mmap accesses */
+	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
 	loff_t prev_pos;		/* Cache last read() position */
 };
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-10  6:02 ` Andrew Morton
  2009-04-10  6:32   ` Ying Han
@ 2009-04-10  7:30   ` Wu Fengguang
  2009-04-10 16:01     ` Linus Torvalds
  1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10  7:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, linux-mm, linux-kernel, torvalds, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
[snip]
> >  	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> >  		return ret;
> > 
> > @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> >  {
> >  	pgoff_t pgoff = (((address & PAGE_MASK)
> >  			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > -	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> > +	int write = write_access & ~FAULT_FLAG_RETRY;
> > +	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> > 
> > +	flags |= (write_access & FAULT_FLAG_RETRY);
> 
> gee, I'm lost.

So did me.

> Can we please redo this as:
> 
> 
> 	int write;
> 	unsigned int flags;
> 
> 	/*
> 	 * Big fat comment explaining the next three lines goes here
> 	 */

Basically it's doing a
        (is_write_access  | FAULT_FLAG_RETRY) =>
        (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
by extracting the bool part:
> 	write = write_access & ~FAULT_FLAG_RETRY;
convert bool to a bit flag:
> 	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
and restore the FAULT_FLAG_RETRY:
> 	flags |= (write_access & FAULT_FLAG_RETRY);

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-10  7:30   ` Wu Fengguang
@ 2009-04-10 16:01     ` Linus Torvalds
  2009-04-10 17:57       ` Ying Han
  0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2009-04-10 16:01 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Ying Han, linux-mm, linux-kernel, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

On Fri, 10 Apr 2009, Wu Fengguang wrote:

> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
> 
> > Can we please redo this as:
> > 
> > 
> > 	int write;
> > 	unsigned int flags;
> > 
> > 	/*
> > 	 * Big fat comment explaining the next three lines goes here
> > 	 */
> 
> Basically it's doing a
>         (is_write_access  | FAULT_FLAG_RETRY) =>
>         (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
> by extracting the bool part:
> > 	write = write_access & ~FAULT_FLAG_RETRY;
> convert bool to a bit flag:
> > 	unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

The point is, we shouldn't do that. 

Your code is confused, because it uses "write_access" as if it had the old 
behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say 
"retry"), and that's just wrong.

Just get rid of "write_access" entirely, and switch it over to something 
that is a pure bitmask.

Yes, it means a couple of new preliminary patches that switch all callers 
of handle_mm_fault() over to using the VM_FLAGS, but that's not a big 
deal.

I'm following up this email with two _example_ patches. They are untested, 
but they look sane. I'd like the series to _start_ with these, and then 
you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to 
handle_mm_fault() cleanly.

Hmm? Note the _untested_ part on the patches to follow. It was done very 
mechanically, and the patches look sane, but .. !!!

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
  2009-04-10 16:01     ` Linus Torvalds
@ 2009-04-10 17:57       ` Ying Han
  0 siblings, 0 replies; 11+ messages in thread
From: Ying Han @ 2009-04-10 17:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Wu Fengguang, Andrew Morton, linux-mm, linux-kernel, Ingo Molnar,
	Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
	H. Peter Anvin, Török Edwin, Lee Schermerhorn,
	Nick Piggin

2009/4/10 Linus Torvalds <torvalds@linux-foundation.org>:
>
>
> On Fri, 10 Apr 2009, Wu Fengguang wrote:
>
>> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
>>
>> > Can we please redo this as:
>> >
>> >
>> >     int write;
>> >     unsigned int flags;
>> >
>> >     /*
>> >      * Big fat comment explaining the next three lines goes here
>> >      */
>>
>> Basically it's doing a
>>         (is_write_access  | FAULT_FLAG_RETRY) =>
>>         (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
>> by extracting the bool part:
>> >     write = write_access & ~FAULT_FLAG_RETRY;
>> convert bool to a bit flag:
>> >     unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

Fengguang: thank you for your comments. i agree this is confusing...
>
> The point is, we shouldn't do that.
>
> Your code is confused, because it uses "write_access" as if it had the old
> behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say
> "retry"), and that's just wrong.
>
> Just get rid of "write_access" entirely, and switch it over to something
> that is a pure bitmask.
>
> Yes, it means a couple of new preliminary patches that switch all callers
> of handle_mm_fault() over to using the VM_FLAGS, but that's not a big
> deal.
>
> I'm following up this email with two _example_ patches. They are untested,
> but they look sane. I'd like the series to _start_ with these, and then
> you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to
> handle_mm_fault() cleanly.
>
> Hmm? Note the _untested_ part on the patches to follow. It was done very
> mechanically, and the patches look sane, but .. !!!

Thanks Linus for your comments. I will take Peter Zijlstra's patches
(the _untested_ part)
which basically replaces the write_access as a flag as you mentioned
and start from there.

My next step is to cleanup the patch with comments in the thread so
far and post the new version.
Anything else i missed, please let me know. thanks

--Ying
>
>                        Linus
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-04-10 17:56 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-08 20:06 [PATCH][1/2]page_fault retry with NOPAGE_RETRY Ying Han
  -- strict thread matches above, loose matches on Subject: below --
2009-04-08 20:02 Ying Han
2009-04-09  7:47 ` Wu Fengguang
2009-04-09 16:21   ` Ying Han
2009-04-10  0:25     ` Wu Fengguang
2009-04-10  6:02 ` Andrew Morton
2009-04-10  6:32   ` Ying Han
2009-04-10  6:48     ` Wu Fengguang
2009-04-10  7:30   ` Wu Fengguang
2009-04-10 16:01     ` Linus Torvalds
2009-04-10 17:57       ` Ying Han

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox