* [PATCH][1/2]page_fault retry with NOPAGE_RETRY
@ 2009-04-08 20:06 Ying Han
0 siblings, 0 replies; 11+ messages in thread
From: Ying Han @ 2009-04-08 20:06 UTC (permalink / raw)
To: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin, Wu Fengguang
support for FAULT_FLAG_RETRY with no user change:
Signed-off-by: Ying Han <yinghan@google.com>
Mike Waychison <mikew@google.com>
include/linux/fs.h | 2 +-
include/linux/mm.h | 2 +
mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
mm/memory.c | 33 +++++++++++++++++------
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a853ef..29c2c39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -793,7 +793,7 @@ struct file_ra_state {
there are only # of pages ahead */
unsigned int ra_pages; /* Maximum readahead window */
- int mmap_miss; /* Cache miss stat for mmap accesses */
+ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos; /* Cache last read() position */
};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..5a134a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
/*
@@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
+#define VM_FAULT_RETRY 0x0010
#define VM_FAULT_OOM 0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..6eb7c36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,58 @@ repeat:
EXPORT_SYMBOL(find_lock_page);
/**
+ * find_lock_page_retry - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @ppage: zero if page not present, otherwise point to the page in pagecache
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * If retry flag is on, and page is already locked by someone else, return
+ * a hint of retry and leave *ppage untouched.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+ struct vm_area_struct *vma, struct page **ppage,
+ int retry)
+{
+ unsigned int ret = 0;
+ struct page *page;
+
+repeat:
+ page = find_get_page(mapping, offset);
+ if (page) {
+ if (!retry)
+ lock_page(page);
+ else {
+ if (!trylock_page(page)) {
+ struct mm_struct *mm = vma->vm_mm;
+
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ down_read(&mm->mmap_sem);
+
+ page_cache_release(page);
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+ *ppage = page;
+ return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
* find_or_create_page - locate or add a pagecache page
* @mapping: the page's address_space
* @index: the page's index into the mapping
@@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
pgoff_t size;
int did_readaround = 0;
int ret = 0;
+ int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+ int retry_ret;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
@@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
*/
retry_find:
page = find_lock_page(mapping, vmf->pgoff);
+
/*
* For sequential accesses, we use the generic readahead logic.
*/
@@ -1465,7 +1520,13 @@ retry_find:
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
- page = find_lock_page(mapping, vmf->pgoff);
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
+ if (retry_ret == VM_FAULT_RETRY) {
+ /* counteract the followed retry hit */
+ ra->mmap_miss++;
+ return retry_ret;
+ }
if (!page)
goto no_cached_page;
}
@@ -1504,7 +1565,14 @@ retry_find:
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
- page = find_lock_page(mapping, vmf->pgoff);
+retry_find_retry:
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
+ if (retry_ret == VM_FAULT_RETRY) {
+ /* counteract the followed retry hit */
+ ra->mmap_miss++;
+ return retry_ret;
+ }
if (!page)
goto no_cached_page;
}
@@ -1548,7 +1616,7 @@ no_cached_page:
* meantime, we'll just come back here and read it again.
*/
if (error >= 0)
- goto retry_find;
+ goto retry_find_retry;
/*
* An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..5e215c9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
+
+ /* page may be available, but we have to restart the process
+ * because mmap_sem was dropped during the ->fault
+ */
+ if (ret & VM_FAULT_RETRY)
+ return ret;
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
@@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+ int write = write_access & ~FAULT_FLAG_RETRY;
+ unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
+ flags |= (write_access & FAULT_FLAG_RETRY);
pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
@@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ int ret;
__set_current_state(TASK_RUNNING);
- count_vm_event(PGFAULT);
-
- if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, write_access);
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ ret = hugetlb_fault(mm, vma, address, write_access);
+ goto out;
+ }
+ ret = VM_FAULT_OOM;
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
- return VM_FAULT_OOM;
+ goto out;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
- return VM_FAULT_OOM;
+ goto out;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
- return VM_FAULT_OOM;
+ goto out;
- return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+ ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+out:
+ if (!(ret & VM_FAULT_RETRY))
+ count_vm_event(PGFAULT);
+ return ret;
}
#ifndef __PAGETABLE_PUD_FOLDED
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH][1/2]page_fault retry with NOPAGE_RETRY
@ 2009-04-08 20:02 Ying Han
2009-04-09 7:47 ` Wu Fengguang
2009-04-10 6:02 ` Andrew Morton
0 siblings, 2 replies; 11+ messages in thread
From: Ying Han @ 2009-04-08 20:02 UTC (permalink / raw)
To: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin, Wu Fengguang
support for FAULT_FLAG_RETRY with no user change:
Signed-off-by: Ying Han <yinghan@google.com>
Mike Waychison <mikew@google.com>
include/linux/fs.h | 2 +-
include/linux/mm.h | 2 +
mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
mm/memory.c | 33 +++++++++++++++++------
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4a853ef..29c2c39 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -793,7 +793,7 @@ struct file_ra_state {
there are only # of pages ahead */
unsigned int ra_pages; /* Maximum readahead window */
- int mmap_miss; /* Cache miss stat for mmap accesses */
+ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos; /* Cache last read() position */
};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..5a134a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
/*
@@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
+#define VM_FAULT_RETRY 0x0010
#define VM_FAULT_OOM 0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..6eb7c36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,58 @@ repeat:
EXPORT_SYMBOL(find_lock_page);
/**
+ * find_lock_page_retry - locate, pin and lock a pagecache page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @ppage: zero if page not present, otherwise point to the page in pagecache
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * If retry flag is on, and page is already locked by someone else, return
+ * a hint of retry and leave *ppage untouched.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+ struct vm_area_struct *vma, struct page **ppage,
+ int retry)
+{
+ unsigned int ret = 0;
+ struct page *page;
+
+repeat:
+ page = find_get_page(mapping, offset);
+ if (page) {
+ if (!retry)
+ lock_page(page);
+ else {
+ if (!trylock_page(page)) {
+ struct mm_struct *mm = vma->vm_mm;
+
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ down_read(&mm->mmap_sem);
+
+ page_cache_release(page);
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ VM_BUG_ON(page->index != offset);
+ }
+ *ppage = page;
+ return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
* find_or_create_page - locate or add a pagecache page
* @mapping: the page's address_space
* @index: the page's index into the mapping
@@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
pgoff_t size;
int did_readaround = 0;
int ret = 0;
+ int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+ int retry_ret;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
@@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
*/
retry_find:
page = find_lock_page(mapping, vmf->pgoff);
+
/*
* For sequential accesses, we use the generic readahead logic.
*/
@@ -1465,7 +1520,13 @@ retry_find:
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
- page = find_lock_page(mapping, vmf->pgoff);
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
+ if (retry_ret == VM_FAULT_RETRY) {
+ /* counteract the followed retry hit */
+ ra->mmap_miss++;
+ return retry_ret;
+ }
if (!page)
goto no_cached_page;
}
@@ -1504,7 +1565,14 @@ retry_find:
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
- page = find_lock_page(mapping, vmf->pgoff);
+retry_find_retry:
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
+ if (retry_ret == VM_FAULT_RETRY) {
+ /* counteract the followed retry hit */
+ ra->mmap_miss++;
+ return retry_ret;
+ }
if (!page)
goto no_cached_page;
}
@@ -1548,7 +1616,7 @@ no_cached_page:
* meantime, we'll just come back here and read it again.
*/
if (error >= 0)
- goto retry_find;
+ goto retry_find_retry;
/*
* An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..5e215c9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
+
+ /* page may be available, but we have to restart the process
+ * because mmap_sem was dropped during the ->fault
+ */
+ if (ret & VM_FAULT_RETRY)
+ return ret;
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
@@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+ int write = write_access & ~FAULT_FLAG_RETRY;
+ unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
+ flags |= (write_access & FAULT_FLAG_RETRY);
pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
@@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ int ret;
__set_current_state(TASK_RUNNING);
- count_vm_event(PGFAULT);
-
- if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, write_access);
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ ret = hugetlb_fault(mm, vma, address, write_access);
+ goto out;
+ }
+ ret = VM_FAULT_OOM;
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
- return VM_FAULT_OOM;
+ goto out;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
- return VM_FAULT_OOM;
+ goto out;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
- return VM_FAULT_OOM;
+ goto out;
- return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+ ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+out:
+ if (!(ret & VM_FAULT_RETRY))
+ count_vm_event(PGFAULT);
+ return ret;
}
#ifndef __PAGETABLE_PUD_FOLDED
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-08 20:02 Ying Han
@ 2009-04-09 7:47 ` Wu Fengguang
2009-04-09 16:21 ` Ying Han
2009-04-10 6:02 ` Andrew Morton
1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-04-09 7:47 UTC (permalink / raw)
To: Ying Han
Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> support for FAULT_FLAG_RETRY with no user change:
A better changelog is desired, otherwise:
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> Signed-off-by: Ying Han <yinghan@google.com>
> Mike Waychison <mikew@google.com>
>
> include/linux/fs.h | 2 +-
> include/linux/mm.h | 2 +
> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/memory.c | 33 +++++++++++++++++------
>
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
> there are only # of pages ahead */
>
> unsigned int ra_pages; /* Maximum readahead window */
> - int mmap_miss; /* Cache miss stat for mmap accesses */
> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> loff_t prev_pos; /* Cache last read() position */
> };
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>
> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>
>
> /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>
> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>
> +#define VM_FAULT_RETRY 0x0010
> #define VM_FAULT_OOM 0x0001
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004
Why not append VM_FAULT_RETRY here, and indent the _OOM line?
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
> EXPORT_SYMBOL(find_lock_page);
>
> /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> + struct vm_area_struct *vma, struct page **ppage,
> + int retry)
> +{
> + unsigned int ret = 0;
> + struct page *page;
> +
> +repeat:
> + page = find_get_page(mapping, offset);
> + if (page) {
> + if (!retry)
> + lock_page(page);
> + else {
> + if (!trylock_page(page)) {
> + struct mm_struct *mm = vma->vm_mm;
> +
> + up_read(&mm->mmap_sem);
> + wait_on_page_locked(page);
> + down_read(&mm->mmap_sem);
> +
> + page_cache_release(page);
> + return VM_FAULT_RETRY;
> + }
> + }
> + if (unlikely(page->mapping != mapping)) {
> + unlock_page(page);
> + page_cache_release(page);
> + goto repeat;
> + }
> + VM_BUG_ON(page->index != offset);
> + }
> + *ppage = page;
> + return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
> * find_or_create_page - locate or add a pagecache page
> * @mapping: the page's address_space
> * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> pgoff_t size;
> int did_readaround = 0;
> int ret = 0;
> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> + int retry_ret;
>
> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> */
> retry_find:
> page = find_lock_page(mapping, vmf->pgoff);
> +
> /*
> * For sequential accesses, we use the generic readahead logic.
> */
> @@ -1465,7 +1520,13 @@ retry_find:
> if (!page) {
> page_cache_sync_readahead(mapping, ra, file,
> vmf->pgoff, 1);
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
Please don't relocate the comment...because that will break my
following patches(mainly Linus' filemap cleanups), which will
_heavily_ rework these chunks anyway. And make a hard time for Andrew
to merge them.
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1504,7 +1565,14 @@ retry_find:
> start = vmf->pgoff - ra_pages / 2;
> do_page_cache_readahead(mapping, file, start, ra_pages);
> }
> - page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
ditto
Thanks,
Fengguang
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1548,7 +1616,7 @@ no_cached_page:
> * meantime, we'll just come back here and read it again.
> */
> if (error >= 0)
> - goto retry_find;
> + goto retry_find_retry;
>
> /*
> * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> vmf.page = NULL;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> +
> + /* page may be available, but we have to restart the process
> + * because mmap_sem was dropped during the ->fault
> + */
> + if (ret & VM_FAULT_RETRY)
> + return ret;
> +
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
>
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> {
> pgoff_t pgoff = (((address & PAGE_MASK)
> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> + int write = write_access & ~FAULT_FLAG_RETRY;
> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>
> + flags |= (write_access & FAULT_FLAG_RETRY);
> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
> + int ret;
>
> __set_current_state(TASK_RUNNING);
>
> - count_vm_event(PGFAULT);
> -
> - if (unlikely(is_vm_hugetlb_page(vma)))
> - return hugetlb_fault(mm, vma, address, write_access);
> + if (unlikely(is_vm_hugetlb_page(vma))) {
> + ret = hugetlb_fault(mm, vma, address, write_access);
> + goto out;
> + }
>
> + ret = VM_FAULT_OOM;
> pgd = pgd_offset(mm, address);
> pud = pud_alloc(mm, pgd, address);
> if (!pud)
> - return VM_FAULT_OOM;
> + goto out;
> pmd = pmd_alloc(mm, pud, address);
> if (!pmd)
> - return VM_FAULT_OOM;
> + goto out;
> pte = pte_alloc_map(mm, pmd, address);
> if (!pte)
> - return VM_FAULT_OOM;
> + goto out;
>
> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> + if (!(ret & VM_FAULT_RETRY))
> + count_vm_event(PGFAULT);
> + return ret;
> }
>
> #ifndef __PAGETABLE_PUD_FOLDED
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-09 7:47 ` Wu Fengguang
@ 2009-04-09 16:21 ` Ying Han
2009-04-10 0:25 ` Wu Fengguang
0 siblings, 1 reply; 11+ messages in thread
From: Ying Han @ 2009-04-09 16:21 UTC (permalink / raw)
To: Wu Fengguang
Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
>> support for FAULT_FLAG_RETRY with no user change:
>
> A better changelog is desired, otherwise:
>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
>
>> Signed-off-by: Ying Han <yinghan@google.com>
>> Mike Waychison <mikew@google.com>
>>
>> include/linux/fs.h | 2 +-
>> include/linux/mm.h | 2 +
>> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>> mm/memory.c | 33 +++++++++++++++++------
>>
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>> there are only # of pages ahead */
>>
>> unsigned int ra_pages; /* Maximum readahead window */
>> - int mmap_miss; /* Cache miss stat for mmap accesses */
>> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
>> loff_t prev_pos; /* Cache last read() position */
>> };
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
>> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>>
>>
>> /*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY 0x0010
>> #define VM_FAULT_OOM 0x0001
>> #define VM_FAULT_SIGBUS 0x0002
>> #define VM_FAULT_MAJOR 0x0004
>
> Why not append VM_FAULT_RETRY here, and indent the _OOM line?
>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>> EXPORT_SYMBOL(find_lock_page);
>>
>> /**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> + struct vm_area_struct *vma, struct page **ppage,
>> + int retry)
>> +{
>> + unsigned int ret = 0;
>> + struct page *page;
>> +
>> +repeat:
>> + page = find_get_page(mapping, offset);
>> + if (page) {
>> + if (!retry)
>> + lock_page(page);
>> + else {
>> + if (!trylock_page(page)) {
>> + struct mm_struct *mm = vma->vm_mm;
>> +
>> + up_read(&mm->mmap_sem);
>> + wait_on_page_locked(page);
>> + down_read(&mm->mmap_sem);
>> +
>> + page_cache_release(page);
>> + return VM_FAULT_RETRY;
>> + }
>> + }
>> + if (unlikely(page->mapping != mapping)) {
>> + unlock_page(page);
>> + page_cache_release(page);
>> + goto repeat;
>> + }
>> + VM_BUG_ON(page->index != offset);
>> + }
>> + *ppage = page;
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>> * find_or_create_page - locate or add a pagecache page
>> * @mapping: the page's address_space
>> * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> pgoff_t size;
>> int did_readaround = 0;
>> int ret = 0;
>> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> + int retry_ret;
>>
>> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>> if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> */
>> retry_find:
>> page = find_lock_page(mapping, vmf->pgoff);
>> +
>> /*
>> * For sequential accesses, we use the generic readahead logic.
>> */
>> @@ -1465,7 +1520,13 @@ retry_find:
>> if (!page) {
>> page_cache_sync_readahead(mapping, ra, file,
>> vmf->pgoff, 1);
>> - page = find_lock_page(mapping, vmf->pgoff);
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>
> Please don't relocate the comment...because that will break my
> following patches(mainly Linus' filemap cleanups), which will
> _heavily_ rework these chunks anyway. And make a hard time for Andrew
> to merge them.
then how you force the 80 character lines?
>
>> + return retry_ret;
>> + }
>
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1504,7 +1565,14 @@ retry_find:
>> start = vmf->pgoff - ra_pages / 2;
>> do_page_cache_readahead(mapping, file, start, ra_pages);
>> }
>> - page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>
> ditto
>
> Thanks,
> Fengguang
>
>> + return retry_ret;
>> + }
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>> * meantime, we'll just come back here and read it again.
>> */
>> if (error >= 0)
>> - goto retry_find;
>> + goto retry_find_retry;
>>
>> /*
>> * An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>> vmf.page = NULL;
>>
>> ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> + /* page may be available, but we have to restart the process
>> + * because mmap_sem was dropped during the ->fault
>> + */
>> + if (ret & VM_FAULT_RETRY)
>> + return ret;
>> +
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>> return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>> {
>> pgoff_t pgoff = (((address & PAGE_MASK)
>> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> + int write = write_access & ~FAULT_FLAG_RETRY;
>> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> + flags |= (write_access & FAULT_FLAG_RETRY);
>> pte_unmap(page_table);
>> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>> }
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>> pud_t *pud;
>> pmd_t *pmd;
>> pte_t *pte;
>> + int ret;
>>
>> __set_current_state(TASK_RUNNING);
>>
>> - count_vm_event(PGFAULT);
>> -
>> - if (unlikely(is_vm_hugetlb_page(vma)))
>> - return hugetlb_fault(mm, vma, address, write_access);
>> + if (unlikely(is_vm_hugetlb_page(vma))) {
>> + ret = hugetlb_fault(mm, vma, address, write_access);
>> + goto out;
>> + }
>>
>> + ret = VM_FAULT_OOM;
>> pgd = pgd_offset(mm, address);
>> pud = pud_alloc(mm, pgd, address);
>> if (!pud)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pmd = pmd_alloc(mm, pud, address);
>> if (!pmd)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pte = pte_alloc_map(mm, pmd, address);
>> if (!pte)
>> - return VM_FAULT_OOM;
>> + goto out;
>>
>> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> + if (!(ret & VM_FAULT_RETRY))
>> + count_vm_event(PGFAULT);
>> + return ret;
>> }
>>
>> #ifndef __PAGETABLE_PUD_FOLDED
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-09 16:21 ` Ying Han
@ 2009-04-10 0:25 ` Wu Fengguang
0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10 0:25 UTC (permalink / raw)
To: Ying Han
Cc: linux-mm, linux-kernel, akpm, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Fri, Apr 10, 2009 at 12:21:25AM +0800, Ying Han wrote:
> On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <fengguang.wu@intel.com> wrote:
> > On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > A better changelog is desired, otherwise:
> >
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> >
> >> Signed-off-by: Ying Han <yinghan@google.com>
> >> Mike Waychison <mikew@google.com>
> >>
> >> include/linux/fs.h | 2 +-
> >> include/linux/mm.h | 2 +
> >> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >> mm/memory.c | 33 +++++++++++++++++------
> >>
> >>
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >> there are only # of pages ahead */
> >>
> >> unsigned int ra_pages; /* Maximum readahead window */
> >> - int mmap_miss; /* Cache miss stat for mmap accesses */
> >> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> >> loff_t prev_pos; /* Cache last read() position */
> >> };
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index ffee2f7..5a134a9 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
> >>
> >> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> >> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> >> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
> >>
> >>
> >> /*
> >> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
> >>
> >> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
> >>
> >> +#define VM_FAULT_RETRY 0x0010
> >> #define VM_FAULT_OOM 0x0001
> >> #define VM_FAULT_SIGBUS 0x0002
> >> #define VM_FAULT_MAJOR 0x0004
> >
> > Why not append VM_FAULT_RETRY here, and indent the _OOM line?
> >
> >> diff --git a/mm/filemap.c b/mm/filemap.c
> >> index f3e5f89..6eb7c36 100644
> >> --- a/mm/filemap.c
> >> +++ b/mm/filemap.c
> >> @@ -714,6 +714,58 @@ repeat:
> >> EXPORT_SYMBOL(find_lock_page);
> >>
> >> /**
> >> + * find_lock_page_retry - locate, pin and lock a pagecache page
> >> + * @mapping: the address_space to search
> >> + * @offset: the page index
> >> + * @vma: vma in which the fault was taken
> >> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> >> + * @retry: 1 indicate caller tolerate a retry.
> >> + *
> >> + * If retry flag is on, and page is already locked by someone else, return
> >> + * a hint of retry and leave *ppage untouched.
> >> + *
> >> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> >> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> >> + * hint to caller for retry, or ret=0 which means page is succefully
> >> + * locked.
> >> + */
> >> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> >> + struct vm_area_struct *vma, struct page **ppage,
> >> + int retry)
> >> +{
> >> + unsigned int ret = 0;
> >> + struct page *page;
> >> +
> >> +repeat:
> >> + page = find_get_page(mapping, offset);
> >> + if (page) {
> >> + if (!retry)
> >> + lock_page(page);
> >> + else {
> >> + if (!trylock_page(page)) {
> >> + struct mm_struct *mm = vma->vm_mm;
> >> +
> >> + up_read(&mm->mmap_sem);
> >> + wait_on_page_locked(page);
> >> + down_read(&mm->mmap_sem);
> >> +
> >> + page_cache_release(page);
> >> + return VM_FAULT_RETRY;
> >> + }
> >> + }
> >> + if (unlikely(page->mapping != mapping)) {
> >> + unlock_page(page);
> >> + page_cache_release(page);
> >> + goto repeat;
> >> + }
> >> + VM_BUG_ON(page->index != offset);
> >> + }
> >> + *ppage = page;
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL(find_lock_page_retry);
> >> +
> >> +/**
> >> * find_or_create_page - locate or add a pagecache page
> >> * @mapping: the page's address_space
> >> * @index: the page's index into the mapping
> >> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >> pgoff_t size;
> >> int did_readaround = 0;
> >> int ret = 0;
> >> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> >> + int retry_ret;
> >>
> >> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> >> if (vmf->pgoff >= size)
> >> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >> */
> >> retry_find:
> >> page = find_lock_page(mapping, vmf->pgoff);
> >> +
> >> /*
> >> * For sequential accesses, we use the generic readahead logic.
> >> */
> >> @@ -1465,7 +1520,13 @@ retry_find:
> >> if (!page) {
> >> page_cache_sync_readahead(mapping, ra, file,
> >> vmf->pgoff, 1);
> >> - page = find_lock_page(mapping, vmf->pgoff);
> >> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> + vma, &page, retry_flag);
> >> + if (retry_ret == VM_FAULT_RETRY) {
> >> + /* counteract the followed retry hit */
> >> + ra->mmap_miss++;
> >
> > Please don't relocate the comment...because that will break my
> > following patches(mainly Linus' filemap cleanups), which will
> > _heavily_ rework these chunks anyway. And make a hard time for Andrew
> > to merge them.
>
> then how you force the 80 character lines?
The above line will be removed by my following patch, and...
> >
> >> + return retry_ret;
> >> + }
> >
> >> if (!page)
> >> goto no_cached_page;
> >> }
> >> @@ -1504,7 +1565,14 @@ retry_find:
> >> start = vmf->pgoff - ra_pages / 2;
> >> do_page_cache_readahead(mapping, file, start, ra_pages);
> >> }
> >> - page = find_lock_page(mapping, vmf->pgoff);
> >> +retry_find_retry:
> >> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> + vma, &page, retry_flag);
> >> + if (retry_ret == VM_FAULT_RETRY) {
> >> + /* counteract the followed retry hit */
> >> + ra->mmap_miss++;
> >
...this line can have the comment within 80 column, and will be kept
by my patch.
Thanks,
Fengguang
> > ditto
> >
> > Thanks,
> > Fengguang
> >
> >> + return retry_ret;
> >> + }
> >> if (!page)
> >> goto no_cached_page;
> >> }
> >> @@ -1548,7 +1616,7 @@ no_cached_page:
> >> * meantime, we'll just come back here and read it again.
> >> */
> >> if (error >= 0)
> >> - goto retry_find;
> >> + goto retry_find_retry;
> >>
> >> /*
> >> * An error return from page_cache_read can result if the
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 164951c..5e215c9 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> >> vmf.page = NULL;
> >>
> >> ret = vma->vm_ops->fault(vma, &vmf);
> >> +
> >> + /* page may be available, but we have to restart the process
> >> + * because mmap_sem was dropped during the ->fault
> >> + */
> >> + if (ret & VM_FAULT_RETRY)
> >> + return ret;
> >> +
> >> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> >> return ret;
> >>
> >> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> >> {
> >> pgoff_t pgoff = (((address & PAGE_MASK)
> >> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> >> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> >> + int write = write_access & ~FAULT_FLAG_RETRY;
> >> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> >>
> >> + flags |= (write_access & FAULT_FLAG_RETRY);
> >> pte_unmap(page_table);
> >> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> >> }
> >> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> >> pud_t *pud;
> >> pmd_t *pmd;
> >> pte_t *pte;
> >> + int ret;
> >>
> >> __set_current_state(TASK_RUNNING);
> >>
> >> - count_vm_event(PGFAULT);
> >> -
> >> - if (unlikely(is_vm_hugetlb_page(vma)))
> >> - return hugetlb_fault(mm, vma, address, write_access);
> >> + if (unlikely(is_vm_hugetlb_page(vma))) {
> >> + ret = hugetlb_fault(mm, vma, address, write_access);
> >> + goto out;
> >> + }
> >>
> >> + ret = VM_FAULT_OOM;
> >> pgd = pgd_offset(mm, address);
> >> pud = pud_alloc(mm, pgd, address);
> >> if (!pud)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >> pmd = pmd_alloc(mm, pud, address);
> >> if (!pmd)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >> pte = pte_alloc_map(mm, pmd, address);
> >> if (!pte)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >>
> >> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> +out:
> >> + if (!(ret & VM_FAULT_RETRY))
> >> + count_vm_event(PGFAULT);
> >> + return ret;
> >> }
> >>
> >> #ifndef __PAGETABLE_PUD_FOLDED
> >
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-08 20:02 Ying Han
2009-04-09 7:47 ` Wu Fengguang
@ 2009-04-10 6:02 ` Andrew Morton
2009-04-10 6:32 ` Ying Han
2009-04-10 7:30 ` Wu Fengguang
1 sibling, 2 replies; 11+ messages in thread
From: Andrew Morton @ 2009-04-10 6:02 UTC (permalink / raw)
To: Ying Han
Cc: linux-mm, linux-kernel, torvalds, Ingo Molnar, Mike Waychison,
Rohit Seth, Hugh Dickins, Peter Zijlstra, H. Peter Anvin,
Török Edwin, Lee Schermerhorn, Nick Piggin,
Wu Fengguang
> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
Please give each patch in the series a unique and meaningful title.
On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:
> support for FAULT_FLAG_RETRY with no user change:
yup, we'd prefer a complete changelog here please.
> Signed-off-by: Ying Han <yinghan@google.com>
> Mike Waychison <mikew@google.com>
This form:
Signed-off-by: Ying Han <yinghan@google.com>
Signed-off-by: Mike Waychison <mikew@google.com>
is conventional.
> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
> there are only # of pages ahead */
>
> unsigned int ra_pages; /* Maximum readahead window */
> - int mmap_miss; /* Cache miss stat for mmap accesses */
> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
This change makes sense, but we're not told the reasons for making it?
Did it fix a bug, or is it an unrelated fixlet, or...?
> loff_t prev_pos; /* Cache last read() position */
> };
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>
> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>
>
> /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>
> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>
> +#define VM_FAULT_RETRY 0x0010
> #define VM_FAULT_OOM 0x0001
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
> EXPORT_SYMBOL(find_lock_page);
>
> /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
How about this:
If the page was not found in pagecache, find_lock_page_retry()
returns 0 and sets *@ppage to NULL.
If the page was found in pagecache but is locked and @retry is
true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
write to *@ppage.
If the page was found in pagecache and @retry is false,
find_lock_page_retry() locks the page, writes its address to *@ppage
and returns 0.
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> + struct vm_area_struct *vma, struct page **ppage,
> + int retry)
> +{
> + unsigned int ret = 0;
> + struct page *page;
> +
> +repeat:
> + page = find_get_page(mapping, offset);
> + if (page) {
> + if (!retry)
> + lock_page(page);
> + else {
> + if (!trylock_page(page)) {
> + struct mm_struct *mm = vma->vm_mm;
> +
> + up_read(&mm->mmap_sem);
> + wait_on_page_locked(page);
It looks strange that we wait for the page lock and then don't
just lock the page and return it.
Adding a comment explaining _why_ we do this would be nice.
> + down_read(&mm->mmap_sem);
> +
> + page_cache_release(page);
> + return VM_FAULT_RETRY;
> + }
> + }
> + if (unlikely(page->mapping != mapping)) {
> + unlock_page(page);
> + page_cache_release(page);
> + goto repeat;
> + }
> + VM_BUG_ON(page->index != offset);
> + }
> + *ppage = page;
> + return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
> * find_or_create_page - locate or add a pagecache page
> * @mapping: the page's address_space
> * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> pgoff_t size;
> int did_readaround = 0;
> int ret = 0;
> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> + int retry_ret;
>
> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> */
> retry_find:
> page = find_lock_page(mapping, vmf->pgoff);
> +
> /*
> * For sequential accesses, we use the generic readahead logic.
> */
> @@ -1465,7 +1520,13 @@ retry_find:
> if (!page) {
> page_cache_sync_readahead(mapping, ra, file,
> vmf->pgoff, 1);
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1504,7 +1565,14 @@ retry_find:
> start = vmf->pgoff - ra_pages / 2;
> do_page_cache_readahead(mapping, file, start, ra_pages);
> }
> - page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1548,7 +1616,7 @@ no_cached_page:
> * meantime, we'll just come back here and read it again.
> */
> if (error >= 0)
> - goto retry_find;
> + goto retry_find_retry;
>
> /*
> * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> vmf.page = NULL;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> +
> + /* page may be available, but we have to restart the process
> + * because mmap_sem was dropped during the ->fault
> + */
The preferred comment layout is:
/*
* The page may be available, but we have to restart the process
* because mmap_sem was dropped during the ->fault
*/
> + if (ret & VM_FAULT_RETRY)
> + return ret;
Again, I worry that in some places we treat VM_FAULT_RETRY as an
enumerated type:
if (foo == VM_FAULT_RETRY)
but in other places we treat it as a bitfield
if (foo & VM_FAULT_RETRY)
it may not be buggy, but it _looks_ buggy. And confusing and
inconsistent and dangerous.
Can it be improved?
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
>
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> {
> pgoff_t pgoff = (((address & PAGE_MASK)
> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> + int write = write_access & ~FAULT_FLAG_RETRY;
> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>
> + flags |= (write_access & FAULT_FLAG_RETRY);
gee, I'm lost.
Can we please redo this as:
int write;
unsigned int flags;
/*
* Big fat comment explaining the next three lines goes here
*/
write = write_access & ~FAULT_FLAG_RETRY;
unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
flags |= (write_access & FAULT_FLAG_RETRY);
?
> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
> + int ret;
>
> __set_current_state(TASK_RUNNING);
>
> - count_vm_event(PGFAULT);
> -
> - if (unlikely(is_vm_hugetlb_page(vma)))
> - return hugetlb_fault(mm, vma, address, write_access);
> + if (unlikely(is_vm_hugetlb_page(vma))) {
> + ret = hugetlb_fault(mm, vma, address, write_access);
> + goto out;
> + }
>
> + ret = VM_FAULT_OOM;
> pgd = pgd_offset(mm, address);
> pud = pud_alloc(mm, pgd, address);
> if (!pud)
> - return VM_FAULT_OOM;
> + goto out;
> pmd = pmd_alloc(mm, pud, address);
> if (!pmd)
> - return VM_FAULT_OOM;
> + goto out;
> pte = pte_alloc_map(mm, pmd, address);
> if (!pte)
> - return VM_FAULT_OOM;
> + goto out;
>
> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> + if (!(ret & VM_FAULT_RETRY))
> + count_vm_event(PGFAULT);
> + return ret;
> }
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-10 6:02 ` Andrew Morton
@ 2009-04-10 6:32 ` Ying Han
2009-04-10 6:48 ` Wu Fengguang
2009-04-10 7:30 ` Wu Fengguang
1 sibling, 1 reply; 11+ messages in thread
From: Ying Han @ 2009-04-10 6:32 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, linux-kernel, torvalds, Ingo Molnar, Mike Waychison,
Rohit Seth, Hugh Dickins, Peter Zijlstra, H. Peter Anvin,
Török Edwin, Lee Schermerhorn, Nick Piggin,
Wu Fengguang
2009/4/9 Andrew Morton <akpm@linux-foundation.org>:
>
>> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
>
> Please give each patch in the series a unique and meaningful title.
>
> On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:
>
>> support for FAULT_FLAG_RETRY with no user change:
>
> yup, we'd prefer a complete changelog here please.
>
>> Signed-off-by: Ying Han <yinghan@google.com>
>> Mike Waychison <mikew@google.com>
>
> This form:
>
> Signed-off-by: Ying Han <yinghan@google.com>
> Signed-off-by: Mike Waychison <mikew@google.com>
Thanks Andrew, and i need to add Fengguang to Signed-off-by.
>
> is conventional.
>
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>> there are only # of pages ahead */
>>
>> unsigned int ra_pages; /* Maximum readahead window */
>> - int mmap_miss; /* Cache miss stat for mmap accesses */
>> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
>
> This change makes sense, but we're not told the reasons for making it?
> Did it fix a bug, or is it an unrelated fixlet, or...?
Fengguang: Could you help making comments on this part? and i will
make changes elsewhere
as Andrew pointed. Thanks
>
>> loff_t prev_pos; /* Cache last read() position */
>> };
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
>> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>>
>>
>> /*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY 0x0010
>> #define VM_FAULT_OOM 0x0001
>> #define VM_FAULT_SIGBUS 0x0002
>> #define VM_FAULT_MAJOR 0x0004
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>> EXPORT_SYMBOL(find_lock_page);
>>
>> /**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>
> How about this:
>
> If the page was not found in pagecache, find_lock_page_retry()
> returns 0 and sets *@ppage to NULL.
>
> If the page was found in pagecache but is locked and @retry is
> true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
> write to *@ppage.
>
> If the page was found in pagecache and @retry is false,
> find_lock_page_retry() locks the page, writes its address to *@ppage
> and returns 0.
>
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> + struct vm_area_struct *vma, struct page **ppage,
>> + int retry)
>> +{
>> + unsigned int ret = 0;
>> + struct page *page;
>> +
>> +repeat:
>> + page = find_get_page(mapping, offset);
>> + if (page) {
>> + if (!retry)
>> + lock_page(page);
>> + else {
>> + if (!trylock_page(page)) {
>> + struct mm_struct *mm = vma->vm_mm;
>> +
>> + up_read(&mm->mmap_sem);
>> + wait_on_page_locked(page);
>
> It looks strange that we wait for the page lock and then don't
> just lock the page and return it.
>
> Adding a comment explaining _why_ we do this would be nice.
>
>> + down_read(&mm->mmap_sem);
>> +
>> + page_cache_release(page);
>> + return VM_FAULT_RETRY;
>> + }
>> + }
>> + if (unlikely(page->mapping != mapping)) {
>> + unlock_page(page);
>> + page_cache_release(page);
>> + goto repeat;
>> + }
>> + VM_BUG_ON(page->index != offset);
>> + }
>> + *ppage = page;
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>> * find_or_create_page - locate or add a pagecache page
>> * @mapping: the page's address_space
>> * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> pgoff_t size;
>> int did_readaround = 0;
>> int ret = 0;
>> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> + int retry_ret;
>>
>> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>> if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> */
>> retry_find:
>> page = find_lock_page(mapping, vmf->pgoff);
>> +
>> /*
>> * For sequential accesses, we use the generic readahead logic.
>> */
>> @@ -1465,7 +1520,13 @@ retry_find:
>> if (!page) {
>> page_cache_sync_readahead(mapping, ra, file,
>> vmf->pgoff, 1);
>> - page = find_lock_page(mapping, vmf->pgoff);
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>> + return retry_ret;
>> + }
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1504,7 +1565,14 @@ retry_find:
>> start = vmf->pgoff - ra_pages / 2;
>> do_page_cache_readahead(mapping, file, start, ra_pages);
>> }
>> - page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>> + return retry_ret;
>> + }
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>> * meantime, we'll just come back here and read it again.
>> */
>> if (error >= 0)
>> - goto retry_find;
>> + goto retry_find_retry;
>>
>> /*
>> * An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>> vmf.page = NULL;
>>
>> ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> + /* page may be available, but we have to restart the process
>> + * because mmap_sem was dropped during the ->fault
>> + */
>
> The preferred comment layout is:
>
> /*
> * The page may be available, but we have to restart the process
> * because mmap_sem was dropped during the ->fault
> */
>
>> + if (ret & VM_FAULT_RETRY)
>> + return ret;
>
> Again, I worry that in some places we treat VM_FAULT_RETRY as an
> enumerated type:
>
> if (foo == VM_FAULT_RETRY)
>
> but in other places we treat it as a bitfield
>
> if (foo & VM_FAULT_RETRY)
>
> it may not be buggy, but it _looks_ buggy. And confusing and
> inconsistent and dangerous.
>
> Can it be improved?
>
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>> return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>> {
>> pgoff_t pgoff = (((address & PAGE_MASK)
>> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> + int write = write_access & ~FAULT_FLAG_RETRY;
>> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> + flags |= (write_access & FAULT_FLAG_RETRY);
>
> gee, I'm lost.
>
> Can we please redo this as:
>
>
> int write;
> unsigned int flags;
>
> /*
> * Big fat comment explaining the next three lines goes here
> */
> write = write_access & ~FAULT_FLAG_RETRY;
> unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> flags |= (write_access & FAULT_FLAG_RETRY);
>
> ?
>
>> pte_unmap(page_table);
>> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>> }
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>> pud_t *pud;
>> pmd_t *pmd;
>> pte_t *pte;
>> + int ret;
>>
>> __set_current_state(TASK_RUNNING);
>>
>> - count_vm_event(PGFAULT);
>> -
>> - if (unlikely(is_vm_hugetlb_page(vma)))
>> - return hugetlb_fault(mm, vma, address, write_access);
>> + if (unlikely(is_vm_hugetlb_page(vma))) {
>> + ret = hugetlb_fault(mm, vma, address, write_access);
>> + goto out;
>> + }
>>
>> + ret = VM_FAULT_OOM;
>> pgd = pgd_offset(mm, address);
>> pud = pud_alloc(mm, pgd, address);
>> if (!pud)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pmd = pmd_alloc(mm, pud, address);
>> if (!pmd)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pte = pte_alloc_map(mm, pmd, address);
>> if (!pte)
>> - return VM_FAULT_OOM;
>> + goto out;
>>
>> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> + if (!(ret & VM_FAULT_RETRY))
>> + count_vm_event(PGFAULT);
>> + return ret;
>> }
>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-10 6:32 ` Ying Han
@ 2009-04-10 6:48 ` Wu Fengguang
0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10 6:48 UTC (permalink / raw)
To: Ying Han
Cc: Andrew Morton, linux-mm, linux-kernel, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Fri, Apr 10, 2009 at 02:32:01PM +0800, Ying Han wrote:
> 2009/4/9 Andrew Morton <akpm@linux-foundation.org>:
> >
> >> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
> >
> > Please give each patch in the series a unique and meaningful title.
> >
> > On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <yinghan@google.com> wrote:
> >
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > yup, we'd prefer a complete changelog here please.
> >
> >> Signed-off-by: Ying Han <yinghan@google.com>
> >> Mike Waychison <mikew@google.com>
> >
> > This form:
> >
> > Signed-off-by: Ying Han <yinghan@google.com>
> > Signed-off-by: Mike Waychison <mikew@google.com>
>
> Thanks Andrew, and i need to add Fengguang to Signed-off-by.
Thank you.
> >
> > is conventional.
> >
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >> there are only # of pages ahead */
> >>
> >> unsigned int ra_pages; /* Maximum readahead window */
> >> - int mmap_miss; /* Cache miss stat for mmap accesses */
> >> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> >
> > This change makes sense, but we're not told the reasons for making it?
> > Did it fix a bug, or is it an unrelated fixlet, or...?
>
> Fengguang: Could you help making comments on this part? and i will
> make changes elsewhere as Andrew pointed. Thanks
Ah this may deserve a standalone patch:
---
readhead: make mmap_miss an unsigned int
This makes the performance impact of possible mmap_miss wrap around to be
temporary and tolerable: i.e. MMAP_LOTSAMISS=100 extra readarounds.
Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX
cache misses to bring it back to normal state. During the time mmap
readaround will be _enabled_ for whatever wild random workload. That's
almost permanent performance impact.
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
include/linux/fs.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- mm.orig/include/linux/fs.h
+++ mm/include/linux/fs.h
@@ -824,7 +824,7 @@ struct file_ra_state {
there are only # of pages ahead */
unsigned int ra_pages; /* Maximum readahead window */
- int mmap_miss; /* Cache miss stat for mmap accesses */
+ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos; /* Cache last read() position */
};
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-10 6:02 ` Andrew Morton
2009-04-10 6:32 ` Ying Han
@ 2009-04-10 7:30 ` Wu Fengguang
2009-04-10 16:01 ` Linus Torvalds
1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-04-10 7:30 UTC (permalink / raw)
To: Andrew Morton
Cc: Ying Han, linux-mm, linux-kernel, torvalds, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
[snip]
> > if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> > return ret;
> >
> > @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> > {
> > pgoff_t pgoff = (((address & PAGE_MASK)
> > - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> > + int write = write_access & ~FAULT_FLAG_RETRY;
> > + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> >
> > + flags |= (write_access & FAULT_FLAG_RETRY);
>
> gee, I'm lost.
So did me.
> Can we please redo this as:
>
>
> int write;
> unsigned int flags;
>
> /*
> * Big fat comment explaining the next three lines goes here
> */
Basically it's doing a
(is_write_access | FAULT_FLAG_RETRY) =>
(FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
by extracting the bool part:
> write = write_access & ~FAULT_FLAG_RETRY;
convert bool to a bit flag:
> unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
and restore the FAULT_FLAG_RETRY:
> flags |= (write_access & FAULT_FLAG_RETRY);
Thanks,
Fengguang
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-10 7:30 ` Wu Fengguang
@ 2009-04-10 16:01 ` Linus Torvalds
2009-04-10 17:57 ` Ying Han
0 siblings, 1 reply; 11+ messages in thread
From: Linus Torvalds @ 2009-04-10 16:01 UTC (permalink / raw)
To: Wu Fengguang
Cc: Andrew Morton, Ying Han, linux-mm, linux-kernel, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
On Fri, 10 Apr 2009, Wu Fengguang wrote:
> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
>
> > Can we please redo this as:
> >
> >
> > int write;
> > unsigned int flags;
> >
> > /*
> > * Big fat comment explaining the next three lines goes here
> > */
>
> Basically it's doing a
> (is_write_access | FAULT_FLAG_RETRY) =>
> (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
> by extracting the bool part:
> > write = write_access & ~FAULT_FLAG_RETRY;
> convert bool to a bit flag:
> > unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
The point is, we shouldn't do that.
Your code is confused, because it uses "write_access" as if it had the old
behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say
"retry"), and that's just wrong.
Just get rid of "write_access" entirely, and switch it over to something
that is a pure bitmask.
Yes, it means a couple of new preliminary patches that switch all callers
of handle_mm_fault() over to using the VM_FLAGS, but that's not a big
deal.
I'm following up this email with two _example_ patches. They are untested,
but they look sane. I'd like the series to _start_ with these, and then
you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to
handle_mm_fault() cleanly.
Hmm? Note the _untested_ part on the patches to follow. It was done very
mechanically, and the patches look sane, but .. !!!
Linus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
2009-04-10 16:01 ` Linus Torvalds
@ 2009-04-10 17:57 ` Ying Han
0 siblings, 0 replies; 11+ messages in thread
From: Ying Han @ 2009-04-10 17:57 UTC (permalink / raw)
To: Linus Torvalds
Cc: Wu Fengguang, Andrew Morton, linux-mm, linux-kernel, Ingo Molnar,
Mike Waychison, Rohit Seth, Hugh Dickins, Peter Zijlstra,
H. Peter Anvin, Török Edwin, Lee Schermerhorn,
Nick Piggin
2009/4/10 Linus Torvalds <torvalds@linux-foundation.org>:
>
>
> On Fri, 10 Apr 2009, Wu Fengguang wrote:
>
>> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
>>
>> > Can we please redo this as:
>> >
>> >
>> > int write;
>> > unsigned int flags;
>> >
>> > /*
>> > * Big fat comment explaining the next three lines goes here
>> > */
>>
>> Basically it's doing a
>> (is_write_access | FAULT_FLAG_RETRY) =>
>> (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
>> by extracting the bool part:
>> > write = write_access & ~FAULT_FLAG_RETRY;
>> convert bool to a bit flag:
>> > unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
Fengguang: thank you for your comments. i agree this is confusing...
>
> The point is, we shouldn't do that.
>
> Your code is confused, because it uses "write_access" as if it had the old
> behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say
> "retry"), and that's just wrong.
>
> Just get rid of "write_access" entirely, and switch it over to something
> that is a pure bitmask.
>
> Yes, it means a couple of new preliminary patches that switch all callers
> of handle_mm_fault() over to using the VM_FLAGS, but that's not a big
> deal.
>
> I'm following up this email with two _example_ patches. They are untested,
> but they look sane. I'd like the series to _start_ with these, and then
> you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to
> handle_mm_fault() cleanly.
>
> Hmm? Note the _untested_ part on the patches to follow. It was done very
> mechanically, and the patches look sane, but .. !!!
Thanks Linus for your comments. I will take Peter Zijlstra's patches
(the _untested_ part)
which basically replaces the write_access as a flag as you mentioned
and start from there.
My next step is to cleanup the patch with comments in the thread so
far and post the new version.
Anything else i missed, please let me know. thanks
--Ying
>
> Linus
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2009-04-10 17:56 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-08 20:06 [PATCH][1/2]page_fault retry with NOPAGE_RETRY Ying Han
-- strict thread matches above, loose matches on Subject: below --
2009-04-08 20:02 Ying Han
2009-04-09 7:47 ` Wu Fengguang
2009-04-09 16:21 ` Ying Han
2009-04-10 0:25 ` Wu Fengguang
2009-04-10 6:02 ` Andrew Morton
2009-04-10 6:32 ` Ying Han
2009-04-10 6:48 ` Wu Fengguang
2009-04-10 7:30 ` Wu Fengguang
2009-04-10 16:01 ` Linus Torvalds
2009-04-10 17:57 ` Ying Han
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox