[patch 3/8] mm: merge nopfn into fault

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch 3/8] mm: merge nopfn into fault
@ 2007-05-18  7:37 akpm, Nick Piggin
  2007-05-18 15:23 ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: akpm, Nick Piggin @ 2007-05-18  7:37 UTC (permalink / raw)
  To: torvalds; +Cc: linux-mm, akpm, npiggin

Remove ->nopfn and reimplement the existing handlers with ->fault

[akpm@linux-foundation.org: mpspec.c build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/powerpc/platforms/cell/spufs/file.c |   92 +++++++++++----------
 drivers/char/mspec.c                     |   27 ++++--
 include/linux/mm.h                       |    9 --
 mm/memory.c                              |   58 +------------
 4 files changed, 72 insertions(+), 114 deletions(-)

diff -puN arch/powerpc/platforms/cell/spufs/file.c~mm-merge-nopfn-into-fault arch/powerpc/platforms/cell/spufs/file.c
--- a/arch/powerpc/platforms/cell/spufs/file.c~mm-merge-nopfn-into-fault
+++ a/arch/powerpc/platforms/cell/spufs/file.c
@@ -115,8 +115,8 @@ spufs_mem_write(struct file *file, const
 	return size;
 }
 
-static unsigned long spufs_mem_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static struct page *spufs_mem_mmap_fault(struct vm_area_struct *vma,
+					  struct fault_data *fdata)
 {
 	struct spu_context *ctx	= vma->vm_file->private_data;
 	unsigned long pfn, offset, addr0 = address;
@@ -137,9 +137,11 @@ static unsigned long spufs_mem_mmap_nopf
 	}
 #endif /* CONFIG_SPU_FS_64K_LS */
 
-	offset = (address - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
-	if (offset >= LS_SIZE)
-		return NOPFN_SIGBUS;
+	offset = fdata->pgoff << PAGE_SHIFT
+	if (offset >= LS_SIZE) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 
 	pr_debug("spufs_mem_mmap_nopfn address=0x%lx -> 0x%lx, offset=0x%lx\n",
 		 addr0, address, offset);
@@ -155,16 +157,17 @@ static unsigned long spufs_mem_mmap_nopf
 					     | _PAGE_NO_CACHE);
 		pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
 	}
-	vm_insert_pfn(vma, address, pfn);
+	vm_insert_pfn(vma, fdata->address, pfn);
 
 	spu_release(ctx);
 
-	return NOPFN_REFAULT;
+	fdata->type = VM_FAULT_MINOR;
+	return NULL;
 }
 
 
 static struct vm_operations_struct spufs_mem_mmap_vmops = {
-	.nopfn = spufs_mem_mmap_nopfn,
+	.fault = spufs_mem_mmap_fault,
 };
 
 static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -226,42 +229,45 @@ static const struct file_operations spuf
 #endif
 };
 
-static unsigned long spufs_ps_nopfn(struct vm_area_struct *vma,
-				    unsigned long address,
+static struct page *spufs_ps_fault(struct vm_area_struct *vma,
+				    struct fault_data *fdata,
 				    unsigned long ps_offs,
 				    unsigned long ps_size)
 {
 	struct spu_context *ctx = vma->vm_file->private_data;
-	unsigned long area, offset = address - vma->vm_start;
+	unsigned long area, offset = fdata->pgoff << PAGE_SHIFT;
 	int ret;
 
-	offset += vma->vm_pgoff << PAGE_SHIFT;
-	if (offset >= ps_size)
-		return NOPFN_SIGBUS;
+	if (offset >= ps_size) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
+
+	fdata->type = VM_FAULT_MINOR;
 
 	/* error here usually means a signal.. we might want to test
 	 * the error code more precisely though
 	 */
 	ret = spu_acquire_runnable(ctx, 0);
 	if (ret)
-		return NOPFN_REFAULT;
+		return NULL;
 
 	area = ctx->spu->problem_phys + ps_offs;
-	vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT);
+	vm_insert_pfn(vma, fdata->address, (area + offset) >> PAGE_SHIFT);
 	spu_release(ctx);
 
-	return NOPFN_REFAULT;
+	return NULL;
 }
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_cntl_mmap_nopfn(struct vm_area_struct *vma,
-					   unsigned long address)
+static struct page *spufs_cntl_mmap_fault(struct vm_area_struct *vma,
+					   struct fault_data *fdata)
 {
-	return spufs_ps_nopfn(vma, address, 0x4000, 0x1000);
+	return spufs_ps_fault(vma, fdata, 0x4000, 0x1000);
 }
 
 static struct vm_operations_struct spufs_cntl_mmap_vmops = {
-	.nopfn = spufs_cntl_mmap_nopfn,
+	.fault = spufs_cntl_mmap_fault,
 };
 
 /*
@@ -891,23 +897,23 @@ static ssize_t spufs_signal1_write(struc
 	return 4;
 }
 
-static unsigned long spufs_signal1_mmap_nopfn(struct vm_area_struct *vma,
-					      unsigned long address)
+static struct page *spufs_signal1_mmap_fault(struct vm_area_struct *vma,
+					      struct fault_data *fdata)
 {
 #if PAGE_SIZE == 0x1000
-	return spufs_ps_nopfn(vma, address, 0x14000, 0x1000);
+	return spufs_ps_fault(vma, fdata, 0x14000, 0x1000);
 #elif PAGE_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_nopfn(vma, address, 0x10000, 0x10000);
+	return spufs_ps_fault(vma, fdata, 0x10000, 0x10000);
 #else
 #error unsupported page size
 #endif
 }
 
 static struct vm_operations_struct spufs_signal1_mmap_vmops = {
-	.nopfn = spufs_signal1_mmap_nopfn,
+	.fault = spufs_signal1_mmap_fault,
 };
 
 static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1016,23 +1022,23 @@ static ssize_t spufs_signal2_write(struc
 }
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_signal2_mmap_nopfn(struct vm_area_struct *vma,
-					      unsigned long address)
+static struct page *spufs_signal2_mmap_fault(struct vm_area_struct *vma,
+					      struct fault_data *fdata)
 {
 #if PAGE_SIZE == 0x1000
-	return spufs_ps_nopfn(vma, address, 0x1c000, 0x1000);
+	return spufs_ps_fault(vma, fdata, 0x1c000, 0x1000);
 #elif PAGE_SIZE == 0x10000
 	/* For 64k pages, both signal1 and signal2 can be used to mmap the whole
 	 * signal 1 and 2 area
 	 */
-	return spufs_ps_nopfn(vma, address, 0x10000, 0x10000);
+	return spufs_ps_fault(vma, fdata, 0x10000, 0x10000);
 #else
 #error unsupported page size
 #endif
 }
 
 static struct vm_operations_struct spufs_signal2_mmap_vmops = {
-	.nopfn = spufs_signal2_mmap_nopfn,
+	.fault = spufs_signal2_mmap_fault,
 };
 
 static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1118,14 +1124,14 @@ DEFINE_SIMPLE_ATTRIBUTE(spufs_signal2_ty
 					spufs_signal2_type_set, "%llu");
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_mss_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static struct page *spufs_mss_mmap_fault(struct vm_area_struct *vma,
+					  struct fault_data *fdata)
 {
-	return spufs_ps_nopfn(vma, address, 0x0000, 0x1000);
+	return spufs_ps_fault(vma, fdata, 0x0000, 0x1000);
 }
 
 static struct vm_operations_struct spufs_mss_mmap_vmops = {
-	.nopfn = spufs_mss_mmap_nopfn,
+	.fault = spufs_mss_mmap_fault,
 };
 
 /*
@@ -1180,14 +1186,14 @@ static const struct file_operations spuf
 	.mmap	 = spufs_mss_mmap,
 };
 
-static unsigned long spufs_psmap_mmap_nopfn(struct vm_area_struct *vma,
-					    unsigned long address)
+static struct page *spufs_psmap_mmap_fault(struct vm_area_struct *vma,
+					    struct fault_data *fdata)
 {
-	return spufs_ps_nopfn(vma, address, 0x0000, 0x20000);
+	return spufs_ps_fault(vma, fdata, 0x0000, 0x20000);
 }
 
 static struct vm_operations_struct spufs_psmap_mmap_vmops = {
-	.nopfn = spufs_psmap_mmap_nopfn,
+	.fault = spufs_psmap_mmap_fault,
 };
 
 /*
@@ -1240,14 +1246,14 @@ static const struct file_operations spuf
 
 
 #if SPUFS_MMAP_4K
-static unsigned long spufs_mfc_mmap_nopfn(struct vm_area_struct *vma,
-					  unsigned long address)
+static struct page *spufs_mfc_mmap_fault(struct vm_area_struct *vma,
+					  struct fault_data *fdata)
 {
-	return spufs_ps_nopfn(vma, address, 0x3000, 0x1000);
+	return spufs_ps_fault(vma, fdata, 0x3000, 0x1000);
 }
 
 static struct vm_operations_struct spufs_mfc_mmap_vmops = {
-	.nopfn = spufs_mfc_mmap_nopfn,
+	.fault = spufs_mfc_mmap_fault,
 };
 
 /*
diff -puN drivers/char/mspec.c~mm-merge-nopfn-into-fault drivers/char/mspec.c
--- a/drivers/char/mspec.c~mm-merge-nopfn-into-fault
+++ a/drivers/char/mspec.c
@@ -182,24 +182,25 @@ mspec_close(struct vm_area_struct *vma)
 
 
 /*
- * mspec_nopfn
+ * mspec_fault
  *
  * Creates a mspec page and maps it to user space.
  */
-static unsigned long
-mspec_nopfn(struct vm_area_struct *vma, unsigned long address)
+static struct page *
+mspec_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 {
 	unsigned long paddr, maddr;
 	unsigned long pfn;
-	int index;
+	int index = fdata->pgoff;
 	struct vma_data *vdata = vma->vm_private_data;
 
-	index = (address - vma->vm_start) >> PAGE_SHIFT;
 	maddr = (volatile unsigned long) vdata->maddr[index];
 	if (maddr == 0) {
 		maddr = uncached_alloc_page(numa_node_id());
-		if (maddr == 0)
-			return NOPFN_OOM;
+		if (maddr == 0) {
+			fdata->type = VM_FAULT_OOM;
+			return NULL;
+		}
 
 		spin_lock(&vdata->lock);
 		if (vdata->maddr[index] == 0) {
@@ -219,13 +220,21 @@ mspec_nopfn(struct vm_area_struct *vma, 
 
 	pfn = paddr >> PAGE_SHIFT;
 
-	return pfn;
+	fdata->type = VM_FAULT_MINOR;
+	/*
+	 * vm_insert_pfn can fail with -EBUSY, but in that case it will
+	 * be because another thread has installed the pte first, so it
+	 * is no problem.
+	 */
+	vm_insert_pfn(vma, fdata->address, pfn);
+
+	return NULL;
 }
 
 static struct vm_operations_struct mspec_vm_ops = {
 	.open = mspec_open,
 	.close = mspec_close,
-	.nopfn = mspec_nopfn
+	.fault = mspec_fault,
 };
 
 /*
diff -puN include/linux/mm.h~mm-merge-nopfn-into-fault include/linux/mm.h
--- a/include/linux/mm.h~mm-merge-nopfn-into-fault
+++ a/include/linux/mm.h
@@ -231,8 +231,6 @@ struct vm_operations_struct {
 			struct fault_data *fdata);
 	struct page *(*nopage)(struct vm_area_struct *area,
 			unsigned long address, int *type);
-	unsigned long (*nopfn)(struct vm_area_struct *area,
-			unsigned long address);
 	int (*populate)(struct vm_area_struct *area, unsigned long address,
 			unsigned long len, pgprot_t prot, unsigned long pgoff,
 			int nonblock);
@@ -686,13 +684,6 @@ static inline int page_mapped(struct pag
 #define NOPAGE_OOM	((struct page *) (-1))
 
 /*
- * Error return values for the *_nopfn functions
- */
-#define NOPFN_SIGBUS	((unsigned long) -1)
-#define NOPFN_OOM	((unsigned long) -2)
-#define NOPFN_REFAULT	((unsigned long) -3)
-
-/*
  * Different kinds of faults, as returned by handle_mm_fault().
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
diff -puN mm/memory.c~mm-merge-nopfn-into-fault mm/memory.c
--- a/mm/memory.c~mm-merge-nopfn-into-fault
+++ a/mm/memory.c
@@ -1289,6 +1289,11 @@ EXPORT_SYMBOL(vm_insert_page);
  *
  * This function should only be called from a vm_ops->fault handler, and
  * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
  */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 		unsigned long pfn)
@@ -2450,56 +2455,6 @@ static int do_nonlinear_fault(struct mm_
 }
 
 /*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
-		     unsigned long address, pte_t *page_table, pmd_t *pmd,
-		     int write_access)
-{
-	spinlock_t *ptl;
-	pte_t entry;
-	unsigned long pfn;
-	int ret = VM_FAULT_MINOR;
-
-	pte_unmap(page_table);
-	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-	BUG_ON(is_cow_mapping(vma->vm_flags));
-
-	pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-	if (unlikely(pfn == NOPFN_OOM))
-		return VM_FAULT_OOM;
-	else if (unlikely(pfn == NOPFN_SIGBUS))
-		return VM_FAULT_SIGBUS;
-	else if (unlikely(pfn == NOPFN_REFAULT))
-		return VM_FAULT_MINOR;
-
-	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	/* Only go through if we didn't race with anybody else... */
-	if (pte_none(*page_table)) {
-		entry = pfn_pte(pfn, vma->vm_page_prot);
-		if (write_access)
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		set_pte_at(mm, address, page_table, entry);
-	}
-	pte_unmap_unlock(page_table, ptl);
-	return ret;
-}
-
-/*
  * Fault of a previously existing named mapping. Repopulate the pte
  * from the encoded file_pte if possible. This enables swappable
  * nonlinear vmas.
@@ -2570,9 +2525,6 @@ static inline int handle_pte_fault(struc
 				if (vma->vm_ops->fault || vma->vm_ops->nopage)
 					return do_linear_fault(mm, vma, address,
 						pte, pmd, write_access, entry);
-				if (unlikely(vma->vm_ops->nopfn))
-					return do_no_pfn(mm, vma, address, pte,
-							 pmd, write_access);
 			}
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, write_access);
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-18  7:37 [patch 3/8] mm: merge nopfn into fault akpm, Nick Piggin
@ 2007-05-18 15:23 ` Linus Torvalds
  2007-05-19  1:46   ` Nick Piggin
  2007-05-23 23:40   ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 19+ messages in thread
From: Linus Torvalds @ 2007-05-18 15:23 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, npiggin


On Fri, 18 May 2007, akpm@linux-foundation.org wrote:
>
> From: Nick Piggin <npiggin@suse.de>
> 
> Remove ->nopfn and reimplement the existing handlers with ->fault

So this is why you kept address.

No no no.

If we are changing the calling semantics of "nopage", then we should also 
remove the horrible, horrible hack of making the "nopfn" function itself 
do the "populate the page tables".

It would be *much* better to just

> +static struct page *spufs_mem_mmap_fault(struct vm_area_struct *vma,
> +					  struct fault_data *fdata)
>  {
>  	struct spu_context *ctx	= vma->vm_file->private_data;
>  	unsigned long pfn, offset, addr0 = address;
> @@ -137,9 +137,11 @@ static unsigned long spufs_mem_mmap_nopf
>  	}
>  #endif /* CONFIG_SPU_FS_64K_LS */
>  
> -	offset = (address - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
> -	if (offset >= LS_SIZE)
> -		return NOPFN_SIGBUS;
> +	offset = fdata->pgoff << PAGE_SHIFT
> +	if (offset >= LS_SIZE) {
> +		fdata->type = VM_FAULT_SIGBUS;
> +		return NULL;
> +	}

	if (offset >= LS_SIZE)
		return -EINVAL; /* or whatever error value */

and *remove* the "vm_insert_pfn":

> -	vm_insert_pfn(vma, address, pfn);
> +	vm_insert_pfn(vma, fdata->address, pfn);
>  
>  	spu_release(ctx);
>  
> -	return NOPFN_REFAULT;
> +	fdata->type = VM_FAULT_MINOR;
> +	return NULL;
>  }

And instead on success do

	fdata->pfn = pfn;
	/* Or: 'fdata->pte = pte' */
	return VM_FAULT_MINOR;

and let the caller always insert the thing into the page tables.

Wouldn't it be nice if we never had drivers etc modifying page tables 
directly? Even with helpers like "vm_insert_pfn()"?

And once you don't return "struct page *", the return values can be a lot 
more descriptive too.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-18 15:23 ` Linus Torvalds
@ 2007-05-19  1:46   ` Nick Piggin
  2007-05-23 23:40   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-05-19  1:46 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: akpm, linux-mm

On Fri, May 18, 2007 at 08:23:53AM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 18 May 2007, akpm@linux-foundation.org wrote:
> >
> > From: Nick Piggin <npiggin@suse.de>
> > 
> > Remove ->nopfn and reimplement the existing handlers with ->fault
> 
> So this is why you kept address.

Ah yeah..

 
> No no no.
> 
> If we are changing the calling semantics of "nopage", then we should also 
> remove the horrible, horrible hack of making the "nopfn" function itself 
> do the "populate the page tables".

Hey, just now you wanted me to pass down a bloody pte_t! :)


> It would be *much* better to just
> 
> > +static struct page *spufs_mem_mmap_fault(struct vm_area_struct *vma,
> > +					  struct fault_data *fdata)
> >  {
> >  	struct spu_context *ctx	= vma->vm_file->private_data;
> >  	unsigned long pfn, offset, addr0 = address;
> > @@ -137,9 +137,11 @@ static unsigned long spufs_mem_mmap_nopf
> >  	}
> >  #endif /* CONFIG_SPU_FS_64K_LS */
> >  
> > -	offset = (address - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
> > -	if (offset >= LS_SIZE)
> > -		return NOPFN_SIGBUS;
> > +	offset = fdata->pgoff << PAGE_SHIFT
> > +	if (offset >= LS_SIZE) {
> > +		fdata->type = VM_FAULT_SIGBUS;
> > +		return NULL;
> > +	}
> 
> 	if (offset >= LS_SIZE)
> 		return -EINVAL; /* or whatever error value */
> 
> and *remove* the "vm_insert_pfn":
> 
> > -	vm_insert_pfn(vma, address, pfn);
> > +	vm_insert_pfn(vma, fdata->address, pfn);
> >  
> >  	spu_release(ctx);
> >  
> > -	return NOPFN_REFAULT;
> > +	fdata->type = VM_FAULT_MINOR;
> > +	return NULL;
> >  }
> 
> And instead on success do
> 
> 	fdata->pfn = pfn;
> 	/* Or: 'fdata->pte = pte' */
> 	return VM_FAULT_MINOR;
> 
> and let the caller always insert the thing into the page tables.
> 
> Wouldn't it be nice if we never had drivers etc modifying page tables 
> directly? Even with helpers like "vm_insert_pfn()"?

Yeah it would be logically nicer, but it puts more code and branches
in the ->fault fastpaths, which I was trying to  avoid.

However, if you are willing to make that small tradeoff, and we have
handlers signal back to the caller that they are returning a pfn, then
OK.

But I don't think this is nearly so bad a violation than filesystems
doing ->populate or calculating their own pgoff. The reason? If the
driver is messing with pfns itself, then it already knows about some
aspect of memory management internals. At that point, I think it is
clean enough to have it call the vm_insert_pfn helper.


> And once you don't return "struct page *", the return values can be a lot 
> more descriptive too.

That I agree with.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-18 15:23 ` Linus Torvalds
  2007-05-19  1:46   ` Nick Piggin
@ 2007-05-23 23:40   ` Benjamin Herrenschmidt
  2007-05-24  1:42     ` Nick Piggin
  1 sibling, 1 reply; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-23 23:40 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: akpm, linux-mm, npiggin

On Fri, 2007-05-18 at 08:23 -0700, Linus Torvalds wrote:
> 
> If we are changing the calling semantics of "nopage", then we should also 
> remove the horrible, horrible hack of making the "nopfn" function itself 
> do the "populate the page tables".
> 
> It would be *much* better to just

  .../...

> and let the caller always insert the thing into the page tables.
> 
> Wouldn't it be nice if we never had drivers etc modifying page tables 
> directly? Even with helpers like "vm_insert_pfn()"?

The problem is that this is racy vs. concurrent unmap_mapping_range().

As I explained in my previous email, spufs and the DRI are 2 examples
where we need to expose to userland a mapping whose backing PFN's have
to be switched between different physical storage.

The only way I've found to have this be race free is to have the
->nopfn() function do the actual PTE insertion while holding a
lock/mutex that is also taken by whatever calles unmap_mapping_range()
when the switching occurs).

Cheers,
Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-23 23:40   ` Benjamin Herrenschmidt
@ 2007-05-24  1:42     ` Nick Piggin
  2007-05-24  2:04       ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-05-24  1:42 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Linus Torvalds, akpm, linux-mm

On Thu, May 24, 2007 at 09:40:19AM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2007-05-18 at 08:23 -0700, Linus Torvalds wrote:
> > 
> > If we are changing the calling semantics of "nopage", then we should also 
> > remove the horrible, horrible hack of making the "nopfn" function itself 
> > do the "populate the page tables".
> > 
> > It would be *much* better to just
> 
>   .../...
> 
> > and let the caller always insert the thing into the page tables.
> > 
> > Wouldn't it be nice if we never had drivers etc modifying page tables 
> > directly? Even with helpers like "vm_insert_pfn()"?
> 
> The problem is that this is racy vs. concurrent unmap_mapping_range().

Yeah, I decided against that after replying to Linus. Also, it is just
not a common path that we want to clutter up the core pagefault
handler with. As I said, if the handler already knows about pfns and
specifically doesn't want struct page backings etc etc.  (rather than
just looking up pagecache offsets like a filesystem), then we can assume
it has some idea about memory management and I don't think it is
particularly bad to have it install the pte.

So I do *not* want the normal fault path to do some weird nopfn stuff
based on return values of the handler.

At most, if Linus really doesn't want ->fault to do the nopfn thing, then
I would be happy to leave in ->nopfn... but I don't see much reason not
to just merge them anyway... one fewer branch and less code in the
page fault handler.

> As I explained in my previous email, spufs and the DRI are 2 examples
> where we need to expose to userland a mapping whose backing PFN's have
> to be switched between different physical storage.
> 
> The only way I've found to have this be race free is to have the
> ->nopfn() function do the actual PTE insertion while holding a
> lock/mutex that is also taken by whatever calles unmap_mapping_range()
> when the switching occurs).

Yep, thanks for the input.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  1:42     ` Nick Piggin
@ 2007-05-24  2:04       ` Linus Torvalds
  2007-05-24  2:16         ` Nick Piggin
  2007-05-24  3:17         ` Benjamin Herrenschmidt
  0 siblings, 2 replies; 19+ messages in thread
From: Linus Torvalds @ 2007-05-24  2:04 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Benjamin Herrenschmidt, akpm, linux-mm


On Thu, 24 May 2007, Nick Piggin wrote:
> 
> At most, if Linus really doesn't want ->fault to do the nopfn thing, then
> I would be happy to leave in ->nopfn... but I don't see much reason not
> to just merge them anyway... one fewer branch and less code in the
> page fault handler.

I just think that the "->fault" calling convention is _broken_.

If you want to install the PFN in the low-level driver, just

 - pass the whole "struct vm_fault" to the PFN-installing thing (so that 
   the driver at least doesn't have to muck with the address)

 - return a nice return value saying that you aren't returning a page. And 
   since you *also* need to return a value saying whether the page you 
   want to install is locked or not, that just means that the "struct page 
   *" approach (with a few extra error cases) won't cut it.

 - don't add yet another interface. Replace it cleanly. If you don't want 
   to re-use the name, fine, but at least replace it with something 
   better. 

The old "nopage()" return values weren't exactly pretty before either, but 
dang, do we really have to make it even MORE broken by having to have 
_both_ the old and the new ones, with different interfaces and magic 
locking rules depending on flags? I say "no".

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  2:04       ` Linus Torvalds
@ 2007-05-24  2:16         ` Nick Piggin
  2007-05-24  3:17         ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-05-24  2:16 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin Herrenschmidt, akpm, linux-mm

On Wed, May 23, 2007 at 07:04:28PM -0700, Linus Torvalds wrote:
> 
> 
> On Thu, 24 May 2007, Nick Piggin wrote:
> > 
> > At most, if Linus really doesn't want ->fault to do the nopfn thing, then
> > I would be happy to leave in ->nopfn... but I don't see much reason not
> > to just merge them anyway... one fewer branch and less code in the
> > page fault handler.
> 
> I just think that the "->fault" calling convention is _broken_.
> 
> If you want to install the PFN in the low-level driver, just
> 
>  - pass the whole "struct vm_fault" to the PFN-installing thing (so that 
>    the driver at least doesn't have to muck with the address)

I was going to suggest that too, not a bad idea, but I think it is
not a big problem if we allow some special drivers to get access to
the address if they need to (otherwise they'll just go off and try
to invent their own thing).

>  - return a nice return value saying that you aren't returning a page. And 
>    since you *also* need to return a value saying whether the page you 
>    want to install is locked or not, that just means that the "struct page 
>    *" approach (with a few extra error cases) won't cut it.

Yeah we can do that. We could have the VM_FAULT_ code in the first byte,
and return flags in the second, with the upper 2 bytes reserved... or
something like that.

>  - don't add yet another interface. Replace it cleanly. If you don't want 
>    to re-use the name, fine, but at least replace it with something 
>    better. 

I will definitely do that, but it is far easier to first merge it and
*then* clean up the drivers, rather than carry around all the patches
to do it. The nopage compat code is really small, so I don't see a
problem with carrying it around for a few -rcs until people have had time
to test and ack their driver conversions.

Actually I had a tear in my eye when removing the nopage name... it is
more quirky and has more character than the bland "fault"! :)

> The old "nopage()" return values weren't exactly pretty before either, but 
> dang, do we really have to make it even MORE broken by having to have 
> _both_ the old and the new ones, with different interfaces and magic 
> locking rules depending on flags? I say "no".

Well thanks a lot for the feedback. I'll try to make some improvements
and hopefully we can try again when 2.6.23 opens.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  2:04       ` Linus Torvalds
  2007-05-24  2:16         ` Nick Piggin
@ 2007-05-24  3:17         ` Benjamin Herrenschmidt
  2007-05-24  3:26           ` Benjamin Herrenschmidt
  1 sibling, 1 reply; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-24  3:17 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Nick Piggin, akpm, linux-mm

On Wed, 2007-05-23 at 19:04 -0700, Linus Torvalds wrote:
> 
> If you want to install the PFN in the low-level driver, just
> 
>  - pass the whole "struct vm_fault" to the PFN-installing thing (so that 
>    the driver at least doesn't have to muck with the address)

Fair but in the case of spufs, I -do- have to much with the address in
the driver/fs since it's the driver that knows it wants to use 64K page
mappings, and thus need to insert the PTE with the special 64K flag in
the first of the 16 entries of that 64K region -and- align the address
down before passing it to vm_insert_pfn().

There no knowledge of that arch magic in the generic vm_insert_pfn() and
I don't think there should be. It's all understanding between the arch
specific spufs driver and the arch low level page table management.

I know I'm sort of a special case here though, but I think it might make
sense to have in the future the DRM do similar special things to use
larger HW page sizes to map things like framebuffers or large in-VRAM or
in-AGP objects, possibly using an arch helper that does that appropriate
address & pgprot flag munging, but in the end, the actual PTE insertion
is still the generic one and that should work just fine.

Anyway, I don't see any big hurry to change ->nopfn() and it's
associated NOPFN_REFAULT special return code and vm_insert_pfn() helper
from what they are right now. They work for the few special case that
need them just fine and can be kept separate from whatever other work
Nick is doing.

Cheers,
Ben.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:17         ` Benjamin Herrenschmidt
@ 2007-05-24  3:26           ` Benjamin Herrenschmidt
  2007-05-24  3:37             ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-24  3:26 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Nick Piggin, akpm, linux-mm

On Thu, 2007-05-24 at 13:17 +1000, Benjamin Herrenschmidt wrote:
> Fair but in the case of spufs, I -do- have to much with the address in
> the driver/fs since it's the driver that knows it wants to use 64K
> page
> mappings, and thus need to insert the PTE with the special 64K flag in
> the first of the 16 entries of that 64K region -and- align the address
> down before passing it to vm_insert_pfn(). 

Note that I culd just modify the address/page index in the struct
vm_fault... doesn't make much difference in this case.

Might even create an arch helper prepare_special_pgsize_fault() or
something like that that takes the VM fault struct, whack it the right
way, and returns it to the driver for passing to vm_insert_pfn() so that
all of the logic is actually hidden from the driver.

Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:26           ` Benjamin Herrenschmidt
@ 2007-05-24  3:37             ` Linus Torvalds
  2007-05-24  3:45               ` Nick Piggin
                                 ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Linus Torvalds @ 2007-05-24  3:37 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Nick Piggin, akpm, linux-mm

On Thu, 24 May 2007, Benjamin Herrenschmidt wrote:
> 
> Note that I culd just modify the address/page index in the struct
> vm_fault... doesn't make much difference in this case.
> 
> Might even create an arch helper prepare_special_pgsize_fault() or
> something like that that takes the VM fault struct, whack it the right
> way, and returns it to the driver for passing to vm_insert_pfn() so that
> all of the logic is actually hidden from the driver.

I don't think we really need that, but what I'd like to avoid is people 
using "address" when they don't actually need to (especially if it's just 
a quick-and-lazy conversion, and they use "address" to do the page index 
calculation with the "pgoff + ((address - vma->start) >> PAGE_SHIFT)" kind 
of thing.

So exactly _because_ the "nopage()" interface takes "address", I'd like to 
avoid it in that form in the "vm_fault" structure, just so that people 
don't do stupid things with it.

(And yes, I'm not proud of the "nopage()" interface, but it evolved from 
historical behaviour which did everything at the low level, so "address" 
_used_ to make sense for the same reason you want it now).

So just about any "hiding" would do it as far as I'm concerned. Ranging 
from the odd (making it a "virtual page number") to just using an 
inconvenient name that just makes it obvious that it shouldn't be used 
lightly ("virtual_page_fault_address"), to making it a type that cannot 
easily be used for that kind of arithmetic ("void __user *" would make 
sense, no?).

We literally have code like

	offset = area->vm_pgoff << PAGE_SHIFT;
	offset += address - area->vm_start;
	vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->hwdep_pcm_shm + offset:
	page = virt_to_page(vaddr);

and the "easy" way to convert it would be to just continue to do the 
insane thing, without realizing that the "offset" calculation should now 
be just something like

	offset = fault->pgindex << PAGE_SHIFT;

instead.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:37             ` Linus Torvalds
@ 2007-05-24  3:45               ` Nick Piggin
  2007-05-24 10:07                 ` Christoph Hellwig
  2007-05-24  3:48               ` Benjamin Herrenschmidt
  2007-05-25 11:18               ` Nick Piggin
  2 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-05-24  3:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin Herrenschmidt, akpm, linux-mm

On Wed, May 23, 2007 at 08:37:28PM -0700, Linus Torvalds wrote:
> 
> 
> On Thu, 24 May 2007, Benjamin Herrenschmidt wrote:
> > 
> > Note that I culd just modify the address/page index in the struct
> > vm_fault... doesn't make much difference in this case.
> > 
> > Might even create an arch helper prepare_special_pgsize_fault() or
> > something like that that takes the VM fault struct, whack it the right
> > way, and returns it to the driver for passing to vm_insert_pfn() so that
> > all of the logic is actually hidden from the driver.
> 
> I don't think we really need that, but what I'd like to avoid is people 
> using "address" when they don't actually need to (especially if it's just 
> a quick-and-lazy conversion, and they use "address" to do the page index 
> calculation with the "pgoff + ((address - vma->start) >> PAGE_SHIFT)" kind 
> of thing.
> 
> So exactly _because_ the "nopage()" interface takes "address", I'd like to 
> avoid it in that form in the "vm_fault" structure, just so that people 
> don't do stupid things with it.
> 
> (And yes, I'm not proud of the "nopage()" interface, but it evolved from 
> historical behaviour which did everything at the low level, so "address" 
> _used_ to make sense for the same reason you want it now).

Yes, the goal was always to use pgoff to locate the page, because that
is the correct abstraction to pass through this interface.

 
> So just about any "hiding" would do it as far as I'm concerned. Ranging 
> from the odd (making it a "virtual page number") to just using an 
> inconvenient name that just makes it obvious that it shouldn't be used 
> lightly ("virtual_page_fault_address"), to making it a type that cannot 
> easily be used for that kind of arithmetic ("void __user *" would make 
> sense, no?).

'void __user *' seems reasonable, I think. Good idea.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:37             ` Linus Torvalds
  2007-05-24  3:45               ` Nick Piggin
@ 2007-05-24  3:48               ` Benjamin Herrenschmidt
  2007-05-25 11:18               ` Nick Piggin
  2 siblings, 0 replies; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-24  3:48 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Nick Piggin, akpm, linux-mm

On Wed, 2007-05-23 at 20:37 -0700, Linus Torvalds wrote:
> 
> So just about any "hiding" would do it as far as I'm concerned. Ranging 
> from the odd (making it a "virtual page number") to just using an 
> inconvenient name that just makes it obvious that it shouldn't be used 
> lightly ("virtual_page_fault_address"), to making it a type that cannot 
> easily be used for that kind of arithmetic ("void __user *" would make 
> sense, no?).

Yes, I like void __user *. I don't like long names because they make the
struct definition ugly though. What about

	void __user	*_fault_target; /* for internal use only */

Is that scary enough ? :-)

Cheers,
Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:45               ` Nick Piggin
@ 2007-05-24 10:07                 ` Christoph Hellwig
  2007-05-24 10:15                   ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 19+ messages in thread
From: Christoph Hellwig @ 2007-05-24 10:07 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linus Torvalds, Benjamin Herrenschmidt, akpm, linux-mm

On Thu, May 24, 2007 at 05:45:57AM +0200, Nick Piggin wrote:
> On Wed, May 23, 2007 at 08:37:28PM -0700, Linus Torvalds wrote:
> > 
> > 
> > On Thu, 24 May 2007, Benjamin Herrenschmidt wrote:
> > > 
> > > Note that I culd just modify the address/page index in the struct
> > > vm_fault... doesn't make much difference in this case.
> > > 
> > > Might even create an arch helper prepare_special_pgsize_fault() or
> > > something like that that takes the VM fault struct, whack it the right
> > > way, and returns it to the driver for passing to vm_insert_pfn() so that
> > > all of the logic is actually hidden from the driver.
> > 
> > I don't think we really need that, but what I'd like to avoid is people 
> > using "address" when they don't actually need to (especially if it's just 
> > a quick-and-lazy conversion, and they use "address" to do the page index 
> > calculation with the "pgoff + ((address - vma->start) >> PAGE_SHIFT)" kind 
> > of thing.
> > 
> > So exactly _because_ the "nopage()" interface takes "address", I'd like to 
> > avoid it in that form in the "vm_fault" structure, just so that people 
> > don't do stupid things with it.
> > 
> > (And yes, I'm not proud of the "nopage()" interface, but it evolved from 
> > historical behaviour which did everything at the low level, so "address" 
> > _used_ to make sense for the same reason you want it now).
> 
> Yes, the goal was always to use pgoff to locate the page, because that
> is the correct abstraction to pass through this interface.
> 
>  
> > So just about any "hiding" would do it as far as I'm concerned. Ranging 
> > from the odd (making it a "virtual page number") to just using an 
> > inconvenient name that just makes it obvious that it shouldn't be used 
> > lightly ("virtual_page_fault_address"), to making it a type that cannot 
> > easily be used for that kind of arithmetic ("void __user *" would make 
> > sense, no?).
> 
> 'void __user *' seems reasonable, I think. Good idea.N

Abusing __user for something entirely different is really dumb,
just use the same __attribute__((noderef, address_space(N)) annotation
that __user and __iomem use. but please use a different address_space

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24 10:07                 ` Christoph Hellwig
@ 2007-05-24 10:15                   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-24 10:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Nick Piggin, Linus Torvalds, akpm, linux-mm

On Thu, 2007-05-24 at 11:07 +0100, Christoph Hellwig wrote:
> 
> Abusing __user for something entirely different is really dumb,
> just use the same __attribute__((noderef, address_space(N)) annotation
> that __user and __iomem use. but please use a different address_space 

But it not an abuse. It _is_ a user pointer

Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-24  3:37             ` Linus Torvalds
  2007-05-24  3:45               ` Nick Piggin
  2007-05-24  3:48               ` Benjamin Herrenschmidt
@ 2007-05-25 11:18               ` Nick Piggin
  2007-05-25 16:36                 ` Linus Torvalds
  2 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-05-25 11:18 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin Herrenschmidt, akpm, linux-mm, Christoph Hellwig

On Wed, May 23, 2007 at 08:37:28PM -0700, Linus Torvalds wrote:
> 
> So just about any "hiding" would do it as far as I'm concerned. Ranging 
> from the odd (making it a "virtual page number") to just using an 
> inconvenient name that just makes it obvious that it shouldn't be used 
> lightly ("virtual_page_fault_address"), to making it a type that cannot 
> easily be used for that kind of arithmetic ("void __user *" would make 
> sense, no?).

OK, attached is an incremental diff (against patch 2 in the series)
that changes the API around and feels a bit cleaner.

What do you think? Any better?

---

Change ->fault prototype. We now return an int, which contains VM_FAULT_xxx
code in the low byte, and FAULT_RET_xxx code in the next byte. FAULT_RET_ code
tells the VM whether a page was found, whether it has been locked, and
potentially other things.

This means we no longer set VM_CAN_INVALIDATE in the vma in order to say
that a page is locked. This requires filemap_nopage to go away, so let's
just nuke all that stuff now instead of doing a seperate patch for it at the
end.

struct fault_data is renamed to struct vm_fault as Linus asked. address
is now a void __user * that we should firmly encourage drivers not to use.

The page is now returned via a page pointer in the vm_fault struct.

---
 Documentation/feature-removal-schedule.txt |   20 --
 fs/gfs2/ops_file.c                         |    2 
 fs/gfs2/ops_vm.c                           |   47 ++--
 fs/ncpfs/mmap.c                            |   35 +--
 fs/ocfs2/mmap.c                            |   27 +-
 fs/xfs/linux-2.6/xfs_file.c                |   12 -
 include/linux/mm.h                         |   82 ++++----
 ipc/shm.c                                  |    5 
 mm/filemap.c                               |  283 ++++-------------------------
 mm/filemap_xip.c                           |   37 +--
 mm/memory.c                                |   97 ++++-----
 mm/nommu.c                                 |    2 
 mm/shmem.c                                 |   29 +-
 13 files changed, 214 insertions(+), 464 deletions(-)

Index: linux-2.6/fs/gfs2/ops_vm.c
===================================================================
--- linux-2.6.orig/fs/gfs2/ops_vm.c
+++ linux-2.6/fs/gfs2/ops_vm.c
@@ -27,13 +27,12 @@
 #include "trans.h"
 #include "util.h"
 
-static struct page *gfs2_private_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int gfs2_private_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
 
 	set_bit(GIF_PAGED, &ip->i_flags);
-	return filemap_fault(vma, fdata);
+	return filemap_fault(vma, vmf);
 }
 
 static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -104,55 +103,55 @@ out:
 	return error;
 }
 
-static struct page *gfs2_sharewrite_fault(struct vm_area_struct *vma,
-						struct fault_data *fdata)
+static int gfs2_sharewrite_fault(struct vm_area_struct *vma,
+						struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct gfs2_file *gf = file->private_data;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_holder i_gh;
-	struct page *result = NULL;
 	int alloc_required;
 	int error;
+	int ret = VM_FAULT_MINOR;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
 	if (error)
-		return NULL;
+		goto out;
 
 	set_bit(GIF_PAGED, &ip->i_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
 	error = gfs2_write_alloc_required(ip,
-					(u64)fdata->pgoff << PAGE_CACHE_SHIFT,
+					(u64)vmf->pgoff << PAGE_CACHE_SHIFT,
 					PAGE_CACHE_SIZE, &alloc_required);
 	if (error) {
-		fdata->type = VM_FAULT_OOM; /* XXX: are these right? */
-		goto out;
+		ret = VM_FAULT_OOM; /* XXX: are these right? */
+		goto out_unlock;
 	}
 
 	set_bit(GFF_EXLOCK, &gf->f_flags);
-	result = filemap_fault(vma, fdata);
+	ret = filemap_fault(vma, vmf);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (!result)
-		goto out;
+	if (ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE))
+		goto out_unlock;
 
 	if (alloc_required) {
-		error = alloc_page_backing(ip, result);
+		/* XXX: do we need to drop page lock around alloc_page_backing?*/
+		error = alloc_page_backing(ip, vmf->page);
 		if (error) {
-			if (vma->vm_flags & VM_CAN_INVALIDATE)
-				unlock_page(result);
-			page_cache_release(result);
-			fdata->type = VM_FAULT_OOM;
-			result = NULL;
-			goto out;
+			if (ret & FAULT_RET_LOCKED)
+				unlock_page(vmf->page);
+			page_cache_release(vmf->page);
+			ret = VM_FAULT_OOM;
+			goto out_unlock;
 		}
-		set_page_dirty(result);
+		set_page_dirty(vmf->page);
 	}
 
-out:
+out_unlock:
 	gfs2_glock_dq_uninit(&i_gh);
-
-	return result;
+out:
+	return ret;
 }
 
 struct vm_operations_struct gfs2_vm_ops_private = {
Index: linux-2.6/fs/ncpfs/mmap.c
===================================================================
--- linux-2.6.orig/fs/ncpfs/mmap.c
+++ linux-2.6/fs/ncpfs/mmap.c
@@ -24,33 +24,35 @@
 
 /*
  * Fill in the supplied page for mmap
+ * XXX: how are we excluding truncate/invalidate here? Maybe need to lock
+ * page?
  */
-static struct page* ncp_file_mmap_fault(struct vm_area_struct *area,
-						struct fault_data *fdata)
+static int ncp_file_mmap_fault(struct vm_area_struct *area,
+					struct vm_fault *vmf)
 {
 	struct file *file = area->vm_file;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct page* page;
 	char *pg_addr;
 	unsigned int already_read;
 	unsigned int count;
 	int bufsize;
-	int pos;
+	int pos; /* XXX: loff_t ? */
 
-	page = alloc_page(GFP_HIGHUSER); /* ncpfs has nothing against high pages
-	           as long as recvmsg and memset works on it */
-	if (!page) {
-		fdata->type = VM_FAULT_OOM;
-		return NULL;
-	}
-	pg_addr = kmap(page);
-	pos = fdata->pgoff << PAGE_SHIFT;
+	/*
+	 * ncpfs has nothing against high pages as long
+	 * as recvmsg and memset works on it
+	 */
+	vmf->page = alloc_page(GFP_HIGHUSER);
+	if (!page)
+		return VM_FAULT_OOM;
+	pg_addr = kmap(vmf->page);
+	pos = vmf->pgoff << PAGE_SHIFT;
 
 	count = PAGE_SIZE;
-	if (fdata->address + PAGE_SIZE > area->vm_end) {
+	if (vmf->virtual_address + PAGE_SIZE > area->vm_end) {
 		WARN_ON(1); /* shouldn't happen? */
-		count = area->vm_end - fdata->address;
+		count = area->vm_end - vmf->virtual_address;
 	}
 	/* what we can read in one go */
 	bufsize = NCP_SERVER(inode)->buffer_size;
@@ -85,17 +87,16 @@ static struct page* ncp_file_mmap_fault(
 
 	if (already_read < PAGE_SIZE)
 		memset(pg_addr + already_read, 0, PAGE_SIZE - already_read);
-	flush_dcache_page(page);
-	kunmap(page);
+	flush_dcache_page(vmf->page);
+	kunmap(vmf->page);
 
 	/*
 	 * If I understand ncp_read_kernel() properly, the above always
 	 * fetches from the network, here the analogue of disk.
 	 * -- wli
 	 */
-	fdata->type = VM_FAULT_MAJOR;
 	count_vm_event(PGMAJFAULT);
-	return page;
+	return VM_FAULT_MAJOR;
 }
 
 static struct vm_operations_struct ncp_file_mmap =
Index: linux-2.6/fs/ocfs2/mmap.c
===================================================================
--- linux-2.6.orig/fs/ocfs2/mmap.c
+++ linux-2.6/fs/ocfs2/mmap.c
@@ -42,14 +42,13 @@
 #include "inode.h"
 #include "mmap.h"
 
-static struct page *ocfs2_fault(struct vm_area_struct *area,
-						struct fault_data *fdata)
+static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
 	struct page *page = NULL;
 	sigset_t blocked, oldset;
-	int ret;
+	int error, ret;
 
-	mlog_entry("(area=%p, page offset=%lu)\n", area, fdata->pgoff);
+	mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
 
 	/* The best way to deal with signals in this path is
 	 * to block them upfront, rather than allowing the
@@ -58,21 +57,21 @@ static struct page *ocfs2_fault(struct v
 
 	/* We should technically never get a bad ret return
 	 * from sigprocmask */
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
-	if (ret < 0) {
-		fdata->type = VM_FAULT_SIGBUS;
-		mlog_errno(ret);
+	error = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	if (error < 0) {
+		mlog_errno(error);
+		ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-	page = filemap_fault(area, fdata);
+	ret = filemap_fault(area, vmf);
 
-	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
-	if (ret < 0)
-		mlog_errno(ret);
+	error = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	if (error < 0)
+		mlog_errno(error);
 out:
-	mlog_exit_ptr(page);
-	return page;
+	mlog_exit_ptr(vmf->page);
+	return ret;
 }
 
 static struct vm_operations_struct ocfs2_file_vm_ops = {
Index: linux-2.6/fs/xfs/linux-2.6/xfs_file.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_file.c
+++ linux-2.6/fs/xfs/linux-2.6/xfs_file.c
@@ -245,20 +245,18 @@ xfs_file_fsync(
 }
 
 #ifdef CONFIG_XFS_DMAPI
-STATIC struct page *
+STATIC int
 xfs_vm_fault(
 	struct vm_area_struct	*vma,
-	struct fault_data	*fdata)
+	struct vm_fault	*vmf)
 {
 	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0)) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
-	return filemap_fault(vma, fdata);
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0))
+		return VM_FAULT_SIGBUS;
+	return filemap_fault(vma, vmf);
 }
 #endif /* CONFIG_XFS_DMAPI */
 
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -170,12 +170,7 @@ extern unsigned int kobjsize(const void 
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
 
-#define VM_CAN_INVALIDATE 0x08000000	/* The mapping may be invalidated,
-					 * eg. truncate or invalidate_inode_*.
-					 * In this case, do_no_page must
-					 * return with the page locked.
-					 */
-#define VM_CAN_NONLINEAR 0x10000000	/* Has ->fault & does nonlinear pages */
+#define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -199,24 +194,44 @@ extern unsigned int kobjsize(const void 
  */
 extern pgprot_t protection_map[16];
 
-#define FAULT_FLAG_WRITE	0x01
-#define FAULT_FLAG_NONLINEAR	0x02
+#define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
+#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+
+
+#define FAULT_RET_NOPAGE	0x0100	/* ->fault did not return a page. This
+					 * can be used if the handler installs
+					 * their own pte.
+					 */
+#define FAULT_RET_LOCKED	0x0200	/* ->fault locked the page, caller must
+					 * unlock after installing the mapping.
+					 * This is used by pagecache in
+					 * particular, where the page lock is
+					 * used to synchronise against truncate
+					 * and invalidate. Mutually exclusive
+					 * with FAULT_RET_NOPAGE.
+					 */
 
 /*
- * fault_data is filled in the the pagefault handler and passed to the
- * vma's ->fault function. That function is responsible for filling in
- * 'type', which is the type of fault if a page is returned, or the type
- * of error if NULL is returned.
- *
- * pgoff should be used in favour of address, if possible. If pgoff is
- * used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get
- * nonlinear mapping support.
- */
-struct fault_data {
-	unsigned long address;
-	pgoff_t pgoff;
-	unsigned int flags;
-	int type;
+ * vm_fault is filled by the the pagefault handler and passed to the vma's
+ * ->fault function. The vma's ->fault is responsible for returning the
+ * VM_FAULT_xxx type which occupies the lowest byte of the return code, ORed
+ * with FAULT_RET_ flags that occupy the next byte and give details about
+ * how the fault was handled.
+ *
+ * pgoff should be used in favour of virtual_address, if possible. If pgoff
+ * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
+ * mapping support.
+ */
+struct vm_fault {
+	unsigned int flags;		/* FAULT_FLAG_xxx flags */
+	pgoff_t pgoff;			/* Logical page offset based on vma */
+	void __user *virtual_address;	/* Faulting virtual address */
+
+	struct page *page;		/* ->fault handlers should return a
+					 * page here, unless FAULT_RET_NOPAGE
+					 * is set (which is also implied by
+					 * VM_FAULT_OOM or SIGBUS).
+					 */
 };
 
 /*
@@ -227,15 +242,11 @@ struct fault_data {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
-	struct page *(*fault)(struct vm_area_struct *vma,
-			struct fault_data *fdata);
+	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	struct page *(*nopage)(struct vm_area_struct *area,
 			unsigned long address, int *type);
 	unsigned long (*nopfn)(struct vm_area_struct *area,
 			unsigned long address);
-	int (*populate)(struct vm_area_struct *area, unsigned long address,
-			unsigned long len, pgprot_t prot, unsigned long pgoff,
-			int nonblock);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -697,8 +708,14 @@ static inline int page_mapped(struct pag
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  */
-#define VM_FAULT_OOM	0x00
-#define VM_FAULT_SIGBUS	0x01
+
+/*
+ * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
+ */
+#define VM_FAULT_ERROR	0x20
+
+#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
+#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
 #define VM_FAULT_MINOR	0x02
 #define VM_FAULT_MAJOR	0x03
 
@@ -708,6 +725,11 @@ static inline int page_mapped(struct pag
  */
 #define VM_FAULT_WRITE	0x10
 
+/*
+ * Mask of VM_FAULT_ flags
+ */
+#define VM_FAULT_MASK	0xff
+
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
 extern void show_free_areas(void);
@@ -790,8 +812,6 @@ static inline void unmap_shared_mapping_
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
-extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
-extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 
 #ifdef CONFIG_MMU
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
@@ -1124,11 +1144,7 @@ extern void truncate_inode_pages_range(s
 				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern struct page *filemap_fault(struct vm_area_struct *, struct fault_data *);
-extern struct page * __deprecated_for_modules
-filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-extern int __deprecated_for_modules filemap_populate(struct vm_area_struct *,
-		unsigned long, unsigned long, pgprot_t, unsigned long, int);
+extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
Index: linux-2.6/ipc/shm.c
===================================================================
--- linux-2.6.orig/ipc/shm.c
+++ linux-2.6/ipc/shm.c
@@ -226,13 +226,12 @@ static void shm_close(struct vm_area_str
 	mutex_unlock(&shm_ids(ns).mutex);
 }
 
-static struct page *shm_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->fault(vma, fdata);
+	return sfd->vm_ops->fault(vma, vmf);
 }
 
 #ifdef CONFIG_NUMA
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1336,7 +1336,7 @@ static int fastcall page_cache_read(stru
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	user vma (not used)
- * @fdata:	the applicable fault_data
+ * @vmf:	the applicable fault_data
  *
  * filemap_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
@@ -1345,7 +1345,7 @@ static int fastcall page_cache_read(stru
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
-struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int error;
 	struct file *file = vma->vm_file;
@@ -1355,13 +1355,12 @@ struct page *filemap_fault(struct vm_are
 	struct page *page;
 	unsigned long size;
 	int did_readaround = 0;
+	int ret;
 
-	fdata->type = VM_FAULT_MINOR;
-
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+	ret = VM_FAULT_MINOR;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (fdata->pgoff >= size)
+	if (vmf->pgoff >= size)
 		goto outside_data_content;
 
 	/* If we don't want any read-ahead, don't bother */
@@ -1375,18 +1374,18 @@ struct page *filemap_fault(struct vm_are
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
 	if (VM_SequentialReadHint(vma))
-		page_cache_readahead(mapping, ra, file, fdata->pgoff, 1);
+		page_cache_readahead(mapping, ra, file, vmf->pgoff, 1);
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
-	page = find_lock_page(mapping, fdata->pgoff);
+	page = find_lock_page(mapping, vmf->pgoff);
 	if (!page) {
 		unsigned long ra_pages;
 
 		if (VM_SequentialReadHint(vma)) {
-			handle_ra_miss(mapping, ra, fdata->pgoff);
+			handle_ra_miss(mapping, ra, vmf->pgoff);
 			goto no_cached_page;
 		}
 		ra->mmap_miss++;
@@ -1403,7 +1402,7 @@ retry_find:
 		 * check did_readaround, as this is an inner loop.
 		 */
 		if (!did_readaround) {
-			fdata->type = VM_FAULT_MAJOR;
+			ret = VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 		}
 		did_readaround = 1;
@@ -1411,11 +1410,11 @@ retry_find:
 		if (ra_pages) {
 			pgoff_t start = 0;
 
-			if (fdata->pgoff > ra_pages / 2)
-				start = fdata->pgoff - ra_pages / 2;
+			if (vmf->pgoff > ra_pages / 2)
+				start = vmf->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, fdata->pgoff);
+		page = find_lock_page(mapping, vmf->pgoff);
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1432,7 +1431,7 @@ retry_find:
 
 	/* Must recheck i_size under page lock */
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (unlikely(fdata->pgoff >= size)) {
+	if (unlikely(vmf->pgoff >= size)) {
 		unlock_page(page);
 		goto outside_data_content;
 	}
@@ -1441,24 +1440,24 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 */
 	mark_page_accessed(page);
-	return page;
+	vmf->page = page;
+	return ret | FAULT_RET_LOCKED;
 
 outside_data_content:
 	/*
 	 * An external ptracer can access pages that normally aren't
 	 * accessible..
 	 */
-	if (vma->vm_mm == current->mm) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
+	if (vma->vm_mm == current->mm)
+		return VM_FAULT_SIGBUS;
+
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
 	 * effect.
 	 */
-	error = page_cache_read(file, fdata->pgoff);
+	error = page_cache_read(file, vmf->pgoff);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -1474,15 +1473,13 @@ no_cached_page:
 	 * to schedule I/O.
 	 */
 	if (error == -ENOMEM)
-		fdata->type = VM_FAULT_OOM;
-	else
-		fdata->type = VM_FAULT_SIGBUS;
-	return NULL;
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
 
 page_not_uptodate:
 	/* IO error path */
 	if (!did_readaround) {
-		fdata->type = VM_FAULT_MAJOR;
+		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
 
@@ -1501,206 +1498,10 @@ page_not_uptodate:
 
 	/* Things didn't work out. Return zero to tell the mm layer so. */
 	shrink_readahead_size_eio(file, ra);
-	fdata->type = VM_FAULT_SIGBUS;
-	return NULL;
+	return VM_FAULT_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_fault);
 
-/*
- * filemap_nopage and filemap_populate are legacy exports that are not used
- * in tree. Scheduled for removal.
- */
-struct page *filemap_nopage(struct vm_area_struct *area,
-				unsigned long address, int *type)
-{
-	struct page *page;
-	struct fault_data fdata;
-	fdata.address = address;
-	fdata.pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
-			+ area->vm_pgoff;
-	fdata.flags = 0;
-
-	page = filemap_fault(area, &fdata);
-	if (type)
-		*type = fdata.type;
-
-	return page;
-}
-EXPORT_SYMBOL(filemap_nopage);
-
-static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
-					int nonblock)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct page *page;
-	int error;
-
-	/*
-	 * Do we have something in the page cache already?
-	 */
-retry_find:
-	page = find_get_page(mapping, pgoff);
-	if (!page) {
-		if (nonblock)
-			return NULL;
-		goto no_cached_page;
-	}
-
-	/*
-	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.
-	 */
-	if (!PageUptodate(page)) {
-		if (nonblock) {
-			page_cache_release(page);
-			return NULL;
-		}
-		goto page_not_uptodate;
-	}
-
-success:
-	/*
-	 * Found the page and have a reference on it.
-	 */
-	mark_page_accessed(page);
-	return page;
-
-no_cached_page:
-	error = page_cache_read(file, pgoff);
-
-	/*
-	 * The page we want has now been added to the page cache.
-	 * In the unlikely event that someone removed it in the
-	 * meantime, we'll just come back here and read it again.
-	 */
-	if (error >= 0)
-		goto retry_find;
-
-	/*
-	 * An error return from page_cache_read can result if the
-	 * system is low on memory, or a problem occurs while trying
-	 * to schedule I/O.
-	 */
-	return NULL;
-
-page_not_uptodate:
-	lock_page(page);
-
-	/* Did it get truncated while we waited for it? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-
-	/* Did somebody else get it up-to-date? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Umm, take care of errors if the page isn't up-to-date.
-	 * Try to re-read it _once_. We do this synchronously,
-	 * because there really aren't any performance issues here
-	 * and we need to check for errors.
-	 */
-	lock_page(page);
-
-	/* Somebody truncated the page on us? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-	/* Somebody else successfully read it in? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	ClearPageError(page);
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Things didn't work out. Return zero to tell the
-	 * mm layer so, possibly freeing the page cache page first.
-	 */
-err:
-	page_cache_release(page);
-
-	return NULL;
-}
-
-int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long len, pgprot_t prot, unsigned long pgoff,
-		int nonblock)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	unsigned long size;
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int err;
-
-	if (!nonblock)
-		force_page_cache_readahead(mapping, vma->vm_file,
-					pgoff, len >> PAGE_CACHE_SHIFT);
-
-repeat:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
-		return -EINVAL;
-
-	page = filemap_getpage(file, pgoff, nonblock);
-
-	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
-	 * done in shmem_populate calling shmem_getpage */
-	if (!page && !nonblock)
-		return -ENOMEM;
-
-	if (page) {
-		err = install_page(mm, vma, addr, page, prot);
-		if (err) {
-			page_cache_release(page);
-			return err;
-		}
-	} else if (vma->vm_flags & VM_NONLINEAR) {
-		/* No page was found just because we can't read it in now (being
-		 * here implies nonblock != 0), but the page may exist, so set
-		 * the PTE to fault it in later. */
-		err = install_file_pte(mm, vma, addr, pgoff, prot);
-		if (err)
-			return err;
-	}
-
-	len -= PAGE_SIZE;
-	addr += PAGE_SIZE;
-	pgoff++;
-	if (len)
-		goto repeat;
-
-	return 0;
-}
-EXPORT_SYMBOL(filemap_populate);
-
 struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 };
@@ -1715,7 +1516,7 @@ int generic_file_mmap(struct file * file
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c
+++ linux-2.6/mm/filemap_xip.c
@@ -232,8 +232,7 @@ __xip_unmap (struct address_space * mapp
  *
  * This function is derived from filemap_fault, but used for execute in place
  */
-static struct page *xip_file_fault(struct vm_area_struct *area,
-					struct fault_data *fdata)
+static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 {
 	struct file *file = area->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -244,19 +243,15 @@ static struct page *xip_file_fault(struc
 	/* XXX: are VM_FAULT_ codes OK? */
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (fdata->pgoff >= size) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
 
 	page = mapping->a_ops->get_xip_page(mapping,
-					fdata->pgoff*(PAGE_SIZE/512), 0);
+					vmf->pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page))
 		goto out;
-	if (PTR_ERR(page) != -ENODATA) {
-		fdata->type = VM_FAULT_OOM;
-		return NULL;
-	}
+	if (PTR_ERR(page) != -ENODATA)
+		return VM_FAULT_OOM;
 
 	/* sparse block */
 	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
@@ -264,26 +259,22 @@ static struct page *xip_file_fault(struc
 	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
 		/* maybe shared writable, allocate new block */
 		page = mapping->a_ops->get_xip_page(mapping,
-					fdata->pgoff*(PAGE_SIZE/512), 1);
-		if (IS_ERR(page)) {
-			fdata->type = VM_FAULT_SIGBUS;
-			return NULL;
-		}
+					vmf->pgoff*(PAGE_SIZE/512), 1);
+		if (IS_ERR(page))
+			return VM_FAULT_SIGBUS;
 		/* unmap page at pgoff from all other vmas */
-		__xip_unmap(mapping, fdata->pgoff);
+		__xip_unmap(mapping, vmf->pgoff);
 	} else {
 		/* not shared and writable, use xip_sparse_page() */
 		page = xip_sparse_page();
-		if (!page) {
-			fdata->type = VM_FAULT_OOM;
-			return NULL;
-		}
+		if (!page)
+			return VM_FAULT_OOM;
 	}
 
 out:
-	fdata->type = VM_FAULT_MINOR;
 	page_cache_get(page);
-	return page;
+	vmf->page = page;
+	return VM_FAULT_MINOR;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1827,10 +1827,10 @@ static int unmap_mapping_range_vma(struc
 
 	/*
 	 * files that support invalidating or truncating portions of the
-	 * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
-	 * have their .nopage function return the page locked.
+	 * file from under mmaped areas must have their ->fault function
+	 * return a locked page (and FAULT_RET_LOCKED code). This provides
+	 * synchronisation against concurrent unmapping here.
 	 */
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
 again:
 	restart_addr = vma->vm_truncate_count;
@@ -2299,63 +2299,62 @@ static int __do_fault(struct mm_struct *
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page, *faulted_page;
+	struct page *page;
 	pte_t entry;
 	int anon = 0;
 	struct page *dirty_page = NULL;
-	struct fault_data fdata;
+	struct vm_fault vmf;
+	int ret;
 
-	fdata.address = address & PAGE_MASK;
-	fdata.pgoff = pgoff;
-	fdata.flags = flags;
+	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+	vmf.pgoff = pgoff;
+	vmf.flags = flags;
+	vmf.page = NULL;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
 	if (likely(vma->vm_ops->fault)) {
-		fdata.type = -1;
-		faulted_page = vma->vm_ops->fault(vma, &fdata);
-		WARN_ON(fdata.type == -1);
-		if (unlikely(!faulted_page))
-			return fdata.type;
+		ret = vma->vm_ops->fault(vma, &vmf);
+		if (unlikely(ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE)))
+			return (ret & VM_FAULT_MASK);
 	} else {
 		/* Legacy ->nopage path */
-		fdata.type = VM_FAULT_MINOR;
-		faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
-								&fdata.type);
+		ret = VM_FAULT_MINOR;
+		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
-		if (unlikely(faulted_page == NOPAGE_SIGBUS))
+		if (unlikely(vmf.page == NOPAGE_SIGBUS))
 			return VM_FAULT_SIGBUS;
-		else if (unlikely(faulted_page == NOPAGE_OOM))
+		else if (unlikely(vmf.page == NOPAGE_OOM))
 			return VM_FAULT_OOM;
 	}
 
 	/*
-	 * For consistency in subsequent calls, make the faulted_page always
+	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
-	if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
-		lock_page(faulted_page);
+	if (unlikely(!(ret & FAULT_RET_LOCKED)))
+		lock_page(vmf.page);
 	else
-		BUG_ON(!PageLocked(faulted_page));
+		VM_BUG_ON(!PageLocked(vmf.page));
 
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	page = faulted_page;
+	page = vmf.page;
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
 			anon = 1;
 			if (unlikely(anon_vma_prepare(vma))) {
-				fdata.type = VM_FAULT_OOM;
+				ret = VM_FAULT_OOM;
 				goto out;
 			}
 			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 			if (!page) {
-				fdata.type = VM_FAULT_OOM;
+				ret = VM_FAULT_OOM;
 				goto out;
 			}
-			copy_user_highpage(page, faulted_page, address, vma);
+			copy_user_highpage(page, vmf.page, address, vma);
 		} else {
 			/*
 			 * If the page will be shareable, see if the backing
@@ -2364,8 +2363,8 @@ static int __do_fault(struct mm_struct *
 			 */
 			if (vma->vm_ops->page_mkwrite &&
 			    vma->vm_ops->page_mkwrite(vma, page) < 0) {
-				fdata.type = VM_FAULT_SIGBUS;
-				anon = 1; /* no anon but release faulted_page */
+				ret = VM_FAULT_SIGBUS;
+				anon = 1; /* no anon but release vmf.page */
 				goto out;
 			}
 		}
@@ -2417,15 +2416,15 @@ static int __do_fault(struct mm_struct *
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(faulted_page);
+	unlock_page(vmf.page);
 	if (anon)
-		page_cache_release(faulted_page);
+		page_cache_release(vmf.page);
 	else if (dirty_page) {
 		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 
-	return fdata.type;
+	return (ret & VM_FAULT_MASK);
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2436,18 +2435,10 @@ static int do_linear_fault(struct mm_str
 			- vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
 	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
 
-	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+							flags, orig_pte);
 }
 
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access, pgoff_t pgoff, pte_t orig_pte)
-{
-	unsigned int flags = FAULT_FLAG_NONLINEAR |
-				(write_access ? FAULT_FLAG_WRITE : 0);
-
-	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
-}
 
 /*
  * do_no_pfn() tries to create a new page mapping for a page without
@@ -2508,17 +2499,19 @@ static noinline int do_no_pfn(struct mm_
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access, pte_t orig_pte)
 {
+	unsigned int flags = FAULT_FLAG_NONLINEAR |
+				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
-	int err;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return VM_FAULT_MINOR;
 
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
+	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
@@ -2528,18 +2521,8 @@ static int do_file_page(struct mm_struct
 
 	pgoff = pte_to_pgoff(orig_pte);
 
-	if (vma->vm_ops && vma->vm_ops->fault)
-		return do_nonlinear_fault(mm, vma, address, page_table, pmd,
-					write_access, pgoff, orig_pte);
-
-	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
-	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
-					vma->vm_page_prot, pgoff, 0);
-	if (err == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (err)
-		return VM_FAULT_SIGBUS;
-	return VM_FAULT_MAJOR;
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff,
+							flags, orig_pte);
 }
 
 /*
@@ -2578,7 +2561,7 @@ static inline int handle_pte_fault(struc
 						 pte, pmd, write_access);
 		}
 		if (pte_file(entry))
-			return do_file_page(mm, vma, address,
+			return do_nonlinear_fault(mm, vma, address,
 					pte, pmd, write_access, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, write_access, entry);
Index: linux-2.6/mm/nommu.c
===================================================================
--- linux-2.6.orig/mm/nommu.c
+++ linux-2.6/mm/nommu.c
@@ -1336,10 +1336,10 @@ int in_gate_area_no_task(unsigned long a
 	return 0;
 }
 
-struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
+int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	BUG();
-	return NULL;
+	return 0;
 }
 
 /*
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -1303,29 +1303,21 @@ failed:
 	return error;
 }
 
-static struct page *shmem_fault(struct vm_area_struct *vma,
-					struct fault_data *fdata)
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct page *page = NULL;
 	int error;
+	int ret;
 
-	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+	if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+		return VM_FAULT_SIGBUS;
 
-	if (((loff_t)fdata->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-		fdata->type = VM_FAULT_SIGBUS;
-		return NULL;
-	}
-
-	error = shmem_getpage(inode, fdata->pgoff, &page,
-						SGP_FAULT, &fdata->type);
-	if (error) {
-		fdata->type = ((error == -ENOMEM)?VM_FAULT_OOM:VM_FAULT_SIGBUS);
-		return NULL;
-	}
+	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
+	if (error)
+		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
-	mark_page_accessed(page);
-	return page;
+	mark_page_accessed(vmf->page);
+	return ret | FAULT_RET_LOCKED;
 }
 
 #ifdef CONFIG_NUMA
@@ -1372,7 +1364,7 @@ static int shmem_mmap(struct file *file,
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2562,6 +2554,5 @@ int shmem_zero_setup(struct vm_area_stru
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
 	return 0;
 }
Index: linux-2.6/fs/gfs2/ops_file.c
===================================================================
--- linux-2.6.orig/fs/gfs2/ops_file.c
+++ linux-2.6/fs/gfs2/ops_file.c
@@ -364,8 +364,6 @@ static int gfs2_mmap(struct file *file, 
 	else
 		vma->vm_ops = &gfs2_vm_ops_private;
 
-	vma->vm_flags |= VM_CAN_INVALIDATE|VM_CAN_NONLINEAR;
-
 	gfs2_glock_dq_uninit(&i_gh);
 
 	return error;
Index: linux-2.6/Documentation/feature-removal-schedule.txt
===================================================================
--- linux-2.6.orig/Documentation/feature-removal-schedule.txt
+++ linux-2.6/Documentation/feature-removal-schedule.txt
@@ -160,26 +160,8 @@ Who:	Greg Kroah-Hartman <gregkh@suse.de>
 
 ---------------------------
 
-What:	filemap_nopage, filemap_populate
-When:	April 2007
-Why:	These legacy interfaces no longer have any callers in the kernel and
-	any functionality provided can be provided with filemap_fault. The
-	removal schedule is short because they are a big maintainence burden
-	and have some bugs.
-Who:	Nick Piggin <npiggin@suse.de>
-
----------------------------
-
-What:	vm_ops.populate, install_page
-When:	April 2007
-Why:	These legacy interfaces no longer have any callers in the kernel and
-	any functionality provided can be provided with vm_ops.fault.
-Who:	Nick Piggin <npiggin@suse.de>
-
----------------------------
-
 What:	vm_ops.nopage
-When:	February 2008, provided in-kernel callers have been converted
+When:	Soon, provided in-kernel callers have been converted
 Why:	This interface is replaced by vm_ops.fault, but it has been around
 	forever, is used by a lot of drivers, and doesn't cost much to
 	maintain.
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -510,7 +510,7 @@ More details about quota locking can be 
 prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
-	struct page *(*fault)(struct vm_area_struct*, struct fault_data *);
+	int (*fault)(struct vm_area_struct*, struct vm_fault *);
 	struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
 
 locking rules:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-25 11:18               ` Nick Piggin
@ 2007-05-25 16:36                 ` Linus Torvalds
  2007-05-26  7:34                   ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2007-05-25 16:36 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Benjamin Herrenschmidt, akpm, linux-mm, Christoph Hellwig


On Fri, 25 May 2007, Nick Piggin wrote:
> 
> What do you think? Any better?

Yes, I think this is getting there. It made the error returns generally 
much simpler.

That said, I think it has room for more improvement. Why not make the 
return value just be a bitmask, rather than having two separate "bytes" of 
data.

For example, you now do:

> +
> +/*
> + * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
> + */
> +#define VM_FAULT_ERROR	0x20
> +
> +#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
> +#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
>  #define VM_FAULT_MINOR	0x02
>  #define VM_FAULT_MAJOR	0x03

And it would be so much cleaner (I think) to just realize:

 - successful VM faults are always either major or minor, so having two 
   different values for them is silly (it comes from the fact that we did 
   _not_ have a bitmask). JUst make a "MAJOR" bit, and if it's clear, then 
   it's not major, of course!

 - rather than have one bit to say "we had an error", just make each error 
   be a bit of its own. We don't have that many (two, to be exact), so you 
   actually don't even use any more bits, but it means that you can do:

	#define VM_FAULT_OOM		0x0001
	#define VM_FAULT_SIGBUS		0x0002
	#define VM_FAULT_MAJOR		0x0004
	#define VM_FAULT_WRITE		0x0008

	#define VM_FAULT_NONLINEAR	0x0010
	#define VM_FAULT_NOPAGE		0x0020	/* We did our own pfn map */
	#define VM_FAULT_LOCKED		0x0040	/* Returned a locked page */

	/* Helper defines: */
	#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)

   and you're done. No magic semantics: you just always return a set of 
   result flags.

So now the _user_ would simply do something like

	unsigned int flags;

	flags = vma->vm_ops->fault(...);
	if (flags & VM_FAULT_ERROR)
		return flags;

	if (flags & VM_FAULT_MAJOR)
		increment_major_faults();
	else
		increment_minor_faults();

	/* Did the fault handler do it all already? All done! */
	if (flags & VM_FAULT_NOPAGE)
		return 0;

	page = fault->page;
	.. install page ..

	/*
	 * If the fault handler returned a locked page, we should now 
	 * unlock it
	 */
	if (flags & VM_FAULT_LOCKED)
		unlock_page(page);

	/* All done! */
	return 0;

or something like that. Yeah, the above is simplified, but it's not 
*overly* so. And it never worries about "high bytes" and "low bytes", and 
it never worries about a certain set of bits meaning one thing, and 
another set meaning somethign else. Isn't that just much simpler for 
everybody?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-25 16:36                 ` Linus Torvalds
@ 2007-05-26  7:34                   ` Nick Piggin
  2007-05-26  8:03                     ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-05-26  7:34 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin Herrenschmidt, akpm, linux-mm, Christoph Hellwig

On Fri, May 25, 2007 at 09:36:26AM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 25 May 2007, Nick Piggin wrote:
> > 
> > What do you think? Any better?
> 
> Yes, I think this is getting there. It made the error returns generally 
> much simpler.
> 
> That said, I think it has room for more improvement. Why not make the 
> return value just be a bitmask, rather than having two separate "bytes" of 
> data.

Yeah, that would be really nice, but I guess that now goes out and
touches all arch code too, doesn't it? Or... actually we could just
retain compatibility by masking off the high bits, and defining some
sane definition for VM_FAULT_MINOR (seems like 0x0000 would work).

Then in a subsequent patch I can update all architectures to use bit
masks (I think Ben wanted to make some changes here too, and we could
probably consolidate a bit of code).

Actually if we have a VM_FAULT_ERROR bitmask, we can probably be a
tiny bit more efficient in the arch fault handlers as well by doing an

 if (unlikely(ret & VM_FAULT_ERROR))
     out_of_line_error_handler();


> For example, you now do:
> 
> > +
> > +/*
> > + * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
> > + */
> > +#define VM_FAULT_ERROR	0x20
> > +
> > +#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
> > +#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
> >  #define VM_FAULT_MINOR	0x02
> >  #define VM_FAULT_MAJOR	0x03
> 
> And it would be so much cleaner (I think) to just realize:
> 
>  - successful VM faults are always either major or minor, so having two 
>    different values for them is silly (it comes from the fact that we did 
>    _not_ have a bitmask). JUst make a "MAJOR" bit, and if it's clear, then 
>    it's not major, of course!
> 
>  - rather than have one bit to say "we had an error", just make each error 
>    be a bit of its own. We don't have that many (two, to be exact), so you 
>    actually don't even use any more bits, but it means that you can do:
> 
> 	#define VM_FAULT_OOM		0x0001
> 	#define VM_FAULT_SIGBUS		0x0002
> 	#define VM_FAULT_MAJOR		0x0004
> 	#define VM_FAULT_WRITE		0x0008
> 
> 	#define VM_FAULT_NONLINEAR	0x0010
> 	#define VM_FAULT_NOPAGE		0x0020	/* We did our own pfn map */
> 	#define VM_FAULT_LOCKED		0x0040	/* Returned a locked page */
> 
> 	/* Helper defines: */
> 	#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
> 
>    and you're done. No magic semantics: you just always return a set of 
>    result flags.
> 
> So now the _user_ would simply do something like
> 
> 	unsigned int flags;
> 
> 	flags = vma->vm_ops->fault(...);
> 	if (flags & VM_FAULT_ERROR)
> 		return flags;
> 
> 	if (flags & VM_FAULT_MAJOR)
> 		increment_major_faults();
> 	else
> 		increment_minor_faults();
> 
> 	/* Did the fault handler do it all already? All done! */
> 	if (flags & VM_FAULT_NOPAGE)
> 		return 0;
> 
> 	page = fault->page;
> 	.. install page ..
> 
> 	/*
> 	 * If the fault handler returned a locked page, we should now 
> 	 * unlock it
> 	 */
> 	if (flags & VM_FAULT_LOCKED)
> 		unlock_page(page);
> 
> 	/* All done! */
> 	return 0;
> 
> or something like that. Yeah, the above is simplified, but it's not 
> *overly* so. And it never worries about "high bytes" and "low bytes", and 
> it never worries about a certain set of bits meaning one thing, and 
> another set meaning somethign else. Isn't that just much simpler for 
> everybody?

Yes I think that seems nice. I'll do another inc patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-26  7:34                   ` Nick Piggin
@ 2007-05-26  8:03                     ` Nick Piggin
  2007-05-26 15:44                       ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2007-05-26  8:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Benjamin Herrenschmidt, akpm, linux-mm, Christoph Hellwig

On Sat, May 26, 2007 at 09:34:26AM +0200, Nick Piggin wrote:
> On Fri, May 25, 2007 at 09:36:26AM -0700, Linus Torvalds wrote:
> > 
> > 
> > On Fri, 25 May 2007, Nick Piggin wrote:
> > > 
> > > What do you think? Any better?
> > 
> > Yes, I think this is getting there. It made the error returns generally 
> > much simpler.
> > 
> > That said, I think it has room for more improvement. Why not make the 
> > return value just be a bitmask, rather than having two separate "bytes" of 
> > data.
> 
> Yeah, that would be really nice, but I guess that now goes out and
> touches all arch code too, doesn't it? Or... actually we could just
> retain compatibility by masking off the high bits, and defining some
> sane definition for VM_FAULT_MINOR (seems like 0x0000 would work).

Hmm, maybe we just skip that annoying step?

Here is something of an untested mockup (incremental since the last
incremental one). It does look quite a bit cleaner, and let's us
finally get rid of that stupid __handle_mm_fault thing.

---
Index: linux-2.6/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.orig/arch/i386/mm/fault.c
+++ linux-2.6/arch/i386/mm/fault.c
@@ -303,6 +303,7 @@ fastcall void __kprobes do_page_fault(st
 	struct vm_area_struct * vma;
 	unsigned long address;
 	int write, si_code;
+	int ret;
 
 	/* get the address */
         address = read_cr2();
@@ -422,20 +423,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	ret = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(ret & VM_FAULT_ERROR)) {
+		if (ret & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (ret & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (ret & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -198,25 +198,10 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
 
 
-#define FAULT_RET_NOPAGE	0x0100	/* ->fault did not return a page. This
-					 * can be used if the handler installs
-					 * their own pte.
-					 */
-#define FAULT_RET_LOCKED	0x0200	/* ->fault locked the page, caller must
-					 * unlock after installing the mapping.
-					 * This is used by pagecache in
-					 * particular, where the page lock is
-					 * used to synchronise against truncate
-					 * and invalidate. Mutually exclusive
-					 * with FAULT_RET_NOPAGE.
-					 */
-
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
- * ->fault function. The vma's ->fault is responsible for returning the
- * VM_FAULT_xxx type which occupies the lowest byte of the return code, ORed
- * with FAULT_RET_ flags that occupy the next byte and give details about
- * how the fault was handled.
+ * ->fault function. The vma's ->fault is responsible for returning a bitmask
+ * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
  * pgoff should be used in favour of virtual_address, if possible. If pgoff
  * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
@@ -228,9 +213,9 @@ struct vm_fault {
 	void __user *virtual_address;	/* Faulting virtual address */
 
 	struct page *page;		/* ->fault handlers should return a
-					 * page here, unless FAULT_RET_NOPAGE
+					 * page here, unless VM_FAULT_NOPAGE
 					 * is set (which is also implied by
-					 * VM_FAULT_OOM or SIGBUS).
+					 * VM_FAULT_ERROR).
 					 */
 };
 
@@ -709,26 +694,15 @@ static inline int page_mapped(struct pag
  * just gets major/minor fault counters bumped up.
  */
 
-/*
- * VM_FAULT_ERROR is set for the error cases, to make some tests simpler.
- */
-#define VM_FAULT_ERROR	0x20
+#define VM_FAULT_OOM	0x0001
+#define VM_FAULT_SIGBUS	0x0002
+#define VM_FAULT_MAJOR	0x0004
+#define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 
-#define VM_FAULT_OOM	(0x00 | VM_FAULT_ERROR)
-#define VM_FAULT_SIGBUS	(0x01 | VM_FAULT_ERROR)
-#define VM_FAULT_MINOR	0x02
-#define VM_FAULT_MAJOR	0x03
+#define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
+#define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 
-/* 
- * Special case for get_user_pages.
- * Must be in a distinct bit from the above VM_FAULT_ flags.
- */
-#define VM_FAULT_WRITE	0x10
-
-/*
- * Mask of VM_FAULT_ flags
- */
-#define VM_FAULT_MASK	0xff
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS)
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
@@ -814,16 +788,8 @@ extern int vmtruncate(struct inode * ino
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
 
 #ifdef CONFIG_MMU
-extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
+extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
-
-static inline int handle_mm_fault(struct mm_struct *mm,
-			struct vm_area_struct *vma, unsigned long address,
-			int write_access)
-{
-	return __handle_mm_fault(mm, vma, address, write_access) &
-				(~VM_FAULT_WRITE);
-}
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
 			struct vm_area_struct *vma, unsigned long address,
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -1062,31 +1062,30 @@ int get_user_pages(struct task_struct *t
 			cond_resched();
 			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
-				ret = __handle_mm_fault(mm, vma, start,
+				ret = handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
+				if (ret & VM_FAULT_ERROR) {
+					if (ret & VM_FAULT_OOM)
+						return i ? i : -ENOMEM;
+					else if (ret & VM_FAULT_SIGBUS)
+						return i ? i : -EFAULT;
+					BUG();
+				}
+				if (ret & VM_FAULT_MAJOR)
+					tsk->maj_flt++;
+				else
+					tsk->min_flt++;
+
 				/*
-				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
-				 * broken COW when necessary, even if maybe_mkwrite
-				 * decided not to set pte_write. We can thus safely do
-				 * subsequent page lookups as if they were reads.
+				 * The VM_FAULT_WRITE bit tells us that
+				 * do_wp_page has broken COW when necessary,
+				 * even if maybe_mkwrite decided not to set
+				 * pte_write. We can thus safely do subsequent
+				 * page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
 					foll_flags &= ~FOLL_WRITE;
-				
-				switch (ret & ~VM_FAULT_WRITE) {
-				case VM_FAULT_MINOR:
-					tsk->min_flt++;
-					break;
-				case VM_FAULT_MAJOR:
-					tsk->maj_flt++;
-					break;
-				case VM_FAULT_SIGBUS:
-					return i ? i : -EFAULT;
-				case VM_FAULT_OOM:
-					return i ? i : -ENOMEM;
-				default:
-					BUG();
-				}
+
 				cond_resched();
 			}
 			if (pages) {
@@ -1633,7 +1632,7 @@ static int do_wp_page(struct mm_struct *
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int reuse = 0, ret = VM_FAULT_MINOR;
+	int reuse = 0, ret = 0;
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
@@ -1828,8 +1827,8 @@ static int unmap_mapping_range_vma(struc
 	/*
 	 * files that support invalidating or truncating portions of the
 	 * file from under mmaped areas must have their ->fault function
-	 * return a locked page (and FAULT_RET_LOCKED code). This provides
-	 * synchronisation against concurrent unmapping here.
+	 * return a locked page (and set VM_FAULT_LOCKED in the return).
+	 * This provides synchronisation against concurrent unmapping here.
 	 */
 
 again:
@@ -2133,7 +2132,7 @@ static int do_swap_page(struct mm_struct
 	struct page *page;
 	swp_entry_t entry;
 	pte_t pte;
-	int ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
@@ -2201,8 +2200,9 @@ static int do_swap_page(struct mm_struct
 	unlock_page(page);
 
 	if (write_access) {
+		/* XXX: We could OR the do_wp_page code with this one? */
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
 			ret = VM_FAULT_OOM;
 		goto out;
 	}
@@ -2273,7 +2273,7 @@ static int do_anonymous_page(struct mm_s
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 release:
 	page_cache_release(page);
 	goto unlock;
@@ -2316,11 +2316,11 @@ static int __do_fault(struct mm_struct *
 
 	if (likely(vma->vm_ops->fault)) {
 		ret = vma->vm_ops->fault(vma, &vmf);
-		if (unlikely(ret & (VM_FAULT_ERROR | FAULT_RET_NOPAGE)))
-			return (ret & VM_FAULT_MASK);
+		if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+			return ret;
 	} else {
 		/* Legacy ->nopage path */
-		ret = VM_FAULT_MINOR;
+		ret = 0;
 		vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
 		/* no page was available -- either SIGBUS or OOM */
 		if (unlikely(vmf.page == NOPAGE_SIGBUS))
@@ -2333,7 +2333,7 @@ static int __do_fault(struct mm_struct *
 	 * For consistency in subsequent calls, make the faulted page always
 	 * locked.
 	 */
-	if (unlikely(!(ret & FAULT_RET_LOCKED)))
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
 	else
 		VM_BUG_ON(!PageLocked(vmf.page));
@@ -2424,7 +2424,7 @@ out:
 		put_page(dirty_page);
 	}
 
-	return (ret & VM_FAULT_MASK);
+	return ret;
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -2463,7 +2463,6 @@ static noinline int do_no_pfn(struct mm_
 	spinlock_t *ptl;
 	pte_t entry;
 	unsigned long pfn;
-	int ret = VM_FAULT_MINOR;
 
 	pte_unmap(page_table);
 	BUG_ON(!(vma->vm_flags & VM_PFNMAP));
@@ -2475,7 +2474,7 @@ static noinline int do_no_pfn(struct mm_
 	else if (unlikely(pfn == NOPFN_SIGBUS))
 		return VM_FAULT_SIGBUS;
 	else if (unlikely(pfn == NOPFN_REFAULT))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -2487,7 +2486,7 @@ static noinline int do_no_pfn(struct mm_
 		set_pte_at(mm, address, page_table, entry);
 	}
 	pte_unmap_unlock(page_table, ptl);
-	return ret;
+	return 0;
 }
 
 /*
@@ -2508,7 +2507,7 @@ static int do_nonlinear_fault(struct mm_
 	pgoff_t pgoff;
 
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return VM_FAULT_MINOR;
+		return 0;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
 			!(vma->vm_flags & VM_CAN_NONLINEAR))) {
@@ -2594,13 +2593,13 @@ static inline int handle_pte_fault(struc
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
@@ -2629,7 +2628,7 @@ int __handle_mm_fault(struct mm_struct *
 	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
-EXPORT_SYMBOL_GPL(__handle_mm_fault);
+EXPORT_SYMBOL_GPL(handle_mm_fault);
 
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -1355,9 +1355,7 @@ int filemap_fault(struct vm_area_struct 
 	struct page *page;
 	unsigned long size;
 	int did_readaround = 0;
-	int ret;
-
-	ret = VM_FAULT_MINOR;
+	int ret = 0;
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
@@ -1441,7 +1439,7 @@ retry_find:
 	 */
 	mark_page_accessed(page);
 	vmf->page = page;
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 
 outside_data_content:
 	/*
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -1097,7 +1097,7 @@ static int shmem_getpage(struct inode *i
 		return -EFBIG;
 
 	if (type)
-		*type = VM_FAULT_MINOR;
+		*type = 0;
 
 	/*
 	 * Normally, filepage is NULL on entry, and either found
@@ -1132,9 +1132,9 @@ repeat:
 		if (!swappage) {
 			shmem_swp_unmap(entry);
 			/* here we actually do the io */
-			if (type && *type == VM_FAULT_MINOR) {
+			if (type && !(*type & VM_FAULT_MAJOR)) {
 				__count_vm_event(PGMAJFAULT);
-				*type = VM_FAULT_MAJOR;
+				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
 			swappage = shmem_swapin(info, swap, idx);
@@ -1317,7 +1317,7 @@ static int shmem_fault(struct vm_area_st
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
 
 	mark_page_accessed(vmf->page);
-	return ret | FAULT_RET_LOCKED;
+	return ret | VM_FAULT_LOCKED;
 }
 
 #ifdef CONFIG_NUMA
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c
+++ linux-2.6/mm/hugetlb.c
@@ -445,7 +445,7 @@ static int hugetlb_cow(struct mm_struct 
 	avoidcopy = (page_count(old_page) == 1);
 	if (avoidcopy) {
 		set_huge_ptep_writable(vma, address, ptep);
-		return VM_FAULT_MINOR;
+		return 0;
 	}
 
 	page_cache_get(old_page);
@@ -470,7 +470,7 @@ static int hugetlb_cow(struct mm_struct 
 	}
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-	return VM_FAULT_MINOR;
+	return 0;
 }
 
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -527,7 +527,7 @@ retry:
 	if (idx >= size)
 		goto backout;
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 	if (!pte_none(*ptep))
 		goto backout;
 
@@ -578,7 +578,7 @@ int hugetlb_fault(struct mm_struct *mm, 
 		return ret;
 	}
 
-	ret = VM_FAULT_MINOR;
+	ret = 0;
 
 	spin_lock(&mm->page_table_lock);
 	/* Check for a racing update before calling hugetlb_cow */
@@ -617,7 +617,7 @@ int follow_hugetlb_page(struct mm_struct
 			spin_unlock(&mm->page_table_lock);
 			ret = hugetlb_fault(mm, vma, vaddr, 0);
 			spin_lock(&mm->page_table_lock);
-			if (ret == VM_FAULT_MINOR)
+			if (!(ret & VM_FAULT_MAJOR))
 				continue;
 
 			remainder = 0;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 3/8] mm: merge nopfn into fault
  2007-05-26  8:03                     ` Nick Piggin
@ 2007-05-26 15:44                       ` Linus Torvalds
  0 siblings, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2007-05-26 15:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Benjamin Herrenschmidt, akpm, linux-mm, Christoph Hellwig


On Sat, 26 May 2007, Nick Piggin wrote:
> 
> Here is something of an untested mockup (incremental since the last
> incremental one). It does look quite a bit cleaner, and let's us
> finally get rid of that stupid __handle_mm_fault thing.

Yeah, I think I approve of this one.

Thanks,

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-05-26 15:44 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-05-18  7:37 [patch 3/8] mm: merge nopfn into fault akpm, Nick Piggin
2007-05-18 15:23 ` Linus Torvalds
2007-05-19  1:46   ` Nick Piggin
2007-05-23 23:40   ` Benjamin Herrenschmidt
2007-05-24  1:42     ` Nick Piggin
2007-05-24  2:04       ` Linus Torvalds
2007-05-24  2:16         ` Nick Piggin
2007-05-24  3:17         ` Benjamin Herrenschmidt
2007-05-24  3:26           ` Benjamin Herrenschmidt
2007-05-24  3:37             ` Linus Torvalds
2007-05-24  3:45               ` Nick Piggin
2007-05-24 10:07                 ` Christoph Hellwig
2007-05-24 10:15                   ` Benjamin Herrenschmidt
2007-05-24  3:48               ` Benjamin Herrenschmidt
2007-05-25 11:18               ` Nick Piggin
2007-05-25 16:36                 ` Linus Torvalds
2007-05-26  7:34                   ` Nick Piggin
2007-05-26  8:03                     ` Nick Piggin
2007-05-26 15:44                       ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox