linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [v4][PATCH 1/2] pass mm into pagewalkers
@ 2008-06-11 18:02 Dave Hansen
  2008-06-11 18:02 ` [v4][PATCH 2/2] fix large pages in pagemap Dave Hansen
  2008-06-11 19:35 ` [v4][PATCH 1/2] pass mm into pagewalkers Andrew Morton
  0 siblings, 2 replies; 15+ messages in thread
From: Dave Hansen @ 2008-06-11 18:02 UTC (permalink / raw)
  To: akpm; +Cc: Hans Rosenfeld, Matt Mackall, linux-mm, Dave Hansen

We need this at least for huge page detection for now.

It might also come in handy for some of the other
users.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
---

 linux-2.6.git-dave/fs/proc/task_mmu.c |   44 +++++++++++++++++++---------------
 linux-2.6.git-dave/include/linux/mm.h |   17 ++++++-------
 linux-2.6.git-dave/mm/pagewalk.c      |   42 +++++++++++++++++---------------
 3 files changed, 56 insertions(+), 47 deletions(-)

diff -puN mm/pagewalk.c~pass-mm-into-pagewalkers mm/pagewalk.c
--- linux-2.6.git/mm/pagewalk.c~pass-mm-into-pagewalkers	2008-06-11 10:55:53.000000000 -0700
+++ linux-2.6.git-dave/mm/pagewalk.c	2008-06-11 10:55:53.000000000 -0700
@@ -3,14 +3,14 @@
 #include <linux/sched.h>
 
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pte_t *pte;
 	int err = 0;
 
 	pte = pte_offset_map(pmd, addr);
 	for (;;) {
-		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+		err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
 		if (err)
 		       break;
 		addr += PAGE_SIZE;
@@ -24,7 +24,7 @@ static int walk_pte_range(pmd_t *pmd, un
 }
 
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -35,15 +35,15 @@ static int walk_pmd_range(pud_t *pud, un
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pmd_entry)
-			err = walk->pmd_entry(pmd, addr, next, private);
+			err = walk->pmd_entry(pmd, addr, next, walk);
 		if (!err && walk->pte_entry)
-			err = walk_pte_range(pmd, addr, next, walk, private);
+			err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
@@ -52,7 +52,7 @@ static int walk_pmd_range(pud_t *pud, un
 }
 
 static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
-			  const struct mm_walk *walk, void *private)
+			  struct mm_walk *walk)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -63,15 +63,15 @@ static int walk_pud_range(pgd_t *pgd, un
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pud_entry)
-			err = walk->pud_entry(pud, addr, next, private);
+			err = walk->pud_entry(pud, addr, next, walk);
 		if (!err && (walk->pmd_entry || walk->pte_entry))
-			err = walk_pmd_range(pud, addr, next, walk, private);
+			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
@@ -85,15 +85,15 @@ static int walk_pud_range(pgd_t *pgd, un
  * @addr: starting address
  * @end: ending address
  * @walk: set of callbacks to invoke for each level of the tree
- * @private: private data passed to the callback function
  *
  * Recursively walk the page table for the memory area in a VMA,
  * calling supplied callbacks. Callbacks are called in-order (first
  * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
  * etc.). If lower-level callbacks are omitted, walking depth is reduced.
  *
- * Each callback receives an entry pointer, the start and end of the
- * associated range, and a caller-supplied private data pointer.
+ * Each callback receives an entry pointer and the start and end of the
+ * associated range, and a copy of the original mm_walk for access to
+ * the ->private or ->mm fields.
  *
  * No locks are taken, but the bottom level iterator will map PTE
  * directories from highmem if necessary.
@@ -101,9 +101,8 @@ static int walk_pud_range(pgd_t *pgd, un
  * If any callback returns a non-zero value, the walk is aborted and
  * the return value is propagated back to the caller. Otherwise 0 is returned.
  */
-int walk_page_range(const struct mm_struct *mm,
-		    unsigned long addr, unsigned long end,
-		    const struct mm_walk *walk, void *private)
+int walk_page_range(unsigned long addr, unsigned long end,
+		    struct mm_walk *walk)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -112,21 +111,24 @@ int walk_page_range(const struct mm_stru
 	if (addr >= end)
 		return err;
 
-	pgd = pgd_offset(mm, addr);
+	if (!walk->mm)
+		return -EINVAL;
+
+	pgd = pgd_offset(walk->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd)) {
 			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, private);
+				err = walk->pte_hole(addr, next, walk);
 			if (err)
 				break;
 			continue;
 		}
 		if (walk->pgd_entry)
-			err = walk->pgd_entry(pgd, addr, next, private);
+			err = walk->pgd_entry(pgd, addr, next, walk);
 		if (!err &&
 		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
-			err = walk_pud_range(pgd, addr, next, walk, private);
+			err = walk_pud_range(pgd, addr, next, walk);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
diff -puN include/linux/mm.h~pass-mm-into-pagewalkers include/linux/mm.h
--- linux-2.6.git/include/linux/mm.h~pass-mm-into-pagewalkers	2008-06-11 10:55:53.000000000 -0700
+++ linux-2.6.git-dave/include/linux/mm.h	2008-06-11 10:55:53.000000000 -0700
@@ -760,16 +760,17 @@ unsigned long unmap_vmas(struct mmu_gath
  * (see walk_page_range for more details)
  */
 struct mm_walk {
-	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
-	int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
-	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
-	int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
-	int (*pte_hole)(unsigned long, unsigned long, void *);
+	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
+	int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
+	struct mm_struct *mm;
+	void *private;
 };
 
-int walk_page_range(const struct mm_struct *, unsigned long addr,
-		    unsigned long end, const struct mm_walk *walk,
-		    void *private);
+int walk_page_range(unsigned long addr, unsigned long end,
+		struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
diff -puN fs/proc/task_mmu.c~pass-mm-into-pagewalkers fs/proc/task_mmu.c
--- linux-2.6.git/fs/proc/task_mmu.c~pass-mm-into-pagewalkers	2008-06-11 10:55:53.000000000 -0700
+++ linux-2.6.git-dave/fs/proc/task_mmu.c	2008-06-11 10:55:53.000000000 -0700
@@ -315,9 +315,9 @@ struct mem_size_stats {
 };
 
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			   void *private)
+			   struct mm_walk *walk)
 {
-	struct mem_size_stats *mss = private;
+	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
@@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, u
 	return 0;
 }
 
-static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
-
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
 	int ret;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-				&smaps_walk, &mss);
+		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
 	ret = show_map(m, v);
 	if (ret)
@@ -426,9 +428,9 @@ const struct file_operations proc_smaps_
 };
 
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				unsigned long end, void *private)
+				unsigned long end, struct mm_walk *walk)
 {
-	struct vm_area_struct *vma = private;
+	struct vm_area_struct *vma = walk->private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *p
 	return 0;
 }
 
-static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct f
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		static struct mm_walk clear_refs_walk;
+		memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
+		clear_refs_walk.pmd_entry = clear_refs_pte_range;
+		clear_refs_walk.mm = mm;
 		down_read(&mm->mmap_sem);
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			clear_refs_walk.private = vma;
 			if (!is_vm_hugetlb_page(vma))
-				walk_page_range(mm, vma->vm_start, vma->vm_end,
-						&clear_refs_walk, vma);
+				walk_page_range(vma->vm_start, vma->vm_end,
+						&clear_refs_walk);
+		}
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -538,9 +544,9 @@ static int add_to_pagemap(unsigned long 
 }
 
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
-				void *private)
+				struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct pagemapread *pm = walk->private;
 	unsigned long addr;
 	int err = 0;
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -558,9 +564,9 @@ static u64 swap_pte_to_pagemap_entry(pte
 }
 
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			     void *private)
+			     struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
 
@@ -685,8 +691,8 @@ static ssize_t pagemap_read(struct file 
 		 * user buffer is tracked in "pm", and the walk
 		 * will stop when we hit the end of the buffer.
 		 */
-		ret = walk_page_range(mm, start_vaddr, end_vaddr,
-					&pagemap_walk, &pm);
+		ret = walk_page_range(start_vaddr, end_vaddr,
+					&pagemap_walk);
 		if (ret == PM_END_OF_BUFFER)
 			ret = 0;
 		/* don't need mmap_sem for these, but this looks cleaner */
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 18:02 [v4][PATCH 1/2] pass mm into pagewalkers Dave Hansen
@ 2008-06-11 18:02 ` Dave Hansen
  2008-06-11 19:37   ` Andrew Morton
  2008-06-11 19:35 ` [v4][PATCH 1/2] pass mm into pagewalkers Andrew Morton
  1 sibling, 1 reply; 15+ messages in thread
From: Dave Hansen @ 2008-06-11 18:02 UTC (permalink / raw)
  To: akpm; +Cc: Hans Rosenfeld, Matt Mackall, linux-mm, Dave Hansen

We were walking right into huge page areas in the pagemap
walker, and calling the pmds pmd_bad() and clearing them.

That leaked huge pages.  Bad.

This patch at least works around that for now.  It ignores
huge pages in the pagemap walker for the time being, and
won't leak those pages.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
---

 linux-2.6.git-dave/fs/proc/task_mmu.c |   39 ++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff -puN fs/proc/task_mmu.c~fix-large-pages-in-pagemap fs/proc/task_mmu.c
--- linux-2.6.git/fs/proc/task_mmu.c~fix-large-pages-in-pagemap	2008-06-11 10:59:29.000000000 -0700
+++ linux-2.6.git-dave/fs/proc/task_mmu.c	2008-06-11 10:59:29.000000000 -0700
@@ -563,24 +563,45 @@ static u64 swap_pte_to_pagemap_entry(pte
 	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
+static unsigned long pte_to_pagemap_entry(pte_t pte)
+{
+	unsigned long pme = 0;
+	if (is_swap_pte(pte))
+		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+	else if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			     struct mm_walk *walk)
 {
+	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
 
+	/* find the first VMA at or above 'addr' */
+	vma = find_vma(walk->mm, addr);
 	for (; addr != end; addr += PAGE_SIZE) {
 		u64 pfn = PM_NOT_PRESENT;
-		pte = pte_offset_map(pmd, addr);
-		if (is_swap_pte(*pte))
-			pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
-		else if (pte_present(*pte))
-			pfn = PM_PFRAME(pte_pfn(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
-		/* unmap so we're not in atomic when we copy to userspace */
-		pte_unmap(pte);
+
+		/* check to see if we've left 'vma' behind
+		 * and need a new, higher one */
+		if (vma && (addr >= vma->vm_end))
+			vma = find_vma(walk->mm, addr);
+
+		/* check that 'vma' actually covers this address,
+		 * and that it isn't a huge page vma */
+		if (vma && (vma->vm_start <= addr) &&
+		    !is_vm_hugetlb_page(vma)) {
+			pte = pte_offset_map(pmd, addr);
+			pfn = pte_to_pagemap_entry(*pte);
+			/* unmap before userspace copy */
+			pte_unmap(pte);
+		}
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
diff -puN mm/pagewalk.c~fix-large-pages-in-pagemap mm/pagewalk.c
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 1/2] pass mm into pagewalkers
  2008-06-11 18:02 [v4][PATCH 1/2] pass mm into pagewalkers Dave Hansen
  2008-06-11 18:02 ` [v4][PATCH 2/2] fix large pages in pagemap Dave Hansen
@ 2008-06-11 19:35 ` Andrew Morton
  1 sibling, 0 replies; 15+ messages in thread
From: Andrew Morton @ 2008-06-11 19:35 UTC (permalink / raw)
  To: Dave Hansen; +Cc: akpm, hans.rosenfeld, mpm, linux-mm

On Wed, 11 Jun 2008 11:02:29 -0700
Dave Hansen <dave@linux.vnet.ibm.com> wrote:

> 
> We need this at least for huge page detection for now.
> 
> It might also come in handy for some of the other
> users.
> 

Changelog fails to explain why the `walk' argument was deconstified and
I couldn't immediately work this out from the diff.


> +	struct mm_walk smaps_walk = {
> +		.pmd_entry = smaps_pte_range,
> +		.mm = vma->vm_mm,
> +		.private = &mss,
> +	};

a)

>  	if (mm) {
> +		static struct mm_walk clear_refs_walk;
> +		memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
> +		clear_refs_walk.pmd_entry = clear_refs_pte_range;
> +		clear_refs_walk.mm = mm;

b)

where a) != b).  a) looks nicer.

Please do prefer to put a blank line between end-of-locals and
start-of-code.  It does improve readability.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 18:02 ` [v4][PATCH 2/2] fix large pages in pagemap Dave Hansen
@ 2008-06-11 19:37   ` Andrew Morton
  2008-06-11 19:53     ` Matt Mackall
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2008-06-11 19:37 UTC (permalink / raw)
  To: Dave Hansen; +Cc: hans.rosenfeld, mpm, linux-mm

On Wed, 11 Jun 2008 11:02:31 -0700
Dave Hansen <dave@linux.vnet.ibm.com> wrote:

> 
> We were walking right into huge page areas in the pagemap
> walker, and calling the pmds pmd_bad() and clearing them.
> 
> That leaked huge pages.  Bad.
> 
> This patch at least works around that for now.  It ignores
> huge pages in the pagemap walker for the time being, and
> won't leak those pages.
> 

I don't get it.   Why can't we just stick a

	if (pmd_huge(pmd))
		continue;

into pagemap_pte_range()?  Or something like that.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 19:37   ` Andrew Morton
@ 2008-06-11 19:53     ` Matt Mackall
  2008-06-11 20:11       ` Andrew Morton
  2008-06-11 20:15       ` Dave Hansen
  0 siblings, 2 replies; 15+ messages in thread
From: Matt Mackall @ 2008-06-11 19:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Hansen, hans.rosenfeld, linux-mm, Hugh Dickins

[adding Hugh to the cc:]

On Wed, 2008-06-11 at 12:37 -0700, Andrew Morton wrote:
> On Wed, 11 Jun 2008 11:02:31 -0700
> Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> 
> > 
> > We were walking right into huge page areas in the pagemap
> > walker, and calling the pmds pmd_bad() and clearing them.
> > 
> > That leaked huge pages.  Bad.
> > 
> > This patch at least works around that for now.  It ignores
> > huge pages in the pagemap walker for the time being, and
> > won't leak those pages.
> > 
> 
> I don't get it.   Why can't we just stick a
> 
> 	if (pmd_huge(pmd))
> 		continue;
> 
> into pagemap_pte_range()?  Or something like that.

That's certainly what you'd hope to be able to do, yes.

If I recall the earlier discussion, some arches with huge pages can only
identify them via a VMA. Obviously, any arch with hardware that walks
our pagetables directly must be able to identify huge pages directly
from those tables, but I think PPC and a couple others that don't have
hardware TLB fill fail to store such a bit in the tables at all.

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 19:53     ` Matt Mackall
@ 2008-06-11 20:11       ` Andrew Morton
  2008-06-11 20:21         ` Matt Mackall
  2008-06-11 20:34         ` Dave Hansen
  2008-06-11 20:15       ` Dave Hansen
  1 sibling, 2 replies; 15+ messages in thread
From: Andrew Morton @ 2008-06-11 20:11 UTC (permalink / raw)
  To: Matt Mackall; +Cc: dave, hans.rosenfeld, linux-mm, hugh, Rik van Riel

On Wed, 11 Jun 2008 14:53:00 -0500
Matt Mackall <mpm@selenic.com> wrote:

> [adding Hugh to the cc:]
> 
> On Wed, 2008-06-11 at 12:37 -0700, Andrew Morton wrote:
> > On Wed, 11 Jun 2008 11:02:31 -0700
> > Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> > 
> > > 
> > > We were walking right into huge page areas in the pagemap
> > > walker, and calling the pmds pmd_bad() and clearing them.
> > > 
> > > That leaked huge pages.  Bad.
> > > 
> > > This patch at least works around that for now.  It ignores
> > > huge pages in the pagemap walker for the time being, and
> > > won't leak those pages.
> > > 
> > 
> > I don't get it.   Why can't we just stick a
> > 
> > 	if (pmd_huge(pmd))
> > 		continue;
> > 
> > into pagemap_pte_range()?  Or something like that.
> 
> That's certainly what you'd hope to be able to do, yes.
> 
> If I recall the earlier discussion, some arches with huge pages can only
> identify them via a VMA. Obviously, any arch with hardware that walks
> our pagetables directly must be able to identify huge pages directly
> from those tables, but I think PPC and a couple others that don't have
> hardware TLB fill fail to store such a bit in the tables at all.

Really?  There already a couple of pmd_huge() tests in mm/memory.c and
Rik's access_process_vm-device-memory-infrastructure.patch adds another
one.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 19:53     ` Matt Mackall
  2008-06-11 20:11       ` Andrew Morton
@ 2008-06-11 20:15       ` Dave Hansen
  1 sibling, 0 replies; 15+ messages in thread
From: Dave Hansen @ 2008-06-11 20:15 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Andrew Morton, hans.rosenfeld, linux-mm, Hugh Dickins,
	ADAM G. LITKE [imap],
	Nishanth Aravamudan

On Wed, 2008-06-11 at 14:53 -0500, Matt Mackall wrote:
> 
> > I don't get it.   Why can't we just stick a
> > 
> >       if (pmd_huge(pmd))
> >               continue;
> > 
> > into pagemap_pte_range()?  Or something like that.
> 
> That's certainly what you'd hope to be able to do, yes.
> 
> If I recall the earlier discussion, some arches with huge pages can
> only
> identify them via a VMA. Obviously, any arch with hardware that walks
> our pagetables directly must be able to identify huge pages directly
> from those tables, but I think PPC and a couple others that don't have
> hardware TLB fill fail to store such a bit in the tables at all.

Yeah, the ppc (and more) huge pmd entries are just the address of the
huge page.  I would love to get all of them converted so we could use
pmd_huge() on them, eventually.  But, that's a much bigger undertaking.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 20:11       ` Andrew Morton
@ 2008-06-11 20:21         ` Matt Mackall
  2008-06-11 20:34         ` Dave Hansen
  1 sibling, 0 replies; 15+ messages in thread
From: Matt Mackall @ 2008-06-11 20:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dave, hans.rosenfeld, linux-mm, hugh, Rik van Riel

On Wed, 2008-06-11 at 13:11 -0700, Andrew Morton wrote:
> On Wed, 11 Jun 2008 14:53:00 -0500
> Matt Mackall <mpm@selenic.com> wrote:
> 
> > [adding Hugh to the cc:]
> > 
> > On Wed, 2008-06-11 at 12:37 -0700, Andrew Morton wrote:
> > > On Wed, 11 Jun 2008 11:02:31 -0700
> > > Dave Hansen <dave@linux.vnet.ibm.com> wrote:
> > > 
> > > > 
> > > > We were walking right into huge page areas in the pagemap
> > > > walker, and calling the pmds pmd_bad() and clearing them.
> > > > 
> > > > That leaked huge pages.  Bad.
> > > > 
> > > > This patch at least works around that for now.  It ignores
> > > > huge pages in the pagemap walker for the time being, and
> > > > won't leak those pages.
> > > > 
> > > 
> > > I don't get it.   Why can't we just stick a
> > > 
> > > 	if (pmd_huge(pmd))
> > > 		continue;
> > > 
> > > into pagemap_pte_range()?  Or something like that.
> > 
> > That's certainly what you'd hope to be able to do, yes.
> > 
> > If I recall the earlier discussion, some arches with huge pages can only
> > identify them via a VMA. Obviously, any arch with hardware that walks
> > our pagetables directly must be able to identify huge pages directly
> > from those tables, but I think PPC and a couple others that don't have
> > hardware TLB fill fail to store such a bit in the tables at all.
> 
> Really?  There already a couple of pmd_huge() tests in mm/memory.c and
> Rik's access_process_vm-device-memory-infrastructure.patch adds another
> one.

Quoting Hugh:

i>>?A pmd_huge(*pmd) test is tempting, but it only ever says "yes" on x86:
we've carefully left it undefined what happens to the pgd/pud/pmd/pte
hierarchy in the general arch case, once you're amongst hugepages.

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 20:11       ` Andrew Morton
  2008-06-11 20:21         ` Matt Mackall
@ 2008-06-11 20:34         ` Dave Hansen
  2008-06-11 20:52           ` Andrew Morton
  1 sibling, 1 reply; 15+ messages in thread
From: Dave Hansen @ 2008-06-11 20:34 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Matt Mackall, hans.rosenfeld, linux-mm, hugh, Rik van Riel

On Wed, 2008-06-11 at 13:11 -0700, Andrew Morton wrote:
> Really?  There already a couple of pmd_huge() tests in mm/memory.c and
> Rik's access_process_vm-device-memory-infrastructure.patch adds
> another one.

We're not supposed to ever hit the one in follow_page() because there
are:

                if (is_vm_hugetlb_page(vma)) {
                        i = follow_hugetlb_page(mm, vma, pages, vmas,
                                                &start, &len, i, write);
                        continue;
                }

checks before them like in get_user_pages();

The other mm/memory.c call is under alloc_vm_area(), and that's
supposedly only used on kernel addresses.  I don't think we even have
Linux pagetables for kernel addresses on ppc.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 20:34         ` Dave Hansen
@ 2008-06-11 20:52           ` Andrew Morton
  2008-06-11 21:01             ` Rik van Riel
  2008-06-11 21:23             ` Dave Hansen
  0 siblings, 2 replies; 15+ messages in thread
From: Andrew Morton @ 2008-06-11 20:52 UTC (permalink / raw)
  To: Dave Hansen; +Cc: mpm, hans.rosenfeld, linux-mm, hugh, riel

On Wed, 11 Jun 2008 13:34:22 -0700
Dave Hansen <dave@linux.vnet.ibm.com> wrote:

> On Wed, 2008-06-11 at 13:11 -0700, Andrew Morton wrote:
> > Really?  There already a couple of pmd_huge() tests in mm/memory.c and
> > Rik's access_process_vm-device-memory-infrastructure.patch adds
> > another one.
> 
> We're not supposed to ever hit the one in follow_page() because there
> are:
> 
>                 if (is_vm_hugetlb_page(vma)) {
>                         i = follow_hugetlb_page(mm, vma, pages, vmas,
>                                                 &start, &len, i, write);
>                         continue;
>                 }
> 
> checks before them like in get_user_pages();
> 
> The other mm/memory.c call is under alloc_vm_area(), and that's
> supposedly only used on kernel addresses.  I don't think we even have
> Linux pagetables for kernel addresses on ppc.
> 

access_process_vm-device-memory-infrastructure.patch is a powerpc
feature, and it uses pmd_huge().

Am I missing something, or is pmd_huge() a whopping big grenade for x86
developers to toss at non-x86 architectures?  It seems quite dangerous.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 20:52           ` Andrew Morton
@ 2008-06-11 21:01             ` Rik van Riel
  2008-06-11 21:23             ` Dave Hansen
  1 sibling, 0 replies; 15+ messages in thread
From: Rik van Riel @ 2008-06-11 21:01 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Hansen, mpm, hans.rosenfeld, linux-mm, hugh

On Wed, 11 Jun 2008 13:52:07 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:

> access_process_vm-device-memory-infrastructure.patch is a powerpc
> feature, and it uses pmd_huge().
> 
> Am I missing something, or is pmd_huge() a whopping big grenade for x86
> developers to toss at non-x86 architectures?  It seems quite dangerous.

That function is used on x86 too, to access device memory that's
been mapped through /dev/memory or PCI thingies under /sys.

The X.org people need that patch on x86 to figure out what's in
the GPU's queue before it threw a fit; in short, to better debug
the X server.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 20:52           ` Andrew Morton
  2008-06-11 21:01             ` Rik van Riel
@ 2008-06-11 21:23             ` Dave Hansen
  2008-06-11 22:37               ` Matt Mackall
  1 sibling, 1 reply; 15+ messages in thread
From: Dave Hansen @ 2008-06-11 21:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mpm, hans.rosenfeld, linux-mm, hugh, riel, nacc, Adam Litke

On Wed, 2008-06-11 at 13:52 -0700, Andrew Morton wrote:
> access_process_vm-device-memory-infrastructure.patch is a powerpc
> feature, and it uses pmd_huge().

I think that's bogus.  It probably needs to check the VMA in
generic_access_phys() if it wants to be safe.  I don't see any way that
pmd_huge() can give anything back other than 0 on ppc:

arch/powerpc/mm/hugetlbpage.c:

	int pmd_huge(pmd_t pmd)
	{
	        return 0;
	}

or in include/linux/hugetlb.h:

	#define pmd_huge(x)     0

> Am I missing something, or is pmd_huge() a whopping big grenade for x86
> developers to toss at non-x86 architectures?  It seems quite dangerous.

Yeah, it isn't really usable outside of arch code, although it kinda
looks like it.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 21:23             ` Dave Hansen
@ 2008-06-11 22:37               ` Matt Mackall
  2008-06-12 21:36                 ` Hugh Dickins
  0 siblings, 1 reply; 15+ messages in thread
From: Matt Mackall @ 2008-06-11 22:37 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andrew Morton, hans.rosenfeld, linux-mm, hugh, riel, nacc, Adam Litke

On Wed, 2008-06-11 at 14:23 -0700, Dave Hansen wrote:
> On Wed, 2008-06-11 at 13:52 -0700, Andrew Morton wrote:
> > access_process_vm-device-memory-infrastructure.patch is a powerpc
> > feature, and it uses pmd_huge().
> 
> I think that's bogus.  It probably needs to check the VMA in
> generic_access_phys() if it wants to be safe.  I don't see any way that
> pmd_huge() can give anything back other than 0 on ppc:
> 
> arch/powerpc/mm/hugetlbpage.c:
> 
> 	int pmd_huge(pmd_t pmd)
> 	{
> 	        return 0;
> 	}
> 
> or in include/linux/hugetlb.h:
> 
> 	#define pmd_huge(x)     0
> 
> > Am I missing something, or is pmd_huge() a whopping big grenade for x86
> > developers to toss at non-x86 architectures?  It seems quite dangerous.
> 
> Yeah, it isn't really usable outside of arch code, although it kinda
> looks like it.

That begs the question: if we can't use it reliably outside of arch
code, why do other arches even bother defining it?

And the answer seems to be because of the two uses in mm/memory.c. The
first seems like it could be avoided with an implementation of
follow_huge_addr on x86. The second is either bogus (only works on x86)
or superfluous (not needed at all), no?

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-11 22:37               ` Matt Mackall
@ 2008-06-12 21:36                 ` Hugh Dickins
  2008-06-12 22:35                   ` Matt Mackall
  0 siblings, 1 reply; 15+ messages in thread
From: Hugh Dickins @ 2008-06-12 21:36 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Dave Hansen, Andrew Morton, hans.rosenfeld, linux-mm, riel, nacc,
	Adam Litke

On Wed, 11 Jun 2008, Matt Mackall wrote:
> On Wed, 2008-06-11 at 14:23 -0700, Dave Hansen wrote:
> > On Wed, 2008-06-11 at 13:52 -0700, Andrew Morton wrote:
> > 
> > > Am I missing something, or is pmd_huge() a whopping big grenade for x86
> > > developers to toss at non-x86 architectures?  It seems quite dangerous.
> > 
> > Yeah, it isn't really usable outside of arch code, although it kinda
> > looks like it.
> 
> That begs the question: if we can't use it reliably outside of arch
> code, why do other arches even bother defining it?

Good question.

> And the answer seems to be because of the two uses in mm/memory.c. The
> first seems like it could be avoided with an implementation of
> follow_huge_addr on x86.

No, I don't think we need even that: because get_user_pages avoids
follow_page on huge vmas, and little else uses follow_page, I think
we could delete follow_huge_addr from all architectures.

It's gives a warm glow to know that follow_page could cope with
huge ranges if necessary, but it doesn't feel so good once you've
been misled by pmd_huge.

Incidentally, x86 turns out to have a pmd_large() too!

> The second is either bogus (only works on x86)
> or superfluous (not needed at all), no?

Not needed (checking for things that should never happen
is nice when it's convenient, but not a reason to uglify).

On Dave's patch: yes, I too would have preferred a pmd_huge-style
vma-less approach, but fear that doesn't work out at present.  But
isn't the patch calling find_vma() much more often than necessary?
Is there any architecture which can mix huge and normal pages
within the lowest pagetable?

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [v4][PATCH 2/2] fix large pages in pagemap
  2008-06-12 21:36                 ` Hugh Dickins
@ 2008-06-12 22:35                   ` Matt Mackall
  0 siblings, 0 replies; 15+ messages in thread
From: Matt Mackall @ 2008-06-12 22:35 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Dave Hansen, Andrew Morton, hans.rosenfeld, linux-mm, riel, nacc,
	Adam Litke

On Thu, 2008-06-12 at 22:36 +0100, Hugh Dickins wrote:
> On Wed, 11 Jun 2008, Matt Mackall wrote:
> > On Wed, 2008-06-11 at 14:23 -0700, Dave Hansen wrote:
> > > On Wed, 2008-06-11 at 13:52 -0700, Andrew Morton wrote:
> > > 
> > > > Am I missing something, or is pmd_huge() a whopping big grenade for x86
> > > > developers to toss at non-x86 architectures?  It seems quite dangerous.
> > > 
> > > Yeah, it isn't really usable outside of arch code, although it kinda
> > > looks like it.
> > 
> > That begs the question: if we can't use it reliably outside of arch
> > code, why do other arches even bother defining it?
> 
> Good question.
> 
> > And the answer seems to be because of the two uses in mm/memory.c. The
> > first seems like it could be avoided with an implementation of
> > follow_huge_addr on x86.
> 
> No, I don't think we need even that: because get_user_pages avoids
> follow_page on huge vmas, and little else uses follow_page, I think
> we could delete follow_huge_addr from all architectures.
> 
> It's gives a warm glow to know that follow_page could cope with
> huge ranges if necessary, but it doesn't feel so good once you've
> been misled by pmd_huge.

Since follow_page gets a VMA, it could check.

> Incidentally, x86 turns out to have a pmd_large() too!

As does mn10300 - surely that's a mistake?

> > The second is either bogus (only works on x86)
> > or superfluous (not needed at all), no?
> 
> Not needed (checking for things that should never happen
> is nice when it's convenient, but not a reason to uglify).
> 
> On Dave's patch: yes, I too would have preferred a pmd_huge-style
> vma-less approach, but fear that doesn't work out at present.  But
> isn't the patch calling find_vma() much more often than necessary?

Well not at the level of the function in question. But it could probably
cache a current VMA value across the whole walk in the walk structure.

> Is there any architecture which can mix huge and normal pages
> within the lowest pagetable?

Not sure how that would work?

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-06-12 22:35 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-11 18:02 [v4][PATCH 1/2] pass mm into pagewalkers Dave Hansen
2008-06-11 18:02 ` [v4][PATCH 2/2] fix large pages in pagemap Dave Hansen
2008-06-11 19:37   ` Andrew Morton
2008-06-11 19:53     ` Matt Mackall
2008-06-11 20:11       ` Andrew Morton
2008-06-11 20:21         ` Matt Mackall
2008-06-11 20:34         ` Dave Hansen
2008-06-11 20:52           ` Andrew Morton
2008-06-11 21:01             ` Rik van Riel
2008-06-11 21:23             ` Dave Hansen
2008-06-11 22:37               ` Matt Mackall
2008-06-12 21:36                 ` Hugh Dickins
2008-06-12 22:35                   ` Matt Mackall
2008-06-11 20:15       ` Dave Hansen
2008-06-11 19:35 ` [v4][PATCH 1/2] pass mm into pagewalkers Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox