[patch] shared page table for hugetlb page

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch] shared page table for hugetlb page - v2
@ 2006-09-21  0:57 Chen, Kenneth W
  2006-09-21  1:08 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Chen, Kenneth W @ 2006-09-21  0:57 UTC (permalink / raw)
  To: 'Hugh Dickins', 'Andrew Morton',
	'Dave McCracken'
  Cc: linux-mm

Following up with the work on shared page table, here is a re-post of
shared page table for hugetlb memory.  Dave's latest patch restricts
the page table sharing at pmd level in order to simplify some of the
complexity for normal page, but that simplification cuts out all the
performance benefit for hugetlb on x86-64 and ia32.

The following patch attempt to kick that optimization back in for hugetlb
memory and allow pt sharing at second level.  It is nicely self-contained
within hugetlb subsystem.  With no impact to generic VM at all, I think
this patch is ready for mainline consideration.

Imprecise RSS accounting is an irritating ill effect with pt sharing.
After consulted with several VM experts, I have tried various methods to
solve that problem: (1) iterate through all mm_structs that share the PT
and increment count; (2) keep RSS count in page table structure and then
sum them up at reporting time. None of the above methods yield any
satisfactory implementation.

Since process RSS accounting is pure information only, I propose we don't
count them at all for hugetlb page. rlimit has such field, though there is
absolutely no enforcement on limiting that resource. One other method is
to account all RSS at hugetlb mmap time regardless they are faulted or not.
I opt for the simplicity of no accounting at all.


Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>


 arch/i386/mm/hugetlbpage.c |   79 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/hugetlb.c               |   14 ++++++-
 2 files changed, 89 insertions(+), 4 deletions(-)

--- ./mm/hugetlb.c.orig	2006-09-19 20:42:06.000000000 -0700
+++ ./mm/hugetlb.c	2006-09-20 15:36:28.000000000 -0700
@@ -344,7 +344,6 @@
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -356,6 +355,12 @@
 	return -ENOMEM;
 }
 
+__attribute__((weak))
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end)
 {
@@ -379,13 +384,15 @@
 		if (!ptep)
 			continue;
 
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
+
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 
 	spin_unlock(&mm->page_table_lock);
@@ -488,7 +495,6 @@
 	if (!pte_none(*ptep))
 		goto backout;
 
-	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
@@ -631,6 +637,8 @@
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
--- ./arch/i386/mm/hugetlbpage.c.orig	2006-09-19 20:42:06.000000000 -0700
+++ ./arch/i386/mm/hugetlbpage.c	2006-09-20 09:38:54.000000000 -0700
@@ -17,16 +17,93 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+int page_table_shareable(struct vm_area_struct *svma,
+			 struct vm_area_struct *vma,
+			 unsigned long addr, unsigned long size)
+{
+	unsigned long base = addr & ~(size - 1);
+	unsigned long end = base + size;
+
+	if (base < vma->vm_start || vma->vm_end < end)
+		return 0;
+
+	if (svma->vm_flags != vma->vm_flags ||
+	    svma->vm_start != vma->vm_start ||
+	    svma->vm_end   != vma->vm_end)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+void pmd_share(struct vm_area_struct *vma, pud_t *pud, unsigned long addr)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma;
+	pte_t *spte = NULL;
+
+	if (!vma->vm_flags & VM_SHARED)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap,
+			      vma->vm_pgoff, vma->vm_pgoff) {
+		if (svma == vma ||
+		    !page_table_shareable(svma, vma, addr, PUD_SIZE))
+			continue;
+
+		spin_lock(&svma->vm_mm->page_table_lock);
+		spte = huge_pte_offset(svma->vm_mm, addr);
+		if (spte)
+			get_page(virt_to_page(spte));
+		spin_unlock(&svma->vm_mm->page_table_lock);
+		if (spte)
+			break;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (!spte)
+		return;
+
+	spin_lock(&vma->vm_mm->page_table_lock);
+	if (pud_none(*pud))
+		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+	else
+		put_page(virt_to_page(spte));
+	spin_unlock(&vma->vm_mm->page_table_lock);
+}
+
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
+	pud_t *pud = pud_offset(pgd, *addr);
+
+	if (page_count(virt_to_page(ptep)) <= 1)
+		return 0;
+
+	pud_clear(pud);
+	put_page(virt_to_page(ptep));
+	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+	return 1;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
+	struct vm_area_struct *vma = find_vma(mm, addr);
 	pgd_t *pgd;
 	pud_t *pud;
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
+	if (pud) {
+		if (pud_none(*pud))
+			pmd_share(vma, pud, addr);
 		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
 	return pte;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] shared page table for hugetlb page - v2
  2006-09-21  0:57 [patch] shared page table for hugetlb page - v2 Chen, Kenneth W
@ 2006-09-21  1:08 ` Andrew Morton
  2006-09-21  1:35   ` Chen, Kenneth W
  2006-09-22 21:21 ` Andrew Morton
  2006-09-26 20:03 ` Hugh Dickins
  2 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2006-09-21  1:08 UTC (permalink / raw)
  To: Chen, Kenneth W
  Cc: 'Hugh Dickins', 'Dave McCracken', linux-mm

On Wed, 20 Sep 2006 17:57:33 -0700
"Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:

> Following up with the work on shared page table, here is a re-post of
> shared page table for hugetlb memory.

Is that actually useful?  With one single pagetable page controlling,
say, 4GB of hugepage memory, I'm surprised that there's much point in
trying to optimise it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [patch] shared page table for hugetlb page - v2
  2006-09-21  1:08 ` Andrew Morton
@ 2006-09-21  1:35   ` Chen, Kenneth W
  0 siblings, 0 replies; 7+ messages in thread
From: Chen, Kenneth W @ 2006-09-21  1:35 UTC (permalink / raw)
  To: 'Andrew Morton'
  Cc: 'Hugh Dickins', 'Dave McCracken', linux-mm

Andrew Morton wrote on Wednesday, September 20, 2006 6:08 PM
> On Wed, 20 Sep 2006 17:57:33 -0700
> "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
> 
> > Following up with the work on shared page table, here is a re-post of
> > shared page table for hugetlb memory.
> 
> Is that actually useful?  With one single pagetable page controlling,
> say, 4GB of hugepage memory, I'm surprised that there's much point in
> trying to optimise it.

Yes, there is when large number of processes using one large shared memory
segment.  The optimization is not really targeted to save memory in this
case, instead, the goal of using shared PT on hugetlb is to allow faster
TLB refill and less cache pollution upon TLB miss.

Since pte entries are shared among hundreds of processes, the cache
consumption used by all the page table is a lot smaller and in return, we
got much higher cache hit rate for user space application. I have performance
counter data to back that claim if people want to see the detail. The other
effect is also that cache hit rate with hardware page walker will be higher
too and this helps to reduce tlb miss latency.

In Dave's implementation for sharing PT on normal page, the performance
gain is predominantly come from reducing memory overhead in managing PTE.
I think cache miss rate and tlb miss latency is of secondary consideration
in that scenario, though it should help there as well.

- Ken

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] shared page table for hugetlb page - v2
  2006-09-21  0:57 [patch] shared page table for hugetlb page - v2 Chen, Kenneth W
  2006-09-21  1:08 ` Andrew Morton
@ 2006-09-22 21:21 ` Andrew Morton
  2006-09-22 22:53   ` Chen, Kenneth W
  2006-09-26 20:03 ` Hugh Dickins
  2 siblings, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2006-09-22 21:21 UTC (permalink / raw)
  To: Chen, Kenneth W
  Cc: 'Hugh Dickins', 'Dave McCracken', linux-mm

On Wed, 20 Sep 2006 17:57:33 -0700
"Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:

> Following up with the work on shared page table, here is a re-post of
> shared page table for hugetlb memory.  Dave's latest patch restricts
> the page table sharing at pmd level in order to simplify some of the
> complexity for normal page, but that simplification cuts out all the
> performance benefit for hugetlb on x86-64 and ia32.
> 
> The following patch attempt to kick that optimization back in for hugetlb
> memory and allow pt sharing at second level.  It is nicely self-contained
> within hugetlb subsystem.  With no impact to generic VM at all, I think
> this patch is ready for mainline consideration.
> 
> Imprecise RSS accounting is an irritating ill effect with pt sharing.
> After consulted with several VM experts, I have tried various methods to
> solve that problem: (1) iterate through all mm_structs that share the PT
> and increment count; (2) keep RSS count in page table structure and then
> sum them up at reporting time. None of the above methods yield any
> satisfactory implementation.
> 
> Since process RSS accounting is pure information only, I propose we don't
> count them at all for hugetlb page. rlimit has such field, though there is
> absolutely no enforcement on limiting that resource. One other method is
> to account all RSS at hugetlb mmap time regardless they are faulted or not.
> I opt for the simplicity of no accounting at all.
> 
> 
> +/*
> + * search for a shareable pmd page for hugetlb.
> + */
> +void pmd_share(struct vm_area_struct *vma, pud_t *pud, unsigned long addr)
> +{
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct prio_tree_iter iter;
> +	struct vm_area_struct *svma;
> +	pte_t *spte = NULL;
> +
> +	if (!vma->vm_flags & VM_SHARED)
> +		return;
> +
> +	spin_lock(&mapping->i_mmap_lock);
> +	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap,
> +			      vma->vm_pgoff, vma->vm_pgoff) {
> +		if (svma == vma ||
> +		    !page_table_shareable(svma, vma, addr, PUD_SIZE))
> +			continue;
> +
> +		spin_lock(&svma->vm_mm->page_table_lock);
> +		spte = huge_pte_offset(svma->vm_mm, addr);
> +		if (spte)
> +			get_page(virt_to_page(spte));
> +		spin_unlock(&svma->vm_mm->page_table_lock);
> +		if (spte)
> +			break;
> +	}
> +	spin_unlock(&mapping->i_mmap_lock);
> +
> +	if (!spte)
> +		return;
> +
> +	spin_lock(&vma->vm_mm->page_table_lock);
> +	if (pud_none(*pud))
> +		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
> +	else
> +		put_page(virt_to_page(spte));
> +	spin_unlock(&vma->vm_mm->page_table_lock);
> +}

The locking in here makes me a bit queasy.  What causes *spte to still be
shareable after we've dropped i_mmap_lock?

(A patch which adds appropriate comments would be the preferred answer,
please...)


> +int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)

I think this function could do with a comment describing its
responsibilities.

> +{
> +	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
> +	pud_t *pud = pud_offset(pgd, *addr);
> +
> +	if (page_count(virt_to_page(ptep)) <= 1)
> +		return 0;

And this test.  It's testing the refcount of the pte page, yes?  Why?  What
does it mean when that refcount is zero?  Bug?  And when it's one?  We're
the last user, so the above test is an optimisation, yes?

Please, consider your code from the point of view of someone who is trying
to come up to speed with what it's doing, and be merciful ;)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [patch] shared page table for hugetlb page - v2
  2006-09-22 21:21 ` Andrew Morton
@ 2006-09-22 22:53   ` Chen, Kenneth W
  0 siblings, 0 replies; 7+ messages in thread
From: Chen, Kenneth W @ 2006-09-22 22:53 UTC (permalink / raw)
  To: 'Andrew Morton'
  Cc: 'Hugh Dickins', 'Dave McCracken', linux-mm

Andrew Morton wrote on Friday, September 22, 2006 2:21 PM
> The locking in here makes me a bit queasy.  What causes *spte to still be
> shareable after we've dropped i_mmap_lock?
> 
> (A patch which adds appropriate comments would be the preferred answer,
> please...)
> 

OK, patch attached below.


> > +int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
> 
> I think this function could do with a comment describing its
> responsibilities.
> 

OK, comments added in the patch below.


> > +{
> > +	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
> > +	pud_t *pud = pud_offset(pgd, *addr);
> > +
> > +	if (page_count(virt_to_page(ptep)) <= 1)
> > +		return 0;
> 
> And this test.  It's testing the refcount of the pte page, yes?  Why?  What
> does it mean when that refcount is zero?  Bug?  And when it's one?  We're
> the last user, so the above test is an optimisation, yes?

Yes, testing whether the pte page is shared or not.  This function falls out
if the pte page is not shared or we are the last user.  The caller of this
function then iterate through each pte and unmap the corresponding user pages.
I've added comments in the patch as well.


Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>

--- ./arch/i386/mm/hugetlbpage.c.orig	2006-09-22 12:48:54.000000000 -0700
+++ ./arch/i386/mm/hugetlbpage.c	2006-09-22 13:48:12.000000000 -0700
@@ -57,6 +57,11 @@ void pmd_share(struct vm_area_struct *vm
 
 		spin_lock(&svma->vm_mm->page_table_lock);
 		spte = huge_pte_offset(svma->vm_mm, addr);
+		/*
+		 * if a valid hugetlb pte is found, take a reference count
+		 * on the pte page.  We can then safely populate it into
+		 * pud at a later point.
+		 */
 		if (spte)
 			get_page(virt_to_page(spte));
 		spin_unlock(&svma->vm_mm->page_table_lock);
@@ -76,6 +81,16 @@ void pmd_share(struct vm_area_struct *vm
 	spin_unlock(&vma->vm_mm->page_table_lock);
 }
 
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *	    0 the underlying pte page is not shared, or it is the last user
+ */
 int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
 {
 	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [patch] shared page table for hugetlb page - v2
  2006-09-21  0:57 [patch] shared page table for hugetlb page - v2 Chen, Kenneth W
  2006-09-21  1:08 ` Andrew Morton
  2006-09-22 21:21 ` Andrew Morton
@ 2006-09-26 20:03 ` Hugh Dickins
  2006-09-27  8:34   ` Chen, Kenneth W
  2 siblings, 1 reply; 7+ messages in thread
From: Hugh Dickins @ 2006-09-26 20:03 UTC (permalink / raw)
  To: Chen, Kenneth W
  Cc: 'Andrew Morton', 'Dave McCracken', linux-mm

On Wed, 20 Sep 2006, Chen, Kenneth W wrote:
> Following up with the work on shared page table, here is a re-post of
> shared page table for hugetlb memory.  Dave's latest patch restricts
> the page table sharing at pmd level in order to simplify some of the
> complexity for normal page, but that simplification cuts out all the
> performance benefit for hugetlb on x86-64 and ia32.
> 
> The following patch attempt to kick that optimization back in for hugetlb
> memory and allow pt sharing at second level.  It is nicely self-contained
> within hugetlb subsystem.  With no impact to generic VM at all, I think
> this patch is ready for mainline consideration.

I was impressed by how small and unintrusive this patch is, and how
nicely it adheres to CodingStyle throughout.  But I've spotted one
easily fixed bug, and quite a lot of raciness (depressingly, often
issues already pointed out and hopefully by now fixed in Dave's;
but one of the racinesses is already there before your patch).

Unfit for mainline until those are dealt with: though I don't think
the fixes are going to expand and complicate it terribly, so it
should remain palatable.  My main fear is that the longer I look,
the more raciness I may find: it just seems hard to get shared page
table locking right; I am hoping that once it is right, it won't be
so correspondingly fragile.

> 
> Imprecise RSS accounting is an irritating ill effect with pt sharing.
> After consulted with several VM experts, I have tried various methods to
> solve that problem: (1) iterate through all mm_structs that share the PT
> and increment count; (2) keep RSS count in page table structure and then
> sum them up at reporting time. None of the above methods yield any
> satisfactory implementation.
> 
> Since process RSS accounting is pure information only, I propose we don't
> count them at all for hugetlb page. rlimit has such field, though there is
> absolutely no enforcement on limiting that resource. One other method is
> to account all RSS at hugetlb mmap time regardless they are faulted or not.
> I opt for the simplicity of no accounting at all.

I agree with your decision here for the hugetlb case (but we won't be
able to let Dave take the same easy way out).  Imagine if we enforced
RSS limiting, and tried to swap pages out to make a process meet its
limit: wouldn't work on the hugepages anyway.  Yes, just forget RSS.

But two things on that: next time you send the patch, better to have
a 1/2 which does all that simple RSS removal from the hugetlb code;
and please also remove the call to update_hiwater_rss() - it's doing
no harm, but it's just a waste once that RSS adjustment is gone.

> 
> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
> 
> 
>  arch/i386/mm/hugetlbpage.c |   79 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/hugetlb.c               |   14 ++++++-
>  2 files changed, 89 insertions(+), 4 deletions(-)
> 
> --- ./mm/hugetlb.c.orig	2006-09-19 20:42:06.000000000 -0700
> +++ ./mm/hugetlb.c	2006-09-20 15:36:28.000000000 -0700
...
> +__attribute__((weak))
> +int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
> +{
> +	return 0;
> +}
> +

Hmm, __attribute__((weak)) seems to coming into fashion, I'd better get
used to it.  But I think you did it that way, and your call to find_vma
in huge_pte_alloc, just to avoid mods to other arches for now: good way
to get it up and running, but for merging it would be better to update
all the hugetlb arches with the trivial mods required, than have this
weak default huge_pte_put here.

> --- ./arch/i386/mm/hugetlbpage.c.orig	2006-09-19 20:42:06.000000000 -0700
> +++ ./arch/i386/mm/hugetlbpage.c	2006-09-20 09:38:54.000000000 -0700
> @@ -17,16 +17,93 @@
>  #include <asm/tlb.h>
>  #include <asm/tlbflush.h>
>  
> +int page_table_shareable(struct vm_area_struct *svma,
> +			 struct vm_area_struct *vma,
> +			 unsigned long addr, unsigned long size)
> +{
> +	unsigned long base = addr & ~(size - 1);
> +	unsigned long end = base + size;
> +
> +	if (base < vma->vm_start || vma->vm_end < end)
> +		return 0;
> +
> +	if (svma->vm_flags != vma->vm_flags ||
> +	    svma->vm_start != vma->vm_start ||
> +	    svma->vm_end   != vma->vm_end)
> +		return 0;
> +
> +	return 1;
> +}

Now this isn't an arch-specific function at all, is it?  The size
passed in will be arch specific, but not the function itself.  I
think you put it here to avoid "bloating" mm/hugetlb.c for those
who don't need it; but hugetlb users are already amongst the bloaty,
and other arches would implement soon, so I think it ought to move
there (or if you disagree, then please at least make it static here).
Later, if Dave's work goes in, then perhaps it'll move again and be
shared with his.

The bug that needs fixing is that it's making no check on vm_pgoff:
your vma_prio_tree search gives you all svmas which overlap the
first page of this vma (not quite what you want, really), but
they can easily match the conditions above without matching up
at all in vm_pgoff.

Rather than just fix that, I'd prefer you or Dave to actually get
the page_table_shareable conditions right at last: it doesn't need
the vmas to match exactly, it just needs the right permissions and
the right alignment for the page table in question.  It's just better
doc of what's going on if it checks for what it's really needing.

> +
> +/*
> + * search for a shareable pmd page for hugetlb.
> + */
> +void pmd_share(struct vm_area_struct *vma, pud_t *pud, unsigned long addr)

static

> +{
> +	struct address_space *mapping = vma->vm_file->f_mapping;
> +	struct prio_tree_iter iter;
> +	struct vm_area_struct *svma;
> +	pte_t *spte = NULL;
> +
> +	if (!vma->vm_flags & VM_SHARED)
> +		return;

Better to check VM_MAYSHARE instead there: the difference is that a
PROT_READ,MAP_SHARED mapping which cannot be converted to PROT_WRITE
(file was opened readonly) comes out as VM_MAYSHARE but !VM_SHARED.

> +
> +	spin_lock(&mapping->i_mmap_lock);
> +	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap,
> +			      vma->vm_pgoff, vma->vm_pgoff) {
> +		if (svma == vma ||
> +		    !page_table_shareable(svma, vma, addr, PUD_SIZE))
> +			continue;

No.  Holding i_mmap_lock is indeed good enough to protect against racing
changes to vm_start, vm_end, vm_pgoff (since vma_adjust has to be careful
not to undermine the prio_tree without it), but it's not enough to protect
against racing changes to vm_flags (e.g. by mprotect), and that's a part
of what page_table_shareable has to check (though you might in the end
want to separate it out, if it's going to be more efficient to check the
safe ones first before getting adequate locking for vm_flags).  We went
around this with Dave before, he now does down_read_trylock on mmap_sem
to secure vm_flags.

> +
> +		spin_lock(&svma->vm_mm->page_table_lock);
> +		spte = huge_pte_offset(svma->vm_mm, addr);
> +		if (spte)
> +			get_page(virt_to_page(spte));
> +		spin_unlock(&svma->vm_mm->page_table_lock);
> +		if (spte)
> +			break;
> +	}
> +	spin_unlock(&mapping->i_mmap_lock);
> +
> +	if (!spte)
> +		return;
> +
> +	spin_lock(&vma->vm_mm->page_table_lock);
> +	if (pud_none(*pud))
> +		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
> +	else
> +		put_page(virt_to_page(spte));
> +	spin_unlock(&vma->vm_mm->page_table_lock);
> +}

No, that's another race Dave had to fix months ago.  That put_page,
all you've got there is your own mm->page_table_lock: it's unlikely
but possible that you're now the sole user of that page table page
(the one you shared it from having exited meanwhile, and a racer
on your mm having done the same but picked up a different to share).
Dave now uses the lock in the page table page, you'll probably want
to and be able to keep it simpler, maybe it'll help to hold i_mmap_lock
until the end, maybe that's irrelevant and you'll just need to
huge_pte_put in case, I can't be at all sure without seeing the
totality and there's more wrong below.

> +
> +int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
> +{
> +	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
> +	pud_t *pud = pud_offset(pgd, *addr);
> +
> +	if (page_count(virt_to_page(ptep)) <= 1)
> +		return 0;
> +
> +	pud_clear(pud);
> +	put_page(virt_to_page(ptep));
> +	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
> +	return 1;
> +}

Doesn't "if (page_count <= 1) return 0; blah; put_page;" scream race
to you?  If i_mmap_lock were held wherever this is called, you'd
be alright; but it isn't, and it'd be messy to arrange - because
unmap_hugepage_range is called with i_mmap_lock held when truncating,
but without it when just munmapping.

You may end up deciding that the easiest thing is to use i_mmap_lock
more widely, and somehow arrange things that way - though I don't
think we want to require it in the common path of mprotect.

More typical would be to manipulate the atomic count properly: you
could use the page table page's mapcount and do the extra work when
you atomic_add_negative(-1, &page->_mapcount) (as in mm/rmap.c), or
you could carry on using page_count, and do the extra work when
atomic_sub_return(&page->_count, 1) == 1.  Both of those involve
bumping the respective count on every hugetlb page table at that
level when it's first allocated: I don't think you can delay until
it becomes "shared"; and I don't think there's a way to do it when
page_count falls to 0, that rushes off to the freeing the page
before you're ready to do so.

> +
>  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
>  {
> +	struct vm_area_struct *vma = find_vma(mm, addr);

Fine for getting this working with a minimal patch, but better to
bite the bullet and change huge_pte_alloc args to pass in vma on
all arches (either with or without mm: sometimes we use vma->vm_mm,
sometimes we pass in mm separately, there's no consistency):
the callers know the vma, better to avoid the find_vma call.

Well, no, actually copy_hugetlb_page_range knows the parent vma,
but not this child vma.  But that brings me to another point,
though it was just an optimization: silly for the huge_pte_alloc
from copy_hugetlb_page_range to be doing that pmd_share search
for a suitable page table to share, when (if VM_MAYSHARE) the
parent page table is obviously good to be shared.

>  	pgd_t *pgd;
>  	pud_t *pud;
>  	pte_t *pte = NULL;
>  
>  	pgd = pgd_offset(mm, addr);
>  	pud = pud_alloc(mm, pgd, addr);
> -	if (pud)
> +	if (pud) {
> +		if (pud_none(*pud))
> +			pmd_share(vma, pud, addr);
>  		pte = (pte_t *) pmd_alloc(mm, pud, addr);
> +	}
>  	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
>  
>  	return pte;

I thought for some while that there was even more raciness, but
hugetlb_fault's surprisingly global hugetlb_instantiation_mutex
protects from a lot.  Maybe some of my beliefs above are
erroneous too, please don't take my word for it.

Finally, that raciness already there before your patch.  You have
a problem which again we went over with Dave (in the non-huge case),
that as soon as unmap_hugepage_range or hugetlb_change_protection
has done its huge_pte_put on a shared page table, it's lost control
of the table, which might get independently freed and reused before
this thread does its flush_tlb_range - leaving other threads free to
abuse or suffer from the inappropriate page table.  But even without
your patch, unmap_hugepage_range is freeing hugepages (back to the
pool) before doing any TLB flush.  There needs to be more TLB care
there, and it's not your fault it's missing: either sophisticated
mmu_gather-style ordering, or earlier flush_tlb_range of each subrange.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [patch] shared page table for hugetlb page - v2
  2006-09-26 20:03 ` Hugh Dickins
@ 2006-09-27  8:34   ` Chen, Kenneth W
  0 siblings, 0 replies; 7+ messages in thread
From: Chen, Kenneth W @ 2006-09-27  8:34 UTC (permalink / raw)
  To: 'Hugh Dickins'
  Cc: 'Andrew Morton', 'Dave McCracken', linux-mm

Hugh Dickins wrote on Tuesday, September 26, 2006 1:03 PM
> I was impressed by how small and unintrusive this patch is, and how
> nicely it adheres to CodingStyle throughout.  But I've spotted one
> easily fixed bug, and quite a lot of raciness (depressingly, often
> issues already pointed out and hopefully by now fixed in Dave's;
> but one of the racinesses is already there before your patch).
> 
> Unfit for mainline until those are dealt with: though I don't think
> the fixes are going to expand and complicate it terribly, so it
> should remain palatable.  My main fear is that the longer I look,
> the more raciness I may find: it just seems hard to get shared page
> table locking right; I am hoping that once it is right, it won't be
> so correspondingly fragile.

Yeah, I completely overlooked the locking for the shared page table
page, given the fact that mm_struct->page_table_lock is no longer
appropriate to protect multiple mm that share the same page table
page. Duh, the locking need to be done at higher level.

Below is my new RFC patch on the locking implementation: my first cut
is to use i_mmap_lock throughout to protect these pages. I will implement
atomic ref count later to see which one is better. Here is a rough
outline of what I did:

Change unmap_hugepage_range() to __unmap_hugepage_range so it can be
used in the truncating path.  In the munmap path, added new function
unmap_hugepage_range to hold i_mmap_lock and then calls __unmap... In
function hugetlb_change_protection(), i_mmap_lock is added around
page table manipulation.  Is this acceptable?  Or am I going to get
screamed at for adding i_mmap_lock in the mprotect and munmap path?


> > +
> > +	spin_lock(&mapping->i_mmap_lock);
> > +	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap,
> > +			      vma->vm_pgoff, vma->vm_pgoff) {
> > +		if (svma == vma ||
> > +		    !page_table_shareable(svma, vma, addr, PUD_SIZE))
> > +			continue;
> 
> No.  Holding i_mmap_lock is indeed good enough to protect against racing
> changes to vm_start, vm_end, vm_pgoff (since vma_adjust has to be careful
> not to undermine the prio_tree without it), but it's not enough to protect
> against racing changes to vm_flags (e.g. by mprotect), and that's a part
> of what page_table_shareable has to check (though you might in the end
> want to separate it out, if it's going to be more efficient to check the
> safe ones first before getting adequate locking for vm_flags).  We went
> around this with Dave before, he now does down_read_trylock on mmap_sem
> to secure vm_flags.

I agree with you that vm_flags need to be secured.  But I don't see why
mmap_sem is the only qualifying candidate. Perhaps it was because lack of
lock protection in the unshare path in the earlier version?  If I take
svma->page_table_lock, check against matching vm_flags and then increment
a ref count on the shared page table page, won't that be enough?  Even if
another mm (call it P) changed vm_flags after the check, the ref count will
keep the page around and the pte we got will preserve the original protection
flags. And because of the ref count, P will notice the sharing state when it
unshares the page. Actually the exact timing doesn't really matter as P will
let go the mapping unconditionally anyway.

Along with the patch is a fix to address a bug in matching vma. Now it will
match actual backing file offset of the faulting page and the virtual address.
(It doesn't have to match all virtual address bit.  I will do more in the
next rev).


diff -Nurp linux-2.6.18/arch/i386/mm/hugetlbpage.c linux-2.6.18.ken/arch/i386/mm/hugetlbpage.c
--- linux-2.6.18/arch/i386/mm/hugetlbpage.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.ken/arch/i386/mm/hugetlbpage.c	2006-09-26 23:42:51.000000000 -0700
@@ -17,16 +17,122 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+static int page_table_shareable(struct vm_area_struct *svma,
+			 struct vm_area_struct *vma,
+			 unsigned long addr, unsigned long idx)
+{
+	unsigned long base = addr & ~(PUD_SIZE - 1);
+	unsigned long end = base + PUD_SIZE;
+
+	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+				svma->vm_start;
+	unsigned long sbase = saddr & ~(PUD_SIZE - 1);
+	unsigned long s_end = sbase + PUD_SIZE;
+
+	/*
+	 * match the virtual addresses, permission  and the alignment of the
+	 * page table page.
+	 */
+	if (addr != saddr || vma->vm_flags != svma->vm_flags ||
+	    base < vma->vm_start || vma->vm_end < end ||
+	    sbase < svma->vm_start || svma->vm_end < s_end)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void pmd_share(struct vm_area_struct *vma, pud_t *pud,
+			unsigned long addr)
+{
+	unsigned long idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+			    vma->vm_pgoff;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct prio_tree_iter iter;
+	struct vm_area_struct *svma;
+	pte_t *spte = NULL;
+
+	if (!vma->vm_flags & VM_MAYSHARE)
+		return;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+		if (svma == vma || !page_table_shareable(svma, vma, addr, idx))
+			continue;
+
+		/*
+		 * now that we found a suitable vma, next step is to find a
+		 * valid hugetlb pte page.  Recheck svma->vm_flags with
+		 * paga_table_lock held since checking above is done
+		 * without lock.
+		 */
+		spin_lock(&svma->vm_mm->page_table_lock);
+		if (vma->vm_flags == svma->vm_flags) {
+			spte = huge_pte_offset(svma->vm_mm, addr);
+			if (spte)
+				get_page(virt_to_page(spte));
+		}
+		spin_unlock(&svma->vm_mm->page_table_lock);
+		if (spte)
+			break;
+	}
+
+	if (!spte)
+		goto out;
+
+	spin_lock(&vma->vm_mm->page_table_lock);
+	if (pud_none(*pud))
+		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+	else
+		put_page(virt_to_page(spte));
+	spin_unlock(&vma->vm_mm->page_table_lock);
+out:
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ * 
+ * called with vma->vm_file->f_mapping->i_mmap_lock and 
+ *	       vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *	    0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	pgd_t *pgd = pgd_offset(vma->vm_mm, *addr);
+	pud_t *pud = pud_offset(pgd, *addr);
+
+	if (page_count(virt_to_page(ptep)) <= 1)
+		return 0;
+
+	pud_clear(pud);
+	put_page(virt_to_page(ptep));
+	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+	return 1;
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
+	struct vm_area_struct *vma = find_vma(mm, addr);
 	pgd_t *pgd;
 	pud_t *pud;
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
-	if (pud)
+	if (pud) {
+		if (pud_none(*pud))
+			pmd_share(vma, pud, addr);
 		pte = (pte_t *) pmd_alloc(mm, pud, addr);
+	}
 	BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
 
 	return pte;
diff -Nurp linux-2.6.18/fs/hugetlbfs/inode.c linux-2.6.18.ken/fs/hugetlbfs/inode.c
--- linux-2.6.18/fs/hugetlbfs/inode.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.ken/fs/hugetlbfs/inode.c	2006-09-26 21:57:24.000000000 -0700
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
 
-		unmap_hugepage_range(vma,
+		__unmap_hugepage_range(vma,
 				vma->vm_start + v_offset, vma->vm_end);
 	}
 }
diff -Nurp linux-2.6.18/include/linux/hugetlb.h linux-2.6.18.ken/include/linux/hugetlb.h
--- linux-2.6.18/include/linux/hugetlb.h	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.ken/include/linux/hugetlb.h	2006-09-26 22:14:55.000000000 -0700
@@ -17,6 +17,7 @@ int hugetlb_sysctl_handler(struct ctl_ta
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int
*, int);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
+void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
diff -Nurp linux-2.6.18/mm/hugetlb.c linux-2.6.18.ken/mm/hugetlb.c
--- linux-2.6.18/mm/hugetlb.c	2006-09-19 20:42:06.000000000 -0700
+++ linux-2.6.18.ken/mm/hugetlb.c	2006-09-26 22:50:02.000000000 -0700
@@ -344,7 +344,6 @@ int copy_hugetlb_page_range(struct mm_st
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -356,7 +355,13 @@ nomem:
 	return -ENOMEM;
 }
 
-void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+__attribute__((weak))
+int huge_pte_put(struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep)
+{
+	return 0;
+}
+
+void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 			  unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -370,28 +375,35 @@ void unmap_hugepage_range(struct vm_area
 	BUG_ON(end & ~HPAGE_MASK);
 
 	spin_lock(&mm->page_table_lock);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
 
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
+
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		if (pte_none(pte))
 			continue;
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
-
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
 }
 
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			  unsigned long end)
+{
+	if (vma->vm_file) {
+		spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+		__unmap_hugepage_range(vma, start, end);
+		spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+	}
+}
+
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
@@ -488,7 +500,6 @@ retry:
 	if (!pte_none(*ptep))
 		goto backout;
 
-	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	set_huge_pte_at(mm, address, ptep, new_pte);
@@ -626,11 +637,14 @@ void hugetlb_change_protection(struct vm
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
+	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 	spin_lock(&mm->page_table_lock);
 	for (; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
+		if (huge_pte_put(vma, &address, ptep))
+			continue;
 		if (!pte_none(*ptep)) {
 			pte = huge_ptep_get_and_clear(mm, address, ptep);
 			pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -639,6 +653,7 @@ void hugetlb_change_protection(struct vm
 		}
 	}
 	spin_unlock(&mm->page_table_lock);
+	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
 
 	flush_tlb_range(vma, start, end);
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2006-09-27  8:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-09-21  0:57 [patch] shared page table for hugetlb page - v2 Chen, Kenneth W
2006-09-21  1:08 ` Andrew Morton
2006-09-21  1:35   ` Chen, Kenneth W
2006-09-22 21:21 ` Andrew Morton
2006-09-22 22:53   ` Chen, Kenneth W
2006-09-26 20:03 ` Hugh Dickins
2006-09-27  8:34   ` Chen, Kenneth W

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox