munmap extremely slow even with untouched mapping.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* munmap extremely slow even with untouched mapping.
@ 2005-10-28  1:37 Robin Holt
  2005-10-28 10:45 ` Nick Piggin
  0 siblings, 1 reply; 8+ messages in thread
From: Robin Holt @ 2005-10-28  1:37 UTC (permalink / raw)
  To: linux-kernel, linux-mm

This is going out without any testing.  I got it to compile but never
even booted the kernel.

I noticed that on ia64, the following simple program would take 24
seconds to complete.  Profiling revealed I was spending all that time
in unmap_vmas.  Looking over the function, I noticed that I would only
attempt 8 pages at a time (CONFIG_PREMPT).  I then thought through this
some more.  My particular application had one large mapping which was
never touched after being mmaped.

Without this patch, we would iterate numerous times (256) to get past
a region of memory that had an empty pmd, 524,288 times to get past an
empty pud, and 1,073,741,824 to get past an empty pgd.  I had a 4-level
page table directory patch applied at the time.

Here is the test program:

#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <stdlib.h>
#include <stdio.h>

#define TMPFILE "/tmp/holt-dummy-mmap"

int main(int argc, char **argv)
{
	int fd;
	char *memmap_base;

	fd = open(TMPFILE, O_RDWR | O_CREAT | O_EXCL, 0600);
	if (fd < 0) {
		printf("Error opening " TMPFILE "\n");
		exit(1);
	}
	unlink(TMPFILE);

	memmap_base = (char *) mmap(NULL, 17592182882304,
				    PROT_READ | PROT_WRITE,
				    MAP_SHARED, fd,
				    (off_t) 0);

	if ((void *) memmap_base != MAP_FAILED)
		munmap(memmap_base, 17592182882304);
	return 0;
}


Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2005-10-27 20:06:45.671491763 -0500
+++ linux-2.6/mm/memory.c	2005-10-27 20:22:04.287665340 -0500
@@ -596,8 +596,9 @@ static void zap_pte_range(struct mmu_gat
 	pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
+				unsigned long stop_at,
 				struct zap_details *details)
 {
 	pmd_t *pmd;
@@ -608,12 +609,15 @@ static inline void zap_pmd_range(struct 
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
+		next = stop_at;
 		zap_pte_range(tlb, pmd, addr, next, details);
-	} while (pmd++, addr = next, addr != end);
+	} while (pmd++, addr = next, addr < stop_at);
+	return addr;
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
+				unsigned long stop_at,
 				struct zap_details *details)
 {
 	pud_t *pud;
@@ -624,16 +628,20 @@ static inline void zap_pud_range(struct 
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		zap_pmd_range(tlb, pud, addr, next, details);
-	} while (pud++, addr = next, addr != end);
+		next = zap_pmd_range(tlb, pud, addr, next, stop_at, details);
+	} while (pud++, addr = next, addr < stop_at);
+	return addr;
 }
 
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
+				unsigned long stop_at,
 				struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
+	unsigned long start_addr = addr;
 
 	if (details && !details->check_mapping && !details->nonlinear_vma)
 		details = NULL;
@@ -645,9 +653,10 @@ static void unmap_page_range(struct mmu_
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		zap_pud_range(tlb, pgd, addr, next, details);
-	} while (pgd++, addr = next, addr != end);
+		next = zap_pud_range(tlb, pgd, addr, next, stop_at, details);
+	} while (pgd++, addr = next, addr < stop_at);
 	tlb_end_vma(tlb, vma);
+	return (addr - start_addr);
 }
 
 #ifdef CONFIG_PREEMPT
@@ -722,8 +731,8 @@ unsigned long unmap_vmas(struct mmu_gath
 				unmap_hugepage_range(vma, start, end);
 			} else {
 				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
-						start + block, details);
+				block = unmap_page_range(*tlbp, vma, start,
+						end, start + block, details);
 			}
 
 			start += block;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-28  1:37 munmap extremely slow even with untouched mapping Robin Holt
@ 2005-10-28 10:45 ` Nick Piggin
  2005-10-28 15:20   ` Hugh Dickins
  0 siblings, 1 reply; 8+ messages in thread
From: Nick Piggin @ 2005-10-28 10:45 UTC (permalink / raw)
  To: Robin Holt; +Cc: linux-kernel, linux-mm, Hugh Dickins

[-- Attachment #1: Type: text/plain, Size: 1446 bytes --]

Robin Holt wrote:
> This is going out without any testing.  I got it to compile but never
> even booted the kernel.
> 
> I noticed that on ia64, the following simple program would take 24
> seconds to complete.  Profiling revealed I was spending all that time
> in unmap_vmas.  Looking over the function, I noticed that I would only
> attempt 8 pages at a time (CONFIG_PREMPT).  I then thought through this
> some more.  My particular application had one large mapping which was
> never touched after being mmaped.
> 
> Without this patch, we would iterate numerous times (256) to get past
> a region of memory that had an empty pmd, 524,288 times to get past an
> empty pud, and 1,073,741,824 to get past an empty pgd.  I had a 4-level
> page table directory patch applied at the time.
> 

Ouch. I wonder why nobody's noticed this before. It really is
horribly inefficient on sparse mappings, as you've noticed :(

I guess I prefer the following (compiled, untested) slight
variant of your patch. Measuring work in terms of address range
is fairly vague.... however, it may be the case that some
architectures take a long time to flush a large range of TLBs?

I don't see how the patch can be made too much cleaner... I guess
the rescheduling could be pushed down, like the copy_ case,
however that isn't particularly brilliant either. What's more, it
would probably get uglier due to the mmu_gather tricks...

Nick

-- 
SUSE Labs, Novell Inc.


[-- Attachment #2: mm-zap_block-redundant-fix.patch --]
[-- Type: text/plain, Size: 5291 bytes --]

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -525,20 +525,25 @@ int copy_page_range(struct mm_struct *ds
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
-				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+static unsigned long zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned long *zap_work, struct zap_details *details)
 {
 	pte_t *pte;
 
 	pte = pte_offset_map(pmd, addr);
 	do {
 		pte_t ptent = *pte;
-		if (pte_none(ptent))
+		if (pte_none(ptent)) {
+			(*zap_work)--;
 			continue;
+		}
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
 			unsigned long pfn = pte_pfn(ptent);
+
+			(*zap_work) -= PAGE_SIZE;
+
 			if (pfn_valid(pfn)) {
 				page = pfn_to_page(pfn);
 				if (PageReserved(page))
@@ -592,13 +597,15 @@ static void zap_pte_range(struct mmu_gat
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte++, addr += PAGE_SIZE, (addr != end && (long)*zap_work >0));
 	pte_unmap(pte - 1);
+
+	return addr;
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+			unsigned long addr, unsigned long end,
+			unsigned long *zap_work, struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -606,15 +613,19 @@ static inline void zap_pmd_range(struct 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_clear_bad(pmd)) {
+			(*zap_work)--;
 			continue;
-		zap_pte_range(tlb, pmd, addr, next, details);
-	} while (pmd++, addr = next, addr != end);
+		}
+		next = zap_pte_range(tlb, pmd, addr, next, zap_work, details);
+	} while (pmd++, addr = next, (addr != end && (long)*zap_work > 0));
+
+	return addr;
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
-				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+			unsigned long addr, unsigned long end,
+			unsigned long *zap_work, struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -622,14 +633,17 @@ static inline void zap_pud_range(struct 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
+		if (pud_none_or_clear_bad(pud)) {
+			(*zap_work)--;
 			continue;
-		zap_pmd_range(tlb, pud, addr, next, details);
-	} while (pud++, addr = next, addr != end);
+		}
+		next = zap_pmd_range(tlb, pud, addr, next, zap_work, details);
+	} while (pud++, addr = next, (addr != end && (long)*zap_work > 0));
 }
 
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-				unsigned long addr, unsigned long end,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+			struct vm_area_struct *vma, unsigned long addr,
+			unsigned long end, unsigned long *zap_work,
 				struct zap_details *details)
 {
 	pgd_t *pgd;
@@ -643,10 +657,12 @@ static void unmap_page_range(struct mmu_
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
+		if (pgd_none_or_clear_bad(pgd)) {
+			(*zap_work)--;
 			continue;
-		zap_pud_range(tlb, pgd, addr, next, details);
-	} while (pgd++, addr = next, addr != end);
+		}
+		next = zap_pud_range(tlb, pgd, addr, next, zap_work, details);
+	} while (pgd++, addr = next, (addr != end && (long)*zap_work > 0));
 	tlb_end_vma(tlb, vma);
 }
 
@@ -689,7 +705,7 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	unsigned long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
@@ -710,25 +726,20 @@ unsigned long unmap_vmas(struct mmu_gath
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 
 		while (start != end) {
-			unsigned long block;
-
 			if (!tlb_start_valid) {
 				tlb_start = start;
 				tlb_start_valid = 1;
 			}
 
-			if (is_vm_hugetlb_page(vma)) {
-				block = end - start;
+			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end);
-			} else {
-				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
-						start + block, details);
-			}
+				zap_work -= (end - start) /
+						(HPAGE_SIZE / PAGE_SIZE);
+			} else
+				start += unmap_page_range(*tlbp, vma,
+						start, end, &zap_work, details);
 
-			start += block;
-			zap_bytes -= block;
-			if ((long)zap_bytes > 0)
+			if ((long)zap_work > 0)
 				continue;
 
 			tlb_finish_mmu(*tlbp, tlb_start, start);
@@ -748,7 +759,7 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			*tlbp = tlb_gather_mmu(mm, fullmm);
 			tlb_start_valid = 0;
-			zap_bytes = ZAP_BLOCK_SIZE;
+			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
 out:

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-28 10:45 ` Nick Piggin
@ 2005-10-28 15:20   ` Hugh Dickins
  2005-10-30  4:29     ` Nick Piggin
  0 siblings, 1 reply; 8+ messages in thread
From: Hugh Dickins @ 2005-10-28 15:20 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Robin Holt, linux-kernel, linux-mm

On Fri, 28 Oct 2005, Nick Piggin wrote:
> Robin Holt wrote:
> > 
> > I noticed that on ia64, the following simple program would take 24
> > seconds to complete.  Profiling revealed I was spending all that time
> > in unmap_vmas.  Looking over the function, I noticed that I would only
> > attempt 8 pages at a time (CONFIG_PREMPT).  I then thought through this
> > some more.  My particular application had one large mapping which was
> > never touched after being mmaped.
> 
> Ouch. I wonder why nobody's noticed this before. It really is
> horribly inefficient on sparse mappings, as you've noticed :(

Yes, it's a good observation from Robin.

It'll have been spoiling the exit speedup we expected from your
2.6.14 copy_page_range "Don't copy [faultable] ptes" fork speedup.

> I guess I prefer the following (compiled, untested) slight
> variant of your patch. Measuring work in terms of address range
> is fairly vague.... however, it may be the case that some
> architectures take a long time to flush a large range of TLBs?

I prefer your patch too.  But I'm not very interested in temporary
speedups relative to 2.6.14.  Attacking this is a job I'd put off
until after the page fault scalability changes, which make it much
easier to do a proper job.

But I'm still mulling over precisely what that proper job is.
Probably we allocate a buffer (with fallback of course) for the
page pointers, instead of using the per-cpu mmu_gather.

The point being, the reason for ZAP_BLOCK_SIZE, and its low value
when CONFIG_PREEMPT, is that zap_pte_range is liable to build up a
head of work (TLB flushing, swap freeing, page freeing) that had to
be done with preemption still disabled.  I'm aiming to do it with
with preemption enabled, and proper "do we need to break now?"
tests within zap_pte_range.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-28 15:20   ` Hugh Dickins
@ 2005-10-30  4:29     ` Nick Piggin
  2005-10-30 16:58       ` Hugh Dickins
  0 siblings, 1 reply; 8+ messages in thread
From: Nick Piggin @ 2005-10-30  4:29 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Robin Holt, linux-kernel, linux-mm

Hugh Dickins wrote:

> Yes, it's a good observation from Robin.
> 
> It'll have been spoiling the exit speedup we expected from your
> 2.6.14 copy_page_range "Don't copy [faultable] ptes" fork speedup.
> 

Yep. Not to mention it is probably responsible for some of the
4 level page table performance slowdowns on x86-64.

> 
> 
> I prefer your patch too.  But I'm not very interested in temporary
> speedups relative to 2.6.14.  Attacking this is a job I'd put off
> until after the page fault scalability changes, which make it much
> easier to do a proper job.
> 

Yeah definitely.

I wonder if we should go with Robin's fix (+/- my variation)
as a temporary measure for 2.6.15?

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-30  4:29     ` Nick Piggin
@ 2005-10-30 16:58       ` Hugh Dickins
  2005-10-31  9:10         ` Nick Piggin
  0 siblings, 1 reply; 8+ messages in thread
From: Hugh Dickins @ 2005-10-30 16:58 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Robin Holt, linux-kernel, linux-mm

On Sun, 30 Oct 2005, Nick Piggin wrote:
> Hugh Dickins wrote:
> > 
> > I prefer your patch too.  But I'm not very interested in temporary
> > speedups relative to 2.6.14.  Attacking this is a job I'd put off
> > until after the page fault scalability changes, which make it much
> > easier to do a proper job.
> 
> Yeah definitely.
> 
> I wonder if we should go with Robin's fix (+/- my variation)
> as a temporary measure for 2.6.15?

You're right, I was too dismissive.  I've now spent a day looking into
the larger rework, and it's a bigger job than I'd thought - partly the
architecture variations, partly the fast/slow paths and other "tlb" cruft,
partly the truncation case's i_mmap_lock (and danger of making no progress
whenever we drop it).  I'll have to set all that aside for now.

I've taken another look at the two patches.  The main reason I preferred
yours was that I misread Robin's!  But yes, yours takes it a bit further,
and I think that is worthwhile.

But a built and tested version would be better.  Aren't you trying to
return addr from each level (that's what I liked, and what I'll want to
do in the end)?  But some levels are returning nothing, and unmap_vmas
does start +=, and the huge case leaves start unchanged, and zap_work
should be a long so it doesn't need casting almost everywhere, and...
given all that, I bet there's more!

As to whether p??_none should count for 1 where !pte_none counts for
PAGE_SIZE, well, they say a picture is worth a thousand words, and I'm
sure that's entered your calculation ;-)  I'd probably make the p??_none
count for a little more.  Perhaps we should get everyone involved in a
great profiling effort across the architectures to determine it.
Config option.  Sys tunable.  I'll shut up.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-30 16:58       ` Hugh Dickins
@ 2005-10-31  9:10         ` Nick Piggin
  2005-10-31  9:19           ` Nick Piggin
  2005-10-31 12:20           ` Robin Holt
  0 siblings, 2 replies; 8+ messages in thread
From: Nick Piggin @ 2005-10-31  9:10 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Robin Holt, linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 2659 bytes --]

Hugh Dickins wrote:
> On Sun, 30 Oct 2005, Nick Piggin wrote:

>>I wonder if we should go with Robin's fix (+/- my variation)
>>as a temporary measure for 2.6.15?
> 
> 
> You're right, I was too dismissive.  I've now spent a day looking into
> the larger rework, and it's a bigger job than I'd thought - partly the
> architecture variations, partly the fast/slow paths and other "tlb" cruft,
> partly the truncation case's i_mmap_lock (and danger of making no progress
> whenever we drop it).  I'll have to set all that aside for now.
> 

Yep, I took a quick look before adding my bugs to Robin's patch, and
basically came to the same conclusion.

> I've taken another look at the two patches.  The main reason I preferred
> yours was that I misread Robin's!  But yes, yours takes it a bit further,
> and I think that is worthwhile.
> 

Sure - the nice method to solve it reasonably neatly is Robin's of
course. I just preferred to change accounting slightly so we wouldn't,
for example, retry the call stack after each empty pgd.

> But a built and tested version would be better.  Aren't you trying to
> return addr from each level (that's what I liked, and what I'll want to
> do in the end)?  But some levels are returning nothing,

Hmph, seem to have got half a patch there :P

> and unmap_vmas
> does start +=, and the huge case leaves start unchanged, and

Dang, thanks.

> zap_work
> should be a long so it doesn't need casting almost everywhere, and...
> given all that, I bet there's more!
> 

Yep, good idea. New tested patch attached. Robin, if you agree with the
changes I made, would you sign this one and send it on to Linus and/or
Andrew, please?

> As to whether p??_none should count for 1 where !pte_none counts for
> PAGE_SIZE, well, they say a picture is worth a thousand words, and I'm
> sure that's entered your calculation ;-)  I'd probably make the p??_none

That didn't, but it confirms my theory. I was working on the basis
that a *page* is worth a thousand words - but I think we can assume
any picture worth looking at will be given its own page.

I don't know if we should worry about increasing p??_none case - they
are going to be largely operating on contiguous memory, free of atomic
ops. And the cost for pages must also include the TLB and possible
freeing work too.

Trivial to change once the patch is in though.

> count for a little more.  Perhaps we should get everyone involved in a
> great profiling effort across the architectures to determine it.
> Config option.  Sys tunable.  I'll shut up.
> 

I was going to make it a tunable, but then I realised someone already
has - /dev/kmem ;)

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-zap_block-redundant-fix.patch --]
[-- Type: text/plain, Size: 6487 bytes --]

The address based work estimate for unmapping (for lockbreak) is and
always was horribly inefficient for sparse mappings. The problem is most
simply explained with an example:

If we find a pgd is clear, we still have to call into unmap_page_range
PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in
order to progress the working address to the next pgd.

The fundamental way to solve the problem is to keep track of the end address
we've processed and pass it back to the higher layers.

From: Robin Holt <holt@sgi.com>

Modification to completely get away from address based work estimate and
instead use an abstract count, with a very small cost for empty entries as
opposed to present pages.

On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB of
virtual address space takes 1.69s; with the following patch applied, this
operation can be done 1000 times in less than 0.01s

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -549,10 +549,10 @@ int copy_page_range(struct mm_struct *ds
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
@@ -563,10 +563,15 @@ static void zap_pte_range(struct mmu_gat
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
-		if (pte_none(ptent))
+		if (pte_none(ptent)) {
+			(*zap_work)--;
 			continue;
+		}
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
+
+			(*zap_work) -= PAGE_SIZE;
+
 			if (!(vma->vm_flags & VM_RESERVED)) {
 				unsigned long pfn = pte_pfn(ptent);
 				if (unlikely(!pfn_valid(pfn)))
@@ -624,16 +629,18 @@ static void zap_pte_range(struct mmu_gat
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
 	pte_unmap_unlock(pte - 1, ptl);
+
+	return addr;
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -641,16 +648,21 @@ static inline void zap_pmd_range(struct 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_clear_bad(pmd)) {
+			(*zap_work)--;
 			continue;
-		zap_pte_range(tlb, vma, pmd, addr, next, details);
-	} while (pmd++, addr = next, addr != end);
+		}
+		next = zap_pte_range(tlb, vma, pmd, addr, next,
+						zap_work, details);
+	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -658,15 +670,21 @@ static inline void zap_pud_range(struct 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
+		if (pud_none_or_clear_bad(pud)) {
+			(*zap_work)--;
 			continue;
-		zap_pmd_range(tlb, vma, pud, addr, next, details);
-	} while (pud++, addr = next, addr != end);
+		}
+		next = zap_pmd_range(tlb, vma, pud, addr, next,
+						zap_work, details);
+	} while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
 
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -679,11 +697,16 @@ static void unmap_page_range(struct mmu_
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
+		if (pgd_none_or_clear_bad(pgd)) {
+			(*zap_work)--;
 			continue;
-		zap_pud_range(tlb, vma, pgd, addr, next, details);
-	} while (pgd++, addr = next, addr != end);
+		}
+		next = zap_pud_range(tlb, vma, pgd, addr, next,
+						zap_work, details);
+	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
+
+	return addr;
 }
 
 #ifdef CONFIG_PREEMPT
@@ -724,7 +747,7 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
@@ -745,27 +768,25 @@ unsigned long unmap_vmas(struct mmu_gath
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 
 		while (start != end) {
-			unsigned long block;
-
 			if (!tlb_start_valid) {
 				tlb_start = start;
 				tlb_start_valid = 1;
 			}
 
-			if (is_vm_hugetlb_page(vma)) {
-				block = end - start;
+			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end);
-			} else {
-				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
-						start + block, details);
+				zap_work -= (end - start) /
+						(HPAGE_SIZE / PAGE_SIZE);
+				start = end;
+			} else
+				start = unmap_page_range(*tlbp, vma,
+						start, end, &zap_work, details);
+
+			if (zap_work > 0) {
+				BUG_ON(start != end);
+				break;
 			}
 
-			start += block;
-			zap_bytes -= block;
-			if ((long)zap_bytes > 0)
-				continue;
-
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
 			if (need_resched() ||
@@ -779,7 +800,7 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
-			zap_bytes = ZAP_BLOCK_SIZE;
+			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
 out:

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-31  9:10         ` Nick Piggin
@ 2005-10-31  9:19           ` Nick Piggin
  2005-10-31 12:20           ` Robin Holt
  1 sibling, 0 replies; 8+ messages in thread
From: Nick Piggin @ 2005-10-31  9:19 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Robin Holt, linux-kernel, linux-mm

Nick Piggin wrote:

> The address based work estimate for unmapping (for lockbreak) is and
> always was horribly inefficient for sparse mappings. The problem is most
> simply explained with an example:
> 
> If we find a pgd is clear, we still have to call into unmap_page_range
> PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in
> order to progress the working address to the next pgd.
> 
> The fundamental way to solve the problem is to keep track of the end address
> we've processed and pass it back to the higher layers.
> 
> From: Robin Holt <holt@sgi.com>
> 
> Modification to completely get away from address based work estimate and
> instead use an abstract count, with a very small cost for empty entries as
> opposed to present pages.
> 
> On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB of
> virtual address space takes 1.69s; with the following patch applied, this
> operation can be done 1000 times in less than 0.01s
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>
> 

Note that I think this patch will cripple our nice page table folding
in the unmap path, due to no longer using the 'next' from p??_addr_end,
even if the compiler is very smart.

I haven't confirmed this by looking at assembly, however I'd be almost
sure this is the case. Possibly a followup patch would be in order so
as to restore this, but I couldn't think of a really nice way to do it.

Basically we only want to return the return of the next level
zap_p??_range in the case that it returns with zap_work < 0.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: munmap extremely slow even with untouched mapping.
  2005-10-31  9:10         ` Nick Piggin
  2005-10-31  9:19           ` Nick Piggin
@ 2005-10-31 12:20           ` Robin Holt
  1 sibling, 0 replies; 8+ messages in thread
From: Robin Holt @ 2005-10-31 12:20 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Hugh Dickins, Robin Holt, linux-kernel, linux-mm

On Mon, Oct 31, 2005 at 08:10:50PM +1100, Nick Piggin wrote:
> Hugh Dickins wrote:
> >On Sun, 30 Oct 2005, Nick Piggin wrote:
> 
> >>I wonder if we should go with Robin's fix (+/- my variation)
> >>as a temporary measure for 2.6.15?
> >
> >
> >You're right, I was too dismissive.  I've now spent a day looking into
> >the larger rework, and it's a bigger job than I'd thought - partly the
> >architecture variations, partly the fast/slow paths and other "tlb" cruft,
> >partly the truncation case's i_mmap_lock (and danger of making no progress
> >whenever we drop it).  I'll have to set all that aside for now.
> >
> 
> Yep, I took a quick look before adding my bugs to Robin's patch, and
> basically came to the same conclusion.
> 
> >I've taken another look at the two patches.  The main reason I preferred
> >yours was that I misread Robin's!  But yes, yours takes it a bit further,
> >and I think that is worthwhile.
> >
> 
> Sure - the nice method to solve it reasonably neatly is Robin's of
> course. I just preferred to change accounting slightly so we wouldn't,
> for example, retry the call stack after each empty pgd.
> 
> >But a built and tested version would be better.  Aren't you trying to
> >return addr from each level (that's what I liked, and what I'll want to
> >do in the end)?  But some levels are returning nothing,
> 
> Hmph, seem to have got half a patch there :P
> 
> >and unmap_vmas
> >does start +=, and the huge case leaves start unchanged, and
> 
> Dang, thanks.
> 
> >zap_work
> >should be a long so it doesn't need casting almost everywhere, and...
> >given all that, I bet there's more!
> >
> 
> Yep, good idea. New tested patch attached. Robin, if you agree with the
> changes I made, would you sign this one and send it on to Linus and/or
> Andrew, please?
> 
> >As to whether p??_none should count for 1 where !pte_none counts for
> >PAGE_SIZE, well, they say a picture is worth a thousand words, and I'm
> >sure that's entered your calculation ;-)  I'd probably make the p??_none
> 
> That didn't, but it confirms my theory. I was working on the basis
> that a *page* is worth a thousand words - but I think we can assume
> any picture worth looking at will be given its own page.
> 
> I don't know if we should worry about increasing p??_none case - they
> are going to be largely operating on contiguous memory, free of atomic
> ops. And the cost for pages must also include the TLB and possible
> freeing work too.
> 
> Trivial to change once the patch is in though.
> 
> >count for a little more.  Perhaps we should get everyone involved in a
> >great profiling effort across the architectures to determine it.
> >Config option.  Sys tunable.  I'll shut up.
> >
> 
> I was going to make it a tunable, but then I realised someone already
> has - /dev/kmem ;)
> 
> -- 
> SUSE Labs, Novell Inc.

> The address based work estimate for unmapping (for lockbreak) is and
> always was horribly inefficient for sparse mappings. The problem is most
> simply explained with an example:
> 
> If we find a pgd is clear, we still have to call into unmap_page_range
> PGDIR_SIZE / ZAP_BLOCK_SIZE times, each time checking the clear pgd, in
> order to progress the working address to the next pgd.
> 
> The fundamental way to solve the problem is to keep track of the end address
> we've processed and pass it back to the higher layers.
> 
> From: Robin Holt <holt@sgi.com>
> 
> Modification to completely get away from address based work estimate and
> instead use an abstract count, with a very small cost for empty entries as
> opposed to present pages.
> 
> On 2.6.14-git2, ppc64, and CONFIG_PREEMPT=y, mapping and unmapping 1TB of
> virtual address space takes 1.69s; with the following patch applied, this
> operation can be done 1000 times in less than 0.01s
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>

Thanks, Nick, for doing all the work for me.  I still haven't tested this
but it is definitely what I had hoped would be done to speed this up.

Signed-off-by: Robin Holt <holt@sgi.com>


Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -549,10 +549,10 @@ int copy_page_range(struct mm_struct *ds
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
@@ -563,10 +563,15 @@ static void zap_pte_range(struct mmu_gat
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	do {
 		pte_t ptent = *pte;
-		if (pte_none(ptent))
+		if (pte_none(ptent)) {
+			(*zap_work)--;
 			continue;
+		}
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
+
+			(*zap_work) -= PAGE_SIZE;
+
 			if (!(vma->vm_flags & VM_RESERVED)) {
 				unsigned long pfn = pte_pfn(ptent);
 				if (unlikely(!pfn_valid(pfn)))
@@ -624,16 +629,18 @@ static void zap_pte_range(struct mmu_gat
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
 		pte_clear_full(mm, addr, pte, tlb->fullmm);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
 	pte_unmap_unlock(pte - 1, ptl);
+
+	return addr;
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -641,16 +648,21 @@ static inline void zap_pmd_range(struct 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
+		if (pmd_none_or_clear_bad(pmd)) {
+			(*zap_work)--;
 			continue;
-		zap_pte_range(tlb, vma, pmd, addr, next, details);
-	} while (pmd++, addr = next, addr != end);
+		}
+		next = zap_pte_range(tlb, vma, pmd, addr, next,
+						zap_work, details);
+	} while (pmd++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -658,15 +670,21 @@ static inline void zap_pud_range(struct 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_none_or_clear_bad(pud))
+		if (pud_none_or_clear_bad(pud)) {
+			(*zap_work)--;
 			continue;
-		zap_pmd_range(tlb, vma, pud, addr, next, details);
-	} while (pud++, addr = next, addr != end);
+		}
+		next = zap_pmd_range(tlb, vma, pud, addr, next,
+						zap_work, details);
+	} while (pud++, addr = next, (addr != end && *zap_work > 0));
+
+	return addr;
 }
 
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end,
-				struct zap_details *details)
+				long *zap_work, struct zap_details *details)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -679,11 +697,16 @@ static void unmap_page_range(struct mmu_
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
+		if (pgd_none_or_clear_bad(pgd)) {
+			(*zap_work)--;
 			continue;
-		zap_pud_range(tlb, vma, pgd, addr, next, details);
-	} while (pgd++, addr = next, addr != end);
+		}
+		next = zap_pud_range(tlb, vma, pgd, addr, next,
+						zap_work, details);
+	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
+
+	return addr;
 }
 
 #ifdef CONFIG_PREEMPT
@@ -724,7 +747,7 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *details)
 {
-	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	long zap_work = ZAP_BLOCK_SIZE;
 	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */
 	int tlb_start_valid = 0;
 	unsigned long start = start_addr;
@@ -745,27 +768,25 @@ unsigned long unmap_vmas(struct mmu_gath
 			*nr_accounted += (end - start) >> PAGE_SHIFT;
 
 		while (start != end) {
-			unsigned long block;
-
 			if (!tlb_start_valid) {
 				tlb_start = start;
 				tlb_start_valid = 1;
 			}
 
-			if (is_vm_hugetlb_page(vma)) {
-				block = end - start;
+			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end);
-			} else {
-				block = min(zap_bytes, end - start);
-				unmap_page_range(*tlbp, vma, start,
-						start + block, details);
+				zap_work -= (end - start) /
+						(HPAGE_SIZE / PAGE_SIZE);
+				start = end;
+			} else
+				start = unmap_page_range(*tlbp, vma,
+						start, end, &zap_work, details);
+
+			if (zap_work > 0) {
+				BUG_ON(start != end);
+				break;
 			}
 
-			start += block;
-			zap_bytes -= block;
-			if ((long)zap_bytes > 0)
-				continue;
-
 			tlb_finish_mmu(*tlbp, tlb_start, start);
 
 			if (need_resched() ||
@@ -779,7 +800,7 @@ unsigned long unmap_vmas(struct mmu_gath
 
 			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
 			tlb_start_valid = 0;
-			zap_bytes = ZAP_BLOCK_SIZE;
+			zap_work = ZAP_BLOCK_SIZE;
 		}
 	}
 out:

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2005-10-31 12:20 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-10-28  1:37 munmap extremely slow even with untouched mapping Robin Holt
2005-10-28 10:45 ` Nick Piggin
2005-10-28 15:20   ` Hugh Dickins
2005-10-30  4:29     ` Nick Piggin
2005-10-30 16:58       ` Hugh Dickins
2005-10-31  9:10         ` Nick Piggin
2005-10-31  9:19           ` Nick Piggin
2005-10-31 12:20           ` Robin Holt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox