From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Robin Holt <holt@sgi.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Hugh Dickins <hugh@veritas.com>
Subject: Re: munmap extremely slow even with untouched mapping.
Date: Fri, 28 Oct 2005 20:45:12 +1000 [thread overview]
Message-ID: <43620138.6060707@yahoo.com.au> (raw)
In-Reply-To: <20051028013738.GA19727@attica.americas.sgi.com>
[-- Attachment #1: Type: text/plain, Size: 1446 bytes --]
Robin Holt wrote:
> This is going out without any testing. I got it to compile but never
> even booted the kernel.
>
> I noticed that on ia64, the following simple program would take 24
> seconds to complete. Profiling revealed I was spending all that time
> in unmap_vmas. Looking over the function, I noticed that I would only
> attempt 8 pages at a time (CONFIG_PREMPT). I then thought through this
> some more. My particular application had one large mapping which was
> never touched after being mmaped.
>
> Without this patch, we would iterate numerous times (256) to get past
> a region of memory that had an empty pmd, 524,288 times to get past an
> empty pud, and 1,073,741,824 to get past an empty pgd. I had a 4-level
> page table directory patch applied at the time.
>
Ouch. I wonder why nobody's noticed this before. It really is
horribly inefficient on sparse mappings, as you've noticed :(
I guess I prefer the following (compiled, untested) slight
variant of your patch. Measuring work in terms of address range
is fairly vague.... however, it may be the case that some
architectures take a long time to flush a large range of TLBs?
I don't see how the patch can be made too much cleaner... I guess
the rescheduling could be pushed down, like the copy_ case,
however that isn't particularly brilliant either. What's more, it
would probably get uglier due to the mmu_gather tricks...
Nick
--
SUSE Labs, Novell Inc.
[-- Attachment #2: mm-zap_block-redundant-fix.patch --]
[-- Type: text/plain, Size: 5291 bytes --]
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -525,20 +525,25 @@ int copy_page_range(struct mm_struct *ds
return 0;
}
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- struct zap_details *details)
+static unsigned long zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long *zap_work, struct zap_details *details)
{
pte_t *pte;
pte = pte_offset_map(pmd, addr);
do {
pte_t ptent = *pte;
- if (pte_none(ptent))
+ if (pte_none(ptent)) {
+ (*zap_work)--;
continue;
+ }
if (pte_present(ptent)) {
struct page *page = NULL;
unsigned long pfn = pte_pfn(ptent);
+
+ (*zap_work) -= PAGE_SIZE;
+
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageReserved(page))
@@ -592,13 +597,15 @@ static void zap_pte_range(struct mmu_gat
if (!pte_file(ptent))
free_swap_and_cache(pte_to_swp_entry(ptent));
pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, (addr != end && (long)*zap_work >0));
pte_unmap(pte - 1);
+
+ return addr;
}
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end,
- struct zap_details *details)
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long *zap_work, struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
@@ -606,15 +613,19 @@ static inline void zap_pmd_range(struct
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none_or_clear_bad(pmd)) {
+ (*zap_work)--;
continue;
- zap_pte_range(tlb, pmd, addr, next, details);
- } while (pmd++, addr = next, addr != end);
+ }
+ next = zap_pte_range(tlb, pmd, addr, next, zap_work, details);
+ } while (pmd++, addr = next, (addr != end && (long)*zap_work > 0));
+
+ return addr;
}
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- struct zap_details *details)
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long *zap_work, struct zap_details *details)
{
pud_t *pud;
unsigned long next;
@@ -622,14 +633,17 @@ static inline void zap_pud_range(struct
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
+ if (pud_none_or_clear_bad(pud)) {
+ (*zap_work)--;
continue;
- zap_pmd_range(tlb, pud, addr, next, details);
- } while (pud++, addr = next, addr != end);
+ }
+ next = zap_pmd_range(tlb, pud, addr, next, zap_work, details);
+ } while (pud++, addr = next, (addr != end && (long)*zap_work > 0));
}
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, unsigned long *zap_work,
struct zap_details *details)
{
pgd_t *pgd;
@@ -643,10 +657,12 @@ static void unmap_page_range(struct mmu_
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
+ if (pgd_none_or_clear_bad(pgd)) {
+ (*zap_work)--;
continue;
- zap_pud_range(tlb, pgd, addr, next, details);
- } while (pgd++, addr = next, addr != end);
+ }
+ next = zap_pud_range(tlb, pgd, addr, next, zap_work, details);
+ } while (pgd++, addr = next, (addr != end && (long)*zap_work > 0));
tlb_end_vma(tlb, vma);
}
@@ -689,7 +705,7 @@ unsigned long unmap_vmas(struct mmu_gath
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+ unsigned long zap_work = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
unsigned long start = start_addr;
@@ -710,25 +726,20 @@ unsigned long unmap_vmas(struct mmu_gath
*nr_accounted += (end - start) >> PAGE_SHIFT;
while (start != end) {
- unsigned long block;
-
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
- if (is_vm_hugetlb_page(vma)) {
- block = end - start;
+ if (unlikely(is_vm_hugetlb_page(vma))) {
unmap_hugepage_range(vma, start, end);
- } else {
- block = min(zap_bytes, end - start);
- unmap_page_range(*tlbp, vma, start,
- start + block, details);
- }
+ zap_work -= (end - start) /
+ (HPAGE_SIZE / PAGE_SIZE);
+ } else
+ start += unmap_page_range(*tlbp, vma,
+ start, end, &zap_work, details);
- start += block;
- zap_bytes -= block;
- if ((long)zap_bytes > 0)
+ if ((long)zap_work > 0)
continue;
tlb_finish_mmu(*tlbp, tlb_start, start);
@@ -748,7 +759,7 @@ unsigned long unmap_vmas(struct mmu_gath
*tlbp = tlb_gather_mmu(mm, fullmm);
tlb_start_valid = 0;
- zap_bytes = ZAP_BLOCK_SIZE;
+ zap_work = ZAP_BLOCK_SIZE;
}
}
out:
next prev parent reply other threads:[~2005-10-28 10:45 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-10-28 1:37 Robin Holt
2005-10-28 10:45 ` Nick Piggin [this message]
2005-10-28 15:20 ` Hugh Dickins
2005-10-30 4:29 ` Nick Piggin
2005-10-30 16:58 ` Hugh Dickins
2005-10-31 9:10 ` Nick Piggin
2005-10-31 9:19 ` Nick Piggin
2005-10-31 12:20 ` Robin Holt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=43620138.6060707@yahoo.com.au \
--to=nickpiggin@yahoo.com.au \
--cc=holt@sgi.com \
--cc=hugh@veritas.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox