[patch][rfc] remove ZERO

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch][rfc] remove ZERO_PAGE?
@ 2007-07-27  2:19 Nick Piggin
  2007-07-27  5:29 ` Linus Torvalds
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Nick Piggin @ 2007-07-27  2:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

Hi,

I'd like to see if we can get the ball rolling on this again, and try to
get it in 2.6.24 maybe. Any comments?

---
Inserting a ZERO_PAGE for anonymous read faults appears to be a false
optimisation: if an application is performance critical, it would not
be doing read faults of new memory or at least it could be expected to
write to that memory soon afterwards. If it is memory use is critical,
it should not be touching addresses that it knows to be zero anyway.

eg. Very sparse matrix code might benefit from the ZERO_PAGE, however it
would only be a naive implementation that isn't tuned for memory usage
anyway.

The motivation for this came from a situation where an Altix system
was essentially livelocked tearing down ZERO_PAGE pagetables when an
HPC app aborted during startup. This is also a case of a silly
userspace access pattern, but it did highlight the potential scalability
problem of the ZERO_PAGE, and corner cases where it can really hurt.

Mesuring on my desktop system, there are never many mappings to the
ZERO_PAGE, thus memory usage should not increase too much if we remove
it. My desktop is by no means representative, but it gives some
indication.

When running a make -j4 kernel compile on my dual core system, there are
about 1,000 mappings to the ZERO_PAGE created per second, and about 1,000
ZERO_PAGE COW faults per second! (Less than 1 ZERO_PAGE mapping per second
is torn down without being COWed). So this patch will save 1,000 page
faults per second, and 2,000 bounces of the ZERO_PAGE struct page
cacheline per second when running kbuild, which might end up being a
significant scalability hit on big systems.

The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
much use to them except complexity and useless benchmarks. All other
users of ZERO_PAGE are converted just to use ZERO_PAGE(0) for simplicity.
We can look at replacing them all and ripping out ZERO_PAGE completely
if/when this patch gets in.

Index: linux-2.6/drivers/char/mem.c
===================================================================
--- linux-2.6.orig/drivers/char/mem.c
+++ linux-2.6/drivers/char/mem.c
@@ -625,65 +625,10 @@ static ssize_t splice_write_null(struct 
 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
 }
 
-#ifdef CONFIG_MMU
-/*
- * For fun, we are using the MMU for this.
- */
-static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
-{
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	unsigned long addr=(unsigned long)buf;
-
-	mm = current->mm;
-	/* Oops, this was forgotten before. -ben */
-	down_read(&mm->mmap_sem);
-
-	/* For private mappings, just map in zero pages. */
-	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
-		unsigned long count;
-
-		if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
-			goto out_up;
-		if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
-			break;
-		count = vma->vm_end - addr;
-		if (count > size)
-			count = size;
-
-		zap_page_range(vma, addr, count, NULL);
-        	if (zeromap_page_range(vma, addr, count, PAGE_COPY))
-			break;
-
-		size -= count;
-		buf += count;
-		addr += count;
-		if (size == 0)
-			goto out_up;
-	}
-
-	up_read(&mm->mmap_sem);
-	
-	/* The shared case is hard. Let's do the conventional zeroing. */ 
-	do {
-		unsigned long unwritten = clear_user(buf, PAGE_SIZE);
-		if (unwritten)
-			return size + unwritten - PAGE_SIZE;
-		cond_resched();
-		buf += PAGE_SIZE;
-		size -= PAGE_SIZE;
-	} while (size);
-
-	return size;
-out_up:
-	up_read(&mm->mmap_sem);
-	return size;
-}
-
 static ssize_t read_zero(struct file * file, char __user * buf, 
 			 size_t count, loff_t *ppos)
 {
-	unsigned long left, unwritten, written = 0;
+	size_t written;
 
 	if (!count)
 		return 0;
@@ -691,69 +636,33 @@ static ssize_t read_zero(struct file * f
 	if (!access_ok(VERIFY_WRITE, buf, count))
 		return -EFAULT;
 
-	left = count;
-
-	/* do we want to be clever? Arbitrary cut-off */
-	if (count >= PAGE_SIZE*4) {
-		unsigned long partial;
-
-		/* How much left of the page? */
-		partial = (PAGE_SIZE-1) & -(unsigned long) buf;
-		unwritten = clear_user(buf, partial);
-		written = partial - unwritten;
+	written = 0;
+	while (count) {
+		unsigned long unwritten;
+		size_t chunk = count;
+
+		if (chunk > PAGE_SIZE)
+			chunk = PAGE_SIZE;	/* Just for latency reasons */
+		unwritten = clear_user(buf, chunk);
+		written += chunk - unwritten;
 		if (unwritten)
-			goto out;
-		left -= partial;
-		buf += partial;
-		unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
-		written += (left & PAGE_MASK) - unwritten;
-		if (unwritten)
-			goto out;
-		buf += left & PAGE_MASK;
-		left &= ~PAGE_MASK;
-	}
-	unwritten = clear_user(buf, left);
-	written += left - unwritten;
-out:
-	return written ? written : -EFAULT;
-}
-
-static int mmap_zero(struct file * file, struct vm_area_struct * vma)
-{
-	int err;
-
-	if (vma->vm_flags & VM_SHARED)
-		return shmem_zero_setup(vma);
-	err = zeromap_page_range(vma, vma->vm_start,
-			vma->vm_end - vma->vm_start, vma->vm_page_prot);
-	BUG_ON(err == -EEXIST);
-	return err;
-}
-#else /* CONFIG_MMU */
-static ssize_t read_zero(struct file * file, char * buf, 
-			 size_t count, loff_t *ppos)
-{
-	size_t todo = count;
-
-	while (todo) {
-		size_t chunk = todo;
-
-		if (chunk > 4096)
-			chunk = 4096;	/* Just for latency reasons */
-		if (clear_user(buf, chunk))
-			return -EFAULT;
+			break;
 		buf += chunk;
-		todo -= chunk;
+		count -= chunk;
 		cond_resched();
 	}
-	return count;
+	return written ? written : -EFAULT;
 }
 
 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
 {
+#ifndef CONFIG_MMU
 	return -ENOSYS;
+#endif
+	if (vma->vm_flags & VM_SHARED)
+		return shmem_zero_setup(vma);
+	return 0;
 }
-#endif /* CONFIG_MMU */
 
 static ssize_t write_full(struct file * file, const char __user * buf,
 			  size_t count, loff_t *ppos)
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -778,8 +778,6 @@ void free_pgtables(struct mmu_gather **t
 		unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
-int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
-			unsigned long size, pgprot_t prot);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
 
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
 	 * has touched so far, we don't want to allocate page tables.
 	 */
 	if (flags & FOLL_ANON) {
-		page = ZERO_PAGE(address);
+		page = ZERO_PAGE(0);
 		if (flags & FOLL_GET)
 			get_page(page);
 		BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *t
 }
 EXPORT_SYMBOL(get_user_pages);
 
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
-			unsigned long addr, unsigned long end, pgprot_t prot)
-{
-	pte_t *pte;
-	spinlock_t *ptl;
-	int err = 0;
-
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-	if (!pte)
-		return -EAGAIN;
-	arch_enter_lazy_mmu_mode();
-	do {
-		struct page *page = ZERO_PAGE(addr);
-		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-
-		if (unlikely(!pte_none(*pte))) {
-			err = -EEXIST;
-			pte++;
-			break;
-		}
-		page_cache_get(page);
-		page_add_file_rmap(page);
-		inc_mm_counter(mm, file_rss);
-		set_pte_at(mm, addr, pte, zero_pte);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	arch_leave_lazy_mmu_mode();
-	pte_unmap_unlock(pte - 1, ptl);
-	return err;
-}
-
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
-			unsigned long addr, unsigned long end, pgprot_t prot)
-{
-	pmd_t *pmd;
-	unsigned long next;
-	int err;
-
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		return -EAGAIN;
-	do {
-		next = pmd_addr_end(addr, end);
-		err = zeromap_pte_range(mm, pmd, addr, next, prot);
-		if (err)
-			break;
-	} while (pmd++, addr = next, addr != end);
-	return err;
-}
-
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
-			unsigned long addr, unsigned long end, pgprot_t prot)
-{
-	pud_t *pud;
-	unsigned long next;
-	int err;
-
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
-		return -EAGAIN;
-	do {
-		next = pud_addr_end(addr, end);
-		err = zeromap_pmd_range(mm, pud, addr, next, prot);
-		if (err)
-			break;
-	} while (pud++, addr = next, addr != end);
-	return err;
-}
-
-int zeromap_page_range(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long size, pgprot_t prot)
-{
-	pgd_t *pgd;
-	unsigned long next;
-	unsigned long end = addr + size;
-	struct mm_struct *mm = vma->vm_mm;
-	int err;
-
-	BUG_ON(addr >= end);
-	pgd = pgd_offset(mm, addr);
-	flush_cache_range(vma, addr, end);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = zeromap_pud_range(mm, pgd, addr, next, prot);
-		if (err)
-			break;
-	} while (pgd++, addr = next, addr != end);
-	return err;
-}
-
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
@@ -1714,16 +1625,11 @@ gotten:
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	if (old_page == ZERO_PAGE(address)) {
-		new_page = alloc_zeroed_user_highpage_movable(vma, address);
-		if (!new_page)
-			goto oom;
-	} else {
-		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-		if (!new_page)
-			goto oom;
-		cow_user_page(new_page, old_page, address, vma);
-	}
+	VM_BUG_ON(old_page == ZERO_PAGE(0));
+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+	if (!new_page)
+		goto oom;
+	cow_user_page(new_page, old_page, address, vma);
 
 	/*
 	 * Re-check the pte - we dropped the lock
@@ -2249,39 +2155,24 @@ static int do_anonymous_page(struct mm_s
 	spinlock_t *ptl;
 	pte_t entry;
 
-	if (write_access) {
-		/* Allocate our own private page. */
-		pte_unmap(page_table);
-
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_zeroed_user_highpage_movable(vma, address);
-		if (!page)
-			goto oom;
-
-		entry = mk_pte(page, vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	/* Allocate our own private page. */
+	pte_unmap(page_table);
 
-		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-		if (!pte_none(*page_table))
-			goto release;
-		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
-		page_add_new_anon_rmap(page, vma, address);
-	} else {
-		/* Map the ZERO_PAGE - vm_page_prot is readonly */
-		page = ZERO_PAGE(address);
-		page_cache_get(page);
-		entry = mk_pte(page, vma->vm_page_prot);
+	if (unlikely(anon_vma_prepare(vma)))
+		goto oom;
+	page = alloc_zeroed_user_highpage_movable(vma, address);
+	if (!page)
+		goto oom;
 
-		ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		if (!pte_none(*page_table))
-			goto release;
-		inc_mm_counter(mm, file_rss);
-		page_add_file_rmap(page);
-	}
+	entry = mk_pte(page, vma->vm_page_prot);
+	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
+	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (!pte_none(*page_table))
+		goto release;
+	inc_mm_counter(mm, anon_rss);
+	lru_cache_add_active(page);
+	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
 
 	/* No need to invalidate - it was non-present before */
Index: linux-2.6/fs/binfmt_elf.c
===================================================================
--- linux-2.6.orig/fs/binfmt_elf.c
+++ linux-2.6/fs/binfmt_elf.c
@@ -1733,7 +1733,7 @@ static int elf_core_dump(long signr, str
 						&page, &vma) <= 0) {
 				DUMP_SEEK(PAGE_SIZE);
 			} else {
-				if (page == ZERO_PAGE(addr)) {
+				if (page == ZERO_PAGE(0)) {
 					if (!dump_seek(file, PAGE_SIZE)) {
 						page_cache_release(page);
 						goto end_coredump;
Index: linux-2.6/fs/binfmt_elf_fdpic.c
===================================================================
--- linux-2.6.orig/fs/binfmt_elf_fdpic.c
+++ linux-2.6/fs/binfmt_elf_fdpic.c
@@ -1488,7 +1488,7 @@ static int elf_fdpic_dump_segments(struc
 					   &page, &vma) <= 0) {
 				DUMP_SEEK(file->f_pos + PAGE_SIZE);
 			}
-			else if (page == ZERO_PAGE(addr)) {
+			else if (page == ZERO_PAGE(0)) {
 				page_cache_release(page);
 				DUMP_SEEK(file->f_pos + PAGE_SIZE);
 			}
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c
+++ linux-2.6/fs/direct-io.c
@@ -163,7 +163,7 @@ static int dio_refill_pages(struct dio *
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
-		struct page *page = ZERO_PAGE(dio->curr_user_address);
+		struct page *page = ZERO_PAGE(0);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -772,7 +772,7 @@ static void dio_zero_block(struct dio *d
 
 	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
 
-	page = ZERO_PAGE(dio->curr_user_address);
+	page = ZERO_PAGE(0);
 	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
 				dio->next_block_for_io))
 		return;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27  2:19 [patch][rfc] remove ZERO_PAGE? Nick Piggin
@ 2007-07-27  5:29 ` Linus Torvalds
  2007-07-27  5:54   ` Nick Piggin
  2007-07-27 15:59 ` Hugh Dickins
  2007-07-30 13:52 ` Luiz Fernando N. Capitulino
  2 siblings, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2007-07-27  5:29 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Fri, 27 Jul 2007, Nick Piggin wrote:
> 
> I'd like to see if we can get the ball rolling on this again, and try to
> get it in 2.6.24 maybe. Any comments?

I'd really want real performance numbers. I don't like the "remove it 
because I don't like it". I want real numbers for real loads before I'm 
really interested.

Last time this came up, the logic behind wanting to remove the zero page 
was all screwed up, and it was all based on totally broken premises. So I 
really want somethign else than just "I don't like it".

Sorry. In the absense of numbers (and not just some made-up branchmark: 
something real - I can _easily_ make benchmarks that show that ZERO_PAGE 
is wonderful), I'm not at all interested.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27  5:29 ` Linus Torvalds
@ 2007-07-27  5:54   ` Nick Piggin
  2007-07-27 15:21     ` Linus Torvalds
  0 siblings, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2007-07-27  5:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Thu, Jul 26, 2007 at 10:29:01PM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 27 Jul 2007, Nick Piggin wrote:
> > 
> > I'd like to see if we can get the ball rolling on this again, and try to
> > get it in 2.6.24 maybe. Any comments?
> 
> I'd really want real performance numbers. I don't like the "remove it 
> because I don't like it". I want real numbers for real loads before I'm 
> really interested.

What numbers, though? I can make up benchmarks to show that ZERO_PAGE
sucks just as much. The problem I don't think is finding a situatoin that
improves without it (we have an extreme case where the Altix livelocked)
but to get confidence that nothing is going to blow up.

 
> Last time this came up, the logic behind wanting to remove the zero page 
> was all screwed up, and it was all based on totally broken premises. So I 
> really want somethign else than just "I don't like it".

I thought that last time this came up you thought it might be good to
try out in -mm initially.

 
> Sorry. In the absense of numbers (and not just some made-up branchmark: 
> something real - I can _easily_ make benchmarks that show that ZERO_PAGE 
> is wonderful), I'm not at all interested.

OK, well what numbers would you like to see? I can always try a few
things.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27  5:54   ` Nick Piggin
@ 2007-07-27 15:21     ` Linus Torvalds
  2007-07-30  3:08       ` Nick Piggin
  0 siblings, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2007-07-27 15:21 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List


On Fri, 27 Jul 2007, Nick Piggin wrote:
> 
> What numbers, though? I can make up benchmarks to show that ZERO_PAGE
> sucks just as much. The problem I don't think is finding a situatoin that
> improves without it (we have an extreme case where the Altix livelocked)
> but to get confidence that nothing is going to blow up.

Well, the Altix livelock, for example, would seem to be simply because 
setting up the ZERO_PAGE is so much *faster* that it makes it easier to 
create humongous processes etc so quickly that you don't have time for 
them to be interrupted at setup time.

Is that the "fault" of ZERO_PAGE? Sure. But still..

> > Last time this came up, the logic behind wanting to remove the zero page 
> > was all screwed up, and it was all based on totally broken premises. So I 
> > really want somethign else than just "I don't like it".
> 
> I thought that last time this came up you thought it might be good to
> try out in -mm initially.

I was more thinking about all the churn that we got due to the reference 
counting stuff. That was pretty painful, and removing ZERO_PAGE wasn't the 
right answer then either.

> OK, well what numbers would you like to see? I can always try a few
> things.

Kernel builds with/without this? If the page faults really are that big a 
deal, this should all be visible.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27  2:19 [patch][rfc] remove ZERO_PAGE? Nick Piggin
  2007-07-27  5:29 ` Linus Torvalds
@ 2007-07-27 15:59 ` Hugh Dickins
  2007-07-30 13:52 ` Luiz Fernando N. Capitulino
  2 siblings, 0 replies; 22+ messages in thread
From: Hugh Dickins @ 2007-07-27 15:59 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Linus Torvalds, Andrea Arcangeli, Ingo Molnar,
	Linux Memory Management List

On Fri, 27 Jul 2007, Nick Piggin wrote:
> 
> I'd like to see if we can get the ball rolling on this again, and try to
> get it in 2.6.24 maybe. Any comments?

I'd be glad to see it go in.  You omitted the diffstat:
 6 files changed, 43 insertions(+), 245 deletions(-)

> 
> ---
> Inserting a ZERO_PAGE for anonymous read faults appears to be a false
> optimisation: if an application is performance critical, it would not
> be doing read faults of new memory or at least it could be expected to
> write to that memory soon afterwards. If it is memory use is critical,
> it should not be touching addresses that it knows to be zero anyway.
> 
> eg. Very sparse matrix code might benefit from the ZERO_PAGE, however it
> would only be a naive implementation that isn't tuned for memory usage
> anyway.

Get into -mm soonest, in the hope of revealing anything weird which
does regress (though sadly those tend to get reported only once in
Linus' tree).

> 
> The motivation for this came from a situation where an Altix system
> was essentially livelocked tearing down ZERO_PAGE pagetables when an
> HPC app aborted during startup. This is also a case of a silly
> userspace access pattern, but it did highlight the potential scalability
> problem of the ZERO_PAGE, and corner cases where it can really hurt.
> 
> Mesuring on my desktop system, there are never many mappings to the
> ZERO_PAGE, thus memory usage should not increase too much if we remove
> it. My desktop is by no means representative, but it gives some
> indication.
> 
> When running a make -j4 kernel compile on my dual core system, there are
> about 1,000 mappings to the ZERO_PAGE created per second, and about 1,000
> ZERO_PAGE COW faults per second! (Less than 1 ZERO_PAGE mapping per second
> is torn down without being COWed). So this patch will save 1,000 page
> faults per second, and 2,000 bounces of the ZERO_PAGE struct page
> cacheline per second when running kbuild, which might end up being a
> significant scalability hit on big systems.

I forget my exact numbers, but those are consistent with what
I saw too, back when discouraging you from special casing the
ZERO_PAGE from refcounting: before the Altix issue ever came up
(though we agreed such an issue might emerge, and be fixed this way).

> 
> The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see
> much use to them except complexity and useless benchmarks.

If I'm not mistaken, it was Ingo who long ago put in that "For fun,
we are using the MMU for this" code.  Cc'ed, he's older and wiser
now ;) but might still have a compelling case for it.

But I'd be relieved for it to go: it's easily overlooked and causes
nasty surprises, as when we realized that filemap_xip's original
use of ZERO_PAGE conflicted.

> All other
> users of ZERO_PAGE are converted just to use ZERO_PAGE(0) for simplicity.
> We can look at replacing them all and ripping out ZERO_PAGE completely
> if/when this patch gets in.

My guess is that enough scattered uses will always remain for it
to be useful; though not necessarily wrapped up in that macro.

Hugh

> 
> Index: linux-2.6/drivers/char/mem.c
> ===================================================================
> --- linux-2.6.orig/drivers/char/mem.c
> +++ linux-2.6/drivers/char/mem.c
> @@ -625,65 +625,10 @@ static ssize_t splice_write_null(struct 
>  	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
>  }
>  
> -#ifdef CONFIG_MMU
> -/*
> - * For fun, we are using the MMU for this.
> - */
> -static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
> -{
> -	struct mm_struct *mm;
> -	struct vm_area_struct * vma;
> -	unsigned long addr=(unsigned long)buf;
> -
> -	mm = current->mm;
> -	/* Oops, this was forgotten before. -ben */
> -	down_read(&mm->mmap_sem);
> -
> -	/* For private mappings, just map in zero pages. */
> -	for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
> -		unsigned long count;
> -
> -		if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
> -			goto out_up;
> -		if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
> -			break;
> -		count = vma->vm_end - addr;
> -		if (count > size)
> -			count = size;
> -
> -		zap_page_range(vma, addr, count, NULL);
> -        	if (zeromap_page_range(vma, addr, count, PAGE_COPY))
> -			break;
> -
> -		size -= count;
> -		buf += count;
> -		addr += count;
> -		if (size == 0)
> -			goto out_up;
> -	}
> -
> -	up_read(&mm->mmap_sem);
> -	
> -	/* The shared case is hard. Let's do the conventional zeroing. */ 
> -	do {
> -		unsigned long unwritten = clear_user(buf, PAGE_SIZE);
> -		if (unwritten)
> -			return size + unwritten - PAGE_SIZE;
> -		cond_resched();
> -		buf += PAGE_SIZE;
> -		size -= PAGE_SIZE;
> -	} while (size);
> -
> -	return size;
> -out_up:
> -	up_read(&mm->mmap_sem);
> -	return size;
> -}
> -
>  static ssize_t read_zero(struct file * file, char __user * buf, 
>  			 size_t count, loff_t *ppos)
>  {
> -	unsigned long left, unwritten, written = 0;
> +	size_t written;
>  
>  	if (!count)
>  		return 0;
> @@ -691,69 +636,33 @@ static ssize_t read_zero(struct file * f
>  	if (!access_ok(VERIFY_WRITE, buf, count))
>  		return -EFAULT;
>  
> -	left = count;
> -
> -	/* do we want to be clever? Arbitrary cut-off */
> -	if (count >= PAGE_SIZE*4) {
> -		unsigned long partial;
> -
> -		/* How much left of the page? */
> -		partial = (PAGE_SIZE-1) & -(unsigned long) buf;
> -		unwritten = clear_user(buf, partial);
> -		written = partial - unwritten;
> +	written = 0;
> +	while (count) {
> +		unsigned long unwritten;
> +		size_t chunk = count;
> +
> +		if (chunk > PAGE_SIZE)
> +			chunk = PAGE_SIZE;	/* Just for latency reasons */
> +		unwritten = clear_user(buf, chunk);
> +		written += chunk - unwritten;
>  		if (unwritten)
> -			goto out;
> -		left -= partial;
> -		buf += partial;
> -		unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
> -		written += (left & PAGE_MASK) - unwritten;
> -		if (unwritten)
> -			goto out;
> -		buf += left & PAGE_MASK;
> -		left &= ~PAGE_MASK;
> -	}
> -	unwritten = clear_user(buf, left);
> -	written += left - unwritten;
> -out:
> -	return written ? written : -EFAULT;
> -}
> -
> -static int mmap_zero(struct file * file, struct vm_area_struct * vma)
> -{
> -	int err;
> -
> -	if (vma->vm_flags & VM_SHARED)
> -		return shmem_zero_setup(vma);
> -	err = zeromap_page_range(vma, vma->vm_start,
> -			vma->vm_end - vma->vm_start, vma->vm_page_prot);
> -	BUG_ON(err == -EEXIST);
> -	return err;
> -}
> -#else /* CONFIG_MMU */
> -static ssize_t read_zero(struct file * file, char * buf, 
> -			 size_t count, loff_t *ppos)
> -{
> -	size_t todo = count;
> -
> -	while (todo) {
> -		size_t chunk = todo;
> -
> -		if (chunk > 4096)
> -			chunk = 4096;	/* Just for latency reasons */
> -		if (clear_user(buf, chunk))
> -			return -EFAULT;
> +			break;
>  		buf += chunk;
> -		todo -= chunk;
> +		count -= chunk;
>  		cond_resched();
>  	}
> -	return count;
> +	return written ? written : -EFAULT;
>  }
>  
>  static int mmap_zero(struct file * file, struct vm_area_struct * vma)
>  {
> +#ifndef CONFIG_MMU
>  	return -ENOSYS;
> +#endif
> +	if (vma->vm_flags & VM_SHARED)
> +		return shmem_zero_setup(vma);
> +	return 0;
>  }
> -#endif /* CONFIG_MMU */
>  
>  static ssize_t write_full(struct file * file, const char __user * buf,
>  			  size_t count, loff_t *ppos)
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h
> +++ linux-2.6/include/linux/mm.h
> @@ -778,8 +778,6 @@ void free_pgtables(struct mmu_gather **t
>  		unsigned long floor, unsigned long ceiling);
>  int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
>  			struct vm_area_struct *vma);
> -int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
> -			unsigned long size, pgprot_t prot);
>  void unmap_mapping_range(struct address_space *mapping,
>  		loff_t const holebegin, loff_t const holelen, int even_cows);
>  
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c
> +++ linux-2.6/mm/memory.c
> @@ -966,7 +966,7 @@ no_page_table:
>  	 * has touched so far, we don't want to allocate page tables.
>  	 */
>  	if (flags & FOLL_ANON) {
> -		page = ZERO_PAGE(address);
> +		page = ZERO_PAGE(0);
>  		if (flags & FOLL_GET)
>  			get_page(page);
>  		BUG_ON(flags & FOLL_WRITE);
> @@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *t
>  }
>  EXPORT_SYMBOL(get_user_pages);
>  
> -static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
> -			unsigned long addr, unsigned long end, pgprot_t prot)
> -{
> -	pte_t *pte;
> -	spinlock_t *ptl;
> -	int err = 0;
> -
> -	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
> -	if (!pte)
> -		return -EAGAIN;
> -	arch_enter_lazy_mmu_mode();
> -	do {
> -		struct page *page = ZERO_PAGE(addr);
> -		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
> -
> -		if (unlikely(!pte_none(*pte))) {
> -			err = -EEXIST;
> -			pte++;
> -			break;
> -		}
> -		page_cache_get(page);
> -		page_add_file_rmap(page);
> -		inc_mm_counter(mm, file_rss);
> -		set_pte_at(mm, addr, pte, zero_pte);
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> -	arch_leave_lazy_mmu_mode();
> -	pte_unmap_unlock(pte - 1, ptl);
> -	return err;
> -}
> -
> -static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
> -			unsigned long addr, unsigned long end, pgprot_t prot)
> -{
> -	pmd_t *pmd;
> -	unsigned long next;
> -	int err;
> -
> -	pmd = pmd_alloc(mm, pud, addr);
> -	if (!pmd)
> -		return -EAGAIN;
> -	do {
> -		next = pmd_addr_end(addr, end);
> -		err = zeromap_pte_range(mm, pmd, addr, next, prot);
> -		if (err)
> -			break;
> -	} while (pmd++, addr = next, addr != end);
> -	return err;
> -}
> -
> -static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
> -			unsigned long addr, unsigned long end, pgprot_t prot)
> -{
> -	pud_t *pud;
> -	unsigned long next;
> -	int err;
> -
> -	pud = pud_alloc(mm, pgd, addr);
> -	if (!pud)
> -		return -EAGAIN;
> -	do {
> -		next = pud_addr_end(addr, end);
> -		err = zeromap_pmd_range(mm, pud, addr, next, prot);
> -		if (err)
> -			break;
> -	} while (pud++, addr = next, addr != end);
> -	return err;
> -}
> -
> -int zeromap_page_range(struct vm_area_struct *vma,
> -			unsigned long addr, unsigned long size, pgprot_t prot)
> -{
> -	pgd_t *pgd;
> -	unsigned long next;
> -	unsigned long end = addr + size;
> -	struct mm_struct *mm = vma->vm_mm;
> -	int err;
> -
> -	BUG_ON(addr >= end);
> -	pgd = pgd_offset(mm, addr);
> -	flush_cache_range(vma, addr, end);
> -	do {
> -		next = pgd_addr_end(addr, end);
> -		err = zeromap_pud_range(mm, pgd, addr, next, prot);
> -		if (err)
> -			break;
> -	} while (pgd++, addr = next, addr != end);
> -	return err;
> -}
> -
>  pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
>  {
>  	pgd_t * pgd = pgd_offset(mm, addr);
> @@ -1714,16 +1625,11 @@ gotten:
>  
>  	if (unlikely(anon_vma_prepare(vma)))
>  		goto oom;
> -	if (old_page == ZERO_PAGE(address)) {
> -		new_page = alloc_zeroed_user_highpage_movable(vma, address);
> -		if (!new_page)
> -			goto oom;
> -	} else {
> -		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
> -		if (!new_page)
> -			goto oom;
> -		cow_user_page(new_page, old_page, address, vma);
> -	}
> +	VM_BUG_ON(old_page == ZERO_PAGE(0));
> +	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
> +	if (!new_page)
> +		goto oom;
> +	cow_user_page(new_page, old_page, address, vma);
>  
>  	/*
>  	 * Re-check the pte - we dropped the lock
> @@ -2249,39 +2155,24 @@ static int do_anonymous_page(struct mm_s
>  	spinlock_t *ptl;
>  	pte_t entry;
>  
> -	if (write_access) {
> -		/* Allocate our own private page. */
> -		pte_unmap(page_table);
> -
> -		if (unlikely(anon_vma_prepare(vma)))
> -			goto oom;
> -		page = alloc_zeroed_user_highpage_movable(vma, address);
> -		if (!page)
> -			goto oom;
> -
> -		entry = mk_pte(page, vma->vm_page_prot);
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +	/* Allocate our own private page. */
> +	pte_unmap(page_table);
>  
> -		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> -		if (!pte_none(*page_table))
> -			goto release;
> -		inc_mm_counter(mm, anon_rss);
> -		lru_cache_add_active(page);
> -		page_add_new_anon_rmap(page, vma, address);
> -	} else {
> -		/* Map the ZERO_PAGE - vm_page_prot is readonly */
> -		page = ZERO_PAGE(address);
> -		page_cache_get(page);
> -		entry = mk_pte(page, vma->vm_page_prot);
> +	if (unlikely(anon_vma_prepare(vma)))
> +		goto oom;
> +	page = alloc_zeroed_user_highpage_movable(vma, address);
> +	if (!page)
> +		goto oom;
>  
> -		ptl = pte_lockptr(mm, pmd);
> -		spin_lock(ptl);
> -		if (!pte_none(*page_table))
> -			goto release;
> -		inc_mm_counter(mm, file_rss);
> -		page_add_file_rmap(page);
> -	}
> +	entry = mk_pte(page, vma->vm_page_prot);
> +	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  
> +	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
> +	if (!pte_none(*page_table))
> +		goto release;
> +	inc_mm_counter(mm, anon_rss);
> +	lru_cache_add_active(page);
> +	page_add_new_anon_rmap(page, vma, address);
>  	set_pte_at(mm, address, page_table, entry);
>  
>  	/* No need to invalidate - it was non-present before */
> Index: linux-2.6/fs/binfmt_elf.c
> ===================================================================
> --- linux-2.6.orig/fs/binfmt_elf.c
> +++ linux-2.6/fs/binfmt_elf.c
> @@ -1733,7 +1733,7 @@ static int elf_core_dump(long signr, str
>  						&page, &vma) <= 0) {
>  				DUMP_SEEK(PAGE_SIZE);
>  			} else {
> -				if (page == ZERO_PAGE(addr)) {
> +				if (page == ZERO_PAGE(0)) {
>  					if (!dump_seek(file, PAGE_SIZE)) {
>  						page_cache_release(page);
>  						goto end_coredump;
> Index: linux-2.6/fs/binfmt_elf_fdpic.c
> ===================================================================
> --- linux-2.6.orig/fs/binfmt_elf_fdpic.c
> +++ linux-2.6/fs/binfmt_elf_fdpic.c
> @@ -1488,7 +1488,7 @@ static int elf_fdpic_dump_segments(struc
>  					   &page, &vma) <= 0) {
>  				DUMP_SEEK(file->f_pos + PAGE_SIZE);
>  			}
> -			else if (page == ZERO_PAGE(addr)) {
> +			else if (page == ZERO_PAGE(0)) {
>  				page_cache_release(page);
>  				DUMP_SEEK(file->f_pos + PAGE_SIZE);
>  			}
> Index: linux-2.6/fs/direct-io.c
> ===================================================================
> --- linux-2.6.orig/fs/direct-io.c
> +++ linux-2.6/fs/direct-io.c
> @@ -163,7 +163,7 @@ static int dio_refill_pages(struct dio *
>  	up_read(&current->mm->mmap_sem);
>  
>  	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
> -		struct page *page = ZERO_PAGE(dio->curr_user_address);
> +		struct page *page = ZERO_PAGE(0);
>  		/*
>  		 * A memory fault, but the filesystem has some outstanding
>  		 * mapped blocks.  We need to use those blocks up to avoid
> @@ -772,7 +772,7 @@ static void dio_zero_block(struct dio *d
>  
>  	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
>  
> -	page = ZERO_PAGE(dio->curr_user_address);
> +	page = ZERO_PAGE(0);
>  	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
>  				dio->next_block_for_io))
>  		return;
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27 15:21     ` Linus Torvalds
@ 2007-07-30  3:08       ` Nick Piggin
  2007-07-30  3:45         ` Linus Torvalds
  0 siblings, 1 reply; 22+ messages in thread
From: Nick Piggin @ 2007-07-30  3:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Fri, Jul 27, 2007 at 08:21:54AM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 27 Jul 2007, Nick Piggin wrote:
> > 
> > What numbers, though? I can make up benchmarks to show that ZERO_PAGE
> > sucks just as much. The problem I don't think is finding a situatoin that
> > improves without it (we have an extreme case where the Altix livelocked)
> > but to get confidence that nothing is going to blow up.
> 
> Well, the Altix livelock, for example, would seem to be simply because 
> setting up the ZERO_PAGE is so much *faster* that it makes it easier to 
> create humongous processes etc so quickly that you don't have time for 
> them to be interrupted at setup time.
> 
> Is that the "fault" of ZERO_PAGE? Sure. But still..

Well the issue wasn't exactly that, but the fact that a lot of processes
all exitted at once, while each having a significant number of ZERO_PAGE
mappings. The freeing rate ends up going right down (OK it wasn't quite a
livelock, finishing in > 2 hours, but without ZERO_PAGE bouncing they
exit in 5 seconds).

> > > Last time this came up, the logic behind wanting to remove the zero page 
> > > was all screwed up, and it was all based on totally broken premises. So I 
> > > really want somethign else than just "I don't like it".
> > 
> > I thought that last time this came up you thought it might be good to
> > try out in -mm initially.
> 
> I was more thinking about all the churn that we got due to the reference 
> counting stuff. That was pretty painful, and removing ZERO_PAGE wasn't the 
> right answer then either.

OK the refcounting changes (as you know) were basically unifying 2 forms
of refcount handling in the mm to a single form. ZERO_PAGE was in one of
those classes where refcounting was mostly just skipped. However it is
notable among those because it is potentially most frequently shared.

I pushed the refcounting change because it would make lockless pagecache
work, and in this situation, weird ZERO_PAGE refcounting wasn't a problem
for me so I was happy to add back some of those refcounting exceptions
just for ZERO_PAGE. I guess you and Hugh mainly just saw the cleanliness
aspect so are against adding that back, and that's fair enough.

Well now we have a problem, which is that cacheline bouncing hurts. We
have maybe 2 ways to fix that (add more ZERO_PAGEs or don't refcount
ZERO_PAGE). However what if we take a step back and first ask if the
ZERO_PAGE is even worth keeping? IOW, my reasoning is not simply
"ZERO_PAGE refcounting sucks now so let's just rip it out", but rather
"we could fix the refcounting issue, but it is likely to make the code
more complex, so we could look at getting rid of it as another option".

I realise it is a reasonably significant change in behaviour, but we
have to have some process of shuffling these things off the mortal coil.
Suppose ZERO_PAGE had no downside at all except the extra code; if it
had no good reason for being, then we'd like to be able to remove it
eventually.

> > OK, well what numbers would you like to see? I can always try a few
> > things.
> 
> Kernel builds with/without this? If the page faults really are that big a 
> deal, this should all be visible.

Sorry if it was misleading: the kernel build numbers weren't really about
where ZERO_PAGE hurts us, but just trying to show that it doesn't help too
much (for which I realise anything short of a kernel.org release is sadly
inadequate).

Anyway, I'll see if I can get anything significant...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30  3:08       ` Nick Piggin
@ 2007-07-30  3:45         ` Linus Torvalds
  2007-07-30  3:56           ` Linus Torvalds
  2007-07-30  4:30           ` Nick Piggin
  0 siblings, 2 replies; 22+ messages in thread
From: Linus Torvalds @ 2007-07-30  3:45 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Mon, 30 Jul 2007, Nick Piggin wrote:
> 
> Well the issue wasn't exactly that, but the fact that a lot of processes
> all exitted at once, while each having a significant number of ZERO_PAGE
> mappings. The freeing rate ends up going right down (OK it wasn't quite a
> livelock, finishing in > 2 hours, but without ZERO_PAGE bouncing they
> exit in 5 seconds).

Umm. Isn't this because of the new page->mapping counting for reserved 
pages?

The one I was violently against, and told you (and Hugh) was pointless and 
bad?

Or what "bouncing" are you talking about?

In other words, this doesn't really sound like a ZERO_PAGE problem at all, 
but a problem that was engineered by unnecessarily trying to count those 
pages in the first place. No?

> > Kernel builds with/without this? If the page faults really are that big a 
> > deal, this should all be visible.
> 
> Sorry if it was misleading: the kernel build numbers weren't really about
> where ZERO_PAGE hurts us, but just trying to show that it doesn't help too
> much (for which I realise anything short of a kernel.org release is sadly
> inadequate).
> 
> Anyway, I'll see if I can get anything significant...

The thing that really riles me up about this is that the whole damn thing 
seems to be so pointless. This "ZERO_PAGE is bad" thing has been a 
constant background noise, where people are pushing their opinions with no 
real technical reasons.

I want technical reasons, but I get the feeling that this pogrom is abotu 
anything but technical arguments.

So I _really_ don't want to hear you blaming ZERO_PAGE for something that 
you introduced yourself with Hugh, and that had _zero_ to do with 
ZERO_PAGE, and that I spent weeks saying was pointless, to the point where 
I just gave up.

Now, that you apparently have found the perfect load that proved me right, 
you instead of blaming the pointless refcounting, you want to blame the 
poor ZERO_PAGE. Again. 

And THAT is what makes me irritated with this patch. I hate the 
background, and what looks like intellectual dishonesty. I _told_ people 
that refcounting reserved pages was pointless and bad. Did you listen? No. 
And now that it causes problems, rather than blame the refcounting, you 
blame the victim.

The zero page is *cheaper* to set up than normal pages. It always was. You 
just *made* it more expensive, because of the irrational fear of 
PageReserved() that protected us from all those unnecessary bounces in the 
first place.

So a totally equivalent fix would be to just re-instate the PageReserved 
checks. It would likely even be easier these days (one logical place to do 
so would be in "vm_normal_page()", which automatically would catch all 
unmapping cases, but you'd still have to make sure you don't *increment* 
the page count when you map it too, of course).

But if you can actually show that ZERO_PAGE literally slows things down 
(and none of this page count bouncing crud that you were the one that 
introduced in the first place), then _that_ would be a totally different 
issue. At that point, you have an independent reason for removing code 
that has basically been there since day 1, and all my arguments go away.

See?

I'd love to hear "here's a real-life load, and yes, the ZERO_PAGE logic 
really does hurt more than it helps, it's time to remove it". At that 
point I'll happily apply the patch.

But what I *don't* want to hear is "we screwed up the reference-counting 
of ZERO_PAGE, so now it's so expensive that we want to remove the page 
entirely". That just makes me sad. And a bit angry.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30  3:45         ` Linus Torvalds
@ 2007-07-30  3:56           ` Linus Torvalds
  2007-07-30  4:35             ` Nick Piggin
  2007-07-30  4:30           ` Nick Piggin
  1 sibling, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2007-07-30  3:56 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Sun, 29 Jul 2007, Linus Torvalds wrote:
> 
> I'd love to hear "here's a real-life load, and yes, the ZERO_PAGE logic 
> really does hurt more than it helps, it's time to remove it". At that 
> point I'll happily apply the patch.

Btw, in the absense of that, I'd at least like to hear an acknowledgement 
that the complexity isn't worth it, and that what used to work fine was 
broken because the reference counting overhead gets us on large-scale 
machines.

IOW, I certainly like removing lines of code. In that sense I _love_ that 
patch. I really just react negatively because I really think you first set 
ZERO_PAGE up to fail.

So even if you cannot find a load where this all matters, at least point 
to commit b5810039a54e5babf428e9a1e89fc1940fabff11 (or exactly whichever 
one it was that started ref-counting ZERO_PAGE) and blame *that* one, 
rather than blaming ZERO_PAGE for the problem.

It has served us well for fifteen years, we shouldn't blame it for 
problems that came from elsewhere.

			Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30  3:45         ` Linus Torvalds
  2007-07-30  3:56           ` Linus Torvalds
@ 2007-07-30  4:30           ` Nick Piggin
  1 sibling, 0 replies; 22+ messages in thread
From: Nick Piggin @ 2007-07-30  4:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Sun, Jul 29, 2007 at 08:45:25PM -0700, Linus Torvalds wrote:
> 
> 
> On Mon, 30 Jul 2007, Nick Piggin wrote:
> > 
> > Well the issue wasn't exactly that, but the fact that a lot of processes
> > all exitted at once, while each having a significant number of ZERO_PAGE
> > mappings. The freeing rate ends up going right down (OK it wasn't quite a
> > livelock, finishing in > 2 hours, but without ZERO_PAGE bouncing they
> > exit in 5 seconds).
> 
> Umm. Isn't this because of the new page->mapping counting for reserved 
> pages?
> 
> The one I was violently against, and told you (and Hugh) was pointless and 
> bad?
> 
> Or what "bouncing" are you talking about?

page->mapcount, yes. I don't quite remember anybody being violently against
it, but that's in the past now anyway.

> In other words, this doesn't really sound like a ZERO_PAGE problem at all, 
> but a problem that was engineered by unnecessarily trying to count those 
> pages in the first place. No?

Yeah, but that's a little unfair: it was engineered by removing the code
to *not* count those pages :) Another option I gave you was to add
back some of that code to avoid this refcounting, but you were violently
against that :).

> > > Kernel builds with/without this? If the page faults really are that big a 
> > > deal, this should all be visible.
> > 
> > Sorry if it was misleading: the kernel build numbers weren't really about
> > where ZERO_PAGE hurts us, but just trying to show that it doesn't help too
> > much (for which I realise anything short of a kernel.org release is sadly
> > inadequate).
> > 
> > Anyway, I'll see if I can get anything significant...
> 
> The thing that really riles me up about this is that the whole damn thing 
> seems to be so pointless. This "ZERO_PAGE is bad" thing has been a 
> constant background noise, where people are pushing their opinions with no 
> real technical reasons.
> 
> I want technical reasons, but I get the feeling that this pogrom is abotu 
> anything but technical arguments.
> 
> So I _really_ don't want to hear you blaming ZERO_PAGE for something that 
> you introduced yourself with Hugh, and that had _zero_ to do with 
> ZERO_PAGE, and that I spent weeks saying was pointless, to the point where 
> I just gave up.
> 
> Now, that you apparently have found the perfect load that proved me right, 
> you instead of blaming the pointless refcounting, you want to blame the 
> poor ZERO_PAGE. Again. 

No, I'm not saying ZERO_PAGE is bad because it has a refcounting scalability
problem. We can avoid or eliminate that by *not refcounting it* or doing
something crazy like per-node ZERO_PAGEs.

My first patch to fix the problem was actually to not refcount it. Remember?

The technical argument for this patch is that I'm trying to reason that
ZERO_PAGE is no good in the first place. In this debate, the refcounting
issue is really moot (IOW, I'm not trying to argue the ZERO_PAGE is bad
*because* of the refcounting problem, but because it is not good).

> And THAT is what makes me irritated with this patch. I hate the 
> background, and what looks like intellectual dishonesty. I _told_ people 
> that refcounting reserved pages was pointless and bad. Did you listen? No. 
> And now that it causes problems, rather than blame the refcounting, you 
> blame the victim.
> 
> The zero page is *cheaper* to set up than normal pages. It always was. You 
> just *made* it more expensive, because of the irrational fear of 
> PageReserved() that protected us from all those unnecessary bounces in the 
> first place.
> 
> So a totally equivalent fix would be to just re-instate the PageReserved 
> checks. It would likely even be easier these days (one logical place to do 
> so would be in "vm_normal_page()", which automatically would catch all 
> unmapping cases, but you'd still have to make sure you don't *increment* 
> the page count when you map it too, of course).

I didn't like PageReserved for a number of reasons including that it allowed
people to be sloppy with refcounting. However I wouldn't mind adding back
some checks to skip counting (although if we do that, let's add a new page
flag for it, and PageReserved can eventually disappear).

But we should do it on the right level. That is, the struct page itself is
a refcounted object. If we skip refcounting for userspace mappings, it
should be done there, rather than an ugly hack in put_page.

The fear of PageReserved was not irrational -- as I said, I needed to get
rid of the put_page special casing for the lockless pagecache. I'm not
against properly special casing special pages.

> But if you can actually show that ZERO_PAGE literally slows things down 
> (and none of this page count bouncing crud that you were the one that 
> introduced in the first place), then _that_ would be a totally different 
> issue. At that point, you have an independent reason for removing code 
> that has basically been there since day 1, and all my arguments go away.
> 
> See?
> 
> I'd love to hear "here's a real-life load, and yes, the ZERO_PAGE logic 
> really does hurt more than it helps, it's time to remove it". At that 
> point I'll happily apply the patch.
> 
> But what I *don't* want to hear is "we screwed up the reference-counting 
> of ZERO_PAGE, so now it's so expensive that we want to remove the page 
> entirely". That just makes me sad. And a bit angry.

I'd say it will be hard to actually get a significant real world
improvement from removing it vs de-refcounting it. Maybe on an mmap_sem
constrained workload, but that seems to be a lot better after the glibc
fix and private futexes...

But is there a good reason to keep it? You say it is cheaper to set up,
but make -j still does 500 extra page faults per second per cpu because
of ZERO_PAGE, making it effectively more expensive even ignoring the
refcounting problem. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30  3:56           ` Linus Torvalds
@ 2007-07-30  4:35             ` Nick Piggin
  0 siblings, 0 replies; 22+ messages in thread
From: Nick Piggin @ 2007-07-30  4:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List

On Sun, Jul 29, 2007 at 08:56:54PM -0700, Linus Torvalds wrote:
> 
> 
> On Sun, 29 Jul 2007, Linus Torvalds wrote:
> > 
> > I'd love to hear "here's a real-life load, and yes, the ZERO_PAGE logic 
> > really does hurt more than it helps, it's time to remove it". At that 
> > point I'll happily apply the patch.
> 
> Btw, in the absense of that, I'd at least like to hear an acknowledgement 
> that the complexity isn't worth it, and that what used to work fine was 
> broken because the reference counting overhead gets us on large-scale 
> machines.
> 
> IOW, I certainly like removing lines of code. In that sense I _love_ that 
> patch. I really just react negatively because I really think you first set 
> ZERO_PAGE up to fail.
> 
> So even if you cannot find a load where this all matters, at least point 
> to commit b5810039a54e5babf428e9a1e89fc1940fabff11 (or exactly whichever 
> one it was that started ref-counting ZERO_PAGE) and blame *that* one, 
> rather than blaming ZERO_PAGE for the problem.

OK, as a patch, the changelog could be improved. But I state that
complexity isn't worth it as "ZERO_PAGE appears to be a false optimisation".


> It has served us well for fifteen years, we shouldn't blame it for 
> problems that came from elsewhere.

Can it still serve us well, though? This is what I would really like to talk
about, and I guess would decide whether we again special case the refcounting
or remove it completely. I'm happy to do either (although I do *hope* that it
can be removed, for complexity's sake).


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-27  2:19 [patch][rfc] remove ZERO_PAGE? Nick Piggin
  2007-07-27  5:29 ` Linus Torvalds
  2007-07-27 15:59 ` Hugh Dickins
@ 2007-07-30 13:52 ` Luiz Fernando N. Capitulino
  2007-07-30 18:57   ` Andrew Morton
  2 siblings, 1 reply; 22+ messages in thread
From: Luiz Fernando N. Capitulino @ 2007-07-30 13:52 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Linus Torvalds, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List, lcapitulino

Hi Nick,

On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:

> I'd like to see if we can get the ball rolling on this again, and try to
> get it in 2.6.24 maybe. Any comments?

 I'm trying this patch and got this during the weekend (gmail will
probably break lines automatically, grrr):

"""
[29711.081281] BUG: unable to handle kernel NULL pointer dereference
at virtual address 00000004
[29711.081300]  printing eip:
[29711.081305] dcb5aa49
[29711.081308] *pde = 00000000
[29711.081315] Oops: 0000 [#1]
[29711.081319] SMP
[29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
sata_via libata scsi_mod
[29711.081445] CPU:    0
[29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
[29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
[29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
[29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
[29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
[29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
[29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
task.ti=d8788000)
[29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
c0945160 dcb59202 00000000
[29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
d80af14c d80b0000 d8788f68
[29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
d8092018 d8092000 0000001c
[29711.081714] Call Trace:
[29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
[29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
[29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
[29711.081798]  [<c01058c6>] die+0x116/0x250
[29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
[29711.081836]  [<c02e95ba>] error_code+0x72/0x78
[29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
[29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
[29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
[29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
[29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
[29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
[29711.082032]  =======================
[29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
04 8d
[29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
0068:d8788efc
"""

 Now I'm not sure if this was caused by your patch, or is a bug
somewhere else.

-- 
Luiz Fernando N. Capitulino

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30 13:52 ` Luiz Fernando N. Capitulino
@ 2007-07-30 18:57   ` Andrew Morton
  2007-07-30 22:39     ` J. Bruce Fields
  0 siblings, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2007-07-30 18:57 UTC (permalink / raw)
  To: Luiz Fernando N. Capitulino
  Cc: Nick Piggin, Linus Torvalds, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List, lcapitulino, Neil Brown,
	J. Bruce Fields

On Mon, 30 Jul 2007 10:52:27 -0300
"Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:

> Hi Nick,
> 
> On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> 
> > I'd like to see if we can get the ball rolling on this again, and try to
> > get it in 2.6.24 maybe. Any comments?
> 
>  I'm trying this patch and got this during the weekend (gmail will
> probably break lines automatically, grrr):
> 
> """
> [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> at virtual address 00000004
> [29711.081300]  printing eip:
> [29711.081305] dcb5aa49
> [29711.081308] *pde = 00000000
> [29711.081315] Oops: 0000 [#1]
> [29711.081319] SMP
> [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> sata_via libata scsi_mod
> [29711.081445] CPU:    0
> [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> task.ti=d8788000)
> [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> c0945160 dcb59202 00000000
> [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> d80af14c d80b0000 d8788f68
> [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> d8092018 d8092000 0000001c
> [29711.081714] Call Trace:
> [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> [29711.081798]  [<c01058c6>] die+0x116/0x250
> [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> [29711.082032]  =======================
> [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> 04 8d
> [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> 0068:d8788efc
> """
> 
>  Now I'm not sure if this was caused by your patch, or is a bug
> somewhere else.

It's a little hard to see how Nick's patch could have caused that to
happen.

Neil, Bruce: does this look at all familiar?  We don't appear to have
changed anything in there for months...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30 18:57   ` Andrew Morton
@ 2007-07-30 22:39     ` J. Bruce Fields
  2007-07-30 23:09       ` Andrew Morton
  2007-08-01  1:47       ` Nick Piggin
  0 siblings, 2 replies; 22+ messages in thread
From: J. Bruce Fields @ 2007-07-30 22:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Luiz Fernando N. Capitulino, Nick Piggin, Linus Torvalds,
	Hugh Dickins, Andrea Arcangeli, Linux Memory Management List,
	lcapitulino, Neil Brown

On Mon, Jul 30, 2007 at 11:57:51AM -0700, Andrew Morton wrote:
> On Mon, 30 Jul 2007 10:52:27 -0300
> "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:
> 
> > Hi Nick,
> > 
> > On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > I'd like to see if we can get the ball rolling on this again, and try to
> > > get it in 2.6.24 maybe. Any comments?
> > 
> >  I'm trying this patch and got this during the weekend (gmail will
> > probably break lines automatically, grrr):
> > 
> > """
> > [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> > at virtual address 00000004
> > [29711.081300]  printing eip:
> > [29711.081305] dcb5aa49
> > [29711.081308] *pde = 00000000
> > [29711.081315] Oops: 0000 [#1]
> > [29711.081319] SMP
> > [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> > nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> > binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> > snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> > snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> > snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> > mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> > sata_via libata scsi_mod
> > [29711.081445] CPU:    0
> > [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> > [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> > [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> > [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> > [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> > [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> > [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> > task.ti=d8788000)
> > [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> > c0945160 dcb59202 00000000
> > [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> > d80af14c d80b0000 d8788f68
> > [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> > d8092018 d8092000 0000001c
> > [29711.081714] Call Trace:
> > [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> > [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> > [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> > [29711.081798]  [<c01058c6>] die+0x116/0x250
> > [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> > [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> > [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> > [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> > [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> > [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> > [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> > [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> > [29711.082032]  =======================
> > [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> > c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> > 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> > 04 8d
> > [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> > 0068:d8788efc
> > """
> > 
> >  Now I'm not sure if this was caused by your patch, or is a bug
> > somewhere else.
> 
> It's a little hard to see how Nick's patch could have caused that to
> happen.
> 
> Neil, Bruce: does this look at all familiar?

Not to me.

> We don't appear to have changed anything in there for months...

Well, I've fooled around with the exports in 2.6.23-rc1, and Neil added
the uuid stuff some time around 2.6.21.

It looks to me like it's oopsing at the deference of
fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
commit b41eeef14d claims to fix.  Looks like that's been in since
v2.6.22-rc1; what kernel is this?

--b.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30 22:39     ` J. Bruce Fields
@ 2007-07-30 23:09       ` Andrew Morton
  2007-07-31  0:03         ` Luiz Fernando N. Capitulino
  2007-08-01  1:47       ` Nick Piggin
  1 sibling, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2007-07-30 23:09 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Luiz Fernando N. Capitulino, Nick Piggin, Linus Torvalds,
	Hugh Dickins, Andrea Arcangeli, Linux Memory Management List,
	lcapitulino, Neil Brown

On Mon, 30 Jul 2007 18:39:12 -0400
"J. Bruce Fields" <bfields@fieldses.org> wrote:

> On Mon, Jul 30, 2007 at 11:57:51AM -0700, Andrew Morton wrote:
> > On Mon, 30 Jul 2007 10:52:27 -0300
> > "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:
> > 
> > > Hi Nick,
> > > 
> > > On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> > > 
> > > > I'd like to see if we can get the ball rolling on this again, and try to
> > > > get it in 2.6.24 maybe. Any comments?
> > > 
> > >  I'm trying this patch and got this during the weekend (gmail will
> > > probably break lines automatically, grrr):
> > > 
> > > """
> > > [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> > > at virtual address 00000004
> > > [29711.081300]  printing eip:
> > > [29711.081305] dcb5aa49
> > > [29711.081308] *pde = 00000000
> > > [29711.081315] Oops: 0000 [#1]
> > > [29711.081319] SMP
> > > [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> > > nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> > > binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> > > snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> > > snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> > > snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> > > mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> > > sata_via libata scsi_mod
> > > [29711.081445] CPU:    0
> > > [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> > > [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> > > [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> > > [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> > > [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> > > [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> > > [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> > > task.ti=d8788000)
> > > [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> > > c0945160 dcb59202 00000000
> > > [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> > > d80af14c d80b0000 d8788f68
> > > [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> > > d8092018 d8092000 0000001c
> > > [29711.081714] Call Trace:
> > > [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> > > [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> > > [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> > > [29711.081798]  [<c01058c6>] die+0x116/0x250
> > > [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> > > [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> > > [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> > > [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> > > [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> > > [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> > > [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> > > [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> > > [29711.082032]  =======================
> > > [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> > > c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> > > 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> > > 04 8d
> > > [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> > > 0068:d8788efc
> > > """
> > > 
> > >  Now I'm not sure if this was caused by your patch, or is a bug
> > > somewhere else.
> > 
> > It's a little hard to see how Nick's patch could have caused that to
> > happen.
> > 
> > Neil, Bruce: does this look at all familiar?
> 
> Not to me.
> 
> > We don't appear to have changed anything in there for months...
> 
> Well, I've fooled around with the exports in 2.6.23-rc1, and Neil added
> the uuid stuff some time around 2.6.21.

uh.

> It looks to me like it's oopsing at the deference of
> fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> commit b41eeef14d claims to fix.  Looks like that's been in since
> v2.6.22-rc1; what kernel is this?

I assume it is 2.6.23-rc1 (or later) plus Nick's remove-ZERO_PAGE patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30 23:09       ` Andrew Morton
@ 2007-07-31  0:03         ` Luiz Fernando N. Capitulino
  0 siblings, 0 replies; 22+ messages in thread
From: Luiz Fernando N. Capitulino @ 2007-07-31  0:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: J. Bruce Fields, Nick Piggin, Linus Torvalds, Hugh Dickins,
	Andrea Arcangeli, Linux Memory Management List, lcapitulino,
	Neil Brown

On 7/30/07, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Mon, 30 Jul 2007 18:39:12 -0400
> "J. Bruce Fields" <bfields@fieldses.org> wrote:
>
> > On Mon, Jul 30, 2007 at 11:57:51AM -0700, Andrew Morton wrote:
> > > On Mon, 30 Jul 2007 10:52:27 -0300
> > > "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:
> > >
> > > > Hi Nick,
> > > >
> > > > On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> > > >
> > > > > I'd like to see if we can get the ball rolling on this again, and try to
> > > > > get it in 2.6.24 maybe. Any comments?
> > > >
> > > >  I'm trying this patch and got this during the weekend (gmail will
> > > > probably break lines automatically, grrr):
> > > >
> > > > """
> > > > [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> > > > at virtual address 00000004
> > > > [29711.081300]  printing eip:
> > > > [29711.081305] dcb5aa49
> > > > [29711.081308] *pde = 00000000
> > > > [29711.081315] Oops: 0000 [#1]
> > > > [29711.081319] SMP
> > > > [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> > > > nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> > > > binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> > > > snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> > > > snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> > > > snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> > > > mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> > > > sata_via libata scsi_mod
> > > > [29711.081445] CPU:    0
> > > > [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> > > > [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> > > > [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> > > > [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> > > > [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> > > > [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> > > > [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> > > > task.ti=d8788000)
> > > > [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> > > > c0945160 dcb59202 00000000
> > > > [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> > > > d80af14c d80b0000 d8788f68
> > > > [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> > > > d8092018 d8092000 0000001c
> > > > [29711.081714] Call Trace:
> > > > [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> > > > [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> > > > [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> > > > [29711.081798]  [<c01058c6>] die+0x116/0x250
> > > > [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> > > > [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> > > > [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> > > > [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> > > > [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> > > > [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> > > > [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> > > > [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> > > > [29711.082032]  =======================
> > > > [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> > > > c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> > > > 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> > > > 04 8d
> > > > [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> > > > 0068:d8788efc
> > > > """
> > > >
> > > >  Now I'm not sure if this was caused by your patch, or is a bug
> > > > somewhere else.
> > >
> > > It's a little hard to see how Nick's patch could have caused that to
> > > happen.
> > >
> > > Neil, Bruce: does this look at all familiar?
> >
> > Not to me.
> >
> > > We don't appear to have changed anything in there for months...
> >
> > Well, I've fooled around with the exports in 2.6.23-rc1, and Neil added
> > the uuid stuff some time around 2.6.21.
>
> uh.
>
> > It looks to me like it's oopsing at the deference of
> > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > commit b41eeef14d claims to fix.  Looks like that's been in since
> > v2.6.22-rc1; what kernel is this?
>
> I assume it is 2.6.23-rc1 (or later) plus Nick's remove-ZERO_PAGE patch.

 Yes, it´s 2.6.23-rc1 plus Nick´s patch.

-- 
Luiz Fernando N. Capitulino

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-07-30 22:39     ` J. Bruce Fields
  2007-07-30 23:09       ` Andrew Morton
@ 2007-08-01  1:47       ` Nick Piggin
  2007-08-01  1:53         ` J. Bruce Fields
  2007-08-01  2:17         ` Luiz Fernando N. Capitulino
  1 sibling, 2 replies; 22+ messages in thread
From: Nick Piggin @ 2007-08-01  1:47 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Andrew Morton, Luiz Fernando N. Capitulino, Linus Torvalds,
	Hugh Dickins, Andrea Arcangeli, Linux Memory Management List,
	lcapitulino, Neil Brown

On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> On Mon, Jul 30, 2007 at 11:57:51AM -0700, Andrew Morton wrote:
> > On Mon, 30 Jul 2007 10:52:27 -0300
> > "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:
> > 
> > > Hi Nick,
> > > 
> > > On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> > > 
> > > > I'd like to see if we can get the ball rolling on this again, and try to
> > > > get it in 2.6.24 maybe. Any comments?
> > > 
> > >  I'm trying this patch and got this during the weekend (gmail will
> > > probably break lines automatically, grrr):
> > > 
> > > """
> > > [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> > > at virtual address 00000004
> > > [29711.081300]  printing eip:
> > > [29711.081305] dcb5aa49
> > > [29711.081308] *pde = 00000000
> > > [29711.081315] Oops: 0000 [#1]
> > > [29711.081319] SMP
> > > [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> > > nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> > > binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> > > snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> > > snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> > > snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> > > mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> > > sata_via libata scsi_mod
> > > [29711.081445] CPU:    0
> > > [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> > > [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> > > [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> > > [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> > > [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> > > [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> > > [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> > > task.ti=d8788000)
> > > [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> > > c0945160 dcb59202 00000000
> > > [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> > > d80af14c d80b0000 d8788f68
> > > [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> > > d8092018 d8092000 0000001c
> > > [29711.081714] Call Trace:
> > > [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> > > [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> > > [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> > > [29711.081798]  [<c01058c6>] die+0x116/0x250
> > > [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> > > [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> > > [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> > > [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> > > [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> > > [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> > > [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> > > [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> > > [29711.082032]  =======================
> > > [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> > > c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> > > 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> > > 04 8d
> > > [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> > > 0068:d8788efc
> > > """
> > > 
> > >  Now I'm not sure if this was caused by your patch, or is a bug
> > > somewhere else.
> > 
> > It's a little hard to see how Nick's patch could have caused that to
> > happen.
> > 
> > Neil, Bruce: does this look at all familiar?
> 
> Not to me.
> 
> > We don't appear to have changed anything in there for months...
> 
> Well, I've fooled around with the exports in 2.6.23-rc1, and Neil added
> the uuid stuff some time around 2.6.21.
> 
> It looks to me like it's oopsing at the deference of
> fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> commit b41eeef14d claims to fix.  Looks like that's been in since
> v2.6.22-rc1; what kernel is this?

Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
have triggered it. Luiz, is it reproducable? Without the zero page
removal patch?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-01  1:47       ` Nick Piggin
@ 2007-08-01  1:53         ` J. Bruce Fields
  2007-08-01  2:19           ` Luiz Fernando N. Capitulino
  2007-08-01  2:17         ` Luiz Fernando N. Capitulino
  1 sibling, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2007-08-01  1:53 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Luiz Fernando N. Capitulino, Linus Torvalds,
	Hugh Dickins, Andrea Arcangeli, Linux Memory Management List,
	lcapitulino, Neil Brown

On Wed, Aug 01, 2007 at 03:47:39AM +0200, Nick Piggin wrote:
> On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > It looks to me like it's oopsing at the deference of
> > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > commit b41eeef14d claims to fix.  Looks like that's been in since
> > v2.6.22-rc1; what kernel is this?
> 
> Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> have triggered it.

I agree that it's most likely an nfsd bug.  I'll take another look, but
it probably won't be till tommorow afternoon.

--b.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-01  1:47       ` Nick Piggin
  2007-08-01  1:53         ` J. Bruce Fields
@ 2007-08-01  2:17         ` Luiz Fernando N. Capitulino
  1 sibling, 0 replies; 22+ messages in thread
From: Luiz Fernando N. Capitulino @ 2007-08-01  2:17 UTC (permalink / raw)
  To: Nick Piggin
  Cc: J. Bruce Fields, Andrew Morton, Linus Torvalds, Hugh Dickins,
	Andrea Arcangeli, Linux Memory Management List, lcapitulino,
	Neil Brown

On 7/31/07, Nick Piggin <npiggin@suse.de> wrote:
> On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > On Mon, Jul 30, 2007 at 11:57:51AM -0700, Andrew Morton wrote:
> > > On Mon, 30 Jul 2007 10:52:27 -0300
> > > "Luiz Fernando N. Capitulino" <lcapitulino@gmail.com> wrote:
> > >
> > > > Hi Nick,
> > > >
> > > > On 7/26/07, Nick Piggin <npiggin@suse.de> wrote:
> > > >
> > > > > I'd like to see if we can get the ball rolling on this again, and try to
> > > > > get it in 2.6.24 maybe. Any comments?
> > > >
> > > >  I'm trying this patch and got this during the weekend (gmail will
> > > > probably break lines automatically, grrr):
> > > >
> > > > """
> > > > [29711.081281] BUG: unable to handle kernel NULL pointer dereference
> > > > at virtual address 00000004
> > > > [29711.081300]  printing eip:
> > > > [29711.081305] dcb5aa49
> > > > [29711.081308] *pde = 00000000
> > > > [29711.081315] Oops: 0000 [#1]
> > > > [29711.081319] SMP
> > > > [29711.081325] Modules linked in: nfsd exportfs auth_rpcgss nfs lockd
> > > > nfs_acl sunrpc capability commoncap af_packet ipv6 ide_cd ide_core
> > > > binfmt_misc loop dm_mod floppy pcspkr snd_pcm_oss snd_mixer_oss
> > > > snd_via82xx gameport snd_ac97_codec i2c_viapro amd64_agp snd_pcm
> > > > snd_timer ac97_bus snd_page_alloc snd_mpu401_uart snd_rawmidi
> > > > snd_seq_device snd agpgart ehci_hcd uhci_hcd i2c_core via_rhine k8temp
> > > > mii usbcore evdev tsdev soundcore sr_mod sg ext3 jbd sd_mod pata_via
> > > > sata_via libata scsi_mod
> > > > [29711.081445] CPU:    0
> > > > [29711.081446] EIP:    0060:[<dcb5aa49>]    Not tainted VLI
> > > > [29711.081447] EFLAGS: 00010246   (2.6.23-rc1-zpage #1)
> > > > [29711.081511] EIP is at encode_fsid+0x89/0xb0 [nfsd]
> > > > [29711.081529] eax: d99f4000   ebx: d80af064   ecx: 00000000   edx: 00000002
> > > > [29711.081549] esi: d809204c   edi: d80af14c   ebp: d8788f04   esp: d8788efc
> > > > [29711.081569] ds: 007b   es: 007b   fs: 00d8  gs: 0000  ss: 0068
> > > > [29711.081589] Process nfsd (pid: 3474, ti=d8788000 task=dac07740
> > > > task.ti=d8788000)
> > > > [29711.081609] Stack: 00000000 d809203c d8788f28 dcb5ab25 d80af064
> > > > c0945160 dcb59202 00000000
> > > > [29711.081644]        d80b0000 dcb5bf90 dcb77404 d8788f38 dcb5bfb3
> > > > d80af14c d80b0000 d8788f68
> > > > [29711.081679]        dcb4d32c d87b4c44 d87b4a80 d8788f60 00000003
> > > > d8092018 d8092000 0000001c
> > > > [29711.081714] Call Trace:
> > > > [29711.081738]  [<c01053fa>] show_trace_log_lvl+0x1a/0x30
> > > > [29711.081760]  [<c01054bb>] show_stack_log_lvl+0xab/0xd0
> > > > [29711.081779]  [<c01056b1>] show_registers+0x1d1/0x2d0
> > > > [29711.081798]  [<c01058c6>] die+0x116/0x250
> > > > [29711.081815]  [<c011bb2b>] do_page_fault+0x28b/0x690
> > > > [29711.081836]  [<c02e95ba>] error_code+0x72/0x78
> > > > [29711.081856]  [<dcb5ab25>] encode_fattr3+0xb5/0x140 [nfsd]
> > > > [29711.081888]  [<dcb5bfb3>] nfs3svc_encode_attrstat+0x23/0x50 [nfsd]
> > > > [29711.081921]  [<dcb4d32c>] nfsd_dispatch+0x18c/0x220 [nfsd]
> > > > [29711.081950]  [<dcaf41fa>] svc_process+0x42a/0x7b0 [sunrpc]
> > > > [29711.081985]  [<dcb4d909>] nfsd+0x169/0x290 [nfsd]
> > > > [29711.082013]  [<c0104f9f>] kernel_thread_helper+0x7/0x18
> > > > [29711.082032]  =======================
> > > > [29711.082047] Code: 48 30 89 cb c1 fb 1f 89 d8 0f c8 89 06 89 c8 0f
> > > > c8 89 46 04 8d 46 08 5b 5e 5d c3 8d b4 26 00 00 00 00 8b 83 88 00 00
> > > > 00 8b 48 34 <8b> 51 04 33 51 0c 8b 01 33 41 08 89 d1 0f c8 0f c9 89 46
> > > > 04 8d
> > > > [29711.082150] EIP: [<dcb5aa49>] encode_fsid+0x89/0xb0 [nfsd] SS:ESP
> > > > 0068:d8788efc
> > > > """
> > > >
> > > >  Now I'm not sure if this was caused by your patch, or is a bug
> > > > somewhere else.
> > >
> > > It's a little hard to see how Nick's patch could have caused that to
> > > happen.
> > >
> > > Neil, Bruce: does this look at all familiar?
> >
> > Not to me.
> >
> > > We don't appear to have changed anything in there for months...
> >
> > Well, I've fooled around with the exports in 2.6.23-rc1, and Neil added
> > the uuid stuff some time around 2.6.21.
> >
> > It looks to me like it's oopsing at the deference of
> > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > commit b41eeef14d claims to fix.  Looks like that's been in since
> > v2.6.22-rc1; what kernel is this?
>
> Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> have triggered it. Luiz, is it reproducable? Without the zero page
> removal patch?

 I've tried to reproduce it a few times w/o success, the OOPS has happened
during the weekend at the office, so I can't tell when/how it happened.

 The only important info I have for NFS people is that my machine does
export some NFS volumes with three chroots and some kernel packages.

 I know that there're people at the office who mounts that volumes, but it's
probably impossible to say what was the usage pattern at the weekend.

 But it's there, running the kernel since yesterday w/o problems so far.

-- 
Luiz Fernando N. Capitulino

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-01  1:53         ` J. Bruce Fields
@ 2007-08-01  2:19           ` Luiz Fernando N. Capitulino
  2007-08-01  3:03             ` J. Bruce Fields
  2007-08-02  4:37             ` J. Bruce Fields
  0 siblings, 2 replies; 22+ messages in thread
From: Luiz Fernando N. Capitulino @ 2007-08-01  2:19 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Nick Piggin, Andrew Morton, Linus Torvalds, Hugh Dickins,
	Andrea Arcangeli, Linux Memory Management List, lcapitulino,
	Neil Brown

On 7/31/07, J. Bruce Fields <bfields@fieldses.org> wrote:
> On Wed, Aug 01, 2007 at 03:47:39AM +0200, Nick Piggin wrote:
> > On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > > It looks to me like it's oopsing at the deference of
> > > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > > commit b41eeef14d claims to fix.  Looks like that's been in since
> > > v2.6.22-rc1; what kernel is this?
> >
> > Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> > have triggered it.
>
> I agree that it's most likely an nfsd bug.  I'll take another look, but
> it probably won't be till tommorow afternoon.

 Bruce, is there a way to reproduce the bug b41eeef14d claims to fix?

-- 
Luiz Fernando N. Capitulino

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-01  2:19           ` Luiz Fernando N. Capitulino
@ 2007-08-01  3:03             ` J. Bruce Fields
  2007-08-02  4:37             ` J. Bruce Fields
  1 sibling, 0 replies; 22+ messages in thread
From: J. Bruce Fields @ 2007-08-01  3:03 UTC (permalink / raw)
  To: Luiz Fernando N. Capitulino
  Cc: Nick Piggin, Andrew Morton, Linus Torvalds, Hugh Dickins,
	Andrea Arcangeli, Linux Memory Management List, lcapitulino,
	Neil Brown

On Tue, Jul 31, 2007 at 11:19:00PM -0300, Luiz Fernando N. Capitulino wrote:
> On 7/31/07, J. Bruce Fields <bfields@fieldses.org> wrote:
> > On Wed, Aug 01, 2007 at 03:47:39AM +0200, Nick Piggin wrote:
> > > On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > > > It looks to me like it's oopsing at the deference of
> > > > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > > > commit b41eeef14d claims to fix.  Looks like that's been in since
> > > > v2.6.22-rc1; what kernel is this?
> > >
> > > Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> > > have triggered it.
> >
> > I agree that it's most likely an nfsd bug.  I'll take another look, but
> > it probably won't be till tommorow afternoon.
> 
>  Bruce, is there a way to reproduce the bug b41eeef14d claims to fix?

Neil might have a test case.  Otherwise I'll try to cook one up for you
tommorow.

--b.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-01  2:19           ` Luiz Fernando N. Capitulino
  2007-08-01  3:03             ` J. Bruce Fields
@ 2007-08-02  4:37             ` J. Bruce Fields
  2007-08-03  1:40               ` Neil Brown
  1 sibling, 1 reply; 22+ messages in thread
From: J. Bruce Fields @ 2007-08-02  4:37 UTC (permalink / raw)
  To: Luiz Fernando N. Capitulino
  Cc: Nick Piggin, Andrew Morton, Linus Torvalds, Hugh Dickins,
	Andrea Arcangeli, Linux Memory Management List, lcapitulino,
	Neil Brown

On Tue, Jul 31, 2007 at 11:19:00PM -0300, Luiz Fernando N. Capitulino wrote:
> On 7/31/07, J. Bruce Fields <bfields@fieldses.org> wrote:
> > On Wed, Aug 01, 2007 at 03:47:39AM +0200, Nick Piggin wrote:
> > > On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > > > It looks to me like it's oopsing at the deference of
> > > > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > > > commit b41eeef14d claims to fix.  Looks like that's been in since
> > > > v2.6.22-rc1; what kernel is this?
> > >
> > > Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> > > have triggered it.
> >
> > I agree that it's most likely an nfsd bug.  I'll take another look, but
> > it probably won't be till tommorow afternoon.
> 
>  Bruce, is there a way to reproduce the bug b41eeef14d claims to fix?

OK, sorry, it's taking me a little time to figure out what's going on.

But fh_verify() was responsible for checking and filling in the fhp that
is wrong here, and I don't see the safeguards in fh_verify() (or in the
export-lookup process) that would ensure that the export associated with
a filehandle has a non-NULL ex_uuid whenever the filehandle has a uuid
fsid type.  But I can't create a test case yet.

--b.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [patch][rfc] remove ZERO_PAGE?
  2007-08-02  4:37             ` J. Bruce Fields
@ 2007-08-03  1:40               ` Neil Brown
  0 siblings, 0 replies; 22+ messages in thread
From: Neil Brown @ 2007-08-03  1:40 UTC (permalink / raw)
  To: J. Bruce Fields
  Cc: Luiz Fernando N. Capitulino, Nick Piggin, Andrew Morton,
	Linus Torvalds, Hugh Dickins, Andrea Arcangeli,
	Linux Memory Management List, lcapitulino

On Thursday August 2, bfields@fieldses.org wrote:
> On Tue, Jul 31, 2007 at 11:19:00PM -0300, Luiz Fernando N. Capitulino wrote:
> > On 7/31/07, J. Bruce Fields <bfields@fieldses.org> wrote:
> > > On Wed, Aug 01, 2007 at 03:47:39AM +0200, Nick Piggin wrote:
> > > > On Mon, Jul 30, 2007 at 06:39:12PM -0400, J. Bruce Fields wrote:
> > > > > It looks to me like it's oopsing at the deference of
> > > > > fhp->fh_export->ex_uuid in encode_fsid(), which is exactly the case
> > > > > commit b41eeef14d claims to fix.  Looks like that's been in since
> > > > > v2.6.22-rc1; what kernel is this?
> > > >
> > > > Any progress with this? I'm fairly sure ZERO_PAGE removal wouldn't
> > > > have triggered it.
> > >
> > > I agree that it's most likely an nfsd bug.  I'll take another look, but
> > > it probably won't be till tommorow afternoon.
> > 
> >  Bruce, is there a way to reproduce the bug b41eeef14d claims to fix?
> 
> OK, sorry, it's taking me a little time to figure out what's going on.
> 
> But fh_verify() was responsible for checking and filling in the fhp that
> is wrong here, and I don't see the safeguards in fh_verify() (or in the
> export-lookup process) that would ensure that the export associated with
> a filehandle has a non-NULL ex_uuid whenever the filehandle has a uuid
> fsid type.  But I can't create a test case yet.

I have seen this bug already.  Here is the patch I made.

But I'm not really sure how it happens.  I think you would need a
buggy mounted, and I don't think one of those has escaped.

I'll ponder a some more see what I can find.

But I'm certain this has nothing to do with ZERO_PAGE, so maybe future
followups should trim the cc list a bit.

NeilBrown

-------------------------------------
Validate filehandle type in fsid_source

fsid_source decided where to get the 'fsid' number to
return for a GETATTR based on the type of filehandle.
It can be from the device, from the fsid, or from the
UUID.

It is possible for the filehandle to be inconsistent
with the export information, so make sure the export information
actually has the info implied by the value returned by
fsid_source.


Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./fs/nfsd/nfsfh.c |   20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff .prev/fs/nfsd/nfsfh.c ./fs/nfsd/nfsfh.c
--- .prev/fs/nfsd/nfsfh.c	2007-07-31 11:19:20.000000000 +1000
+++ ./fs/nfsd/nfsfh.c	2007-08-03 11:31:40.000000000 +1000
@@ -566,13 +566,23 @@ enum fsid_source fsid_source(struct svc_
 	case FSID_DEV:
 	case FSID_ENCODE_DEV:
 	case FSID_MAJOR_MINOR:
-		return FSIDSOURCE_DEV;
+		if (fhp->fh_export->ex_dentry->d_inode->i_sb->s_type->fs_flags
+		    & FS_REQUIRES_DEV)
+			return FSIDSOURCE_DEV;
+		break;
 	case FSID_NUM:
-		return FSIDSOURCE_FSID;
-	default:
 		if (fhp->fh_export->ex_flags & NFSEXP_FSID)
 			return FSIDSOURCE_FSID;
-		else
-			return FSIDSOURCE_UUID;
+		break;
+	default:
+		break;
 	}
+	/* either a UUID type filehandle, or the filehandle doesn't
+	 * match the export.
+	 */
+	if (fhp->fh_export->ex_flags & NFSEXP_FSID)
+		return FSIDSOURCE_FSID;
+	if (fhp->fh_export->ex_uuid)
+		return FSIDSOURCE_UUID;
+	return FSIDSOURCE_DEV;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2007-08-03  1:40 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-27  2:19 [patch][rfc] remove ZERO_PAGE? Nick Piggin
2007-07-27  5:29 ` Linus Torvalds
2007-07-27  5:54   ` Nick Piggin
2007-07-27 15:21     ` Linus Torvalds
2007-07-30  3:08       ` Nick Piggin
2007-07-30  3:45         ` Linus Torvalds
2007-07-30  3:56           ` Linus Torvalds
2007-07-30  4:35             ` Nick Piggin
2007-07-30  4:30           ` Nick Piggin
2007-07-27 15:59 ` Hugh Dickins
2007-07-30 13:52 ` Luiz Fernando N. Capitulino
2007-07-30 18:57   ` Andrew Morton
2007-07-30 22:39     ` J. Bruce Fields
2007-07-30 23:09       ` Andrew Morton
2007-07-31  0:03         ` Luiz Fernando N. Capitulino
2007-08-01  1:47       ` Nick Piggin
2007-08-01  1:53         ` J. Bruce Fields
2007-08-01  2:19           ` Luiz Fernando N. Capitulino
2007-08-01  3:03             ` J. Bruce Fields
2007-08-02  4:37             ` J. Bruce Fields
2007-08-03  1:40               ` Neil Brown
2007-08-01  2:17         ` Luiz Fernando N. Capitulino

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox