[rfc] optimise unlock

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [rfc] optimise unlock_page
       [not found] <20070508113709.GA19294@wotan.suse.de>
@ 2007-05-08 11:40 ` Nick Piggin
  2007-05-08 20:08   ` Hugh Dickins
  2007-05-08 21:30   ` Benjamin Herrenschmidt
  2007-05-08 12:13 ` David Howells
  1 sibling, 2 replies; 23+ messages in thread
From: Nick Piggin @ 2007-05-08 11:40 UTC (permalink / raw)
  To: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

This patch trades a page flag for a significant improvement in the unlock_page
fastpath. Various problems in the previous version were spotted by Hugh and
Ben (and fixed in this one).

Comments?

--

Speed up unlock_page by introducing a new page flag to signal that there are
page waitqueue waiters for PG_locked. This means a memory barrier and a random
waitqueue hash cacheline load can be avoided in the fastpath when there is no
contention.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-08 15:31:00.000000000 +1000
@@ -91,6 +91,8 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_waiters		20	/* Page has PG_locked waiters */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 
@@ -123,6 +125,10 @@
 #define TestClearPageLocked(page)	\
 		test_and_clear_bit(PG_locked, &(page)->flags)
 
+#define PageWaiters(page)	test_bit(PG_waiters, &(page)->flags)
+#define SetPageWaiters(page)	set_bit(PG_waiters, &(page)->flags)
+#define ClearPageWaiters(page)	clear_bit(PG_waiters, &(page)->flags)
+
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-08 16:55:07.000000000 +1000
@@ -133,7 +133,8 @@
 
 extern void FASTCALL(__lock_page(struct page *page));
 extern void FASTCALL(__lock_page_nosync(struct page *page));
-extern void FASTCALL(unlock_page(struct page *page));
+extern void FASTCALL(__wait_on_page_locked(struct page *page));
+extern void FASTCALL(__unlock_page(struct page *page));
 
 static inline int trylock_page(struct page *page)
 {
@@ -160,7 +161,15 @@
 	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
-	
+
+static inline void unlock_page(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	ClearPageLocked_Unlock(page);
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
@@ -176,8 +185,9 @@
  */
 static inline void wait_on_page_locked(struct page *page)
 {
+	might_sleep();
 	if (PageLocked(page))
-		wait_on_page_bit(page, PG_locked);
+		__wait_on_page_locked(page);
 }
 
 /* 
@@ -185,6 +195,7 @@
  */
 static inline void wait_on_page_writeback(struct page *page)
 {
+	might_sleep();
 	if (PageWriteback(page))
 		wait_on_page_bit(page, PG_writeback);
 }
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-08 15:30:55.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-08 18:04:20.000000000 +1000
@@ -169,6 +169,7 @@
 	return 0;
 }
 
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -478,12 +479,6 @@
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -516,26 +511,22 @@
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+/*
+ * If PageWaiters was found to be set at unlock-time, __unlock_page should
+ * be called to actually perform the wakeups of waiters.
  */
-void fastcall unlock_page(struct page *page)
+void fastcall __unlock_page(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-	ClearPageLocked_Unlock(page);
+	ClearPageWaiters(page);
+ 	/*
+	 * The mb is necessary to enforce ordering between the clear_bit and
+	 * the read of the waitqueue (to avoid SMP races with a parallel
+	 * wait_on_page_locked()
+	 */
 	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(__unlock_page);
 
 /**
  * end_page_writeback - end writeback against a page
@@ -563,10 +554,16 @@
  */
 void fastcall __lock_page(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 EXPORT_SYMBOL(__lock_page);
 
@@ -576,10 +573,39 @@
  */
 void fastcall __lock_page_nosync(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			io_schedule();
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
+}
+
+void fastcall __wait_on_page_locked(struct page *page)
+{
+	wait_queue_head_t *wq = page_waitqueue(page);
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (PageLocked(page));
+	finish_wait(wq, &wait.wait);
+
+	/*
+	 * Could skip this, but that would leave PG_waiters dangling
+	 * for random pages. This keeps it tidy.
+	 */
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
 }
+EXPORT_SYMBOL(__wait_on_page_locked);
 
 /**
  * find_get_page - find and get a page reference
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-05-08 15:30:39.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2007-05-08 15:31:00.000000000 +1000
@@ -203,7 +203,8 @@
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 
 	/*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
       [not found] <20070508113709.GA19294@wotan.suse.de>
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
@ 2007-05-08 12:13 ` David Howells
  2007-05-08 22:35   ` Nick Piggin
  1 sibling, 1 reply; 23+ messages in thread
From: David Howells @ 2007-05-08 12:13 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

Nick Piggin <npiggin@suse.de> wrote:

> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).

It looks reasonable at first glance, though it does consume yet another page
flag:-/  However, I think that's probably a worthy trade.

>  }
> -	
> +
> +static inline void unlock_page(struct page *page)
> +{
> +	VM_BUG_ON(!PageLocked(page));
> +	ClearPageLocked_Unlock(page);
> +	if (unlikely(PageWaiters(page)))
> +		__unlock_page(page);
> +}
> +

Please don't simply discard the documentation, we have little enough as it is:

> -/**
> - * unlock_page - unlock a locked page
> - * @page: the page
> - *
> - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
> - * Also wakes sleepers in wait_on_page_writeback() because the wakeup
> - * mechananism between PageLocked pages and PageWriteback pages is shared.
> - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
> - *
> - * The mb is necessary to enforce ordering between the clear_bit and the read
> - * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).

David

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
@ 2007-05-08 20:08   ` Hugh Dickins
  2007-05-08 21:30   ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 23+ messages in thread
From: Hugh Dickins @ 2007-05-08 20:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, 8 May 2007, Nick Piggin wrote:

> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).
> 
> Comments?

Seems there's still a bug there.  I get hangs on the page lock, on
i386 and on x86_64 and on powerpc: sometimes they unhang themselves
after a while (presume other activity does the wakeup).  Obvious even
while booting (Starting udevd).  Sorry, not had time to investigate.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
  2007-05-08 20:08   ` Hugh Dickins
@ 2007-05-08 21:30   ` Benjamin Herrenschmidt
  2007-05-08 22:41     ` Nick Piggin
  1 sibling, 1 reply; 23+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-08 21:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> This patch trades a page flag for a significant improvement in the unlock_page
> fastpath. Various problems in the previous version were spotted by Hugh and
> Ben (and fixed in this one).
> 
> Comments?
> 
> --
> 
> Speed up unlock_page by introducing a new page flag to signal that there are
> page waitqueue waiters for PG_locked. This means a memory barrier and a random
> waitqueue hash cacheline load can be avoided in the fastpath when there is no
> contention.

I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
but wake_up_page() does __wake_up_bit() which calls __wake_up() with
nr_exclusive set to 1. Doesn't that mean that only one waiter will be
woken up ?

If that's the case, then we lose because we'll have clear PG_waiters but
only wake up one of them.

Waking them all would fix it but at the risk of causing other
problems... Maybe PG_waiters need to actually be a counter but if that
is the case, then it complicates things even more.

Any smart idea ?

Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 12:13 ` David Howells
@ 2007-05-08 22:35   ` Nick Piggin
  0 siblings, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2007-05-08 22:35 UTC (permalink / raw)
  To: David Howells
  Cc: linux-arch, Benjamin Herrenschmidt, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Tue, May 08, 2007 at 01:13:35PM +0100, David Howells wrote:
> 
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > This patch trades a page flag for a significant improvement in the unlock_page
> > fastpath. Various problems in the previous version were spotted by Hugh and
> > Ben (and fixed in this one).
> 
> It looks reasonable at first glance, though it does consume yet another page
> flag:-/  However, I think that's probably a worthy trade.

Well, that's the big question :)


> >  }
> > -	
> > +
> > +static inline void unlock_page(struct page *page)
> > +{
> > +	VM_BUG_ON(!PageLocked(page));
> > +	ClearPageLocked_Unlock(page);
> > +	if (unlikely(PageWaiters(page)))
> > +		__unlock_page(page);
> > +}
> > +
> 
> Please don't simply discard the documentation, we have little enough as it is:

Oops, right.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 21:30   ` Benjamin Herrenschmidt
@ 2007-05-08 22:41     ` Nick Piggin
  2007-05-08 22:50       ` Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-08 22:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> > This patch trades a page flag for a significant improvement in the unlock_page
> > fastpath. Various problems in the previous version were spotted by Hugh and
> > Ben (and fixed in this one).
> > 
> > Comments?
> > 
> > --
> > 
> > Speed up unlock_page by introducing a new page flag to signal that there are
> > page waitqueue waiters for PG_locked. This means a memory barrier and a random
> > waitqueue hash cacheline load can be avoided in the fastpath when there is no
> > contention.
> 
> I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
> but wake_up_page() does __wake_up_bit() which calls __wake_up() with
> nr_exclusive set to 1. Doesn't that mean that only one waiter will be
> woken up ?
> 
> If that's the case, then we lose because we'll have clear PG_waiters but
> only wake up one of them.
> 
> Waking them all would fix it but at the risk of causing other
> problems... Maybe PG_waiters need to actually be a counter but if that
> is the case, then it complicates things even more.
> 
> Any smart idea ?

It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
wake up code can check whether the waitqueue is still active after the
wakeup, and set PG_waiters again in that case?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 22:41     ` Nick Piggin
@ 2007-05-08 22:50       ` Nick Piggin
  2007-05-09 19:33         ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-08 22:50 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linux-arch, Andrew Morton, Linux Kernel Mailing List,
	Linux Memory Management List

On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote:
> On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> > On Tue, 2007-05-08 at 13:40 +0200, Nick Piggin wrote:
> > > This patch trades a page flag for a significant improvement in the unlock_page
> > > fastpath. Various problems in the previous version were spotted by Hugh and
> > > Ben (and fixed in this one).
> > > 
> > > Comments?
> > > 
> > > --
> > > 
> > > Speed up unlock_page by introducing a new page flag to signal that there are
> > > page waitqueue waiters for PG_locked. This means a memory barrier and a random
> > > waitqueue hash cacheline load can be avoided in the fastpath when there is no
> > > contention.
> > 
> > I'm not 100% familiar with the exclusive vs. non exclusive wait thingy
> > but wake_up_page() does __wake_up_bit() which calls __wake_up() with
> > nr_exclusive set to 1. Doesn't that mean that only one waiter will be
> > woken up ?
> > 
> > If that's the case, then we lose because we'll have clear PG_waiters but
> > only wake up one of them.
> > 
> > Waking them all would fix it but at the risk of causing other
> > problems... Maybe PG_waiters need to actually be a counter but if that
> > is the case, then it complicates things even more.
> > 
> > Any smart idea ?
> 
> It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
> Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
> wake up code can check whether the waitqueue is still active after the
> wakeup, and set PG_waiters again in that case?

Hm, I don't know if we can do that without a race either...

OTOH, waking all non exclusive waiters may not be a really bad idea.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-08 22:50       ` Nick Piggin
@ 2007-05-09 19:33         ` Hugh Dickins
  2007-05-09 21:21           ` Benjamin Herrenschmidt
  2007-05-10  3:37           ` Nick Piggin
  0 siblings, 2 replies; 23+ messages in thread
From: Hugh Dickins @ 2007-05-09 19:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, 9 May 2007, Nick Piggin wrote:
> On Wed, May 09, 2007 at 12:41:24AM +0200, Nick Piggin wrote:
> > On Wed, May 09, 2007 at 07:30:27AM +1000, Benjamin Herrenschmidt wrote:
> > > 
> > > Waking them all would fix it but at the risk of causing other
> > > problems... Maybe PG_waiters need to actually be a counter but if that
> > > is the case, then it complicates things even more.
> > 
> > It will wake up 1 exclusive waiter, but no limit on non exclusive waiters.
> > Hmm, but it won't wake up waiters behind the exclusive guy... maybe the
> > wake up code can check whether the waitqueue is still active after the
> > wakeup, and set PG_waiters again in that case?
> 
> Hm, I don't know if we can do that without a race either...
> 
> OTOH, waking all non exclusive waiters may not be a really bad idea.

Not good enough, I'm afraid.  It looks like Ben's right and you need
a count - and counts in the page struct are a lot harder to add than
page flags.

I've now played around with the hangs on my three 4CPU machines
(all of them in io_schedule below __lock_page, waiting on pages
which were neither PG_locked nor PG_waiters when I looked).

Seeing Ben's mail, I thought the answer would be just to remove
the "_exclusive" from your three prepare_to_wait_exclusive()s.
That helped, but it didn't eliminate the hangs.

After fiddling around with different ideas for some while, I came
to realize that the ClearPageWaiters (in very misleadingly named
__unlock_page) is hopeless.  It's just so easy for it to clear the
PG_waiters that a third task relies upon for wakeup (and which
cannot loop around to set it again, because it simply won't be
woken by unlock_page/__unlock_page without it already being set).

Below is the patch I've applied to see some tests actually running
with your patches, but it's just a joke: absurdly racy and
presumptuous in itself (the "3" stands for us and the cache and one
waiter; I deleted the neighbouring mb and comment, not because I
disagree, but because it's ridiculous to pay so much attention to
such unlikely races when there's much worse nearby).  Though I've
not checked: if I've got the counting wrong, then maybe all my
pages are left marked PG_waiters by now.

(I did imagine we could go back to prepare_to_wait_exclusive
once I'd put in the page_count test before ClearPageWaiters;
but apparently not, that still hung.)

My intention had been to apply the patches to what I tested before
with lmbench, to get comparative numbers; but I don't think this
is worth the time, it's too far from being a real solution.

I was puzzled as to how you came up with any performance numbers
yourself, when I could hardly boot.  I see you mentioned 2CPU G5,
I guess you need a CPU or two more; or maybe it's that you didn't
watch what happened as it booted, often those hangs recover later.

Hugh

--- a/mm/filemap.c	2007-05-08 20:17:31.000000000 +0100
+++ b/mm/filemap.c	2007-05-09 19:14:03.000000000 +0100
@@ -517,13 +517,8 @@ EXPORT_SYMBOL(wait_on_page_bit);
  */
 void fastcall __unlock_page(struct page *page)
 {
-	ClearPageWaiters(page);
- 	/*
-	 * The mb is necessary to enforce ordering between the clear_bit and
-	 * the read of the waitqueue (to avoid SMP races with a parallel
-	 * wait_on_page_locked()
-	 */
-	smp_mb__after_clear_bit();
+	if (page_count(page) <= 3 + page_has_buffers(page)+page_mapcount(page))
+		ClearPageWaiters(page);
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(__unlock_page);
@@ -558,7 +553,7 @@ void fastcall __lock_page(struct page *p
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			sync_page(page);
@@ -577,7 +572,7 @@ void fastcall __lock_page_nosync(struct 
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			io_schedule();
@@ -591,7 +586,7 @@ void fastcall __wait_on_page_locked(stru
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);

 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 		SetPageWaiters(page);
 		if (likely(PageLocked(page)))
 			sync_page(page);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-09 19:33         ` Hugh Dickins
@ 2007-05-09 21:21           ` Benjamin Herrenschmidt
  2007-05-10  3:37           ` Nick Piggin
  1 sibling, 0 replies; 23+ messages in thread
From: Benjamin Herrenschmidt @ 2007-05-09 21:21 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

> Not good enough, I'm afraid.  It looks like Ben's right and you need
> a count - and counts in the page struct are a lot harder to add than
> page flags.
> 
> I've now played around with the hangs on my three 4CPU machines
> (all of them in io_schedule below __lock_page, waiting on pages
> which were neither PG_locked nor PG_waiters when I looked).
> 
> Seeing Ben's mail, I thought the answer would be just to remove
> the "_exclusive" from your three prepare_to_wait_exclusive()s.
> That helped, but it didn't eliminate the hangs.

There might be a way ... by having the flags manipulation always
atomically deal with PG_locked and PG_waiters together. This is possible
but we would need even more weirdo bitops abstractions from the arch I'm
afraid... unless we start using atomic_* rather that bitops in order to
manipulate multiple bits at a time.

Ben.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-09 19:33         ` Hugh Dickins
  2007-05-09 21:21           ` Benjamin Herrenschmidt
@ 2007-05-10  3:37           ` Nick Piggin
  2007-05-10 19:14             ` Hugh Dickins
  1 sibling, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-10  3:37 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 09, 2007 at 08:33:15PM +0100, Hugh Dickins wrote:
> 
> Not good enough, I'm afraid.  It looks like Ben's right and you need
> a count - and counts in the page struct are a lot harder to add than
> page flags.
> 
> I've now played around with the hangs on my three 4CPU machines
> (all of them in io_schedule below __lock_page, waiting on pages
> which were neither PG_locked nor PG_waiters when I looked).
> 
> Seeing Ben's mail, I thought the answer would be just to remove
> the "_exclusive" from your three prepare_to_wait_exclusive()s.
> That helped, but it didn't eliminate the hangs.
> 
> After fiddling around with different ideas for some while, I came
> to realize that the ClearPageWaiters (in very misleadingly named
> __unlock_page) is hopeless.  It's just so easy for it to clear the
> PG_waiters that a third task relies upon for wakeup (and which
> cannot loop around to set it again, because it simply won't be
> woken by unlock_page/__unlock_page without it already being set).

OK, I found a simple bug after pulling out my hair for a while :)
With this, a 4-way system survives a couple of concurrent make -j250s
quite nicely (wheras they eventually locked up before).

The problem is that the bit wakeup function did not go through with
the wakeup if it found the bit (ie. PG_locked) set. This meant that
waiters would not get a chance to reset PG_waiters.

However you probably weren't referring to that particular problem
when you imagined the need for a full count, or the slippery 3rd
task... I wasn't able to derive any such problems with the basic
logic, so if there was a bug there, it would still be unfixed in this
patch.
---

Speed up unlock_page by introducing a new page flag to signal that there are
page waitqueue waiters for PG_locked. This means a memory barrier and a random
waitqueue hash cacheline load can be avoided in the fastpath when there is no
contention.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/include/linux/page-flags.h	2007-05-10 10:22:06.000000000 +1000
@@ -91,6 +91,8 @@
 #define PG_nosave_free		18	/* Used for system suspend/resume */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_waiters		20	/* Page has PG_locked waiters */
+
 /* PG_owner_priv_1 users should have descriptive aliases */
 #define PG_checked		PG_owner_priv_1 /* Used by some filesystems */
 
@@ -113,6 +115,10 @@
 #define SetPageLocked(page)		\
 		set_bit(PG_locked, &(page)->flags)
 
+#define PageWaiters(page)	test_bit(PG_waiters, &(page)->flags)
+#define SetPageWaiters(page)	set_bit(PG_waiters, &(page)->flags)
+#define ClearPageWaiters(page)	clear_bit(PG_waiters, &(page)->flags)
+
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
 #define ClearPageError(page)	clear_bit(PG_error, &(page)->flags)
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/include/linux/pagemap.h	2007-05-10 10:22:06.000000000 +1000
@@ -133,7 +133,8 @@
 
 extern void FASTCALL(__lock_page(struct page *page));
 extern void FASTCALL(__lock_page_nosync(struct page *page));
-extern void FASTCALL(unlock_page(struct page *page));
+extern void FASTCALL(__wait_on_page_locked(struct page *page));
+extern void FASTCALL(__unlock_page(struct page *page));
 
 static inline int trylock_page(struct page *page)
 {
@@ -160,7 +161,15 @@
 	if (!trylock_page(page))
 		__lock_page_nosync(page);
 }
-	
+
+static inline void unlock_page(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+	clear_bit_unlock(PG_locked, &page->flags);
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
+}
+
 /*
  * This is exported only for wait_on_page_locked/wait_on_page_writeback.
  * Never use this directly!
@@ -176,8 +185,9 @@
  */
 static inline void wait_on_page_locked(struct page *page)
 {
+	might_sleep();
 	if (PageLocked(page))
-		wait_on_page_bit(page, PG_locked);
+		__wait_on_page_locked(page);
 }
 
 /* 
@@ -185,6 +195,7 @@
  */
 static inline void wait_on_page_writeback(struct page *page)
 {
+	might_sleep();
 	if (PageWriteback(page))
 		wait_on_page_bit(page, PG_writeback);
 }
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2007-05-10 10:22:05.000000000 +1000
+++ linux-2.6/mm/filemap.c	2007-05-10 11:03:10.000000000 +1000
@@ -165,10 +165,12 @@
 	mapping = page_mapping(page);
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		mapping->a_ops->sync_page(page);
-	io_schedule();
+	if (io_schedule_timeout(20*HZ) == 0)
+		printk("page->flags = %lx\n", page->flags);
 	return 0;
 }
 
+
 /**
  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
  * @mapping:	address space structure to write
@@ -478,12 +480,6 @@
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-static int __sleep_on_page_lock(void *word)
-{
-	io_schedule();
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -516,26 +512,23 @@
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * The mb is necessary to enforce ordering between the clear_bit and the read
- * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
+/*
+ * If PageWaiters was found to be set at unlock-time, __unlock_page should
+ * be called to actually perform the wakeups of waiters.
  */
-void fastcall unlock_page(struct page *page)
+void fastcall __unlock_page(struct page *page)
 {
-	VM_BUG_ON(!PageLocked(page));
-	clear_bit_unlock(PG_locked, &page->flags);
+	ClearPageWaiters(page);
+ 	/*
+	 * The mb is necessary to enforce ordering between the clear_bit and
+	 * the read of the waitqueue (to avoid SMP races with a parallel
+	 * wait_on_page_locked()
+	 */
 	smp_mb__after_clear_bit();
+
 	wake_up_page(page, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(__unlock_page);
 
 /**
  * end_page_writeback - end writeback against a page
@@ -563,10 +556,16 @@
  */
 void fastcall __lock_page(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
 }
 EXPORT_SYMBOL(__lock_page);
 
@@ -576,10 +575,41 @@
  */
 void fastcall __lock_page_nosync(struct page *page)
 {
+	wait_queue_head_t *wq = page_waitqueue(page);
 	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-	__wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
-							TASK_UNINTERRUPTIBLE);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page))) {
+			if (io_schedule_timeout(20*HZ) == 0)
+				printk("page->flags = %lx\n", page->flags);
+		}
+	} while (!trylock_page(page));
+	finish_wait(wq, &wait.wait);
+}
+
+void fastcall __wait_on_page_locked(struct page *page)
+{
+	wait_queue_head_t *wq = page_waitqueue(page);
+	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		SetPageWaiters(page);
+		if (likely(PageLocked(page)))
+			sync_page(page);
+	} while (PageLocked(page));
+	finish_wait(wq, &wait.wait);
+
+	/*
+	 * Could skip this, but that would leave PG_waiters dangling
+	 * for random pages. This keeps it tidy.
+	 */
+	if (unlikely(PageWaiters(page)))
+		__unlock_page(page);
 }
+EXPORT_SYMBOL(__wait_on_page_locked);
 
 /**
  * find_get_page - find and get a page reference
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-05-10 10:21:32.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2007-05-10 10:22:06.000000000 +1000
@@ -203,7 +203,8 @@
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+			1 << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -438,7 +439,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -588,7 +590,8 @@
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_waiters ))))
 		bad_page(page);
 
 	/*
Index: linux-2.6/kernel/wait.c
===================================================================
--- linux-2.6.orig/kernel/wait.c	2007-05-10 11:06:43.000000000 +1000
+++ linux-2.6/kernel/wait.c	2007-05-10 11:17:25.000000000 +1000
@@ -144,8 +144,7 @@
 		= container_of(wait, struct wait_bit_queue, wait);
 
 	if (wait_bit->key.flags != key->flags ||
-			wait_bit->key.bit_nr != key->bit_nr ||
-			test_bit(key->bit_nr, key->flags))
+			wait_bit->key.bit_nr != key->bit_nr)
 		return 0;
 	else
 		return autoremove_wake_function(wait, mode, sync, key);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-10  3:37           ` Nick Piggin
@ 2007-05-10 19:14             ` Hugh Dickins
  2007-05-11  8:54               ` Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-10 19:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Thu, 10 May 2007, Nick Piggin wrote:
> 
> OK, I found a simple bug after pulling out my hair for a while :)
> With this, a 4-way system survives a couple of concurrent make -j250s
> quite nicely (wheras they eventually locked up before).
> 
> The problem is that the bit wakeup function did not go through with
> the wakeup if it found the bit (ie. PG_locked) set. This meant that
> waiters would not get a chance to reset PG_waiters.

That makes a lot of sense.  And this version seems stable to me,
I've found no problems so far: magic!

Well, on the x86_64 I have seen a few of your io_schedule_timeout
printks under load; but suspect those are no fault of your changes,
but reflect some actual misbehaviour down towards the disk end (when
kernel default moved from AS to CFQ, I had to stick with AS because
CFQ ran my tests very much slower on that one machine: something odd
going on that I've occasionally wasted time looking into but never
tracked down - certainly long-locked pages are a feature of that).

> However you probably weren't referring to that particular problem
> when you imagined the need for a full count, or the slippery 3rd
> task... I wasn't able to derive any such problems with the basic
> logic, so if there was a bug there, it would still be unfixed in this
> patch.

I've been struggling to conjure up and exorcise the race that seemed
so obvious to me yesterday.  I was certainly imagining one task on
its way between SetPageWaiters and io_schedule, when the unlock_page
comes, wakes, and lets another waiter take the lock.  Probably I was
forgetting the essence of prepare_to_wait, that this task would then
fall through io_schedule as if woken as part of that batch.  Until
demonstrated otherwise, let's assume I was utterly mistaken.

In addition to 3 hours of load on the three machines, I've gone back
and applied this new patch (and the lock bitops; remembering to shift
PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench
testing, on those three machines.

On the PowerPC G5, these changes pretty much balance out your earlier
changes (not just the one fix-fault-vs-invalidate patch, but the whole
group which came in with that - it'd take me a while to tell exactly
what, easiest to send you a diff if you want it), in those lmbench
fork, exec, sh, mmap, fault tests.  On the P4 Xeons, they improve
the numbers significantly, but only retrieve half the regression.

So here it looks like a good change; but not enough to atone ;)

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-10 19:14             ` Hugh Dickins
@ 2007-05-11  8:54               ` Nick Piggin
  2007-05-11 13:15                 ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-11  8:54 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote:
> On Thu, 10 May 2007, Nick Piggin wrote:
> > 
> > OK, I found a simple bug after pulling out my hair for a while :)
> > With this, a 4-way system survives a couple of concurrent make -j250s
> > quite nicely (wheras they eventually locked up before).
> > 
> > The problem is that the bit wakeup function did not go through with
> > the wakeup if it found the bit (ie. PG_locked) set. This meant that
> > waiters would not get a chance to reset PG_waiters.
> 
> That makes a lot of sense.  And this version seems stable to me,
> I've found no problems so far: magic!
> 
> Well, on the x86_64 I have seen a few of your io_schedule_timeout
> printks under load; but suspect those are no fault of your changes,

Hmm, I see... well I forgot to remove those from the page I sent,
the timeouts will kick things off again if they get stalled, so
maybe it just hides a problem? (OTOH, I *think* the logic is pretty
sound).


> In addition to 3 hours of load on the three machines, I've gone back
> and applied this new patch (and the lock bitops; remembering to shift
> PG_waiters up) to 2.6.21-rc3-mm2 on which I did the earlier lmbench
> testing, on those three machines.
> 
> On the PowerPC G5, these changes pretty much balance out your earlier
> changes (not just the one fix-fault-vs-invalidate patch, but the whole
> group which came in with that - it'd take me a while to tell exactly
> what, easiest to send you a diff if you want it), in those lmbench
> fork, exec, sh, mmap, fault tests.  On the P4 Xeons, they improve
> the numbers significantly, but only retrieve half the regression.
> 
> So here it looks like a good change; but not enough to atone ;)

Don't worry, I'm only just beginning ;) Can we then do something crazy
like this?  (working on x86-64 only, so far. It seems to eliminate
lat_pagefault and lat_proc regressions here).

What architecture and workloads are you testing with, btw?

--

Put PG_locked in its own byte from other PG_bits, so we can use non-atomic
stores to unlock it.

Index: linux-2.6/include/asm-x86_64/bitops.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/bitops.h
+++ linux-2.6/include/asm-x86_64/bitops.h
@@ -68,6 +68,38 @@ static __inline__ void clear_bit(int nr,
 		:"dIr" (nr));
 }
 
+/**
+ * clear_bit_unlock - Clears a bit in memory with unlock semantics
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ */
+static __inline__ void clear_bit_unlock(int nr, volatile void * addr)
+{
+	barrier();
+	__asm__ __volatile__( LOCK_PREFIX
+		"btrl %1,%0"
+		:ADDR
+		:"dIr" (nr));
+}
+
+/**
+ * __clear_bit_unlock_byte - same as clear_bit_unlock but uses a byte sized
+ *			     non-atomic store
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit_unlock() is non-atomic, however it implements unlock ordering,
+ * so it cannot be reordered arbitrarily.
+ */
+static __inline__ void __clear_bit_unlock_byte(int nr, void *addr)
+{
+        unsigned char mask = 1UL << (nr % BITS_PER_BYTE);
+        unsigned char *p = addr + nr / BITS_PER_BYTE;
+
+        barrier();
+        *p &= ~mask;
+}
+
 static __inline__ void __clear_bit(int nr, volatile void * addr)
 {
 	__asm__ __volatile__(
@@ -132,6 +164,26 @@ static __inline__ int test_and_set_bit(i
 	return oldbit;
 }
 
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for locking
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and has lock barrier semantics.
+ */
+static __inline__ int test_and_set_bit_lock(int nr, volatile void * addr)
+{
+	int oldbit;
+
+	__asm__ __volatile__( LOCK_PREFIX
+		"btsl %2,%1\n\tsbbl %0,%0"
+		:"=r" (oldbit),ADDR
+		:"dIr" (nr));
+	barrier();
+	return oldbit;
+}
+
 /**
  * __test_and_set_bit - Set a bit and return its old value
  * @nr: Bit to set
@@ -408,7 +460,6 @@ static __inline__ int fls(int x)
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
 #include <asm-generic/bitops/hweight.h>
-#include <asm-generic/bitops/lock.h>
 
 #endif /* __KERNEL__ */
 
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -615,13 +615,13 @@ extern struct zone *next_zone(struct zon
  * with 32 bit page->flags field, we reserve 9 bits for node/zone info.
  * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes.
  */
-#define FLAGS_RESERVED		9
+#define FLAGS_RESERVED		7
 
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define FLAGS_RESERVED		32
+#define FLAGS_RESERVED		31
 
 #else
 
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -67,7 +67,6 @@
  * FLAGS_RESERVED which defines the width of the fields section
  * (see linux/mmzone.h).  New flags must _not_ overlap with this area.
  */
-#define PG_locked	 	 0	/* Page is locked. Don't touch. */
 #define PG_error		 1
 #define PG_referenced		 2
 #define PG_uptodate		 3
@@ -104,6 +103,14 @@
  *         63                            32                              0
  */
 #define PG_uncached		31	/* Page has been mapped as uncached */
+
+/*
+ * PG_locked sits in a different byte to the rest of the flags. This allows
+ * optimised implementations to use a non-atomic store to unlock.
+ */
+#define PG_locked		32
+#else
+#define PG_locked		24
 #endif
 
 /*
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -187,7 +187,11 @@ static inline void lock_page_nosync(stru
 static inline void unlock_page(struct page *page)
 {
 	VM_BUG_ON(!PageLocked(page));
-	clear_bit_unlock(PG_locked, &page->flags);
+	/*
+	 * PG_locked sits in its own byte in page->flags, away from normal
+	 * flags, so we can do a non-atomic unlock here
+	 */
+	__clear_bit_unlock_byte(PG_locked, &page->flags);
 	if (unlikely(PageWaiters(page)))
 		__unlock_page(page);
 }
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -192,17 +192,17 @@ static void bad_page(struct page *page)
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
 	dump_stack();
-	page->flags &= ~(1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim |
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_buddy |
-			1 << PG_waiters );
+	page->flags &= ~(1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_dirty	 |
+			1UL << PG_reclaim|
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -427,19 +427,19 @@ static inline void __free_one_page(struc
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
-		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(page->mapping != NULL)	 |
+		(page_count(page) != 0)	 |
 		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_slab	|
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy |
-			1 << PG_waiters ))))
+			1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_reserved|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters ))))
 		bad_page(page);
 	/*
 	 * PageReclaim == PageTail. It is only an error
@@ -582,21 +582,21 @@ static inline void expand(struct zone *z
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
-		(page->mapping != NULL)  |
-		(page_count(page) != 0)  |
+		(page->mapping != NULL)	 |
+		(page_count(page) != 0)	 |
 		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private	|
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim	|
-			1 << PG_slab    |
-			1 << PG_swapcache |
-			1 << PG_writeback |
-			1 << PG_reserved |
-			1 << PG_buddy |
-			1 << PG_waiters ))))
+			1UL << PG_lru	 |
+			1UL << PG_private|
+			1UL << PG_locked |
+			1UL << PG_active |
+			1UL << PG_dirty	 |
+			1UL << PG_reclaim|
+			1UL << PG_slab	 |
+			1UL << PG_swapcache|
+			1UL << PG_writeback|
+			1UL << PG_reserved|
+			1UL << PG_buddy	 |
+			1UL << PG_waiters ))))
 		bad_page(page);
 
 	/*
@@ -606,9 +606,9 @@ static int prep_new_page(struct page *pa
 	if (PageReserved(page))
 		return 1;
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
+	page->flags &= ~(1UL << PG_uptodate | 1UL << PG_error |
+			1UL << PG_referenced | 1UL << PG_arch_1 |
+			1UL << PG_owner_priv_1 | 1UL << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-11  8:54               ` Nick Piggin
@ 2007-05-11 13:15                 ` Hugh Dickins
  2007-05-13  3:32                   ` Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-11 13:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Fri, 11 May 2007, Nick Piggin wrote:
> On Thu, May 10, 2007 at 08:14:52PM +0100, Hugh Dickins wrote:
> > 
> > Well, on the x86_64 I have seen a few of your io_schedule_timeout
> > printks under load; but suspect those are no fault of your changes,
> 
> Hmm, I see... well I forgot to remove those from the page I sent,
> the timeouts will kick things off again if they get stalled, so
> maybe it just hides a problem? (OTOH, I *think* the logic is pretty
> sound).

As I said in what you snipped, I believe your debug there is showing
up an existing problem on my machine, not a problem in your changes.

> > So here it looks like a good change; but not enough to atone ;)
> 
> Don't worry, I'm only just beginning ;) Can we then do something crazy
> like this?  (working on x86-64 only, so far. It seems to eliminate
> lat_pagefault and lat_proc regressions here).

I think Mr __NickPiggin_Lock is squirming ever more desperately.

So, in essence, you'd like to expand PG_locked from 1 to 8 bits,
despite the fact that page flags are known to be in short supply?
Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED.

Hmm, well, I think that's fairly horrid, and would it even be
guaranteed to work on all architectures?  Playing with one char
of an unsigned long in one way, while playing with the whole of
the unsigned long in another way (bitops) sounds very dodgy to me.

I think I'd rather just accept that your changes have slowed some
microbenchmarks down: it is not always possible to fix a serious
bug without slowing something down.  That's probably what you're
trying to push me into saying by this patch ;)

But again I wonder just what the gain has been, once your double
unmap_mapping_range is factored in.  When I suggested before that
perhaps the double (well, treble including the one in truncate.c)
unmap_mapping_range might solve the problem you set out to solve
(I've lost sight of that!) without pagelock when faulting, you said:

> Well aside from being terribly ugly, it means we can still drop
> the dirty bit where we'd otherwise rather not, so I don't think
> we can do that.

but that didn't give me enough information to agree or disagree.

> 
> What architecture and workloads are you testing with, btw?

i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad).

Workloads mostly lmbench and my usual pair of make -j20 kernel builds,
one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM
plus swap.  Which is ever so old but still finds enough to keep me busy.

Hugh

> --
> 
> Put PG_locked in its own byte from other PG_bits, so we can use non-atomic
> stores to unlock it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-11 13:15                 ` Hugh Dickins
@ 2007-05-13  3:32                   ` Nick Piggin
  2007-05-13  4:39                     ` Hugh Dickins
  2007-05-16 17:21                     ` Hugh Dickins
  0 siblings, 2 replies; 23+ messages in thread
From: Nick Piggin @ 2007-05-13  3:32 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> On Fri, 11 May 2007, Nick Piggin wrote:
> > 
> > Don't worry, I'm only just beginning ;) Can we then do something crazy
> > like this?  (working on x86-64 only, so far. It seems to eliminate
> > lat_pagefault and lat_proc regressions here).
> 
> I think Mr __NickPiggin_Lock is squirming ever more desperately.

Really? I thought it was pretty cool to be able to shave several
hundreds of cycles off our page lock :)


> So, in essence, you'd like to expand PG_locked from 1 to 8 bits,
> despite the fact that page flags are known to be in short supply?
> Ah, no, you're keeping it near the static mmzone FLAGS_RESERVED.

Yep, no flags bloating at all.


> Hmm, well, I think that's fairly horrid, and would it even be
> guaranteed to work on all architectures?  Playing with one char
> of an unsigned long in one way, while playing with the whole of
> the unsigned long in another way (bitops) sounds very dodgy to me.

Of course not, but they can just use a regular atomic word sized
bitop. The problem with i386 is that its atomic ops also imply
memory barriers that you obviously don't need on unlock. So I think
getting rid of them is pretty good. A grep of mm/ and fs/ for
lock_page tells me we want this to be as fast as possible even
if it isn't being used in the nopage fastpath.


> I think I'd rather just accept that your changes have slowed some
> microbenchmarks down: it is not always possible to fix a serious
> bug without slowing something down.  That's probably what you're
> trying to push me into saying by this patch ;)

Well I was resigned to that few % regression in the page fault path
until the G5 numbers showed that we needed to improve things. But
now it looks like (at least on my 2*HT P4 Xeon) that we don't have to
have any regression there.


> But again I wonder just what the gain has been, once your double
> unmap_mapping_range is factored in.  When I suggested before that
> perhaps the double (well, treble including the one in truncate.c)
> unmap_mapping_range might solve the problem you set out to solve
> (I've lost sight of that!) without pagelock when faulting, you said:
> 
> > Well aside from being terribly ugly, it means we can still drop
> > the dirty bit where we'd otherwise rather not, so I don't think
> > we can do that.
> 
> but that didn't give me enough information to agree or disagree.

Oh, well invalidate wants to be able to skip dirty pages or have the
filesystem do something special with them first. Once you have taken
the page out of the pagecache but still mapped shared, then blowing
it away doesn't actually solve the data loss problem... only makes
the window of VM inconsistency smaller.


> > What architecture and workloads are you testing with, btw?
> 
> i386 (2*HT P4 Xeons), x86_64 (2*HT P4 Xeons), PowerPC (G5 Quad).
> 
> Workloads mostly lmbench and my usual pair of make -j20 kernel builds,
> one to tmpfs and one to ext2 looped on tmpfs, restricted to 512M RAM
> plus swap.  Which is ever so old but still finds enough to keep me busy.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  3:32                   ` Nick Piggin
@ 2007-05-13  4:39                     ` Hugh Dickins
  2007-05-13  6:52                       ` Nick Piggin
  2007-05-16 17:21                     ` Hugh Dickins
  1 sibling, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-13  4:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> 
> > Hmm, well, I think that's fairly horrid, and would it even be
> > guaranteed to work on all architectures?  Playing with one char
> > of an unsigned long in one way, while playing with the whole of
> > the unsigned long in another way (bitops) sounds very dodgy to me.
> 
> Of course not, but they can just use a regular atomic word sized
> bitop. The problem with i386 is that its atomic ops also imply
> memory barriers that you obviously don't need on unlock.

But is it even a valid procedure on i386?

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  4:39                     ` Hugh Dickins
@ 2007-05-13  6:52                       ` Nick Piggin
  2007-05-16 17:54                         ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-13  6:52 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > 
> > > Hmm, well, I think that's fairly horrid, and would it even be
> > > guaranteed to work on all architectures?  Playing with one char
> > > of an unsigned long in one way, while playing with the whole of
> > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > 
> > Of course not, but they can just use a regular atomic word sized
> > bitop. The problem with i386 is that its atomic ops also imply
> > memory barriers that you obviously don't need on unlock.
> 
> But is it even a valid procedure on i386?

Well I think so, but not completely sure. OTOH, I admit this is one
of the more contentious speedups ;) It is likely to be vary a lot by
the arch (I think the P4 is infamous for expensive locked ops, others
may prefer not to mix the byte sized ops with word length ones).

But that aside, I'd still like to do the lock page in nopage and get
this bug fixed. Now it is possible to fix some other way, eg we could
use another page flag (I'd say it would be better to use that flag for
PG_waiters and speed up all PG_locked users), however I think it is fine
to lock the page over fault. It gets rid of some complexity of memory
ordering there, and we already have to do the wait_on_page_locked thing
to prevent the page_mkclean data loss thingy.

I haven't seen a non-microbenchmark where it hurts.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  3:32                   ` Nick Piggin
  2007-05-13  4:39                     ` Hugh Dickins
@ 2007-05-16 17:21                     ` Hugh Dickins
  2007-05-16 17:38                       ` Nick Piggin
  1 sibling, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-16 17:21 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> 
> > But again I wonder just what the gain has been, once your double
> > unmap_mapping_range is factored in.  When I suggested before that
> > perhaps the double (well, treble including the one in truncate.c)
> > unmap_mapping_range might solve the problem you set out to solve
> > (I've lost sight of that!) without pagelock when faulting, you said:
> > 
> > > Well aside from being terribly ugly, it means we can still drop
> > > the dirty bit where we'd otherwise rather not, so I don't think
> > > we can do that.
> > 
> > but that didn't give me enough information to agree or disagree.
> 
> Oh, well invalidate wants to be able to skip dirty pages or have the
> filesystem do something special with them first. Once you have taken
> the page out of the pagecache but still mapped shared, then blowing
> it away doesn't actually solve the data loss problem... only makes
> the window of VM inconsistency smaller.

Right, I think I see what you mean now, thanks: userspace
must not for a moment be allowed to write to orphaned pages.

Whereas it's not an issue for the privately COWed pages you added
the second unmap_mapping_range for: because it's only truncation
that has to worry about them, so they're heading for SIGBUS anyway.

Yes, and the page_mapped tests in mm/truncate.c are just racy
heuristics without the page lock you now put into faulting.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 17:21                     ` Hugh Dickins
@ 2007-05-16 17:38                       ` Nick Piggin
  0 siblings, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2007-05-16 17:38 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 16, 2007 at 06:21:09PM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > 
> > > But again I wonder just what the gain has been, once your double
> > > unmap_mapping_range is factored in.  When I suggested before that
> > > perhaps the double (well, treble including the one in truncate.c)
> > > unmap_mapping_range might solve the problem you set out to solve
> > > (I've lost sight of that!) without pagelock when faulting, you said:
> > > 
> > > > Well aside from being terribly ugly, it means we can still drop
> > > > the dirty bit where we'd otherwise rather not, so I don't think
> > > > we can do that.
> > > 
> > > but that didn't give me enough information to agree or disagree.
> > 
> > Oh, well invalidate wants to be able to skip dirty pages or have the
> > filesystem do something special with them first. Once you have taken
> > the page out of the pagecache but still mapped shared, then blowing
> > it away doesn't actually solve the data loss problem... only makes
> > the window of VM inconsistency smaller.
> 
> Right, I think I see what you mean now, thanks: userspace
> must not for a moment be allowed to write to orphaned pages.

Yep.


> Whereas it's not an issue for the privately COWed pages you added
> the second unmap_mapping_range for: because it's only truncation
> that has to worry about them, so they're heading for SIGBUS anyway.
> 
> Yes, and the page_mapped tests in mm/truncate.c are just racy
> heuristics without the page lock you now put into faulting.

Yes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-13  6:52                       ` Nick Piggin
@ 2007-05-16 17:54                         ` Hugh Dickins
  2007-05-16 18:18                           ` Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-16 17:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Sun, 13 May 2007, Nick Piggin wrote:
> On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> > On Sun, 13 May 2007, Nick Piggin wrote:
> > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > > 
> > > > Hmm, well, I think that's fairly horrid, and would it even be
> > > > guaranteed to work on all architectures?  Playing with one char
> > > > of an unsigned long in one way, while playing with the whole of
> > > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > > 
> > > Of course not, but they can just use a regular atomic word sized
> > > bitop. The problem with i386 is that its atomic ops also imply
> > > memory barriers that you obviously don't need on unlock.
> > 
> > But is it even a valid procedure on i386?
> 
> Well I think so, but not completely sure.

That's not quite enough to convince me!

I do retract my "fairly horrid" remark, that was a kneejerk reaction
to cleverness; it's quite nice, if it can be guaranteed to work (and
if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully
chose 9).

Please seek out those guarantees.  Like you, I can't really see how
it would go wrong (how could moving in the unlocked char mess with
the flag bits in the rest of the long? how could atomically modifying
the long have a chance of undoing that move?), but it feels like it
might take us into errata territory.

Hugh

> OTOH, I admit this is one
> of the more contentious speedups ;) It is likely to be vary a lot by
> the arch (I think the P4 is infamous for expensive locked ops, others
> may prefer not to mix the byte sized ops with word length ones).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 17:54                         ` Hugh Dickins
@ 2007-05-16 18:18                           ` Nick Piggin
  2007-05-16 19:28                             ` Hugh Dickins
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2007-05-16 18:18 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List,
	Linus Torvalds

On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> On Sun, 13 May 2007, Nick Piggin wrote:
> > On Sun, May 13, 2007 at 05:39:03AM +0100, Hugh Dickins wrote:
> > > On Sun, 13 May 2007, Nick Piggin wrote:
> > > > On Fri, May 11, 2007 at 02:15:03PM +0100, Hugh Dickins wrote:
> > > > 
> > > > > Hmm, well, I think that's fairly horrid, and would it even be
> > > > > guaranteed to work on all architectures?  Playing with one char
> > > > > of an unsigned long in one way, while playing with the whole of
> > > > > the unsigned long in another way (bitops) sounds very dodgy to me.
> > > > 
> > > > Of course not, but they can just use a regular atomic word sized
> > > > bitop. The problem with i386 is that its atomic ops also imply
> > > > memory barriers that you obviously don't need on unlock.
> > > 
> > > But is it even a valid procedure on i386?
> > 
> > Well I think so, but not completely sure.
> 
> That's not quite enough to convince me!

I did ask Linus, and he was very sure it works.

> I do retract my "fairly horrid" remark, that was a kneejerk reaction
> to cleverness; it's quite nice, if it can be guaranteed to work (and
> if lowering FLAGS_RESERVED from 9 to 7 doesn't upset whoever carefully
> chose 9).

Hmm, it is a _little_ bit horrid ;) Maybe slightly clever, but definitely
slightly horrid as well!

Not so much from a high level view (although it does put more constraints
on the flags layout), but from a CPU level... the way we intermix different
sized loads and stores can run into store forwarding issues[*] which might
be expensive as well. Not to mention that we can't do the non-atomic
unlock on all architectures.

OTOH, it did _seem_ to eliminate the pagefault regression on my P4 Xeon
here, in one round of tests.

The other option of moving the bit into ->mapping hopefully avoids all
the issues, and would probably be a little faster again on the P4, at the
expense of being a more intrusive (but it doesn't look too bad, at first
glance)...

[*] I did mention to Linus that we might be able to avoid the store
forwarding stall by loading just a single byte to test PG_waiters after
the byte store to clear PG_locked. At this point he puked. We decided
that using another field altogether might be better.

> Please seek out those guarantees.  Like you, I can't really see how
> it would go wrong (how could moving in the unlocked char mess with
> the flag bits in the rest of the long? how could atomically modifying
> the long have a chance of undoing that move?), but it feels like it
> might take us into errata territory.

I think we can just rely on the cache coherency protocol taking care of
it for us, on x86. movb would not affect other data other than the dest.
A non-atomic op _could_ of course undo the movb, but it could likewise
undo any other store to the word or byte. An atomic op on the flags does
not modify the movb byte so the movb before/after possibilities should
look exactly the same regardless of the atomic operations happening.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 18:18                           ` Nick Piggin
@ 2007-05-16 19:28                             ` Hugh Dickins
  2007-05-16 19:47                               ` Linus Torvalds
  0 siblings, 1 reply; 23+ messages in thread
From: Hugh Dickins @ 2007-05-16 19:28 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List,
	Linus Torvalds

On Wed, 16 May 2007, Nick Piggin wrote:
> On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> > On Sun, 13 May 2007, Nick Piggin wrote:
> > > 
> > > Well I think so, but not completely sure.
> > 
> > That's not quite enough to convince me!
> 
> I did ask Linus, and he was very sure it works.

Good, that's very encouraging.

> Not so much from a high level view (although it does put more constraints
> on the flags layout), but from a CPU level... the way we intermix different
> sized loads and stores can run into store forwarding issues[*] which might
> be expensive as well. Not to mention that we can't do the non-atomic
> unlock on all architectures.

Ah yes, that's easier to envisage than an actual correctness problem.

> The other option of moving the bit into ->mapping hopefully avoids all
> the issues, and would probably be a little faster again on the P4, at the
> expense of being a more intrusive (but it doesn't look too bad, at first
> glance)...

Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
cede it to your PG_locked, though I can't deny your use should take
precedence.  Perhaps we could enforce 8-byte alignment of struct
address_space and struct anon_vma to make both bits available
(along with the anon bit).

But I think you may not be appreciating how intrusive PG_locked
will be.  There are many references to page->mapping (often ->host)
throughout fs/ : when we keep anon/swap flags in page->mapping, we
know the filesystems will never see those bits set in their pages,
so no page_mapping-like conversion is needed; just a few places in
common code need to adapt.

And given our deprecation discipline for in-kernel interfaces,
wouldn't we have to wait a similar period before making page->mapping
unavailable to out-of-tree filesystems?

> > Please seek out those guarantees.  Like you, I can't really see how
> > it would go wrong (how could moving in the unlocked char mess with
> > the flag bits in the rest of the long? how could atomically modifying
> > the long have a chance of undoing that move?), but it feels like it
> > might take us into errata territory.
> 
> I think we can just rely on the cache coherency protocol taking care of
> it for us, on x86. movb would not affect other data other than the dest.
> A non-atomic op _could_ of course undo the movb, but it could likewise
> undo any other store to the word or byte. An atomic op on the flags does
> not modify the movb byte so the movb before/after possibilities should
> look exactly the same regardless of the atomic operations happening.

Yes, I've gone through that same thought process (my questions were
intended as rhetorical exclamations of inconceivabilty, rather than
actual queries).  But if you do go that way, I'd still like you to
check with Intel and AMD for errata.  See include/asm-i386/spinlock.h
for the CONFIG_X86_OOSTORE || CONFIG_X86_PPRO_FENCE __raw_spin_unlock
using xchgb: doesn't that hint that exceptions may be needed?

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 19:28                             ` Hugh Dickins
@ 2007-05-16 19:47                               ` Linus Torvalds
  2007-05-17  6:27                                 ` Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Linus Torvalds @ 2007-05-16 19:47 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Nick Piggin, Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List


On Wed, 16 May 2007, Hugh Dickins wrote:

> On Wed, 16 May 2007, Nick Piggin wrote:
> > On Wed, May 16, 2007 at 06:54:15PM +0100, Hugh Dickins wrote:
> > > On Sun, 13 May 2007, Nick Piggin wrote:
> > > > 
> > > > Well I think so, but not completely sure.
> > > 
> > > That's not quite enough to convince me!
> > 
> > I did ask Linus, and he was very sure it works.
> 
> Good, that's very encouraging.

Note that our default spinlocks _depend_ on a bog-standard store just 
working as an unlock, so this wouldn't even be anything half-way new:

	static inline void __raw_spin_unlock(raw_spinlock_t *lock)
	{
		asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory");
	}

There are some Opteron errata (and a really old P6 bug) wrt this, but they 
are definitely CPU bugs, and we haven't really worked out what the Opteron 
solution should be (the bug is apparently pretty close to impossible to 
trigger in practice, so it's not been a high priority).

> > The other option of moving the bit into ->mapping hopefully avoids all
> > the issues, and would probably be a little faster again on the P4, at the
> > expense of being a more intrusive (but it doesn't look too bad, at first
> > glance)...
> 
> Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
> cede it to your PG_locked, though I can't deny your use should take
> precedence.  Perhaps we could enforce 8-byte alignment of struct
> address_space and struct anon_vma to make both bits available
> (along with the anon bit).

We probably could. It should be easy enough to mark "struct address_space" 
to be 8-byte aligned.

> But I think you may not be appreciating how intrusive PG_locked
> will be.  There are many references to page->mapping (often ->host)
> throughout fs/ : when we keep anon/swap flags in page->mapping, we
> know the filesystems will never see those bits set in their pages,
> so no page_mapping-like conversion is needed; just a few places in
> common code need to adapt.

You're right, it could be really painful. We'd have to rename the field, 
and use some inline function to access it (which masks off the low bits).

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [rfc] optimise unlock_page
  2007-05-16 19:47                               ` Linus Torvalds
@ 2007-05-17  6:27                                 ` Nick Piggin
  0 siblings, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2007-05-17  6:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Hugh Dickins, Benjamin Herrenschmidt, linux-arch, Andrew Morton,
	Linux Kernel Mailing List, Linux Memory Management List

On Wed, May 16, 2007 at 12:47:54PM -0700, Linus Torvalds wrote:
> 
> On Wed, 16 May 2007, Hugh Dickins wrote:
> 
> > > The other option of moving the bit into ->mapping hopefully avoids all
> > > the issues, and would probably be a little faster again on the P4, at the
> > > expense of being a more intrusive (but it doesn't look too bad, at first
> > > glance)...
> > 
> > Hmm, I'm so happy with PG_swapcache in there, that I'm reluctant to
> > cede it to your PG_locked, though I can't deny your use should take
> > precedence.  Perhaps we could enforce 8-byte alignment of struct
> > address_space and struct anon_vma to make both bits available
> > (along with the anon bit).
> 
> We probably could. It should be easy enough to mark "struct address_space" 
> to be 8-byte aligned.

Yeah, it might be worthwhile, because I agree that PG_swapcache would
work nicely there too.


> > But I think you may not be appreciating how intrusive PG_locked
> > will be.  There are many references to page->mapping (often ->host)
> > throughout fs/ : when we keep anon/swap flags in page->mapping, we
> > know the filesystems will never see those bits set in their pages,
> > so no page_mapping-like conversion is needed; just a few places in
> > common code need to adapt.
> 
> You're right, it could be really painful. We'd have to rename the field, 
> and use some inline function to access it (which masks off the low bits).

Yeah, I realise that the change is intrusive in terms of lines touched,
but AFAIKS, it should not be much more complex than a search/replace...

As far as deprecating things goes... I don't think we have to wait too
long, its more for features, drivers, or more fundamental APIs isn't it?
If we just point out that one must use set_page_mapping/page_mapping
rather than page->mapping, it is trivial to fix any out of tree breakage.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2007-05-17  6:27 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20070508113709.GA19294@wotan.suse.de>
2007-05-08 11:40 ` [rfc] optimise unlock_page Nick Piggin
2007-05-08 20:08   ` Hugh Dickins
2007-05-08 21:30   ` Benjamin Herrenschmidt
2007-05-08 22:41     ` Nick Piggin
2007-05-08 22:50       ` Nick Piggin
2007-05-09 19:33         ` Hugh Dickins
2007-05-09 21:21           ` Benjamin Herrenschmidt
2007-05-10  3:37           ` Nick Piggin
2007-05-10 19:14             ` Hugh Dickins
2007-05-11  8:54               ` Nick Piggin
2007-05-11 13:15                 ` Hugh Dickins
2007-05-13  3:32                   ` Nick Piggin
2007-05-13  4:39                     ` Hugh Dickins
2007-05-13  6:52                       ` Nick Piggin
2007-05-16 17:54                         ` Hugh Dickins
2007-05-16 18:18                           ` Nick Piggin
2007-05-16 19:28                             ` Hugh Dickins
2007-05-16 19:47                               ` Linus Torvalds
2007-05-17  6:27                                 ` Nick Piggin
2007-05-16 17:21                     ` Hugh Dickins
2007-05-16 17:38                       ` Nick Piggin
2007-05-08 12:13 ` David Howells
2007-05-08 22:35   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox