linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <nickpiggin@yahoo.com.au>
To: linux-kernel <linux-kernel@vger.kernel.org>,
	Linux Memory Management <linux-mm@kvack.org>
Subject: [patch 2] mm: speculative get_page
Date: Mon, 27 Jun 2005 16:32:38 +1000	[thread overview]
Message-ID: <42BF9D86.90204@yahoo.com.au> (raw)
In-Reply-To: <42BF9D67.10509@yahoo.com.au>

[-- Attachment #1: Type: text/plain, Size: 28 bytes --]


-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-speculative-get_page.patch --]
[-- Type: text/plain, Size: 6511 bytes --]

If we can be sure that elevating the page_count on a pagecache
page will pin it, we can speculatively run this operation, and
subsequently check to see if we hit the right page rather than
relying on holding a lock or otherwise pinning a reference to
the page.

This can be done if get_page/put_page behaves in the same manner
throughout the whole tree (ie. if we "get" the page after it has
been used for something else, we must be able to free it with a
put_page).

There needs to be some careful logic for freed pages so they aren't
freed again, and also some careful logic for pages in the process
of being removed from pagecache.

Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -77,6 +77,7 @@
 #define PG_uncached		19	/* Page has been mapped as uncached */
 
 #define PG_free			20	/* Page is on the free lists */
+#define PG_freeing		21	/* PG_refcount about to be freed */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -312,6 +313,11 @@ extern void __mod_page_state(unsigned lo
 #define __SetPageFree(page)	__set_bit(PG_free, &(page)->flags)
 #define __ClearPageFree(page)	__clear_bit(PG_free, &(page)->flags)
 
+#define PageFreeing(page)	test_bit(PG_freeing, &(page)->flags)
+#define SetPageFreeing(page)	set_bit(PG_freeing, &(page)->flags)
+#define ClearPageFreeing(page)	clear_bit(PG_freeing, &(page)->flags)
+#define __ClearPageFreeing(page) __clear_bit(PG_freeing, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -50,6 +50,42 @@ static inline void mapping_set_gfp_mask(
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+static inline struct page *page_cache_get_speculative(struct page **pagep)
+{
+	struct page *page;
+
+	preempt_disable();
+	page = *pagep;
+	if (!page)
+		goto out_failed;
+
+	if (unlikely(get_page_testone(page))) {
+		/* Picked up a freed page */
+		__put_page(page);
+		goto out_failed;
+	}
+	/*
+	 * preempt can really be enabled here (only needs to be disabled
+	 * because page allocation can spin on the elevated refcount, but
+	 * we don't want to hold a reference on an unrelated page for too
+	 * long, so keep preempt off until we know we have the right page
+	 */
+
+	if (unlikely(PageFreeing(page)) ||
+			unlikely(page != *pagep)) {
+		/* Picked up a page being freed, or one that's been reused */
+		put_page(page);
+		goto out_failed;
+	}
+	preempt_enable();
+
+	return page;
+
+out_failed:
+	preempt_enable();
+	return NULL;
+}
+
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x)|__GFP_NORECLAIM, 0);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -116,7 +116,6 @@ static void bad_page(const char *functio
 			1 << PG_writeback |
 			1 << PG_reserved |
 			1 << PG_free );
-	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
 	tainted |= TAINT_BAD_PAGE;
@@ -316,7 +315,6 @@ static inline void free_pages_check(cons
 {
 	if (	page_mapcount(page) ||
 		page->mapping != NULL ||
-		page_count(page) != 0 ||
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private |
@@ -424,7 +422,7 @@ expand(struct zone *zone, struct page *p
 void set_page_refs(struct page *page, int order)
 {
 #ifdef CONFIG_MMU
-	set_page_count(page, 1);
+	get_page(page);
 #else
 	int i;
 
@@ -434,7 +432,7 @@ void set_page_refs(struct page *page, in
 	 * - eg: access_process_vm()
 	 */
 	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
+		get_page(page + i);
 #endif /* CONFIG_MMU */
 }
 
@@ -445,7 +443,6 @@ static void prep_new_page(struct page *p
 {
 	if (	page_mapcount(page) ||
 		page->mapping != NULL ||
-		page_count(page) != 0 ||
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private	|
@@ -464,7 +461,13 @@ static void prep_new_page(struct page *p
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	page->private = 0;
+
 	set_page_refs(page, order);
+	smp_mb();
+	/* Wait for speculative get_page after count has been elevated. */
+	while (unlikely(page_count(page) > 1))
+		cpu_relax();
+
 	kernel_map_pages(page, 1 << order, 1);
 }
 
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -504,6 +504,7 @@ static int shrink_list(struct list_head 
 		if (!mapping)
 			goto keep_locked;	/* truncate got there first */
 
+		SetPageFreeing(page);
 		write_lock_irq(&mapping->tree_lock);
 
 		/*
@@ -513,6 +514,7 @@ static int shrink_list(struct list_head 
 		 */
 		if (page_count(page) != 2 || PageDirty(page)) {
 			write_unlock_irq(&mapping->tree_lock);
+			ClearPageFreeing(page);
 			goto keep_locked;
 		}
 
@@ -533,6 +535,7 @@ static int shrink_list(struct list_head 
 
 free_it:
 		unlock_page(page);
+		__ClearPageFreeing(page);
 		reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
Index: linux-2.6/mm/bootmem.c
===================================================================
--- linux-2.6.orig/mm/bootmem.c
+++ linux-2.6/mm/bootmem.c
@@ -278,17 +278,19 @@ static unsigned long __init free_all_boo
 		if (gofast && v == ~0UL) {
 			int j, order;
 
+			prefetchw(page);
 			count += BITS_PER_LONG;
-			__ClearPageReserved(page);
+
 			order = ffs(BITS_PER_LONG) - 1;
-			set_page_refs(page, order);
-			for (j = 1; j < BITS_PER_LONG; j++) {
-				if (j + 16 < BITS_PER_LONG)
-					prefetchw(page + j + 16);
+			for (j = 0; j < BITS_PER_LONG; j++) {
+				if (j + 1 < BITS_PER_LONG)
+					prefetchw(page + j + 1);
 				__ClearPageReserved(page + j);
 				set_page_count(page + j, 0);
 			}
+			set_page_refs(page, order);
 			__free_pages(page, order);
+
 			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		} else if (v) {
@@ -297,6 +299,7 @@ static unsigned long __init free_all_boo
 				if (v & m) {
 					count++;
 					__ClearPageReserved(page);
+					set_page_count(page, 0);
 					set_page_refs(page, 0);
 					__free_page(page);
 				}

  reply	other threads:[~2005-06-27  6:32 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-06-27  6:29 [rfc] lockless pagecache Nick Piggin
2005-06-27  6:32 ` [patch 1] mm: PG_free flag Nick Piggin
2005-06-27  6:32   ` Nick Piggin [this message]
2005-06-27  6:33     ` [patch 3] radix tree: lookup_slot Nick Piggin
2005-06-27  6:34       ` [patch 4] radix tree: lockless readside Nick Piggin
2005-06-27  6:34         ` [patch 5] mm: lockless pagecache lookups Nick Piggin
2005-06-27  6:35           ` [patch 6] mm: spinlock tree_lock Nick Piggin
2005-06-27 14:12     ` [patch 2] mm: speculative get_page William Lee Irwin III
2005-06-28  0:03       ` Nick Piggin
2005-06-28  0:56         ` Nick Piggin
2005-06-28  1:22         ` William Lee Irwin III
2005-06-28  1:42           ` Nick Piggin
2005-06-28  4:06             ` William Lee Irwin III
2005-06-28  4:50               ` Nick Piggin
2005-06-28  5:08                 ` [patch 2] mm: speculative get_page, " David S. Miller, Nick Piggin
2005-06-28  5:34                   ` Nick Piggin
2005-06-28 14:19                   ` William Lee Irwin III
2005-06-28 15:43                     ` Nick Piggin
2005-06-28 17:01                       ` Christoph Lameter
2005-06-28 23:10                         ` Nick Piggin
2005-06-28 21:32                   ` Jesse Barnes
2005-06-28 22:17                     ` Christoph Lameter
2005-06-28 12:45     ` Andy Whitcroft
2005-06-28 13:16       ` Nick Piggin
2005-06-28 16:02         ` Dave Hansen
2005-06-29 16:31           ` Pavel Machek
2005-06-29 18:43             ` Dave Hansen
2005-06-29 21:22               ` Pavel Machek
2005-06-29 16:31         ` Pavel Machek
2005-06-27  6:43 ` VFS scalability (was: [rfc] lockless pagecache) Nick Piggin
2005-06-27  7:13   ` Andi Kleen
2005-06-27  7:33     ` VFS scalability Nick Piggin
2005-06-27  7:44       ` Andi Kleen
2005-06-27  8:03         ` Nick Piggin
2005-06-27  7:46 ` [rfc] lockless pagecache Andrew Morton
2005-06-27  8:02   ` Nick Piggin
2005-06-27  8:15     ` Andrew Morton
2005-06-27  8:28       ` Nick Piggin
2005-06-27  8:56     ` Lincoln Dale
2005-06-27  9:04       ` Nick Piggin
2005-06-27 18:14         ` Chen, Kenneth W
2005-06-27 18:50           ` Badari Pulavarty
2005-06-27 19:05             ` Chen, Kenneth W
2005-06-27 19:22               ` Christoph Lameter
2005-06-27 19:42                 ` Chen, Kenneth W
2005-07-05 15:11                   ` Sonny Rao
2005-07-05 15:31                     ` Martin J. Bligh
2005-07-05 15:37                       ` Sonny Rao
2005-06-27 13:17     ` Benjamin LaHaise
2005-06-28  0:32       ` Nick Piggin
2005-06-28  1:26         ` William Lee Irwin III
2005-06-27 14:08   ` Martin J. Bligh
2005-06-27 17:49   ` Christoph Lameter
2005-06-29 10:49 ` Hirokazu Takahashi
2005-06-29 11:38   ` Nick Piggin
2005-06-30  3:32     ` Hirokazu Takahashi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42BF9D86.90204@yahoo.com.au \
    --to=nickpiggin@yahoo.com.au \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox