* [patch 1/3] radix tree: RCU lockless read-side
2006-04-04 9:31 [patch 0/3] lockless pagecache Nick Piggin
@ 2006-04-04 9:31 ` Nick Piggin
2006-04-04 9:32 ` [patch 2/3] mm: speculative get_page Nick Piggin
2006-04-04 9:32 ` [patch 3/3] mm: lockless pagecache lookups Nick Piggin
2 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-04 9:31 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel, Nick Piggin, Linux Memory Management
Make radix tree lookups safe to be performed without locks. Readers
are protected against nodes being deleted by using RCU based freeing.
Readers are protected against new node insertion by using memory
barriers to ensure the node itself will be properly written before it
is visible in the radix tree.
Each radix tree node keeps a record of their height (above leaf
nodes). This height does not change after insertion -- when the radix
tree is extended, higher nodes are only inserted in the top. So a
lookup can take the pointer to what is *now* the root node, and
traverse down it even if the tree is concurrently extended and this
node becomes a subtree of a new root.
When a reader wants to traverse the next branch, they will take a
copy of the pointer. This pointer will be either NULL (and the branch
is empty) or non-NULL (and will point to a valid node).
Also introduce a lockfree gang_lookup_slot which will be used by a
future patch.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Index: linux-2.6/lib/radix-tree.c
===================================================================
--- linux-2.6.orig/lib/radix-tree.c
+++ linux-2.6/lib/radix-tree.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/string.h>
#include <linux/bitops.h>
+#include <linux/rcupdate.h>
#ifdef __KERNEL__
@@ -45,7 +46,9 @@
((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
struct radix_tree_node {
+ unsigned int height; /* Height from the bottom */
unsigned int count;
+ struct rcu_head rcu_head;
void *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
@@ -97,10 +100,17 @@ radix_tree_node_alloc(struct radix_tree_
return ret;
}
+static void radix_tree_node_rcu_free(struct rcu_head *head)
+{
+ struct radix_tree_node *node =
+ container_of(head, struct radix_tree_node, rcu_head);
+ kmem_cache_free(radix_tree_node_cachep, node);
+}
+
static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
- kmem_cache_free(radix_tree_node_cachep, node);
+ call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}
/*
@@ -206,6 +216,7 @@ static int radix_tree_extend(struct radi
}
do {
+ unsigned int newheight;
if (!(node = radix_tree_node_alloc(root)))
return -ENOMEM;
@@ -218,9 +229,11 @@ static int radix_tree_extend(struct radi
tag_set(node, tag, 0);
}
+ newheight = root->height+1;
+ node->height = newheight;
node->count = 1;
- root->rnode = node;
- root->height++;
+ rcu_assign_pointer(root->rnode, node);
+ root->height = newheight;
} while (height > root->height);
out:
return 0;
@@ -260,11 +273,12 @@ int radix_tree_insert(struct radix_tree_
/* Have to add a child node. */
if (!(slot = radix_tree_node_alloc(root)))
return -ENOMEM;
+ slot->height = height;
if (node) {
- node->slots[offset] = slot;
+ rcu_assign_pointer(node->slots[offset], slot);
node->count++;
} else
- root->rnode = slot;
+ rcu_assign_pointer(root->rnode, slot);
}
/* Go a level down */
@@ -280,7 +294,7 @@ int radix_tree_insert(struct radix_tree_
BUG_ON(!node);
node->count++;
- node->slots[offset] = item;
+ rcu_assign_pointer(node->slots[offset], item);
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
@@ -292,25 +306,29 @@ static inline void **__lookup_slot(struc
unsigned long index)
{
unsigned int height, shift;
- struct radix_tree_node **slot;
+ struct radix_tree_node *node, **slot;
- height = root->height;
+ /* Must take a copy now because root->rnode may change */
+ node = rcu_dereference(root->rnode);
+ if (node == NULL)
+ return NULL;
+
+ height = node->height;
if (index > radix_tree_maxindex(height))
return NULL;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
- slot = &root->rnode;
- while (height > 0) {
- if (*slot == NULL)
+ do {
+ slot = (struct radix_tree_node **)
+ (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
+ node = rcu_dereference(*slot);
+ if (node == NULL)
return NULL;
- slot = (struct radix_tree_node **)
- ((*slot)->slots +
- ((index >> shift) & RADIX_TREE_MAP_MASK));
shift -= RADIX_TREE_MAP_SHIFT;
height--;
- }
+ } while (height > 0);
return (void **)slot;
}
@@ -322,6 +340,12 @@ static inline void **__lookup_slot(struc
*
* Lookup the slot corresponding to the position @index in the radix tree
* @root. This is useful for update-if-exists operations.
+ *
+ * This function can be called under rcu_read_lock, however it is the
+ * duty of the caller to manage the lifetimes of the leaf nodes (ie.
+ * they would usually be RCU protected as well). Also, dereferencing
+ * the slot pointer would require rcu_dereference, and modifying it
+ * would require rcu_assign_pointer.
*/
void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
{
@@ -335,13 +359,18 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
* @index: index key
*
* Lookup the item at the position @index in the radix tree @root.
+ *
+ * Like radix_tree_lookup_slot, this function can be called under
+ * rcu_read_lock, and likewise the caller must manage lifetimes of
+ * leaf nodes. No RCU barriers are required to access or modify the
+ * returned item, however.
*/
void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
{
void **slot;
slot = __lookup_slot(root, index);
- return slot != NULL ? *slot : NULL;
+ return slot != NULL ? rcu_dereference(*slot) : NULL;
}
EXPORT_SYMBOL(radix_tree_lookup);
@@ -505,7 +534,7 @@ EXPORT_SYMBOL(radix_tree_tag_get);
#endif
static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+__lookup(struct radix_tree_root *root, void ***results, unsigned long index,
unsigned int max_items, unsigned long *next_index)
{
unsigned int nr_found = 0;
@@ -513,18 +542,20 @@ __lookup(struct radix_tree_root *root, v
struct radix_tree_node *slot;
unsigned long i;
- height = root->height;
- if (height == 0)
+ slot = rcu_dereference(root->rnode);
+ if (!slot || slot->height == 0)
goto out;
+ height = slot->height;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
- slot = root->rnode;
for ( ; height > 1; height--) {
+ struct radix_tree_node *__s;
for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
i < RADIX_TREE_MAP_SIZE; i++) {
- if (slot->slots[i] != NULL)
+ __s = rcu_dereference(slot->slots[i]);
+ if (__s != NULL)
break;
index &= ~((1UL << shift) - 1);
index += 1UL << shift;
@@ -535,14 +566,14 @@ __lookup(struct radix_tree_root *root, v
goto out;
shift -= RADIX_TREE_MAP_SHIFT;
- slot = slot->slots[i];
+ slot = __s;
}
/* Bottom level: grab some items */
for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
index++;
if (slot->slots[i]) {
- results[nr_found++] = slot->slots[i];
+ results[nr_found++] = &slot->slots[i];
if (nr_found == max_items)
goto out;
}
@@ -572,6 +603,46 @@ radix_tree_gang_lookup(struct radix_tree
const unsigned long max_index = radix_tree_maxindex(root->height);
unsigned long cur_index = first_index;
unsigned int ret = 0;
+ void ***__results = (void ***)results; /* use results as a temporary
+ * store for the pointers to
+ * the actual results */
+
+ while (ret < max_items) {
+ unsigned int nr_found, i;
+ unsigned long next_index; /* Index of next search */
+
+ if (cur_index > max_index)
+ break;
+ nr_found = __lookup(root, __results + ret, cur_index,
+ max_items - ret, &next_index);
+ for (i = 0; i < nr_found; i++)
+ results[ret + i] = *rcu_dereference(__results[ret + i]);
+ ret += nr_found;
+ if (next_index == 0)
+ break;
+ cur_index = next_index;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);
+
+/**
+ * radix_tree_gang_lookup_slot - perform multiple lookup on a radix tree
+ * @root: radix tree root
+ * @results: where the results of the lookup are placed
+ * @first_index: start the lookup from this key
+ * @max_items: place up to this many items at *results
+ *
+ * Same as radix_tree_gang_lookup, but returns an array of pointers
+ * (slots) to the stored items instead of the items themselves.
+ */
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+ unsigned long first_index, unsigned int max_items)
+{
+ const unsigned long max_index = radix_tree_maxindex(root->height);
+ unsigned long cur_index = first_index;
+ unsigned int ret = 0;
while (ret < max_items) {
unsigned int nr_found;
@@ -588,7 +659,8 @@ radix_tree_gang_lookup(struct radix_tree
}
return ret;
}
-EXPORT_SYMBOL(radix_tree_gang_lookup);
+EXPORT_SYMBOL_GPL(radix_tree_gang_lookup_slot);
+
/*
* FIXME: the two tag_get()s here should use find_next_bit() instead of
@@ -694,6 +766,11 @@ static inline void radix_tree_shrink(str
root->rnode->slots[0]) {
struct radix_tree_node *to_free = root->rnode;
+ /*
+ * this doesn't need an rcu_assign_pointer, because
+ * we aren't touching the object that to_free->slots[0]
+ * points to.
+ */
root->rnode = to_free->slots[0];
root->height--;
/* must only free zeroed nodes into the slab */
@@ -809,7 +886,7 @@ EXPORT_SYMBOL(radix_tree_delete);
int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
{
struct radix_tree_node *rnode;
- rnode = root->rnode;
+ rnode = rcu_dereference(root->rnode);
if (!rnode)
return 0;
return any_tag_set(rnode, tag);
Index: linux-2.6/include/linux/radix-tree.h
===================================================================
--- linux-2.6.orig/include/linux/radix-tree.h
+++ linux-2.6/include/linux/radix-tree.h
@@ -54,6 +54,9 @@ void *radix_tree_delete(struct radix_tre
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items);
+unsigned int
+radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
+ unsigned long first_index, unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *root,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* [patch 2/3] mm: speculative get_page
2006-04-04 9:31 [patch 0/3] lockless pagecache Nick Piggin
2006-04-04 9:31 ` [patch 1/3] radix tree: RCU lockless read-side Nick Piggin
@ 2006-04-04 9:32 ` Nick Piggin
2006-04-04 9:47 ` Andrew Morton
` (2 more replies)
2006-04-04 9:32 ` [patch 3/3] mm: lockless pagecache lookups Nick Piggin
2 siblings, 3 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-04 9:32 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel, Nick Piggin, Linux Memory Management
If we can be sure that elevating the page_count on a pagecache
page will pin it, we can speculatively run this operation, and
subsequently check to see if we hit the right page rather than
relying on holding a lock or otherwise pinning a reference to the
page.
This can be done if get_page/put_page behaves consistently
throughout the whole tree (ie. if we "get" the page after it has
been used for something else, we must be able to free it with a
put_page).
Actually, there is a period where the count behaves differently:
when the page is free or if it is a constituent page of a compound
page. We need an atomic_inc_not_zero operation to ensure we don't
try to grab the page in either case.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -76,6 +76,9 @@
#define PG_nosave_free 18 /* Free, should not be written */
#define PG_uncached 19 /* Page has been mapped as uncached */
+#define PG_nonewrefs 20 /* Block concurrent pagecache lookups
+ * while testing refcount */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
* allowed.
@@ -346,6 +349,11 @@ extern void __mod_page_state_offset(unsi
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#define PageNoNewRefs(page) test_bit(PG_nonewrefs, &(page)->flags)
+#define SetPageNoNewRefs(page) set_bit(PG_nonewrefs, &(page)->flags)
+#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags)
+#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags)
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -11,6 +11,8 @@
#include <linux/compiler.h>
#include <asm/uaccess.h>
#include <linux/gfp.h>
+#include <linux/page-flags.h>
+#include <linux/hardirq.h> /* for in_interrupt() */
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
@@ -51,6 +53,91 @@ static inline void mapping_set_gfp_mask(
#define page_cache_release(page) put_page(page)
void release_pages(struct page **pages, int nr, int cold);
+static inline struct page *page_cache_get_speculative(struct page **pagep)
+{
+ struct page *page;
+
+ VM_BUG_ON(in_interrupt());
+
+#ifndef CONFIG_SMP
+ page = *pagep;
+ if (unlikely(!page))
+ return NULL;
+
+ VM_BUG_ON(!in_atomic());
+ /*
+ * Preempt must be disabled here - we rely on rcu_read_lock doing
+ * this for us.
+ *
+ * Pagecache won't be truncated from interrupt context, so if we have
+ * found a page in the radix tree here, we have pinned its refcount by
+ * disabling preempt, and hence no need for the "speculative get" that
+ * SMP requires.
+ */
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_inc(&page->_count);
+ VM_BUG_ON(page != *pagep);
+
+#else
+ again:
+ page = rcu_dereference(*pagep);
+ if (unlikely(!page))
+ return NULL;
+
+ if (unlikely(!get_page_unless_zero(page)))
+ goto again; /* page has been freed */
+
+ /*
+ * Note that get_page_unless_zero provides a memory barrier.
+ * This is needed to ensure PageNoNewRefs is evaluated after the
+ * page refcount has been raised. See below comment.
+ */
+
+ /*
+ * PageNoNewRefs is set in order to prevent new references to the
+ * page (eg. before it gets removed from pagecache). Wait until it
+ * becomes clear (and checks below will ensure we still have the
+ * correct one).
+ */
+ while (unlikely(PageNoNewRefs(page)))
+ cpu_relax();
+
+ /*
+ * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs())
+ * is performed before the load of *pagep in the below comparison.
+ *
+ * Those places that set PageNoNewRefs have the following pattern:
+ * SetPageNoNewRefs(page)
+ * wmb();
+ * if (page_count(page) == X)
+ * remove page from pagecache
+ * wmb();
+ * ClearPageNoNewRefs(page)
+ *
+ * So PageNoNewRefs() becomes clear _after_ we've elevated page
+ * refcount, then either the page will be safely pinned in pagecache,
+ * or it will have been already removed. In the latter case, *pagep
+ * will be changed in the below test - provided it is loaded after
+ * testing PageNoNewRefs() (which is what the smp_rmb is for).
+ *
+ * If the load was out of order, *pagep might be loaded before the
+ * page is removed from pagecache while PageNoNewRefs evaluated after
+ * the ClearPageNoNewRefs().
+ */
+ smp_rmb();
+
+ if (unlikely(page != *pagep)) {
+ /* page no longer at *pagep */
+ put_page(page);
+ goto again;
+ }
+
+#endif
+ VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page);
+
+ return page;
+}
+
#ifdef CONFIG_NUMA
extern struct page *page_cache_alloc(struct address_space *x);
extern struct page *page_cache_alloc_cold(struct address_space *x);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -365,6 +365,7 @@ int remove_mapping(struct address_space
if (!mapping)
return 0; /* truncate got there first */
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
/*
@@ -383,17 +384,20 @@ int remove_mapping(struct address_space
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
- __put_page(page); /* The pagecache ref */
- return 1;
+ goto free_it;
}
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
+
+free_it:
+ __ClearPageNoNewRefs(page);
+ __put_page(page); /* The pagecache ref */
return 1;
cannot_free:
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
return 0;
}
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -407,6 +407,7 @@ int add_to_page_cache(struct page *page,
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
@@ -418,6 +419,7 @@ int add_to_page_cache(struct page *page,
pagecache_acct(1);
}
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
radix_tree_preload_end();
}
return error;
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -78,6 +78,7 @@ static int __add_to_swap_cache(struct pa
BUG_ON(PagePrivate(page));
error = radix_tree_preload(gfp_mask);
if (!error) {
+ SetPageNoNewRefs(page);
write_lock_irq(&swapper_space.tree_lock);
error = radix_tree_insert(&swapper_space.page_tree,
entry.val, page);
@@ -90,6 +91,7 @@ static int __add_to_swap_cache(struct pa
pagecache_acct(1);
}
write_unlock_irq(&swapper_space.tree_lock);
+ ClearPageNoNewRefs(page);
radix_tree_preload_end();
}
return error;
Index: linux-2.6/mm/migrate.c
===================================================================
--- linux-2.6.orig/mm/migrate.c
+++ linux-2.6/mm/migrate.c
@@ -28,8 +28,6 @@
#include "internal.h"
-#include "internal.h"
-
/* The maximum number of pages to take off the LRU for migration */
#define MIGRATE_CHUNK_SIZE 256
@@ -225,6 +223,7 @@ int migrate_page_remove_references(struc
if (page_mapcount(page))
return -EAGAIN;
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -234,6 +233,7 @@ int migrate_page_remove_references(struc
if (!page_mapping(page) || page_count(page) != nr_refs ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
return 1;
}
@@ -253,9 +253,13 @@ int migrate_page_remove_references(struc
set_page_private(newpage, page_private(page));
}
- *radix_pointer = newpage;
+ SetPageNoNewRefs(newpage);
+ rcu_assign_pointer(*radix_pointer, newpage);
+
+ write_unlock_irq(&mapping->tree_lock);
__put_page(page);
- write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
+ ClearPageNoNewRefs(newpage);
return 0;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [patch 2/3] mm: speculative get_page
2006-04-04 9:32 ` [patch 2/3] mm: speculative get_page Nick Piggin
@ 2006-04-04 9:47 ` Andrew Morton
2006-04-04 10:21 ` Nick Piggin
2006-04-04 15:20 ` Christoph Lameter
2006-04-04 15:21 ` Christoph Lameter
2 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2006-04-04 9:47 UTC (permalink / raw)
To: Nick Piggin; +Cc: linux-kernel, linux-mm
Nick Piggin <npiggin@suse.de> wrote:
>
> +static inline struct page *page_cache_get_speculative(struct page **pagep)
Seems rather large to inline.
> +{
> + struct page *page;
> +
> + VM_BUG_ON(in_interrupt());
> +
> +#ifndef CONFIG_SMP
> + page = *pagep;
> + if (unlikely(!page))
> + return NULL;
> +
> + VM_BUG_ON(!in_atomic());
This will go blam if !CONFIG_PREEMPT.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [patch 2/3] mm: speculative get_page
2006-04-04 9:47 ` Andrew Morton
@ 2006-04-04 10:21 ` Nick Piggin
0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-04 10:21 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, linux-kernel, linux-mm
On Tue, Apr 04, 2006 at 02:47:15AM -0700, Andrew Morton wrote:
> Nick Piggin <npiggin@suse.de> wrote:
> >
> > +static inline struct page *page_cache_get_speculative(struct page **pagep)
>
> Seems rather large to inline.
>
Possibly... with all the debugging turned off, it is only atomic_inc
on UP, and atomic_inc_not_zero + several branches and barriers on SMP.
With only two callsites, I figure it is probably OK to be inline. It
probably looks bigger than it is...
> > +{
> > + struct page *page;
> > +
> > + VM_BUG_ON(in_interrupt());
> > +
> > +#ifndef CONFIG_SMP
> > + page = *pagep;
> > + if (unlikely(!page))
> > + return NULL;
> > +
> > + VM_BUG_ON(!in_atomic());
>
> This will go blam if !CONFIG_PREEMPT.
Hmm yes. Is there a safe way to do that? I guess it is pretty trivally
safely under rcu_read_lock , so that can probably just be removed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [patch 2/3] mm: speculative get_page
2006-04-04 9:32 ` [patch 2/3] mm: speculative get_page Nick Piggin
2006-04-04 9:47 ` Andrew Morton
@ 2006-04-04 15:20 ` Christoph Lameter
2006-04-05 0:22 ` Nick Piggin
2006-04-04 15:21 ` Christoph Lameter
2 siblings, 1 reply; 11+ messages in thread
From: Christoph Lameter @ 2006-04-04 15:20 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management
Looks like the NoNewRefs flag is mostly ==
spin_is_locked(mapping->tree_lock)? Would it not be better to check the
tree_lock?
> --- linux-2.6.orig/mm/migrate.c
> +++ linux-2.6/mm/migrate.c
>
> + SetPageNoNewRefs(page);
> write_lock_irq(&mapping->tree_lock);
A dream come true! If this is really working as it sounds then we can
move the SetPageNoNewRefs up and avoid the final check under
mapping->tree_lock. Then keep SetPageNoNewRefs until the page has been
copied. It would basically play the same role as locking the page.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [patch 2/3] mm: speculative get_page
2006-04-04 15:20 ` Christoph Lameter
@ 2006-04-05 0:22 ` Nick Piggin
0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-05 0:22 UTC (permalink / raw)
To: Christoph Lameter
Cc: Nick Piggin, Andrew Morton, Linux Kernel, Linux Memory Management
Christoph Lameter wrote:
> Looks like the NoNewRefs flag is mostly ==
> spin_is_locked(mapping->tree_lock)? Would it not be better to check the
> tree_lock?
>
Well there are other uses for the tree_lock (eg. tag operations)
which do not need the "no new references" guarantee.
>
>
>>--- linux-2.6.orig/mm/migrate.c
>>+++ linux-2.6/mm/migrate.c
>>
>>+ SetPageNoNewRefs(page);
>> write_lock_irq(&mapping->tree_lock);
>
>
> A dream come true! If this is really working as it sounds then we can
> move the SetPageNoNewRefs up and avoid the final check under
> mapping->tree_lock. Then keep SetPageNoNewRefs until the page has been
> copied. It would basically play the same role as locking the page.
>
Yes we could do that but at this stage I wouldn't like to seperate
SetPageNoNewRefs from tree_lock, as it is replacing a traditional
guarantee that tree_lock no longer provides.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [patch 2/3] mm: speculative get_page
2006-04-04 9:32 ` [patch 2/3] mm: speculative get_page Nick Piggin
2006-04-04 9:47 ` Andrew Morton
2006-04-04 15:20 ` Christoph Lameter
@ 2006-04-04 15:21 ` Christoph Lameter
2006-04-05 0:27 ` Nick Piggin
2 siblings, 1 reply; 11+ messages in thread
From: Christoph Lameter @ 2006-04-04 15:21 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, Linux Kernel, Linux Memory Management
On Tue, 4 Apr 2006, Nick Piggin wrote:
> + /*
> + * PageNoNewRefs is set in order to prevent new references to the
> + * page (eg. before it gets removed from pagecache). Wait until it
> + * becomes clear (and checks below will ensure we still have the
> + * correct one).
> + */
> + while (unlikely(PageNoNewRefs(page)))
> + cpu_relax();
That part looks suspiciously like we need some sort of lock here.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [patch 2/3] mm: speculative get_page
2006-04-04 15:21 ` Christoph Lameter
@ 2006-04-05 0:27 ` Nick Piggin
0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-05 0:27 UTC (permalink / raw)
To: Christoph Lameter
Cc: Nick Piggin, Andrew Morton, Linux Kernel, Linux Memory Management
Christoph Lameter wrote:
> On Tue, 4 Apr 2006, Nick Piggin wrote:
>
>
>>+ /*
>>+ * PageNoNewRefs is set in order to prevent new references to the
>>+ * page (eg. before it gets removed from pagecache). Wait until it
>>+ * becomes clear (and checks below will ensure we still have the
>>+ * correct one).
>>+ */
>>+ while (unlikely(PageNoNewRefs(page)))
>>+ cpu_relax();
>
>
> That part looks suspiciously like we need some sort of lock here.
>
It's very light-weight now. A lock of course would only be page local,
so it wouldn't really harm scalability, however it would slow down the
single threaded case. At the moment, single threaded performance of
find_get_page is anywhere from about 15-100% faster than before the
lockless patches.
I don't see why you think there needs to be a lock? Before the write
side clears PageNoNewRefs, they will have moved 'page' out of pagecache,
so when this loop breaks, the subsequent test will fail and this
function will be repeated.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [patch 3/3] mm: lockless pagecache lookups
2006-04-04 9:31 [patch 0/3] lockless pagecache Nick Piggin
2006-04-04 9:31 ` [patch 1/3] radix tree: RCU lockless read-side Nick Piggin
2006-04-04 9:32 ` [patch 2/3] mm: speculative get_page Nick Piggin
@ 2006-04-04 9:32 ` Nick Piggin
2 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-04-04 9:32 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linux Kernel, Nick Piggin, Linux Memory Management
Use page_cache_get_speculative and lockless radix tree lookups to
introduce lockless page cache lookups (ie. no mapping->tree_lock).
The only atomicity changes this should introduce is the use of a
non atomic pagevec lookup for truncate, however what atomicity
guarantees that there might have been were not used anyway, because
the size of the pagevec is not guaranteed (eg. it might be 1).
Signed-off-by: Nick Piggin <npiggin@suse.de>
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -424,7 +424,6 @@ int add_to_page_cache(struct page *page,
}
return error;
}
-
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -547,21 +546,21 @@ void fastcall __lock_page(struct page *p
EXPORT_SYMBOL(__lock_page);
/*
- * a rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+ * find_get_page finds and gets a reference to a pagecache page.
*/
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page *find_get_page(struct address_space *mapping, unsigned long offset)
{
- struct page *page;
+ struct page **pagep;
+ struct page *page = NULL;
- read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
- if (page)
- page_cache_get(page);
- read_unlock_irq(&mapping->tree_lock);
+ rcu_read_lock();
+ pagep = (struct page **)radix_tree_lookup_slot(&mapping->page_tree,
+ offset);
+ if (likely(pagep))
+ page = page_cache_get_speculative(pagep);
+ rcu_read_unlock();
return page;
}
-
EXPORT_SYMBOL(find_get_page);
/*
@@ -597,26 +596,17 @@ struct page *find_lock_page(struct addre
{
struct page *page;
- read_lock_irq(&mapping->tree_lock);
repeat:
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = find_get_page(mapping, offset);
if (page) {
- page_cache_get(page);
- if (TestSetPageLocked(page)) {
- read_unlock_irq(&mapping->tree_lock);
- __lock_page(page);
- read_lock_irq(&mapping->tree_lock);
-
- /* Has the page been truncated while we slept? */
- if (unlikely(page->mapping != mapping ||
- page->index != offset)) {
- unlock_page(page);
- page_cache_release(page);
- goto repeat;
- }
+ lock_page(page);
+ /* Has the page been truncated while we slept? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
}
}
- read_unlock_irq(&mapping->tree_lock);
return page;
}
@@ -699,6 +689,32 @@ unsigned find_get_pages(struct address_s
return ret;
}
+unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages)
+{
+ unsigned int i;
+ unsigned int nr_found;
+ unsigned int ret;
+
+ /*
+ * We do some unsightly casting to use the array first for storing
+ * pointers to the page pointers, and then for the pointers to
+ * the pages themselves that the caller wants.
+ */
+ rcu_read_lock();
+ nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ (void ***)pages, start, nr_pages);
+ ret = 0;
+ for (i = 0; i < nr_found; i++) {
+ struct page *page;
+ page = page_cache_get_speculative(((struct page ***)pages)[i]);
+ if (likely(page))
+ pages[ret++] = page;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
/*
* Like find_get_pages, except we only return pages which are tagged with
* `tag'. We update *index to index the next page for the traversal.
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -286,27 +286,26 @@ __do_page_cache_readahead(struct address
/*
* Preallocate as many pages as we will need.
*/
- read_lock_irq(&mapping->tree_lock);
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
+ /* Don't need mapping->tree_lock - lookup can be racy */
+ rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ rcu_read_unlock();
if (page)
continue;
- read_unlock_irq(&mapping->tree_lock);
page = page_cache_alloc_cold(mapping);
- read_lock_irq(&mapping->tree_lock);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
ret++;
}
- read_unlock_irq(&mapping->tree_lock);
/*
* Now start the IO. We ignore I/O errors - if the page is not
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -165,6 +165,8 @@ extern struct page * find_or_create_page
unsigned long index, gfp_t gfp_mask);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages);
Index: linux-2.6/include/linux/pagevec.h
===================================================================
--- linux-2.6.orig/include/linux/pagevec.h
+++ linux-2.6/include/linux/pagevec.h
@@ -28,6 +28,8 @@ void __pagevec_lru_add_active(struct pag
void pagevec_strip(struct pagevec *pvec);
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages);
+unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t start, unsigned nr_pages);
unsigned pagevec_lookup_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, int tag,
unsigned nr_pages);
Index: linux-2.6/mm/swap.c
===================================================================
--- linux-2.6.orig/mm/swap.c
+++ linux-2.6/mm/swap.c
@@ -427,6 +427,20 @@ unsigned pagevec_lookup(struct pagevec *
EXPORT_SYMBOL(pagevec_lookup);
+/**
+ * pagevec_lookup_nonatomic - non atomic pagevec_lookup
+ *
+ * This routine is non-atomic in that it may return blah.
+ */
+unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t start, unsigned nr_pages)
+{
+ pvec->nr = find_get_pages_nonatomic(mapping, start,
+ nr_pages, pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_nonatomic);
+
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
{
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -124,7 +124,7 @@ void truncate_inode_pages_range(struct a
pagevec_init(&pvec, 0);
next = start;
while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
@@ -163,7 +163,7 @@ void truncate_inode_pages_range(struct a
next = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ if (!pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
if (next == start)
break;
next = start;
@@ -227,7 +227,7 @@ unsigned long invalidate_mapping_pages(s
pagevec_init(&pvec, 0);
while (next <= end &&
- pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -284,7 +284,7 @@ int invalidate_inode_pages2_range(struct
pagevec_init(&pvec, 0);
next = start;
while (next <= end && !ret && !wrapped &&
- pagevec_lookup(&pvec, mapping, next,
+ pagevec_lookup_nonatomic(&pvec, mapping, next,
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -816,17 +816,15 @@ int test_set_page_writeback(struct page
EXPORT_SYMBOL(test_set_page_writeback);
/*
- * Return true if any of the pages in the mapping are marged with the
+ * Return true if any of the pages in the mapping are marked with the
* passed tag.
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- unsigned long flags;
int ret;
-
- read_lock_irqsave(&mapping->tree_lock, flags);
+ rcu_read_lock();
ret = radix_tree_tagged(&mapping->page_tree, tag);
- read_unlock_irqrestore(&mapping->tree_lock, flags);
+ rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL(mapping_tagged);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [patch 2/3] mm: speculative get_page
2006-03-10 15:18 A lockless pagecache for Linux Nick Piggin
@ 2006-03-10 15:18 ` Nick Piggin
0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2006-03-10 15:18 UTC (permalink / raw)
To: Linux Kernel, Linux Memory Management; +Cc: Nick Piggin
If we can be sure that elevating the page_count on a pagecache
page will pin it, we can speculatively run this operation, and
subsequently check to see if we hit the right page rather than
relying on holding a lock or otherwise pinning a reference to the
page.
This can be done if get_page/put_page behaves consistently
throughout the whole tree (ie. if we "get" the page after it has
been used for something else, we must be able to free it with a
put_page).
Actually, there is a period where the count behaves differently:
when the page is free or if it is a constituent page of a compound
page. We need an atomic_inc_not_zero operation to ensure we don't
try to grab the page in either case.
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h
+++ linux-2.6/include/linux/page-flags.h
@@ -76,6 +76,9 @@
#define PG_nosave_free 18 /* Free, should not be written */
#define PG_uncached 19 /* Page has been mapped as uncached */
+#define PG_nonewrefs 20 /* Block concurrent pagecache lookups
+ * while testing refcount */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
* allowed.
@@ -346,6 +349,11 @@ extern void __mod_page_state_offset(unsi
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#define PageNoNewRefs(page) test_bit(PG_nonewrefs, &(page)->flags)
+#define SetPageNoNewRefs(page) set_bit(PG_nonewrefs, &(page)->flags)
+#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags)
+#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags)
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -11,6 +11,8 @@
#include <linux/compiler.h>
#include <asm/uaccess.h>
#include <linux/gfp.h>
+#include <linux/page-flags.h>
+#include <linux/hardirq.h> /* for in_interrupt() */
/*
* Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
@@ -51,6 +53,91 @@ static inline void mapping_set_gfp_mask(
#define page_cache_release(page) put_page(page)
void release_pages(struct page **pages, int nr, int cold);
+static inline struct page *page_cache_get_speculative(struct page **pagep)
+{
+ struct page *page;
+
+ VM_BUG_ON(in_interrupt());
+
+#ifndef CONFIG_SMP
+ page = *pagep;
+ if (unlikely(!page))
+ return NULL;
+
+ VM_BUG_ON(!in_atomic());
+ /*
+ * Preempt must be disabled here - we rely on rcu_read_lock doing
+ * this for us.
+ *
+ * Pagecache won't be truncated from interrupt context, so if we have
+ * found a page in the radix tree here, we have pinned its refcount by
+ * disabling preempt, and hence no need for the "speculative get" that
+ * SMP requires.
+ */
+ VM_BUG_ON(page_count(page) == 0);
+ atomic_inc(&page->_count);
+ VM_BUG_ON(page != *pagep);
+
+#else
+ again:
+ page = rcu_dereference(*pagep);
+ if (unlikely(!page))
+ return NULL;
+
+ if (unlikely(!get_page_unless_zero(page)))
+ goto again; /* page has been freed */
+
+ /*
+ * Note that get_page_unless_zero provides a memory barrier.
+ * This is needed to ensure PageNoNewRefs is evaluated after the
+ * page refcount has been raised. See below comment.
+ */
+
+ /*
+ * PageNoNewRefs is set in order to prevent new references to the
+ * page (eg. before it gets removed from pagecache). Wait until it
+ * becomes clear (and checks below will ensure we still have the
+ * correct one).
+ */
+ while (unlikely(PageNoNewRefs(page)))
+ cpu_relax();
+
+ /*
+ * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs())
+ * is performed before the load of *pagep in the below comparison.
+ *
+ * Those places that set PageNoNewRefs have the following pattern:
+ * SetPageNoNewRefs(page)
+ * wmb();
+ * if (page_count(page) == X)
+ * remove page from pagecache
+ * wmb();
+ * ClearPageNoNewRefs(page)
+ *
+ * So PageNoNewRefs() becomes clear _after_ we've elevated page
+ * refcount, then either the page will be safely pinned in pagecache,
+ * or it will have been already removed. In the latter case, *pagep
+ * will be changed in the below test - provided it is loaded after
+ * testing PageNoNewRefs() (which is what the smp_rmb is for).
+ *
+ * If the load was out of order, *pagep might be loaded before the
+ * page is removed from pagecache while PageNoNewRefs evaluated after
+ * the ClearPageNoNewRefs().
+ */
+ smp_rmb();
+
+ if (unlikely(page != *pagep)) {
+ /* page no longer at *pagep */
+ put_page(page);
+ goto again;
+ }
+
+#endif
+ VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page);
+
+ return page;
+}
+
static inline struct page *page_cache_alloc(struct address_space *x)
{
return alloc_pages(mapping_gfp_mask(x), 0);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c
+++ linux-2.6/mm/vmscan.c
@@ -383,6 +383,7 @@ static int remove_mapping(struct address
if (!mapping)
return 0; /* truncate got there first */
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
/*
@@ -401,17 +402,20 @@ static int remove_mapping(struct address
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
- __put_page(page); /* The pagecache ref */
- return 1;
+ goto free_it;
}
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
+
+free_it:
+ __ClearPageNoNewRefs(page);
+ __put_page(page); /* The pagecache ref */
return 1;
cannot_free:
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
return 0;
}
@@ -731,6 +735,7 @@ int migrate_page_remove_references(struc
if (page_mapcount(page))
return 1;
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
radix_pointer = (struct page **)radix_tree_lookup_slot(
@@ -740,6 +745,7 @@ int migrate_page_remove_references(struc
if (!page_mapping(page) || page_count(page) != nr_refs ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
return 1;
}
@@ -758,10 +764,14 @@ int migrate_page_remove_references(struc
SetPageSwapCache(newpage);
set_page_private(newpage, page_private(page));
}
+ SetPageNoNewRefs(newpage);
+
+ rcu_assign_pointer(*radix_pointer, newpage);
- *radix_pointer = newpage;
- __put_page(page);
write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);
+ ClearPageNoNewRefs(page);
+ ClearPageNoNewRefs(newpage);
return 0;
}
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -400,6 +400,7 @@ int add_to_page_cache(struct page *page,
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
+ SetPageNoNewRefs(page);
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
@@ -411,6 +412,7 @@ int add_to_page_cache(struct page *page,
pagecache_acct(1);
}
write_unlock_irq(&mapping->tree_lock);
+ ClearPageNoNewRefs(page);
radix_tree_preload_end();
}
return error;
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -77,6 +77,7 @@ static int __add_to_swap_cache(struct pa
BUG_ON(PagePrivate(page));
error = radix_tree_preload(gfp_mask);
if (!error) {
+ SetPageNoNewRefs(page);
write_lock_irq(&swapper_space.tree_lock);
error = radix_tree_insert(&swapper_space.page_tree,
entry.val, page);
@@ -89,6 +90,7 @@ static int __add_to_swap_cache(struct pa
pagecache_acct(1);
}
write_unlock_irq(&swapper_space.tree_lock);
+ ClearPageNoNewRefs(page);
radix_tree_preload_end();
}
return error;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 11+ messages in thread