linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [patch 2/2] mm: lockless pagecache
@ 2006-07-26  6:39 Nick Piggin
  2006-07-31 15:36 ` Andy Whitcroft
  0 siblings, 1 reply; 4+ messages in thread
From: Nick Piggin @ 2006-07-26  6:39 UTC (permalink / raw)
  To: Andrew Morton, Linux Memory Management List

Combine page_cache_get_speculative with lockless radix tree lookups to
introduce lockless page cache lookups (ie. no mapping->tree_lock on
the read-side).

The only atomicity changes this introduces is that the gang pagecache
lookup functions now behave as if they are implemented with multiple
find_get_page calls, rather than operating on a snapshot of the pages.
In practice, this atomicity guarantee is not used anyway, and it is
difficult to see how it could be. Gang pagecache lookups are designed
to replace individual lookups, so these semantics are natural.

Swapcache can no longer use find_get_page, because it has a different
method of encoding swapcache position into the page. Introduce a new
find_get_swap_page for it.

Signed-off-by: Nick Piggin <npiggin@suse.de>

 include/linux/swap.h |    1
 mm/filemap.c         |  161 +++++++++++++++++++++++++++++++++++++--------------
 mm/page-writeback.c  |    8 --
 mm/readahead.c       |    7 --
 mm/swap_state.c      |   27 +++++++-
 mm/swapfile.c        |    2
 6 files changed, 150 insertions(+), 56 deletions(-)

Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -613,11 +613,22 @@ struct page *find_trylock_page(struct ad
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
+	rcu_read_lock();
+repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page && TestSetPageLocked(page))
-		page = NULL;
-	read_unlock_irq(&mapping->tree_lock);
+	if (page) {
+		page = page_cache_get_speculative(page);
+		if (unlikely(!page))
+			goto repeat;
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index != offset)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+
 	return page;
 }
 EXPORT_SYMBOL(find_trylock_page);
@@ -637,26 +648,25 @@ struct page *find_lock_page(struct addre
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
 repeat:
+	rcu_read_lock();
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
-		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
-			read_unlock_irq(&mapping->tree_lock);
-			__lock_page(page);
-			read_lock_irq(&mapping->tree_lock);
-
-			/* Has the page been truncated while we slept? */
-			if (unlikely(page->mapping != mapping ||
-				     page->index != offset)) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
+		page = page_cache_get_speculative(page);
+		rcu_read_unlock();
+		if (unlikely(!page))
+			goto repeat;
+		lock_page(page);
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index != offset)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
 		}
-	}
-	read_unlock_irq(&mapping->tree_lock);
+	} else
+		rcu_read_unlock();
+
 	return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -724,16 +734,40 @@ EXPORT_SYMBOL(find_or_create_page);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			    unsigned int nr_pages, struct page **pages)
 {
+
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
+			break;
+		}
+
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index < start)) {
+			page_cache_release(page);
+			goto bail;
+		}
+
+		/* ensure we don't pick up pages that have moved behind us */
+		start = page->index+1;
+	}
+	rcu_read_unlock();
+	return i;
 }
 
 /**
@@ -752,19 +786,35 @@ unsigned find_get_pages_contig(struct ad
 			       unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, index, nr_pages);
-	for (i = 0; i < ret; i++) {
-		if (pages[i]->mapping == NULL || pages[i]->index != index)
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
 			break;
+		}
 
-		page_cache_get(pages[i]);
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index != index)) {
+			page_cache_release(page);
+			goto bail;
+		}
 		index++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
+	rcu_read_unlock();
 	return i;
 }
 
@@ -783,17 +833,40 @@ unsigned find_get_pages_tag(struct addre
 			int tag, unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
+	pgoff_t start = *index;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	if (ret)
-		*index = pages[ret - 1]->index + 1;
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup_tag(&mapping->page_tree,
+				(void **)pages, start, nr_pages, tag);
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
+			break;
+		}
+
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index < start)) {
+			page_cache_release(page);
+			goto bail;
+		}
+
+		/* ensure we don't pick up pages that have moved behind us */
+		start = page->index+1;
+	}
+	rcu_read_unlock();
+	*index = start;
+	return i;
 }
 
 /**
Index: linux-2.6/mm/readahead.c
===================================================================
--- linux-2.6.orig/mm/readahead.c
+++ linux-2.6/mm/readahead.c
@@ -282,27 +282,26 @@ __do_page_cache_readahead(struct address
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
 		
 		if (page_offset > end_index)
 			break;
 
+		/* Don't need mapping->tree_lock - lookup can be racy */
+		rcu_read_lock();
 		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		rcu_read_unlock();
 		if (page)
 			continue;
 
-		read_unlock_irq(&mapping->tree_lock);
 		page = page_cache_alloc_cold(mapping);
-		read_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
 		list_add(&page->lru, &page_pool);
 		ret++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -803,17 +803,15 @@ int test_set_page_writeback(struct page 
 EXPORT_SYMBOL(test_set_page_writeback);
 
 /*
- * Return true if any of the pages in the mapping are marged with the
+ * Return true if any of the pages in the mapping are marked with the
  * passed tag.
  */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-	unsigned long flags;
 	int ret;
-
-	read_lock_irqsave(&mapping->tree_lock, flags);
+	rcu_read_lock();
 	ret = radix_tree_tagged(&mapping->page_tree, tag);
-	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -226,6 +226,7 @@ extern int move_from_swap_cache(struct p
 		struct address_space *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
+extern struct page * find_get_swap_page(swp_entry_t);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
 					   unsigned long addr);
Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c
+++ linux-2.6/mm/swap_state.c
@@ -293,6 +293,29 @@ void free_pages_and_swap_cache(struct pa
 	}
 }
 
+struct page *find_get_swap_page(swp_entry_t entry)
+{
+	struct page *page;
+
+	rcu_read_lock();
+repeat:
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	if (page) {
+		page = page_cache_get_speculative(page);
+		if (unlikely(!page))
+			goto repeat;
+		/* Has the page been truncated? */
+		if (unlikely(!PageSwapCache(page)
+				|| page_private(page) != entry.val)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+
+	return page;
+}
+
 /*
  * Lookup a swap entry in the swap cache. A found page will be returned
  * unlocked and with its refcount incremented - we rely on the kernel
@@ -303,7 +326,7 @@ struct page * lookup_swap_cache(swp_entr
 {
 	struct page *page;
 
-	page = find_get_page(&swapper_space, entry.val);
+	page = find_get_swap_page(entry);
 
 	if (page)
 		INC_CACHE_INFO(find_success);
@@ -330,7 +353,7 @@ struct page *read_swap_cache_async(swp_e
 		 * called after lookup_swap_cache() failed, re-calling
 		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		found_page = find_get_swap_page(entry);
 		if (found_page)
 			break;
 
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c
+++ linux-2.6/mm/swapfile.c
@@ -400,7 +400,7 @@ void free_swap_and_cache(swp_entry_t ent
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
-			page = find_get_page(&swapper_space, entry.val);
+			page = find_get_swap_page(entry);
 			if (page && unlikely(TestSetPageLocked(page))) {
 				page_cache_release(page);
 				page = NULL;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch 2/2] mm: lockless pagecache
  2006-07-26  6:39 [patch 2/2] mm: lockless pagecache Nick Piggin
@ 2006-07-31 15:36 ` Andy Whitcroft
  2006-07-31 18:34   ` Hugh Dickins
  2006-08-01  9:04   ` Nick Piggin
  0 siblings, 2 replies; 4+ messages in thread
From: Andy Whitcroft @ 2006-07-31 15:36 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Linux Memory Management List

Nick Piggin wrote:
> Combine page_cache_get_speculative with lockless radix tree lookups to
> introduce lockless page cache lookups (ie. no mapping->tree_lock on
> the read-side).
> 
> The only atomicity changes this introduces is that the gang pagecache
> lookup functions now behave as if they are implemented with multiple
> find_get_page calls, rather than operating on a snapshot of the pages.
> In practice, this atomicity guarantee is not used anyway, and it is
> difficult to see how it could be. Gang pagecache lookups are designed
> to replace individual lookups, so these semantics are natural.
> 
> Swapcache can no longer use find_get_page, because it has a different
> method of encoding swapcache position into the page. Introduce a new
> find_get_swap_page for it.
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>
> 
>  include/linux/swap.h |    1
>  mm/filemap.c         |  161 +++++++++++++++++++++++++++++++++++++--------------
>  mm/page-writeback.c  |    8 --
>  mm/readahead.c       |    7 --
>  mm/swap_state.c      |   27 +++++++-
>  mm/swapfile.c        |    2
>  6 files changed, 150 insertions(+), 56 deletions(-)
>

It seems in these routines you have two different placements for the rcu 
locking.  Either outside or inside the repeat.  Should we assume that 
those where the locks are outside the repeat: loop have very light payloads?

> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -613,11 +613,22 @@ struct page *find_trylock_page(struct ad
>  {
>  	struct page *page;
>  
> -	read_lock_irq(&mapping->tree_lock);
> +	rcu_read_lock();
> +repeat:
>  	page = radix_tree_lookup(&mapping->page_tree, offset);
> -	if (page && TestSetPageLocked(page))
> -		page = NULL;
> -	read_unlock_irq(&mapping->tree_lock);
> +	if (page) {
> +		page = page_cache_get_speculative(page);
> +		if (unlikely(!page))
> +			goto repeat;
> +		/* Has the page been truncated? */
> +		if (unlikely(page->mapping != mapping
> +				|| page->index != offset)) {
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +	}
> +	rcu_read_unlock();
> +
>  	return page;
>  }

This one has me puzzled.  This seem to no longer lock the page at all 
when returning it.  It seems the semantics of this has changed wildly. 
Also find_lock_page below still seems to lock the page, the semantic 
seems maintained there?  I think I am expecting to find a 
TestSetPageLocked() in the new version too?

>  EXPORT_SYMBOL(find_trylock_page);
> @@ -637,26 +648,25 @@ struct page *find_lock_page(struct addre
>  {
>  	struct page *page;
>  
> -	read_lock_irq(&mapping->tree_lock);
>  repeat:
> +	rcu_read_lock();
>  	page = radix_tree_lookup(&mapping->page_tree, offset);
>  	if (page) {
> -		page_cache_get(page);
> -		if (TestSetPageLocked(page)) {
> -			read_unlock_irq(&mapping->tree_lock);
> -			__lock_page(page);
> -			read_lock_irq(&mapping->tree_lock);
> -
> -			/* Has the page been truncated while we slept? */
> -			if (unlikely(page->mapping != mapping ||
> -				     page->index != offset)) {
> -				unlock_page(page);
> -				page_cache_release(page);
> -				goto repeat;
> -			}
> +		page = page_cache_get_speculative(page);
> +		rcu_read_unlock();
> +		if (unlikely(!page))
> +			goto repeat;
> +		lock_page(page);
> +		/* Has the page been truncated? */
> +		if (unlikely(page->mapping != mapping
> +				|| page->index != offset)) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +			goto repeat;
>  		}
> -	}
> -	read_unlock_irq(&mapping->tree_lock);
> +	} else
> +		rcu_read_unlock();
> +
>  	return page;
>  }
>  EXPORT_SYMBOL(find_lock_page);
> @@ -724,16 +734,40 @@ EXPORT_SYMBOL(find_or_create_page);
>  unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
>  			    unsigned int nr_pages, struct page **pages)
>  {
> +
>  	unsigned int i;
> -	unsigned int ret;
> +	unsigned int nr_found;
>  
> -	read_lock_irq(&mapping->tree_lock);
> -	ret = radix_tree_gang_lookup(&mapping->page_tree,
> +	rcu_read_lock();
> +repeat:
> +	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
>  				(void **)pages, start, nr_pages);
> -	for (i = 0; i < ret; i++)
> -		page_cache_get(pages[i]);
> -	read_unlock_irq(&mapping->tree_lock);
> -	return ret;
> +	for (i = 0; i < nr_found; i++) {
> +		struct page *page;
> +		page = page_cache_get_speculative(pages[i]);
> +		if (unlikely(!page)) {
> +bail:
> +			/*
> +			 * must return at least 1 page, so caller continues
> +			 * calling in.

Although that is a resonable semantic, several callers seem to expect 
all or nothing semantics here.  Mostly the direct callers to 
find_get_pages().  The callers using pagevec_lookup() at least seem to 
cope with a partial left fill as implemented here.

> +			 */
> +			if (i == 0)
> +				goto repeat;
> +			break;
> +		}
> +
> +		/* Has the page been truncated? */
> +		if (unlikely(page->mapping != mapping
> +				|| page->index < start)) {
> +			page_cache_release(page);
> +			goto bail;

I have looked at this check for a while now and I can say I am troubled 
by it.  We do not know which page we are looking up so can we truly say 
the index check here is sufficient?  Also, could not the start= below 
lead us to follow a moving page and skip pages?  Perhaps there is no way 
to get any sort of guarentee with this interface before or after this 
change; and all is well?  Tell me it is :).

> +		}
> +
> +		/* ensure we don't pick up pages that have moved behind us */
> +		start = page->index+1;
> +	}
> +	rcu_read_unlock();
> +	return i;
>  }
>  
>  /**
> @@ -752,19 +786,35 @@ unsigned find_get_pages_contig(struct ad
>  			       unsigned int nr_pages, struct page **pages)
>  {
>  	unsigned int i;
> -	unsigned int ret;
> +	unsigned int nr_found;
>  
> -	read_lock_irq(&mapping->tree_lock);
> -	ret = radix_tree_gang_lookup(&mapping->page_tree,
> +	rcu_read_lock();
> +repeat:
> +	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
>  				(void **)pages, index, nr_pages);
> -	for (i = 0; i < ret; i++) {
> -		if (pages[i]->mapping == NULL || pages[i]->index != index)
> +	for (i = 0; i < nr_found; i++) {
> +		struct page *page;
> +		page = page_cache_get_speculative(pages[i]);
> +		if (unlikely(!page)) {
> +bail:
> +			/*
> +			 * must return at least 1 page, so caller continues
> +			 * calling in.
> +			 */
> +			if (i == 0)
> +				goto repeat;
>  			break;
> +		}
>  
> -		page_cache_get(pages[i]);
> +		/* Has the page been truncated? */
> +		if (unlikely(page->mapping != mapping
> +				|| page->index != index)) {
> +			page_cache_release(page);
> +			goto bail;
> +		}

Ok, normally this construct is checking against the page at 
(mapping,index) so it is very unlikely that the index does not match. 
However in this case we are doing a contiguity scan, so in fact the 
likelyhood of this missmatching is more defined by the likelyhood of 
contiguity in the mapping.  The check originally had no such hints?  Is 
it appropriate to have a hint here?

>  		index++;
>  	}
> -	read_unlock_irq(&mapping->tree_lock);
> +	rcu_read_unlock();
>  	return i;
>  }
>  
> @@ -783,17 +833,40 @@ unsigned find_get_pages_tag(struct addre
>  			int tag, unsigned int nr_pages, struct page **pages)
>  {
>  	unsigned int i;
> -	unsigned int ret;
> +	unsigned int nr_found;
> +	pgoff_t start = *index;
>  
> -	read_lock_irq(&mapping->tree_lock);
> -	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
> -				(void **)pages, *index, nr_pages, tag);
> -	for (i = 0; i < ret; i++)
> -		page_cache_get(pages[i]);
> -	if (ret)
> -		*index = pages[ret - 1]->index + 1;
> -	read_unlock_irq(&mapping->tree_lock);
> -	return ret;
> +	rcu_read_lock();
> +repeat:
> +	nr_found = radix_tree_gang_lookup_tag(&mapping->page_tree,
> +				(void **)pages, start, nr_pages, tag);
> +	for (i = 0; i < nr_found; i++) {
> +		struct page *page;
> +		page = page_cache_get_speculative(pages[i]);
> +		if (unlikely(!page)) {
> +bail:
> +			/*
> +			 * must return at least 1 page, so caller continues
> +			 * calling in.
> +			 */
> +			if (i == 0)
> +				goto repeat;
> +			break;
> +		}
> +
> +		/* Has the page been truncated? */
> +		if (unlikely(page->mapping != mapping
> +				|| page->index < start)) {
> +			page_cache_release(page);
> +			goto bail;
> +		}

Same concern about < start for this one too.

> +
> +		/* ensure we don't pick up pages that have moved behind us */
> +		start = page->index+1;
> +	}
> +	rcu_read_unlock();
> +	*index = start;
> +	return i;
>  }
>  
>  /**
> Index: linux-2.6/mm/readahead.c
> ===================================================================
> --- linux-2.6.orig/mm/readahead.c
> +++ linux-2.6/mm/readahead.c
> @@ -282,27 +282,26 @@ __do_page_cache_readahead(struct address
>  	/*
>  	 * Preallocate as many pages as we will need.
>  	 */
> -	read_lock_irq(&mapping->tree_lock);
>  	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
>  		pgoff_t page_offset = offset + page_idx;
>  		
>  		if (page_offset > end_index)
>  			break;
>  
> +		/* Don't need mapping->tree_lock - lookup can be racy */
> +		rcu_read_lock();
>  		page = radix_tree_lookup(&mapping->page_tree, page_offset);
> +		rcu_read_unlock();
>  		if (page)
>  			continue;
>  
> -		read_unlock_irq(&mapping->tree_lock);
>  		page = page_cache_alloc_cold(mapping);
> -		read_lock_irq(&mapping->tree_lock);
>  		if (!page)
>  			break;
>  		page->index = page_offset;
>  		list_add(&page->lru, &page_pool);
>  		ret++;
>  	}
> -	read_unlock_irq(&mapping->tree_lock);
>  
>  	/*
>  	 * Now start the IO.  We ignore I/O errors - if the page is not
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c
> +++ linux-2.6/mm/page-writeback.c
> @@ -803,17 +803,15 @@ int test_set_page_writeback(struct page 
>  EXPORT_SYMBOL(test_set_page_writeback);
>  
>  /*
> - * Return true if any of the pages in the mapping are marged with the
> + * Return true if any of the pages in the mapping are marked with the
>   * passed tag.
>   */
>  int mapping_tagged(struct address_space *mapping, int tag)
>  {
> -	unsigned long flags;
>  	int ret;
> -
> -	read_lock_irqsave(&mapping->tree_lock, flags);
> +	rcu_read_lock();
>  	ret = radix_tree_tagged(&mapping->page_tree, tag);
> -	read_unlock_irqrestore(&mapping->tree_lock, flags);
> +	rcu_read_unlock();
>  	return ret;
>  }
>  EXPORT_SYMBOL(mapping_tagged);
> Index: linux-2.6/include/linux/swap.h
> ===================================================================
> --- linux-2.6.orig/include/linux/swap.h
> +++ linux-2.6/include/linux/swap.h
> @@ -226,6 +226,7 @@ extern int move_from_swap_cache(struct p
>  		struct address_space *);
>  extern void free_page_and_swap_cache(struct page *);
>  extern void free_pages_and_swap_cache(struct page **, int);
> +extern struct page * find_get_swap_page(swp_entry_t);
>  extern struct page * lookup_swap_cache(swp_entry_t);
>  extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
>  					   unsigned long addr);
> Index: linux-2.6/mm/swap_state.c
> ===================================================================
> --- linux-2.6.orig/mm/swap_state.c
> +++ linux-2.6/mm/swap_state.c
> @@ -293,6 +293,29 @@ void free_pages_and_swap_cache(struct pa
>  	}
>  }
>  
> +struct page *find_get_swap_page(swp_entry_t entry)
> +{
> +	struct page *page;
> +
> +	rcu_read_lock();
> +repeat:
> +	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
> +	if (page) {
> +		page = page_cache_get_speculative(page);
> +		if (unlikely(!page))
> +			goto repeat;
> +		/* Has the page been truncated? */
> +		if (unlikely(!PageSwapCache(page)
> +				|| page_private(page) != entry.val)) {
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	return page;
> +}
> +
>  /*
>   * Lookup a swap entry in the swap cache. A found page will be returned
>   * unlocked and with its refcount incremented - we rely on the kernel
> @@ -303,7 +326,7 @@ struct page * lookup_swap_cache(swp_entr
>  {
>  	struct page *page;
>  
> -	page = find_get_page(&swapper_space, entry.val);
> +	page = find_get_swap_page(entry);
>  
>  	if (page)
>  		INC_CACHE_INFO(find_success);
> @@ -330,7 +353,7 @@ struct page *read_swap_cache_async(swp_e
>  		 * called after lookup_swap_cache() failed, re-calling
>  		 * that would confuse statistics.
>  		 */
> -		found_page = find_get_page(&swapper_space, entry.val);
> +		found_page = find_get_swap_page(entry);
>  		if (found_page)
>  			break;
>  
> Index: linux-2.6/mm/swapfile.c
> ===================================================================
> --- linux-2.6.orig/mm/swapfile.c
> +++ linux-2.6/mm/swapfile.c
> @@ -400,7 +400,7 @@ void free_swap_and_cache(swp_entry_t ent
>  	p = swap_info_get(entry);
>  	if (p) {
>  		if (swap_entry_free(p, swp_offset(entry)) == 1) {
> -			page = find_get_page(&swapper_space, entry.val);
> +			page = find_get_swap_page(entry);
>  			if (page && unlikely(TestSetPageLocked(page))) {
>  				page_cache_release(page);
>  				page = NULL;
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch 2/2] mm: lockless pagecache
  2006-07-31 15:36 ` Andy Whitcroft
@ 2006-07-31 18:34   ` Hugh Dickins
  2006-08-01  9:04   ` Nick Piggin
  1 sibling, 0 replies; 4+ messages in thread
From: Hugh Dickins @ 2006-07-31 18:34 UTC (permalink / raw)
  To: Andy Whitcroft; +Cc: Nick Piggin, Andrew Morton, Linux Memory Management List

On Mon, 31 Jul 2006, Andy Whitcroft wrote:
> > Index: linux-2.6/mm/filemap.c
> > ===================================================================
> > --- linux-2.6.orig/mm/filemap.c
> > +++ linux-2.6/mm/filemap.c
> > @@ -613,11 +613,22 @@ struct page *find_trylock_page(struct ad
> ....
> 
> This one has me puzzled.  This seem to no longer lock the page at all when
> returning it.  It seems the semantics of this has changed wildly. Also
> find_lock_page below still seems to lock the page, the semantic seems
> maintained there?  I think I am expecting to find a TestSetPageLocked()
> in the new version too?

Whereas find_get_page, which should be the centre-piece of the patch,
is unchanged and using read_lock_irq(&mapping->tree_lock) as before.

It looks like the code seen in find_trylock_page is actually what should
be in find_get_page.  It doesn't matter too much what find_trylock_page
does, since it's deprecated and nothing in tree now uses it; but it ought
to TestSetPageLocked and page_cache_release somewhere, to suit remaining
out-of-tree users.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch 2/2] mm: lockless pagecache
  2006-07-31 15:36 ` Andy Whitcroft
  2006-07-31 18:34   ` Hugh Dickins
@ 2006-08-01  9:04   ` Nick Piggin
  1 sibling, 0 replies; 4+ messages in thread
From: Nick Piggin @ 2006-08-01  9:04 UTC (permalink / raw)
  To: Andy Whitcroft; +Cc: Andrew Morton, Linux Memory Management List

On Mon, Jul 31, 2006 at 04:36:05PM +0100, Andy Whitcroft wrote:
> Nick Piggin wrote:
> >Combine page_cache_get_speculative with lockless radix tree lookups to
> >introduce lockless page cache lookups (ie. no mapping->tree_lock on
> >the read-side).
> >
> >The only atomicity changes this introduces is that the gang pagecache
> >lookup functions now behave as if they are implemented with multiple
> >find_get_page calls, rather than operating on a snapshot of the pages.
> >In practice, this atomicity guarantee is not used anyway, and it is
> >difficult to see how it could be. Gang pagecache lookups are designed
> >to replace individual lookups, so these semantics are natural.
> >
> >Swapcache can no longer use find_get_page, because it has a different
> >method of encoding swapcache position into the page. Introduce a new
> >find_get_swap_page for it.
> >
> >Signed-off-by: Nick Piggin <npiggin@suse.de>
> >
> > include/linux/swap.h |    1
> > mm/filemap.c         |  161 
> > +++++++++++++++++++++++++++++++++++++--------------
> > mm/page-writeback.c  |    8 --
> > mm/readahead.c       |    7 --
> > mm/swap_state.c      |   27 +++++++-
> > mm/swapfile.c        |    2
> > 6 files changed, 150 insertions(+), 56 deletions(-)
> >
> 
> It seems in these routines you have two different placements for the rcu 
> locking.  Either outside or inside the repeat.  Should we assume that 
> those where the locks are outside the repeat: loop have very light payloads?

Yeah, I think the "inside" rcu locking is just used where we have to
sleep. Indeed the loop should be very light I expect it will almost
never retry, and only rarely spin on NoNewRefs.

> >@@ -613,11 +613,22 @@ struct page *find_trylock_page(struct ad
> > {
> > 	struct page *page;
> > 
> >-	read_lock_irq(&mapping->tree_lock);
> >+	rcu_read_lock();
> >+repeat:
> > 	page = radix_tree_lookup(&mapping->page_tree, offset);
> >-	if (page && TestSetPageLocked(page))
> >-		page = NULL;
> >-	read_unlock_irq(&mapping->tree_lock);
> >+	if (page) {
> >+		page = page_cache_get_speculative(page);
> >+		if (unlikely(!page))
> >+			goto repeat;
> >+		/* Has the page been truncated? */
> >+		if (unlikely(page->mapping != mapping
> >+				|| page->index != offset)) {
> >+			page_cache_release(page);
> >+			goto repeat;
> >+		}
> >+	}
> >+	rcu_read_unlock();
> >+
> > 	return page;
> > }
> 
> This one has me puzzled.  This seem to no longer lock the page at all 
> when returning it.  It seems the semantics of this has changed wildly. 
> Also find_lock_page below still seems to lock the page, the semantic 
> seems maintained there?  I think I am expecting to find a 
> TestSetPageLocked() in the new version too?

Yeah, looks like I stuffed up merging it. Didn't notice because nobody
uses find_trylock_page (as Hugh points out, this should be the find_get_page
body, and find_trylock should be unchanged).

> >+	rcu_read_lock();
> >+repeat:
> >+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
> > 				(void **)pages, start, nr_pages);
> >-	for (i = 0; i < ret; i++)
> >-		page_cache_get(pages[i]);
> >-	read_unlock_irq(&mapping->tree_lock);
> >-	return ret;
> >+	for (i = 0; i < nr_found; i++) {
> >+		struct page *page;
> >+		page = page_cache_get_speculative(pages[i]);
> >+		if (unlikely(!page)) {
> >+bail:
> >+			/*
> >+			 * must return at least 1 page, so caller continues
> >+			 * calling in.
> 
> Although that is a resonable semantic, several callers seem to expect 
> all or nothing semantics here.  Mostly the direct callers to 
> find_get_pages().  The callers using pagevec_lookup() at least seem to 
> cope with a partial left fill as implemented here.

I don't see any problems with find_get_pages callers (splice and ramfs).
(hmm, ramfs should be moved over to find_get_pages_contig). You see, if
the pages being looked up are getting chucked out of pagecache anyway,
then then it could be valid to return.

Actually: find_get_pages is a little iffy, but ramfs is happy with that.
Maybe I'll reintroduce my find_get_pages_nonatomic, and leave fgps under
lock.

pagevec_lookup users seem to be fine. Maybe an API rename is in order
for them too, though.

> 
> >+			 */
> >+			if (i == 0)
> >+				goto repeat;
> >+			break;
> >+		}
> >+
> >+		/* Has the page been truncated? */
> >+		if (unlikely(page->mapping != mapping
> >+				|| page->index < start)) {
> >+			page_cache_release(page);
> >+			goto bail;
> 
> I have looked at this check for a while now and I can say I am troubled 
> by it.  We do not know which page we are looking up so can we truly say 
> the index check here is sufficient?  Also, could not the start= below 
> lead us to follow a moving page and skip pages?  Perhaps there is no way 

Hmm, it may move upwards and lead us to skip I think. Good point, I'll
look at that.

> to get any sort of guarentee with this interface before or after this 
> change; and all is well?  Tell me it is :).

Well the gang lookups were generally introduced to replace multiple calls
to fgp, so callers don't use the requirement of an atomic snapshot. However
you can still use the tree_lock to get the old semantics.

> 
> >+		}
> >+
> >+		/* ensure we don't pick up pages that have moved behind us */
> >+		start = page->index+1;
> >+	}
> >+	rcu_read_unlock();
> >+	return i;
> > }
> > 
> > /**
> >@@ -752,19 +786,35 @@ unsigned find_get_pages_contig(struct ad
> > 			       unsigned int nr_pages, struct page **pages)
> > {
> > 	unsigned int i;
> >-	unsigned int ret;
> >+	unsigned int nr_found;
> > 
> >-	read_lock_irq(&mapping->tree_lock);
> >-	ret = radix_tree_gang_lookup(&mapping->page_tree,
> >+	rcu_read_lock();
> >+repeat:
> >+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
> > 				(void **)pages, index, nr_pages);
> >-	for (i = 0; i < ret; i++) {
> >-		if (pages[i]->mapping == NULL || pages[i]->index != index)
> >+	for (i = 0; i < nr_found; i++) {
> >+		struct page *page;
> >+		page = page_cache_get_speculative(pages[i]);
> >+		if (unlikely(!page)) {
> >+bail:
> >+			/*
> >+			 * must return at least 1 page, so caller continues
> >+			 * calling in.
> >+			 */
> >+			if (i == 0)
> >+				goto repeat;
> > 			break;
> >+		}
> > 
> >-		page_cache_get(pages[i]);
> >+		/* Has the page been truncated? */
> >+		if (unlikely(page->mapping != mapping
> >+				|| page->index != index)) {
> >+			page_cache_release(page);
> >+			goto bail;
> >+		}
> 
> Ok, normally this construct is checking against the page at 
> (mapping,index) so it is very unlikely that the index does not match. 
> However in this case we are doing a contiguity scan, so in fact the 
> likelyhood of this missmatching is more defined by the likelyhood of 
> contiguity in the mapping.  The check originally had no such hints?  Is 
> it appropriate to have a hint here?

Yes. We have nothing protecting the page against being truncated or
moved by splice (wheras previously tree_lock would do it).

Thanks again for the review. I'll try to fix up the outstanding
issues you identified.

Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2006-08-01  9:04 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-26  6:39 [patch 2/2] mm: lockless pagecache Nick Piggin
2006-07-31 15:36 ` Andy Whitcroft
2006-07-31 18:34   ` Hugh Dickins
2006-08-01  9:04   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox