[rfc][patch] shmem: don't zero full-page writes

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [rfc][patch] shmem: don't zero full-page writes
@ 2006-10-14  5:59 Nick Piggin
  2006-10-15 17:57 ` Hugh Dickins
  0 siblings, 1 reply; 2+ messages in thread
From: Nick Piggin @ 2006-10-14  5:59 UTC (permalink / raw)
  To: Hugh Dickins, Linux Memory Management List

Just while looking at the peripheral code around the pagecache deadlocks
problem, I noticed we might be able to speed up shmem a bit. This patch
isn't well tested when shmem goes into swap, but before wasting more time on
it I just wanted to see if there is a fundamental reason why we're not doing
this?

--
Don't zero out newly allocated tmpfs pages if we're about to write a full
page to them anyway, and also don't bother to read them in from swap.
Increases aligned write bandwidth by about 30% for 4M writes to shmfs
in RAM, and about 7% for 4K writes. Not tested with swap backed shm yet;
the improvement will be much larger but it should be much less common.

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -81,6 +81,7 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
+	SGP_WRITE_FULL,	/* same as SGP_WRITE, full page write */
 	SGP_FAULT,	/* same as SGP_CACHE, return with page locked */
 };
 
@@ -348,7 +349,7 @@ static swp_entry_t *shmem_swp_alloc(stru
 	struct page *page = NULL;
 	swp_entry_t *entry;
 
-	if (sgp != SGP_WRITE &&
+	if (sgp != SGP_WRITE && sgp != SGP_WRITE_FULL &&
 	    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 		return ERR_PTR(-EINVAL);
 
@@ -381,7 +382,7 @@ static swp_entry_t *shmem_swp_alloc(stru
 			shmem_free_blocks(inode, 1);
 			return ERR_PTR(-ENOMEM);
 		}
-		if (sgp != SGP_WRITE &&
+		if (sgp != SGP_WRITE && sgp != SGP_WRITE_FULL &&
 		    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 			entry = ERR_PTR(-EINVAL);
 			break;
@@ -976,7 +977,7 @@ shmem_alloc_page(gfp_t gfp, struct shmem
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
 	pvma.vm_pgoff = idx;
 	pvma.vm_end = PAGE_SIZE;
-	page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
+	page = alloc_page_vma(gfp, &pvma, 0);
 	mpol_free(pvma.vm_policy);
 	return page;
 }
@@ -996,10 +997,30 @@ shmem_swapin(struct shmem_inode_info *in
 static inline struct page *
 shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
 {
-	return alloc_page(gfp | __GFP_ZERO);
+	return alloc_page(gfp);
 }
 #endif
 
+static inline int
+shmem_acct_page(struct inode *inode, struct shmem_inode_info *info)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+	if (sbinfo->max_blocks) {
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks == 0 ||
+		    shmem_acct_block(info->flags)) {
+			spin_unlock(&sbinfo->stat_lock);
+			return -ENOSPC;
+		}
+		sbinfo->free_blocks--;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
+	} else if (shmem_acct_block(info->flags)) {
+		return -ENOSPC;
+	}
+	return 0;
+}
 /*
  * shmem_getpage - either get the page from swap or allocate a new one
  *
@@ -1012,12 +1033,13 @@ static int shmem_getpage(struct inode *i
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct shmem_sb_info *sbinfo;
+	struct page *cache = NULL;
 	struct page *filepage = *pagep;
 	struct page *swappage;
 	swp_entry_t *entry;
 	swp_entry_t swap;
 	int error;
+	int free_swap;
 
 	if (idx >= SHMEM_MAX_INDEX)
 		return -EFBIG;
@@ -1034,10 +1056,26 @@ static int shmem_getpage(struct inode *i
 	 * and may need to be copied from the swappage read in.
 	 */
 repeat:
+	free_swap = 0;
 	if (!filepage)
 		filepage = find_lock_page(mapping, idx);
-	if (filepage && PageUptodate(filepage))
-		goto done;
+	if (filepage) {
+		if (PageUptodate(filepage) || sgp == SGP_WRITE_FULL)
+			goto done;
+	} else if (sgp != SGP_QUICK && sgp != SGP_READ) {
+		gfp_t gfp = mapping_gfp_mask(mapping);
+		if (sgp != SGP_WRITE_FULL)
+			gfp |= __GFP_ZERO;
+		cache = shmem_alloc_page(mapping_gfp_mask(mapping), info, idx);
+		if (sgp != SGP_WRITE_FULL) {
+			flush_dcache_page(filepage);
+			SetPageUptodate(filepage); /* could be non-atomic */
+		}
+
+		if (!cache)
+			return -ENOMEM;
+	}
+
 	error = 0;
 	if (sgp == SGP_QUICK)
 		goto failed;
@@ -1056,6 +1094,12 @@ repeat:
 		/* Look it up and read it in.. */
 		swappage = lookup_swap_cache(swap);
 		if (!swappage) {
+			if (sgp == SGP_WRITE_FULL) {
+				/* May throw away the backing swap */
+				free_swap = 1;
+				goto not_swap_backed;
+			}
+
 			shmem_swp_unmap(entry);
 			/* here we actually do the io */
 			if (type && *type == VM_FAULT_MINOR) {
@@ -1154,63 +1198,38 @@ repeat:
 		spin_unlock(&info->lock);
 	} else {
 		shmem_swp_unmap(entry);
-		sbinfo = SHMEM_SB(inode->i_sb);
-		if (sbinfo->max_blocks) {
-			spin_lock(&sbinfo->stat_lock);
-			if (sbinfo->free_blocks == 0 ||
-			    shmem_acct_block(info->flags)) {
-				spin_unlock(&sbinfo->stat_lock);
-				spin_unlock(&info->lock);
-				error = -ENOSPC;
-				goto failed;
-			}
-			sbinfo->free_blocks--;
-			inode->i_blocks += BLOCKS_PER_PAGE;
-			spin_unlock(&sbinfo->stat_lock);
-		} else if (shmem_acct_block(info->flags)) {
+		error = shmem_acct_page(inode, info);
+		if (error) {
 			spin_unlock(&info->lock);
-			error = -ENOSPC;
 			goto failed;
 		}
 
 		if (!filepage) {
-			spin_unlock(&info->lock);
-			filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
-						    info,
-						    idx);
-			if (!filepage) {
-				shmem_unacct_blocks(info->flags, 1);
-				shmem_free_blocks(inode, 1);
-				error = -ENOMEM;
-				goto failed;
-			}
-
-			spin_lock(&info->lock);
-			entry = shmem_swp_alloc(info, idx, sgp);
-			if (IS_ERR(entry))
-				error = PTR_ERR(entry);
-			else {
-				swap = *entry;
-				shmem_swp_unmap(entry);
-			}
-			if (error || swap.val || 0 != add_to_page_cache_lru(
-					filepage, mapping, idx, GFP_ATOMIC)) {
+not_swap_backed:
+			BUG_ON(!cache);
+			filepage = cache;
+			if (add_to_page_cache_lru(filepage, mapping,
+							idx, GFP_ATOMIC)) {
+				if (free_swap)
+					shmem_swp_unmap(entry);
 				spin_unlock(&info->lock);
-				page_cache_release(filepage);
 				shmem_unacct_blocks(info->flags, 1);
 				shmem_free_blocks(inode, 1);
 				filepage = NULL;
-				if (error)
-					goto failed;
 				goto repeat;
 			}
+			cache = NULL;
 			info->flags |= SHMEM_PAGEIN;
 		}
 
 		info->alloced++;
+		if (free_swap) {
+			shmem_swp_set(info, entry, 0);
+			shmem_swp_unmap(entry);
+		}
 		spin_unlock(&info->lock);
-		flush_dcache_page(filepage);
-		SetPageUptodate(filepage);
+		if (free_swap)
+			swap_free(swap);
 	}
 done:
 	if (*pagep != filepage) {
@@ -1219,14 +1238,20 @@ done:
 			unlock_page(filepage);
 
 	}
-	return 0;
+
+	error = 0;
+out:
+	if (cache)
+		page_cache_release(cache);
+
+	return error;
 
 failed:
 	if (*pagep != filepage) {
 		unlock_page(filepage);
 		page_cache_release(filepage);
 	}
-	return error;
+	goto out;
 }
 
 struct page *shmem_fault(struct vm_area_struct *vma, struct fault_data *fdata)
@@ -1379,7 +1404,9 @@ static int
 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
-	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+	enum sgp_type sgp = (to - offset == PAGE_CACHE_SIZE) ?
+						SGP_WRITE_FULL : SGP_WRITE;
+	return shmem_getpage(inode, page->index, &page, sgp, NULL);
 }
 
 static ssize_t
@@ -1412,8 +1439,10 @@ shmem_file_write(struct file *file, cons
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
 	do {
+		volatile unsigned char dummy;
 		struct page *page = NULL;
 		unsigned long bytes, index, offset;
+		enum sgp_type sgp;
 		char *kaddr;
 		int left;
 
@@ -1429,26 +1458,19 @@ shmem_file_write(struct file *file, cons
 		 * But it still may be a good idea to prefault below.
 		 */
 
-		err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
+		sgp = (bytes == PAGE_CACHE_SIZE) ?  SGP_WRITE_FULL : SGP_WRITE;
+retry:
+		__get_user(dummy, buf);
+		__get_user(dummy, buf + bytes - 1);
+		err = shmem_getpage(inode, index, &page, sgp, NULL);
 		if (err)
 			break;
 
-		left = bytes;
-		if (PageHighMem(page)) {
-			volatile unsigned char dummy;
-			__get_user(dummy, buf);
-			__get_user(dummy, buf + bytes - 1);
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			left = __copy_from_user_inatomic(kaddr + offset,
-							buf, bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		if (left) {
-			kaddr = kmap(page);
-			left = __copy_from_user(kaddr + offset, buf, bytes);
-			kunmap(page);
-		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (left)
+			goto retry;
 
 		written += bytes;
 		count -= bytes;
@@ -1458,6 +1480,8 @@ shmem_file_write(struct file *file, cons
 			i_size_write(inode, pos);
 
 		flush_dcache_page(page);
+		if (sgp == SGP_WRITE_FULL)
+			SetPageUptodate(page);
 		set_page_dirty(page);
 		mark_page_accessed(page);
 		page_cache_release(page);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [rfc][patch] shmem: don't zero full-page writes
  2006-10-14  5:59 [rfc][patch] shmem: don't zero full-page writes Nick Piggin
@ 2006-10-15 17:57 ` Hugh Dickins
  0 siblings, 0 replies; 2+ messages in thread
From: Hugh Dickins @ 2006-10-15 17:57 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management List

On Sat, 14 Oct 2006, Nick Piggin wrote:
> Just while looking at the peripheral code around the pagecache deadlocks
> problem, I noticed we might be able to speed up shmem a bit. This patch
> isn't well tested when shmem goes into swap, but before wasting more time on
> it I just wanted to see if there is a fundamental reason why we're not doing
> this?

No fundamental reason: I did consider doing so a couple of times,
but it's rather messy and nobody was complaining, so I didn't bother.

> 
> --
> Don't zero out newly allocated tmpfs pages if we're about to write a full
> page to them anyway, and also don't bother to read them in from swap.
> Increases aligned write bandwidth by about 30% for 4M writes to shmfs
> in RAM, and about 7% for 4K writes. Not tested with swap backed shm yet;
> the improvement will be much larger but it should be much less common.

That's quite nice: though as I say, nobody had complained (I think the
high performance people are usually looking at the shm end rather than
at tmpfs files, so wouldn't see any benefit).

I'd like your patch _so_ much more if it didn't scatter SGP_WRITE_FULL
(and free_swap) conditionals all over: we'd all prefer a simpler
shmem_getpage to a more complex one.  That's the messiness that
put me off.  I wonder if there's a better way, but don't see it.

I wonder if the patch you tested is this one you've sent out: it
seems uncertain what to do with PageUptodate, and mishandles it.

> -	if (filepage && PageUptodate(filepage))
> -		goto done;
> +	if (filepage) {
> +		if (PageUptodate(filepage) || sgp == SGP_WRITE_FULL)
> +			goto done;
> +	} else if (sgp != SGP_QUICK && sgp != SGP_READ) {
> +		gfp_t gfp = mapping_gfp_mask(mapping);
> +		if (sgp != SGP_WRITE_FULL)
> +			gfp |= __GFP_ZERO;
> +		cache = shmem_alloc_page(mapping_gfp_mask(mapping), info, idx);
> +		if (sgp != SGP_WRITE_FULL) {
> +			flush_dcache_page(filepage);
> +			SetPageUptodate(filepage); /* could be non-atomic */
> +		}

That's a very odd place to SetPageUptodate(filepage), where filepage
is NULL.  Those lines seem to have crept up from much further down.

> +not_swap_backed:
> +			BUG_ON(!cache);
> +			filepage = cache;
> +			if (add_to_page_cache_lru(filepage, mapping,
> +							idx, GFP_ATOMIC)) {
> +				if (free_swap)
> +					shmem_swp_unmap(entry);
>  				spin_unlock(&info->lock);
> -				page_cache_release(filepage);
>  				shmem_unacct_blocks(info->flags, 1);
>  				shmem_free_blocks(inode, 1);
>  				filepage = NULL;
> -				if (error)
> -					goto failed;
>  				goto repeat;

You're in danger of leaking your cache page when you go back to repeat
there.  But you do indeed want to retry a memory allocation, otherwise
it might cycle around without any memory for the add_to_page_cache_lru.
Perhaps you just need to reinstate the page_cache_release(filepage)?

But the main problem is over in shmem_file_write:

> @@ -1458,6 +1480,8 @@ shmem_file_write(struct file *file, cons
>  			i_size_write(inode, pos);
>  
>  		flush_dcache_page(page);
> +		if (sgp == SGP_WRITE_FULL)
> +			SetPageUptodate(page);
>  		set_page_dirty(page);
>  		mark_page_accessed(page);
>  		page_cache_release(page);

If someone might be waiting for PageUptodate, you ought to wake them.
But in fact nobody does wait for that PageUptodate, and you don't
have pagelock, which is bad news: someone can get in to peep at the
uninitialized page (just as I think you've discovered in ramfs).

I suggest you set this patch aside for now.  If your various
truncation/pagelock/deadlock efforts work out, then it should
become natural for us to change shmem_getpage to return with
pagelock held, and then might as well SetPageUptodate itself
as now (prematurely, but no matter while pagelock is held,
since even to read demands pagelock for a moment).

And meanwhile we'll maybe think of a more appealing way to do it.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2006-10-15 17:57 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-10-14  5:59 [rfc][patch] shmem: don't zero full-page writes Nick Piggin
2006-10-15 17:57 ` Hugh Dickins

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox