* [rfc][patch] shmem: don't zero full-page writes
@ 2006-10-14 5:59 Nick Piggin
2006-10-15 17:57 ` Hugh Dickins
0 siblings, 1 reply; 2+ messages in thread
From: Nick Piggin @ 2006-10-14 5:59 UTC (permalink / raw)
To: Hugh Dickins, Linux Memory Management List
Just while looking at the peripheral code around the pagecache deadlocks
problem, I noticed we might be able to speed up shmem a bit. This patch
isn't well tested when shmem goes into swap, but before wasting more time on
it I just wanted to see if there is a fundamental reason why we're not doing
this?
--
Don't zero out newly allocated tmpfs pages if we're about to write a full
page to them anyway, and also don't bother to read them in from swap.
Increases aligned write bandwidth by about 30% for 4M writes to shmfs
in RAM, and about 7% for 4K writes. Not tested with swap backed shm yet;
the improvement will be much larger but it should be much less common.
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -81,6 +81,7 @@ enum sgp_type {
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
SGP_WRITE, /* may exceed i_size, may allocate page */
+ SGP_WRITE_FULL, /* same as SGP_WRITE, full page write */
SGP_FAULT, /* same as SGP_CACHE, return with page locked */
};
@@ -348,7 +349,7 @@ static swp_entry_t *shmem_swp_alloc(stru
struct page *page = NULL;
swp_entry_t *entry;
- if (sgp != SGP_WRITE &&
+ if (sgp != SGP_WRITE && sgp != SGP_WRITE_FULL &&
((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return ERR_PTR(-EINVAL);
@@ -381,7 +382,7 @@ static swp_entry_t *shmem_swp_alloc(stru
shmem_free_blocks(inode, 1);
return ERR_PTR(-ENOMEM);
}
- if (sgp != SGP_WRITE &&
+ if (sgp != SGP_WRITE && sgp != SGP_WRITE_FULL &&
((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
entry = ERR_PTR(-EINVAL);
break;
@@ -976,7 +977,7 @@ shmem_alloc_page(gfp_t gfp, struct shmem
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
pvma.vm_pgoff = idx;
pvma.vm_end = PAGE_SIZE;
- page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
+ page = alloc_page_vma(gfp, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
}
@@ -996,10 +997,30 @@ shmem_swapin(struct shmem_inode_info *in
static inline struct page *
shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
{
- return alloc_page(gfp | __GFP_ZERO);
+ return alloc_page(gfp);
}
#endif
+static inline int
+shmem_acct_page(struct inode *inode, struct shmem_inode_info *info)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (sbinfo->max_blocks) {
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks == 0 ||
+ shmem_acct_block(info->flags)) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_blocks--;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ spin_unlock(&sbinfo->stat_lock);
+ } else if (shmem_acct_block(info->flags)) {
+ return -ENOSPC;
+ }
+ return 0;
+}
/*
* shmem_getpage - either get the page from swap or allocate a new one
*
@@ -1012,12 +1033,13 @@ static int shmem_getpage(struct inode *i
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo;
+ struct page *cache = NULL;
struct page *filepage = *pagep;
struct page *swappage;
swp_entry_t *entry;
swp_entry_t swap;
int error;
+ int free_swap;
if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
@@ -1034,10 +1056,26 @@ static int shmem_getpage(struct inode *i
* and may need to be copied from the swappage read in.
*/
repeat:
+ free_swap = 0;
if (!filepage)
filepage = find_lock_page(mapping, idx);
- if (filepage && PageUptodate(filepage))
- goto done;
+ if (filepage) {
+ if (PageUptodate(filepage) || sgp == SGP_WRITE_FULL)
+ goto done;
+ } else if (sgp != SGP_QUICK && sgp != SGP_READ) {
+ gfp_t gfp = mapping_gfp_mask(mapping);
+ if (sgp != SGP_WRITE_FULL)
+ gfp |= __GFP_ZERO;
+ cache = shmem_alloc_page(mapping_gfp_mask(mapping), info, idx);
+ if (sgp != SGP_WRITE_FULL) {
+ flush_dcache_page(filepage);
+ SetPageUptodate(filepage); /* could be non-atomic */
+ }
+
+ if (!cache)
+ return -ENOMEM;
+ }
+
error = 0;
if (sgp == SGP_QUICK)
goto failed;
@@ -1056,6 +1094,12 @@ repeat:
/* Look it up and read it in.. */
swappage = lookup_swap_cache(swap);
if (!swappage) {
+ if (sgp == SGP_WRITE_FULL) {
+ /* May throw away the backing swap */
+ free_swap = 1;
+ goto not_swap_backed;
+ }
+
shmem_swp_unmap(entry);
/* here we actually do the io */
if (type && *type == VM_FAULT_MINOR) {
@@ -1154,63 +1198,38 @@ repeat:
spin_unlock(&info->lock);
} else {
shmem_swp_unmap(entry);
- sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks) {
- spin_lock(&sbinfo->stat_lock);
- if (sbinfo->free_blocks == 0 ||
- shmem_acct_block(info->flags)) {
- spin_unlock(&sbinfo->stat_lock);
- spin_unlock(&info->lock);
- error = -ENOSPC;
- goto failed;
- }
- sbinfo->free_blocks--;
- inode->i_blocks += BLOCKS_PER_PAGE;
- spin_unlock(&sbinfo->stat_lock);
- } else if (shmem_acct_block(info->flags)) {
+ error = shmem_acct_page(inode, info);
+ if (error) {
spin_unlock(&info->lock);
- error = -ENOSPC;
goto failed;
}
if (!filepage) {
- spin_unlock(&info->lock);
- filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
- info,
- idx);
- if (!filepage) {
- shmem_unacct_blocks(info->flags, 1);
- shmem_free_blocks(inode, 1);
- error = -ENOMEM;
- goto failed;
- }
-
- spin_lock(&info->lock);
- entry = shmem_swp_alloc(info, idx, sgp);
- if (IS_ERR(entry))
- error = PTR_ERR(entry);
- else {
- swap = *entry;
- shmem_swp_unmap(entry);
- }
- if (error || swap.val || 0 != add_to_page_cache_lru(
- filepage, mapping, idx, GFP_ATOMIC)) {
+not_swap_backed:
+ BUG_ON(!cache);
+ filepage = cache;
+ if (add_to_page_cache_lru(filepage, mapping,
+ idx, GFP_ATOMIC)) {
+ if (free_swap)
+ shmem_swp_unmap(entry);
spin_unlock(&info->lock);
- page_cache_release(filepage);
shmem_unacct_blocks(info->flags, 1);
shmem_free_blocks(inode, 1);
filepage = NULL;
- if (error)
- goto failed;
goto repeat;
}
+ cache = NULL;
info->flags |= SHMEM_PAGEIN;
}
info->alloced++;
+ if (free_swap) {
+ shmem_swp_set(info, entry, 0);
+ shmem_swp_unmap(entry);
+ }
spin_unlock(&info->lock);
- flush_dcache_page(filepage);
- SetPageUptodate(filepage);
+ if (free_swap)
+ swap_free(swap);
}
done:
if (*pagep != filepage) {
@@ -1219,14 +1238,20 @@ done:
unlock_page(filepage);
}
- return 0;
+
+ error = 0;
+out:
+ if (cache)
+ page_cache_release(cache);
+
+ return error;
failed:
if (*pagep != filepage) {
unlock_page(filepage);
page_cache_release(filepage);
}
- return error;
+ goto out;
}
struct page *shmem_fault(struct vm_area_struct *vma, struct fault_data *fdata)
@@ -1379,7 +1404,9 @@ static int
shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
{
struct inode *inode = page->mapping->host;
- return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+ enum sgp_type sgp = (to - offset == PAGE_CACHE_SIZE) ?
+ SGP_WRITE_FULL : SGP_WRITE;
+ return shmem_getpage(inode, page->index, &page, sgp, NULL);
}
static ssize_t
@@ -1412,8 +1439,10 @@ shmem_file_write(struct file *file, cons
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
do {
+ volatile unsigned char dummy;
struct page *page = NULL;
unsigned long bytes, index, offset;
+ enum sgp_type sgp;
char *kaddr;
int left;
@@ -1429,26 +1458,19 @@ shmem_file_write(struct file *file, cons
* But it still may be a good idea to prefault below.
*/
- err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
+ sgp = (bytes == PAGE_CACHE_SIZE) ? SGP_WRITE_FULL : SGP_WRITE;
+retry:
+ __get_user(dummy, buf);
+ __get_user(dummy, buf + bytes - 1);
+ err = shmem_getpage(inode, index, &page, sgp, NULL);
if (err)
break;
- left = bytes;
- if (PageHighMem(page)) {
- volatile unsigned char dummy;
- __get_user(dummy, buf);
- __get_user(dummy, buf + bytes - 1);
-
- kaddr = kmap_atomic(page, KM_USER0);
- left = __copy_from_user_inatomic(kaddr + offset,
- buf, bytes);
- kunmap_atomic(kaddr, KM_USER0);
- }
- if (left) {
- kaddr = kmap(page);
- left = __copy_from_user(kaddr + offset, buf, bytes);
- kunmap(page);
- }
+ kaddr = kmap_atomic(page, KM_USER0);
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (left)
+ goto retry;
written += bytes;
count -= bytes;
@@ -1458,6 +1480,8 @@ shmem_file_write(struct file *file, cons
i_size_write(inode, pos);
flush_dcache_page(page);
+ if (sgp == SGP_WRITE_FULL)
+ SetPageUptodate(page);
set_page_dirty(page);
mark_page_accessed(page);
page_cache_release(page);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [rfc][patch] shmem: don't zero full-page writes
2006-10-14 5:59 [rfc][patch] shmem: don't zero full-page writes Nick Piggin
@ 2006-10-15 17:57 ` Hugh Dickins
0 siblings, 0 replies; 2+ messages in thread
From: Hugh Dickins @ 2006-10-15 17:57 UTC (permalink / raw)
To: Nick Piggin; +Cc: Linux Memory Management List
On Sat, 14 Oct 2006, Nick Piggin wrote:
> Just while looking at the peripheral code around the pagecache deadlocks
> problem, I noticed we might be able to speed up shmem a bit. This patch
> isn't well tested when shmem goes into swap, but before wasting more time on
> it I just wanted to see if there is a fundamental reason why we're not doing
> this?
No fundamental reason: I did consider doing so a couple of times,
but it's rather messy and nobody was complaining, so I didn't bother.
>
> --
> Don't zero out newly allocated tmpfs pages if we're about to write a full
> page to them anyway, and also don't bother to read them in from swap.
> Increases aligned write bandwidth by about 30% for 4M writes to shmfs
> in RAM, and about 7% for 4K writes. Not tested with swap backed shm yet;
> the improvement will be much larger but it should be much less common.
That's quite nice: though as I say, nobody had complained (I think the
high performance people are usually looking at the shm end rather than
at tmpfs files, so wouldn't see any benefit).
I'd like your patch _so_ much more if it didn't scatter SGP_WRITE_FULL
(and free_swap) conditionals all over: we'd all prefer a simpler
shmem_getpage to a more complex one. That's the messiness that
put me off. I wonder if there's a better way, but don't see it.
I wonder if the patch you tested is this one you've sent out: it
seems uncertain what to do with PageUptodate, and mishandles it.
> - if (filepage && PageUptodate(filepage))
> - goto done;
> + if (filepage) {
> + if (PageUptodate(filepage) || sgp == SGP_WRITE_FULL)
> + goto done;
> + } else if (sgp != SGP_QUICK && sgp != SGP_READ) {
> + gfp_t gfp = mapping_gfp_mask(mapping);
> + if (sgp != SGP_WRITE_FULL)
> + gfp |= __GFP_ZERO;
> + cache = shmem_alloc_page(mapping_gfp_mask(mapping), info, idx);
> + if (sgp != SGP_WRITE_FULL) {
> + flush_dcache_page(filepage);
> + SetPageUptodate(filepage); /* could be non-atomic */
> + }
That's a very odd place to SetPageUptodate(filepage), where filepage
is NULL. Those lines seem to have crept up from much further down.
> +not_swap_backed:
> + BUG_ON(!cache);
> + filepage = cache;
> + if (add_to_page_cache_lru(filepage, mapping,
> + idx, GFP_ATOMIC)) {
> + if (free_swap)
> + shmem_swp_unmap(entry);
> spin_unlock(&info->lock);
> - page_cache_release(filepage);
> shmem_unacct_blocks(info->flags, 1);
> shmem_free_blocks(inode, 1);
> filepage = NULL;
> - if (error)
> - goto failed;
> goto repeat;
You're in danger of leaking your cache page when you go back to repeat
there. But you do indeed want to retry a memory allocation, otherwise
it might cycle around without any memory for the add_to_page_cache_lru.
Perhaps you just need to reinstate the page_cache_release(filepage)?
But the main problem is over in shmem_file_write:
> @@ -1458,6 +1480,8 @@ shmem_file_write(struct file *file, cons
> i_size_write(inode, pos);
>
> flush_dcache_page(page);
> + if (sgp == SGP_WRITE_FULL)
> + SetPageUptodate(page);
> set_page_dirty(page);
> mark_page_accessed(page);
> page_cache_release(page);
If someone might be waiting for PageUptodate, you ought to wake them.
But in fact nobody does wait for that PageUptodate, and you don't
have pagelock, which is bad news: someone can get in to peep at the
uninitialized page (just as I think you've discovered in ramfs).
I suggest you set this patch aside for now. If your various
truncation/pagelock/deadlock efforts work out, then it should
become natural for us to change shmem_getpage to return with
pagelock held, and then might as well SetPageUptodate itself
as now (prematurely, but no matter while pagelock is held,
since even to read demands pagelock for a moment).
And meanwhile we'll maybe think of a more appealing way to do it.
Hugh
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2006-10-15 17:57 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-10-14 5:59 [rfc][patch] shmem: don't zero full-page writes Nick Piggin
2006-10-15 17:57 ` Hugh Dickins
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox