Re: [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Jan Kara <jack@suse.cz>
To: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org, willy@linux.intel.com
Subject: Re: [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler
Date: Wed, 9 Apr 2014 00:05:25 +0200	[thread overview]
Message-ID: <20140408220525.GC26019@quack.suse.cz> (raw)
In-Reply-To: <c2e602f401a580c4fac54b9b8f4a6f8dd0ac1071.1395591795.git.matthew.r.wilcox@intel.com>

On Sun 23-03-14 15:08:33, Matthew Wilcox wrote:
> Instead of calling aops->get_xip_mem from the fault handler, the
> filesystem passes a get_block_t that is used to find the appropriate
> blocks.
  I have some suggestions below...

> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
> ---
>  fs/dax.c           | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext2/file.c     |  35 ++++++++-
>  include/linux/fs.h |   4 +-
>  mm/filemap_xip.c   | 206 ----------------------------------------------------
>  4 files changed, 243 insertions(+), 209 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 66a6bda..863749c 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -19,8 +19,12 @@
>  #include <linux/buffer_head.h>
>  #include <linux/fs.h>
>  #include <linux/genhd.h>
> +#include <linux/highmem.h>
> +#include <linux/memcontrol.h>
> +#include <linux/mm.h>
>  #include <linux/mutex.h>
>  #include <linux/uio.h>
> +#include <linux/vmstat.h>
>  
>  static long dax_get_addr(struct inode *inode, struct buffer_head *bh,
>  								void **addr)
> @@ -32,6 +36,16 @@ static long dax_get_addr(struct inode *inode, struct buffer_head *bh,
>  	return ops->direct_access(bdev, sector, addr, &pfn, bh->b_size);
>  }
>  
> +static long dax_get_pfn(struct inode *inode, struct buffer_head *bh,
> +							unsigned long *pfn)
> +{
> +	struct block_device *bdev = bh->b_bdev;
> +	const struct block_device_operations *ops = bdev->bd_disk->fops;
> +	void *addr;
> +	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
> +	return ops->direct_access(bdev, sector, &addr, pfn, bh->b_size);
> +}
> +
>  static void dax_new_buf(void *addr, unsigned size, unsigned first,
>  					loff_t offset, loff_t end, int rw)
>  {
> @@ -214,3 +228,196 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
>  	return retval;
>  }
>  EXPORT_SYMBOL_GPL(dax_do_io);
> +
> +/*
> + * The user has performed a load from a hole in the file.  Allocating
> + * a new page in the file would cause excessive storage usage for
> + * workloads with sparse files.  We allocate a page cache page instead.
> + * We'll kick it out of the page cache if it's ever written to,
> + * otherwise it will simply fall out of the page cache under memory
> + * pressure without ever having been dirtied.
> + */
> +static int dax_load_hole(struct address_space *mapping, struct page *page,
> +							struct vm_fault *vmf)
> +{
> +	unsigned long size;
> +	struct inode *inode = mapping->host;
> +	if (!page)
> +		page = find_or_create_page(mapping, vmf->pgoff,
> +						GFP_KERNEL | __GFP_ZERO);
> +	if (!page)
> +		return VM_FAULT_OOM;
> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	if (vmf->pgoff >= size) {
  Maybe comment here that we have to recheck i_size so that we don't create
pages in the area truncate_pagecache() has already evicted.

> +		unlock_page(page);
> +		page_cache_release(page);
> +		return VM_FAULT_SIGBUS;
> +	}
> +
> +	vmf->page = page;
> +	return VM_FAULT_LOCKED;
> +}
> +
> +static void copy_user_bh(struct page *to, struct inode *inode,
> +				struct buffer_head *bh, unsigned long vaddr)
> +{
> +	void *vfrom, *vto;
> +	dax_get_addr(inode, bh, &vfrom);	/* XXX: error handling */
  The error handling here is missing as the comment suggests :)

> +	vto = kmap_atomic(to);
> +	copy_user_page(vto, vfrom, vaddr, to);
> +	kunmap_atomic(vto);
> +}
> +
> +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +			get_block_t get_block)
> +{
> +	struct file *file = vma->vm_file;
> +	struct inode *inode = file_inode(file);
> +	struct address_space *mapping = file->f_mapping;
> +	struct page *page;
> +	struct buffer_head bh;
> +	unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +	sector_t block;
> +	pgoff_t size;
> +	unsigned long pfn;
> +	int error;
> +	int major = 0;
> +
> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	if (vmf->pgoff >= size)
> +		return VM_FAULT_SIGBUS;
> +
> +	memset(&bh, 0, sizeof(bh));
> +	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - inode->i_blkbits);
> +	bh.b_size = PAGE_SIZE;
> +
> + repeat:
> +	page = find_get_page(mapping, vmf->pgoff);
> +	if (page) {
> +		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
> +			page_cache_release(page);
> +			return VM_FAULT_RETRY;
> +		}
> +		if (unlikely(page->mapping != mapping)) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +			goto repeat;
> +		}
> +	}
> +
> +	error = get_block(inode, block, &bh, 0);
> +	if (error || bh.b_size < PAGE_SIZE)
> +		goto sigbus;
> +
> +	if (!buffer_written(&bh) && !vmf->cow_page) {
> +		if (vmf->flags & FAULT_FLAG_WRITE) {
> +			error = get_block(inode, block, &bh, 1);
> +			count_vm_event(PGMAJFAULT);
> +			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> +			major = VM_FAULT_MAJOR;
> +			if (error || bh.b_size < PAGE_SIZE)
> +				goto sigbus;
> +		} else {
> +			return dax_load_hole(mapping, page, vmf);
> +		}
> +	}
> +
> +	/* Recheck i_size under i_mmap_mutex */
> +	mutex_lock(&mapping->i_mmap_mutex);
> +	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	if (unlikely(vmf->pgoff >= size)) {
> +		mutex_unlock(&mapping->i_mmap_mutex);
> +		goto sigbus;
> +	}
> +	if (vmf->cow_page) {
> +		if (buffer_written(&bh))
> +			copy_user_bh(vmf->cow_page, inode, &bh, vaddr);
> +		else
> +			clear_user_highpage(vmf->cow_page, vaddr);
> +		if (page) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +		}
> +		/* do_cow_fault() will release the i_mmap_mutex */
> +		return VM_FAULT_COWED;
> +	}
> +
> +	if (buffer_unwritten(&bh) || buffer_new(&bh))
> +		dax_clear_blocks(inode, bh.b_blocknr, bh.b_size);
  Where is dax_clear_blocks() defined?

> +
> +	error = dax_get_pfn(inode, &bh, &pfn);
> +	if (error > 0)
> +		error = vm_insert_mixed(vma, vaddr, pfn);
  When there's a hole (thus page != NULL) and we are called from
dax_mkwrite(), this will always return EBUSY, correct?

> +	mutex_unlock(&mapping->i_mmap_mutex);
> +
> +	if (page) {
> +		delete_from_page_cache(page);
> +		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
> +							PAGE_CACHE_SIZE, 0);
  Here we unmap the PTE pointing to the hole page but then we'll have to
retry the fault again to fill in the pfn we've got? This seems wrong. I'd
say we want to remap the PTE from the hole page to a pfn we've got while
holding i_mmap_mutex. remap_pfn_range() almost does what you need, except
that you also need that to work for normal pages. So you might need to
create a new helper in mm layer for that.

> +		unlock_page(page);
> +		page_cache_release(page);
> +	}
> +
> +	if (error == -ENOMEM)
> +		return VM_FAULT_OOM;
> +	/* -EBUSY is fine, somebody else faulted on the same PTE */
> +	if (error != -EBUSY)
> +		BUG_ON(error);
> +	return VM_FAULT_NOPAGE | major;
> +
> + sigbus:
> +	if (page) {
> +		unlock_page(page);
> +		page_cache_release(page);
> +	}
> +	return VM_FAULT_SIGBUS;
> +}
> +
> +/**
> + * dax_fault - handle a page fault on an XIP file
> + * @vma: The virtual memory area where the fault occurred
> + * @vmf: The description of the fault
> + * @get_block: The filesystem method used to translate file offsets to blocks
> + *
> + * When a page fault occurs, filesystems may call this helper in their
> + * fault handler for XIP files.
> + */
> +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +			get_block_t get_block)
> +{
> +	int result;
> +	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +
> +	sb_start_pagefault(sb);
  You don't need any filesystem freeze protection for the fault handler
since that's not going to modify the filesystem.

> +	file_update_time(vma->vm_file);
  Why do you update m/ctime? We are only reading the file...

> +	result = do_dax_fault(vma, vmf, get_block);
> +	sb_end_pagefault(sb);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL_GPL(dax_fault);
> +
> +/**
> + * dax_mkwrite - convert a read-only page to read-write in an XIP file
> + * @vma: The virtual memory area where the fault occurred
> + * @vmf: The description of the fault
> + * @get_block: The filesystem method used to translate file offsets to blocks
> + *
> + * XIP handles reads of holes by adding pages full of zeroes into the
> + * mapping.  If the page is subsequenty written to, we have to allocate
> + * the page on media and free the page that was in the cache.
> + */
> +int dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
> +			get_block_t get_block)
> +{
> +	int result;
> +	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> +
> +	sb_start_pagefault(sb);
> +	file_update_time(vma->vm_file);
> +	result = do_dax_fault(vma, vmf, get_block);
> +	sb_end_pagefault(sb);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL_GPL(dax_mkwrite);
> diff --git a/fs/ext2/file.c b/fs/ext2/file.c
> index ef5cf96..e3ce10d 100644
> --- a/fs/ext2/file.c
> +++ b/fs/ext2/file.c
> @@ -25,6 +25,37 @@
>  #include "xattr.h"
>  #include "acl.h"
>  
> +#ifdef CONFIG_EXT2_FS_XIP
> +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_fault(vma, vmf, ext2_get_block);
> +}
> +
> +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	return dax_mkwrite(vma, vmf, ext2_get_block);
> +}
> +
> +static const struct vm_operations_struct ext2_dax_vm_ops = {
> +	.fault		= ext2_dax_fault,
> +	.page_mkwrite	= ext2_dax_mkwrite,
> +	.remap_pages	= generic_file_remap_pages,
> +};
> +
> +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	if (!IS_DAX(file_inode(file)))
> +		return generic_file_mmap(file, vma);
> +
> +	file_accessed(file);
> +	vma->vm_ops = &ext2_dax_vm_ops;
> +	vma->vm_flags |= VM_MIXEDMAP;
> +	return 0;
> +}
> +#else
> +#define ext2_file_mmap	generic_file_mmap
> +#endif
> +
>  /*
>   * Called when filp is released. This happens when all file descriptors
>   * for a single struct file are closed. Note that different open() calls
> @@ -70,7 +101,7 @@ const struct file_operations ext2_file_operations = {
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl	= ext2_compat_ioctl,
>  #endif
> -	.mmap		= generic_file_mmap,
> +	.mmap		= ext2_file_mmap,
  So what's the point of ext2_file_operations ever handling IS_DAX()
inodes? Actually ext2_file_operations and ext2_xip_file_operations seem to
be the same after this patch so either you drop ext2_xip_file_operations
(I'm for this) or you can leave generic_file_mmap here and assume
ext2_file_mmap is always called for IS_DAX() inodes.

>  	.open		= dquot_file_open,
>  	.release	= ext2_release_file,
>  	.fsync		= ext2_fsync,
> @@ -89,7 +120,7 @@ const struct file_operations ext2_xip_file_operations = {
>  #ifdef CONFIG_COMPAT
>  	.compat_ioctl	= ext2_compat_ioctl,
>  #endif
> -	.mmap		= xip_file_mmap,
> +	.mmap		= ext2_file_mmap,
>  	.open		= dquot_file_open,
>  	.release	= ext2_release_file,
>  	.fsync		= ext2_fsync,

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2014-04-08 22:05 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-23 19:08 [PATCH v7 00/22] Support ext4 on NV-DIMMs Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 01/22] Fix XIP fault vs truncate race Matthew Wilcox
2014-03-29 15:57   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 02/22] Allow page fault handlers to perform the COW Matthew Wilcox
2014-04-08 16:34   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 03/22] axonram: Fix bug in direct_access Matthew Wilcox
2014-03-29 16:22   ` Jan Kara
2014-04-02 19:24     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 04/22] Change direct_access calling convention Matthew Wilcox
2014-03-29 16:30   ` Jan Kara
2014-04-02 19:27     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 05/22] Introduce IS_DAX(inode) Matthew Wilcox
2014-04-08 15:32   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 06/22] Replace XIP read and write with DAX I/O Matthew Wilcox
2014-04-08 17:56   ` Jan Kara
2014-04-08 20:21     ` Matthew Wilcox
2014-04-09  9:14       ` Jan Kara
2014-04-09 15:19         ` Matthew Wilcox
2014-04-09 20:55           ` Jan Kara
2014-04-13 18:05             ` Matthew Wilcox
2014-04-09 12:04   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler Matthew Wilcox
2014-04-08 22:05   ` Jan Kara [this message]
2014-04-09 20:48     ` Matthew Wilcox
2014-04-09 21:12       ` Jan Kara
2014-04-13 11:21         ` Matthew Wilcox
2014-04-14 16:04           ` Jan Kara
2014-04-09 10:27   ` Jan Kara
2014-04-09 20:51     ` Matthew Wilcox
2014-04-09 21:43       ` Jan Kara
2014-04-13 18:03         ` Matthew Wilcox
2014-07-29 12:12         ` Matthew Wilcox
2014-07-29 21:04           ` Jan Kara
2014-07-29 21:23             ` Matthew Wilcox
2014-07-30  9:52               ` Jan Kara
2014-07-30 21:02                 ` Matthew Wilcox
2014-08-09 11:00                 ` Matthew Wilcox
2014-08-11  8:51                   ` Jan Kara
2014-08-11 14:13                     ` Matthew Wilcox
2014-08-11 14:35                       ` Jan Kara
2014-08-11 15:02                         ` Matthew Wilcox
2014-08-11 15:25                           ` Jan Kara
2014-05-21 20:35   ` Toshi Kani
2014-06-05 22:38     ` Toshi Kani
2014-03-23 19:08 ` [PATCH v7 08/22] Replace xip_truncate_page with dax_truncate_page Matthew Wilcox
2014-04-08 22:17   ` Jan Kara
2014-04-09  9:26     ` Jan Kara
2014-04-13 19:07       ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 09/22] Remove mm/filemap_xip.c Matthew Wilcox
2014-04-08 18:21   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 10/22] Remove get_xip_mem Matthew Wilcox
2014-04-08 18:20   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 11/22] Replace ext2_clear_xip_target with dax_clear_blocks Matthew Wilcox
2014-04-09  9:46   ` Jan Kara
2014-04-10 14:16     ` Matthew Wilcox
2014-04-10 18:31       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 12/22] ext2: Remove ext2_xip_verify_sb() Matthew Wilcox
2014-04-09  9:52   ` Jan Kara
2014-04-10 14:22     ` Matthew Wilcox
2014-04-10 18:35       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 13/22] ext2: Remove ext2_use_xip Matthew Wilcox
2014-04-09  9:55   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 14/22] ext2: Remove xip.c and xip.h Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 15/22] Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-04-10 14:23     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 16/22] ext2: Remove ext2_aops_xip Matthew Wilcox
2014-04-09 10:02   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 17/22] Get rid of most mentions of XIP in ext2 Matthew Wilcox
2014-04-09 10:04   ` Jan Kara
2014-04-10 14:26     ` Matthew Wilcox
2014-04-10 18:40       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 18/22] xip: Add xip_zero_page_range Matthew Wilcox
2014-04-09 10:15   ` Jan Kara
2014-04-10 14:27     ` Matthew Wilcox
2014-04-10 18:43       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 19/22] ext4: Make ext4_block_zero_page_range static Matthew Wilcox
2014-03-24 19:11   ` tytso
2014-03-23 19:08 ` [PATCH v7 20/22] ext4: Add DAX functionality Matthew Wilcox
2014-04-09 12:17   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 21/22] ext4: Fix typos Matthew Wilcox
2014-03-24 19:16   ` tytso
2014-03-23 19:08 ` [PATCH v7 22/22] brd: Rename XIP to DAX Matthew Wilcox
2014-04-09 10:07   ` Jan Kara
2014-05-18 14:58 ` [PATCH v7 00/22] Support ext4 on NV-DIMMs Boaz Harrosh
2014-05-18 23:24   ` Matthew Wilcox
2014-06-17 18:11 ` Boaz Harrosh
2014-06-17 18:19   ` Matthew Wilcox
2014-06-17 18:39     ` Boaz Harrosh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140408220525.GC26019@quack.suse.cz \
    --to=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox