linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Matthew Wilcox <willy@linux.intel.com>
To: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler
Date: Sun, 13 Apr 2014 07:21:32 -0400	[thread overview]
Message-ID: <20140413112132.GP5727@linux.intel.com> (raw)
In-Reply-To: <20140409211203.GP32103@quack.suse.cz>

On Wed, Apr 09, 2014 at 11:12:03PM +0200, Jan Kara wrote:
>   This would be fine except that unmap_mapping_range() grabs i_mmap_mutex
> again :-|. But it might be easier to provide a version of that function
> which assumes i_mmap_mutex is already locked than what I was suggesting.

*sigh*.  I knew that once ... which was why the call was after dropping
the lock.  OK, another try at fixing the problem; handle it down in the
insert_pfn code:

diff --git a/fs/dax.c b/fs/dax.c
index 6a8725b..2453025 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -390,7 +390,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	error = dax_get_pfn(&bh, &pfn, blkbits);
 	if (error > 0)
-		error = vm_insert_mixed(vma, vaddr, pfn);
+		error = vm_replace_mixed(vma, vaddr, pfn);
 	mutex_unlock(&mapping->i_mmap_mutex);
 
 	if (page) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ba72c54..df25410 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1944,8 +1944,12 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			unsigned long pfn);
+int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, bool replace);
+#define vm_insert_mixed(vma, addr, pfn)	\
+	__vm_insert_mixed(vma, addr, pfn, false)
+#define vm_replace_mixed(vma, addr, pfn)	\
+	__vm_insert_mixed(vma, addr, pfn, true)
 int vm_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
 			unsigned long pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/mm/memory.c b/mm/memory.c
index 76fd657..ec59239 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2100,7 +2100,7 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
  * pages reserved for the old functions anyway.
  */
 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
-			struct page *page, pgprot_t prot)
+			struct page *page, pgprot_t prot, bool replace)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
@@ -2116,8 +2116,12 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (!pte_none(*pte)) {
+		if (!replace)
+			goto out_unlock;
+		VM_BUG_ON(!mutex_is_locked(&vma->vm_file->f_mapping->i_mmap_mutex));
+		zap_page_range_single(vma, addr, PAGE_SIZE, NULL);
+	}
 
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
@@ -2173,12 +2177,12 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 		BUG_ON(vma->vm_flags & VM_PFNMAP);
 		vma->vm_flags |= VM_MIXEDMAP;
 	}
-	return insert_page(vma, addr, page, vma->vm_page_prot);
+	return insert_page(vma, addr, page, vma->vm_page_prot, false);
 }
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			unsigned long pfn, pgprot_t prot)
+			unsigned long pfn, pgprot_t prot, bool replace)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
@@ -2190,8 +2194,12 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (!pte_none(*pte)) {
+		if (!replace)
+			goto out_unlock;
+		VM_BUG_ON(!mutex_is_locked(&vma->vm_file->f_mapping->i_mmap_mutex));
+		zap_page_range_single(vma, addr, PAGE_SIZE, NULL);
+	}
 
 	/* Ok, finally just insert the thing.. */
 	entry = pte_mkspecial(pfn_pte(pfn, prot));
@@ -2244,14 +2252,14 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (track_pfn_insert(vma, &pgprot, pfn))
 		return -EINVAL;
 
-	ret = insert_pfn(vma, addr, pfn, pgprot);
+	ret = insert_pfn(vma, addr, pfn, pgprot, false);
 
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			unsigned long pfn)
+int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, bool replace)
 {
 	BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 
@@ -2269,11 +2277,11 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		struct page *page;
 
 		page = pfn_to_page(pfn);
-		return insert_page(vma, addr, page, vma->vm_page_prot);
+		return insert_page(vma, addr, page, vma->vm_page_prot, replace);
 	}
-	return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+	return insert_pfn(vma, addr, pfn, vma->vm_page_prot, replace);
 }
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(__vm_insert_mixed);
 
 static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 			pmd_t *pmd, unsigned long pfn, pgprot_t prot)

> > > > +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> > > > +			get_block_t get_block)
> > > > +{
> > > > +	int result;
> > > > +	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> > > > +
> > > > +	sb_start_pagefault(sb);
> > >   You don't need any filesystem freeze protection for the fault handler
> > > since that's not going to modify the filesystem.
> > 
> > Err ... we might allocate a block as a result of doing a write to a hole.
> > Or does that not count as 'modifying the filesystem' in this context?
>   Ah, it does. But it would be nice to avoid doing sb_start_pagefault() if
> it's not a write fault - because you don't want to block reading from a
> frozen filesystem (imagine what would happen when you freeze your root
> filesystem to do a snapshot...).
> 
> I have somewhat a mindset of standard pagecache mmap where filemap_fault()
> only reads in data regardless of FAULT_FLAG_WRITE setting so I was confused
> by your difference :).

Understood!  So this should work:

diff --git a/fs/dax.c b/fs/dax.c
index 2453025..e4d00fc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -431,10 +431,13 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
 
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
+	if (vmf->flags & FAULT_FLAG_WRITE) {
+		sb_start_pagefault(sb);
+		file_update_time(vma->vm_file);
+	}
 	result = do_dax_fault(vma, vmf, get_block);
-	sb_end_pagefault(sb);
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		sb_end_pagefault(sb);
 
 	return result;
 }
@@ -453,15 +456,7 @@ EXPORT_SYMBOL_GPL(dax_fault);
 int dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block)
 {
-	int result;
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
-	result = do_dax_fault(vma, vmf, get_block);
-	sb_end_pagefault(sb);
-
-	return result;
+	return dax_fault(vma, vmf, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_mkwrite);
 

> > > > +	file_update_time(vma->vm_file);
> > >   Why do you update m/ctime? We are only reading the file...
> > 
> > ... except that it might be a write fault.  I think we modify the file
> > iff we return VM_FAULT_MAJOR from do_dax_fault().  So I'd be open to
> > something like this:
> > 
> > 	sb_start_pagefault(sb);
> > 	result = do_dax_fault(vma, vmf, get_block);
> > 	if (result & VM_FAULT_MAJOR)
> > 		file_update_time(vma->vm_file);
> > 	sb_end_pagefault(sb);
> > 
> > Would that work better for you?
>   Definitely. It's also a performance thing BTW - updating time stamps is
> relatively expensive for journalling filesystems - you have to start a
> transaction, add block with inode to the journal, stop a transaction - not
> something you want to do unless you have to.

I realised that this isn't right.  If you do a store to an mmaped file,
you should update the timestamps, whether or not the fs had to allocate
blocks.  Hence the version above that only checks whether the fault is
for write or not.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2014-04-13 11:21 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-23 19:08 [PATCH v7 00/22] Support ext4 on NV-DIMMs Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 01/22] Fix XIP fault vs truncate race Matthew Wilcox
2014-03-29 15:57   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 02/22] Allow page fault handlers to perform the COW Matthew Wilcox
2014-04-08 16:34   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 03/22] axonram: Fix bug in direct_access Matthew Wilcox
2014-03-29 16:22   ` Jan Kara
2014-04-02 19:24     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 04/22] Change direct_access calling convention Matthew Wilcox
2014-03-29 16:30   ` Jan Kara
2014-04-02 19:27     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 05/22] Introduce IS_DAX(inode) Matthew Wilcox
2014-04-08 15:32   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 06/22] Replace XIP read and write with DAX I/O Matthew Wilcox
2014-04-08 17:56   ` Jan Kara
2014-04-08 20:21     ` Matthew Wilcox
2014-04-09  9:14       ` Jan Kara
2014-04-09 15:19         ` Matthew Wilcox
2014-04-09 20:55           ` Jan Kara
2014-04-13 18:05             ` Matthew Wilcox
2014-04-09 12:04   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 07/22] Replace the XIP page fault handler with the DAX page fault handler Matthew Wilcox
2014-04-08 22:05   ` Jan Kara
2014-04-09 20:48     ` Matthew Wilcox
2014-04-09 21:12       ` Jan Kara
2014-04-13 11:21         ` Matthew Wilcox [this message]
2014-04-14 16:04           ` Jan Kara
2014-04-09 10:27   ` Jan Kara
2014-04-09 20:51     ` Matthew Wilcox
2014-04-09 21:43       ` Jan Kara
2014-04-13 18:03         ` Matthew Wilcox
2014-07-29 12:12         ` Matthew Wilcox
2014-07-29 21:04           ` Jan Kara
2014-07-29 21:23             ` Matthew Wilcox
2014-07-30  9:52               ` Jan Kara
2014-07-30 21:02                 ` Matthew Wilcox
2014-08-09 11:00                 ` Matthew Wilcox
2014-08-11  8:51                   ` Jan Kara
2014-08-11 14:13                     ` Matthew Wilcox
2014-08-11 14:35                       ` Jan Kara
2014-08-11 15:02                         ` Matthew Wilcox
2014-08-11 15:25                           ` Jan Kara
2014-05-21 20:35   ` Toshi Kani
2014-06-05 22:38     ` Toshi Kani
2014-03-23 19:08 ` [PATCH v7 08/22] Replace xip_truncate_page with dax_truncate_page Matthew Wilcox
2014-04-08 22:17   ` Jan Kara
2014-04-09  9:26     ` Jan Kara
2014-04-13 19:07       ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 09/22] Remove mm/filemap_xip.c Matthew Wilcox
2014-04-08 18:21   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 10/22] Remove get_xip_mem Matthew Wilcox
2014-04-08 18:20   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 11/22] Replace ext2_clear_xip_target with dax_clear_blocks Matthew Wilcox
2014-04-09  9:46   ` Jan Kara
2014-04-10 14:16     ` Matthew Wilcox
2014-04-10 18:31       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 12/22] ext2: Remove ext2_xip_verify_sb() Matthew Wilcox
2014-04-09  9:52   ` Jan Kara
2014-04-10 14:22     ` Matthew Wilcox
2014-04-10 18:35       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 13/22] ext2: Remove ext2_use_xip Matthew Wilcox
2014-04-09  9:55   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 14/22] ext2: Remove xip.c and xip.h Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 15/22] Remove CONFIG_EXT2_FS_XIP and rename CONFIG_FS_XIP to CONFIG_FS_DAX Matthew Wilcox
2014-04-09  9:59   ` Jan Kara
2014-04-10 14:23     ` Matthew Wilcox
2014-03-23 19:08 ` [PATCH v7 16/22] ext2: Remove ext2_aops_xip Matthew Wilcox
2014-04-09 10:02   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 17/22] Get rid of most mentions of XIP in ext2 Matthew Wilcox
2014-04-09 10:04   ` Jan Kara
2014-04-10 14:26     ` Matthew Wilcox
2014-04-10 18:40       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 18/22] xip: Add xip_zero_page_range Matthew Wilcox
2014-04-09 10:15   ` Jan Kara
2014-04-10 14:27     ` Matthew Wilcox
2014-04-10 18:43       ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 19/22] ext4: Make ext4_block_zero_page_range static Matthew Wilcox
2014-03-24 19:11   ` tytso
2014-03-23 19:08 ` [PATCH v7 20/22] ext4: Add DAX functionality Matthew Wilcox
2014-04-09 12:17   ` Jan Kara
2014-03-23 19:08 ` [PATCH v7 21/22] ext4: Fix typos Matthew Wilcox
2014-03-24 19:16   ` tytso
2014-03-23 19:08 ` [PATCH v7 22/22] brd: Rename XIP to DAX Matthew Wilcox
2014-04-09 10:07   ` Jan Kara
2014-05-18 14:58 ` [PATCH v7 00/22] Support ext4 on NV-DIMMs Boaz Harrosh
2014-05-18 23:24   ` Matthew Wilcox
2014-06-17 18:11 ` Boaz Harrosh
2014-06-17 18:19   ` Matthew Wilcox
2014-06-17 18:39     ` Boaz Harrosh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140413112132.GP5727@linux.intel.com \
    --to=willy@linux.intel.com \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.r.wilcox@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox