[PATCH v9 06/20] fs/dax: Always remove DAX page-cache entries when breaking layouts

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Alistair Popple <apopple@nvidia.com>
To: akpm@linux-foundation.org, dan.j.williams@intel.com, linux-mm@kvack.org
Cc: Alistair Popple <apopple@nvidia.com>,
	Alison Schofield <alison.schofield@intel.com>,
	lina@asahilina.net, zhang.lyra@gmail.com,
	gerald.schaefer@linux.ibm.com, vishal.l.verma@intel.com,
	dave.jiang@intel.com, logang@deltatee.com, bhelgaas@google.com,
	jack@suse.cz, jgg@ziepe.ca, catalin.marinas@arm.com,
	will@kernel.org, mpe@ellerman.id.au, npiggin@gmail.com,
	dave.hansen@linux.intel.com, ira.weiny@intel.com,
	willy@infradead.org, djwong@kernel.org, tytso@mit.edu,
	linmiaohe@huawei.com, david@redhat.com, peterx@redhat.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-ext4@vger.kernel.org, linux-xfs@vger.kernel.org,
	jhubbard@nvidia.com, hch@lst.de, david@fromorbit.com,
	chenhuacai@kernel.org, kernel@xen0n.name,
	loongarch@lists.linux.dev
Subject: [PATCH v9 06/20] fs/dax: Always remove DAX page-cache entries when breaking layouts
Date: Fri, 28 Feb 2025 14:31:01 +1100	[thread overview]
Message-ID: <3be6115eaaa8d28fee37fcba3287be4f226a7d24.1740713401.git-series.apopple@nvidia.com> (raw)
In-Reply-To: <cover.8068ad144a7eea4a813670301f4d2a86a8e68ec4.1740713401.git-series.apopple@nvidia.com>

Prior to any truncation operations file systems call
dax_break_mapping() to ensure pages in the range are not under going
DMA. Later DAX page-cache entries will be removed by
truncate_folio_batch_exceptionals() in the generic page-cache code.

However this makes it possible for folios to be removed from the
page-cache even though they are still DMA busy if the file-system
hasn't called dax_break_mapping(). It also means they can never be
waited on in future because FS DAX will lose track of them once the
page-cache entry has been deleted.

Instead it is better to delete the FS DAX entry when the file-system
calls dax_break_mapping() as part of it's truncate operation. This
ensures only idle pages can be removed from the FS DAX page-cache and
makes it easy to detect if a file-system hasn't called
dax_break_mapping() prior to a truncate operation.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>

---

Changes for v7:

 - s/dax_break_mapping/dax_break_layout/ suggested by Dan.
 - Rework dax_break_mapping() to take a NULL callback for NOWAIT
   behaviour as suggested by Dan.
---
 fs/dax.c            | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.c  |  5 ++---
 include/linux/dax.h |  2 ++
 mm/truncate.c       | 16 +++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index f1945aa..14fbe51 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -846,6 +846,36 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	return ret;
 }
 
+void dax_delete_mapping_range(struct address_space *mapping,
+				loff_t start, loff_t end)
+{
+	void *entry;
+	pgoff_t start_idx = start >> PAGE_SHIFT;
+	pgoff_t end_idx;
+	XA_STATE(xas, &mapping->i_pages, start_idx);
+
+	/* If end == LLONG_MAX, all pages from start to till end of file */
+	if (end == LLONG_MAX)
+		end_idx = ULONG_MAX;
+	else
+		end_idx = end >> PAGE_SHIFT;
+
+	xas_lock_irq(&xas);
+	xas_for_each(&xas, entry, end_idx) {
+		if (!xa_is_value(entry))
+			continue;
+		entry = wait_entry_unlocked_exclusive(&xas, entry);
+		if (!entry)
+			continue;
+		dax_disassociate_entry(entry, mapping, true);
+		xas_store(&xas, NULL);
+		mapping->nrpages -= 1UL << dax_entry_order(entry);
+		put_unlocked_entry(&xas, entry, WAKE_ALL);
+	}
+	xas_unlock_irq(&xas);
+}
+EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
+
 static int wait_page_idle(struct page *page,
 			void (cb)(struct inode *),
 			struct inode *inode)
@@ -857,6 +887,9 @@ static int wait_page_idle(struct page *page,
 /*
  * Unmaps the inode and waits for any DMA to complete prior to deleting the
  * DAX mapping entries for the range.
+ *
+ * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
+ * busy page
  */
 int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
 		void (cb)(struct inode *))
@@ -871,10 +904,17 @@ int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
 		page = dax_layout_busy_page_range(inode->i_mapping, start, end);
 		if (!page)
 			break;
+		if (!cb) {
+			error = -ERESTARTSYS;
+			break;
+		}
 
 		error = wait_page_idle(page, cb, inode);
 	} while (error == 0);
 
+	if (!page)
+		dax_delete_mapping_range(inode->i_mapping, start, end);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(dax_break_layout);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d4f07e0..8008337 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2735,7 +2735,6 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	struct xfs_inode	*ip2)
 {
 	int			error;
-	struct page		*page;
 
 	if (ip1->i_ino > ip2->i_ino)
 		swap(ip1, ip2);
@@ -2759,8 +2758,8 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
 	 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
 	 * for this nested lock case.
 	 */
-	page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
-	if (!dax_page_is_idle(page)) {
+	error = dax_break_layout(VFS_I(ip2), 0, -1, NULL);
+	if (error) {
 		xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
 		xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
 		goto again;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a6b277f..2fbb262 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -255,6 +255,8 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
 		unsigned int order, pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_delete_mapping_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
 int __must_check dax_break_layout(struct inode *inode, loff_t start,
diff --git a/mm/truncate.c b/mm/truncate.c
index 0994817..031d0be 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
 
 	if (dax_mapping(mapping)) {
 		for (i = j; i < nr; i++) {
-			if (xa_is_value(fbatch->folios[i]))
+			if (xa_is_value(fbatch->folios[i])) {
+				/*
+				 * File systems should already have called
+				 * dax_break_layout_entry() to remove all DAX
+				 * entries while holding a lock to prevent
+				 * establishing new entries. Therefore we
+				 * shouldn't find any here.
+				 */
+				WARN_ON_ONCE(1);
+
+				/*
+				 * Delete the mapping so truncate_pagecache()
+				 * doesn't loop forever.
+				 */
 				dax_delete_mapping_entry(mapping, indices[i]);
+			}
 		}
 		goto out;
 	}
-- 
git-series 0.9.1

next prev parent reply	other threads:[~2025-02-28  3:32 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-28  3:30 [PATCH v9 00/20] fs/dax: Fix ZONE_DEVICE page reference counts Alistair Popple
2025-02-28  3:30 ` [PATCH v9 01/20] fuse: Fix dax truncate/punch_hole fault path Alistair Popple
2025-02-28  3:30 ` [PATCH v9 02/20] fs/dax: Return unmapped busy pages from dax_layout_busy_page_range() Alistair Popple
2025-02-28  3:30 ` [PATCH v9 03/20] fs/dax: Don't skip locked entries when scanning entries Alistair Popple
2025-02-28  3:30 ` [PATCH v9 04/20] fs/dax: Refactor wait for dax idle page Alistair Popple
2025-02-28  3:31 ` [PATCH v9 05/20] fs/dax: Create a common implementation to break DAX layouts Alistair Popple
2025-02-28  3:31 ` Alistair Popple [this message]
2025-02-28  3:31 ` [PATCH v9 07/20] fs/dax: Ensure all pages are idle prior to filesystem unmount Alistair Popple
2025-02-28  3:31 ` [PATCH v9 08/20] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag Alistair Popple
2025-02-28  3:31 ` [PATCH v9 09/20] mm/gup: Remove redundant check for PCI P2PDMA page Alistair Popple
2025-02-28  3:31 ` [PATCH v9 10/20] mm/mm_init: Move p2pdma page refcount initialisation to p2pdma Alistair Popple
2025-02-28  3:31 ` [PATCH v9 11/20] mm: Allow compound zone device pages Alistair Popple
2025-02-28  3:31 ` [PATCH v9 12/20] mm/memory: Enhance insert_page_into_pte_locked() to create writable mappings Alistair Popple
2025-02-28  3:31 ` [PATCH v9 13/20] mm/memory: Add vmf_insert_page_mkwrite() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 14/20] mm/rmap: Add support for PUD sized mappings to rmap Alistair Popple
2025-02-28  3:31 ` [PATCH v9 15/20] mm/huge_memory: Add vmf_insert_folio_pud() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 16/20] mm/huge_memory: Add vmf_insert_folio_pmd() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 17/20] mm/gup: Don't allow FOLL_LONGTERM pinning of FS DAX pages Alistair Popple
2025-02-28  3:31 ` [PATCH v9 18/20] dcssblk: Mark DAX broken, remove FS_DAX_LIMITED support Alistair Popple
2025-02-28  3:31 ` [PATCH v9 19/20] fs/dax: Properly refcount fs dax pages Alistair Popple
2025-03-03  8:58   ` David Hildenbrand
2025-03-26 21:04     ` Dan Williams
2025-02-28  3:31 ` [PATCH v9 20/20] device/dax: Properly refcount device dax pages when mapping Alistair Popple
2025-02-28  3:42 ` [PATCH v9 00/20] fs/dax: Fix ZONE_DEVICE page reference counts Alistair Popple
2025-03-04  4:46   ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3be6115eaaa8d28fee37fcba3287be4f226a7d24.1740713401.git-series.apopple@nvidia.com \
    --to=apopple@nvidia.com \
    --cc=akpm@linux-foundation.org \
    --cc=alison.schofield@intel.com \
    --cc=bhelgaas@google.com \
    --cc=catalin.marinas@arm.com \
    --cc=chenhuacai@kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dave.jiang@intel.com \
    --cc=david@fromorbit.com \
    --cc=david@redhat.com \
    --cc=djwong@kernel.org \
    --cc=gerald.schaefer@linux.ibm.com \
    --cc=hch@lst.de \
    --cc=ira.weiny@intel.com \
    --cc=jack@suse.cz \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=kernel@xen0n.name \
    --cc=lina@asahilina.net \
    --cc=linmiaohe@huawei.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=logang@deltatee.com \
    --cc=loongarch@lists.linux.dev \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=peterx@redhat.com \
    --cc=tytso@mit.edu \
    --cc=vishal.l.verma@intel.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=zhang.lyra@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox