[PATCH v9 03/20] fs/dax: Don't skip locked entries when scanning entries

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Alistair Popple <apopple@nvidia.com>
To: akpm@linux-foundation.org, dan.j.williams@intel.com, linux-mm@kvack.org
Cc: Alistair Popple <apopple@nvidia.com>,
	Alison Schofield <alison.schofield@intel.com>,
	lina@asahilina.net, zhang.lyra@gmail.com,
	gerald.schaefer@linux.ibm.com, vishal.l.verma@intel.com,
	dave.jiang@intel.com, logang@deltatee.com, bhelgaas@google.com,
	jack@suse.cz, jgg@ziepe.ca, catalin.marinas@arm.com,
	will@kernel.org, mpe@ellerman.id.au, npiggin@gmail.com,
	dave.hansen@linux.intel.com, ira.weiny@intel.com,
	willy@infradead.org, djwong@kernel.org, tytso@mit.edu,
	linmiaohe@huawei.com, david@redhat.com, peterx@redhat.com,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linuxppc-dev@lists.ozlabs.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-ext4@vger.kernel.org, linux-xfs@vger.kernel.org,
	jhubbard@nvidia.com, hch@lst.de, david@fromorbit.com,
	chenhuacai@kernel.org, kernel@xen0n.name,
	loongarch@lists.linux.dev
Subject: [PATCH v9 03/20] fs/dax: Don't skip locked entries when scanning entries
Date: Fri, 28 Feb 2025 14:30:58 +1100	[thread overview]
Message-ID: <b11b2baed7157dc900bf07a4571bf71b7cd82d97.1740713401.git-series.apopple@nvidia.com> (raw)
In-Reply-To: <cover.8068ad144a7eea4a813670301f4d2a86a8e68ec4.1740713401.git-series.apopple@nvidia.com>

Several functions internal to FS DAX use the following pattern when
trying to obtain an unlocked entry:

    xas_for_each(&xas, entry, end_idx) {
	if (dax_is_locked(entry))
	    entry = get_unlocked_entry(&xas, 0);

This is problematic because get_unlocked_entry() will get the next
present entry in the range, and the next entry may not be
locked. Therefore any processing of the original locked entry will be
skipped. This can cause dax_layout_busy_page_range() to miss DMA-busy
pages in the range, leading file systems to free blocks whilst DMA
operations are ongoing which can lead to file system corruption.

Instead callers from within a xas_for_each() loop should be waiting
for the current entry to be unlocked without advancing the XArray
state so a new function is introduced to wait.

Also while we are here rename get_unlocked_entry() to
get_next_unlocked_entry() to make it clear that it may advance the
iterator state.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index b35f538..f5fdb43 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -206,7 +206,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry,
  *
  * Must be called with the i_pages lock held.
  */
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
 {
 	void *entry;
 	struct wait_exceptional_entry_queue ewait;
@@ -236,6 +236,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
 }
 
 /*
+ * Wait for the given entry to become unlocked. Caller must hold the i_pages
+ * lock and call either put_unlocked_entry() if it did not lock the entry or
+ * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
+ */
+static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
+{
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq;
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+
+	while (unlikely(dax_is_locked(entry))) {
+		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+		prepare_to_wait_exclusive(wq, &ewait.wait,
+					TASK_UNINTERRUPTIBLE);
+		xas_pause(xas);
+		xas_unlock_irq(xas);
+		schedule();
+		finish_wait(wq, &ewait.wait);
+		xas_lock_irq(xas);
+		entry = xas_load(xas);
+	}
+
+	if (xa_is_internal(entry))
+		return NULL;
+
+	return entry;
+}
+
+/*
  * The only thing keeping the address space around is the i_pages lock
  * (it's cycled in clear_inode() after removing the entries from i_pages)
  * After we call xas_unlock_irq(), we cannot touch xas->xa.
@@ -250,7 +281,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
 
 	wq = dax_entry_waitqueue(xas, entry, &ewait.key);
 	/*
-	 * Unlike get_unlocked_entry() there is no guarantee that this
+	 * Unlike get_next_unlocked_entry() there is no guarantee that this
 	 * path ever successfully retrieves an unlocked entry before an
 	 * inode dies. Perform a non-exclusive wait in case this path
 	 * never successfully performs its own wake up.
@@ -581,7 +612,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
 retry:
 	pmd_downgrade = false;
 	xas_lock_irq(xas);
-	entry = get_unlocked_entry(xas, order);
+	entry = get_next_unlocked_entry(xas, order);
 
 	if (entry) {
 		if (dax_is_conflict(entry))
@@ -717,8 +748,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
 	xas_for_each(&xas, entry, end_idx) {
 		if (WARN_ON_ONCE(!xa_is_value(entry)))
 			continue;
-		if (unlikely(dax_is_locked(entry)))
-			entry = get_unlocked_entry(&xas, 0);
+		entry = wait_entry_unlocked_exclusive(&xas, entry);
 		if (entry)
 			page = dax_busy_page(entry);
 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -751,7 +781,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
 	void *entry;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas, 0);
+	entry = get_next_unlocked_entry(&xas, 0);
 	if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
 		goto out;
 	if (!trunc &&
@@ -777,7 +807,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping,
 
 	xas_lock_irq(&xas);
 	xas_for_each(&xas, entry, end) {
-		entry = get_unlocked_entry(&xas, 0);
+		entry = wait_entry_unlocked_exclusive(&xas, entry);
+		if (!entry)
+			continue;
 		xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
 		xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
 		put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -941,7 +973,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
 	if (unlikely(dax_is_locked(entry))) {
 		void *old_entry = entry;
 
-		entry = get_unlocked_entry(xas, 0);
+		entry = get_next_unlocked_entry(xas, 0);
 
 		/* Entry got punched out / reallocated? */
 		if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1950,7 +1982,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	vm_fault_t ret;
 
 	xas_lock_irq(&xas);
-	entry = get_unlocked_entry(&xas, order);
+	entry = get_next_unlocked_entry(&xas, order);
 	/* Did we race with someone splitting entry or so? */
 	if (!entry || dax_is_conflict(entry) ||
 	    (order == 0 && !dax_is_pte_entry(entry))) {
-- 
git-series 0.9.1

next prev parent reply	other threads:[~2025-02-28  3:31 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-28  3:30 [PATCH v9 00/20] fs/dax: Fix ZONE_DEVICE page reference counts Alistair Popple
2025-02-28  3:30 ` [PATCH v9 01/20] fuse: Fix dax truncate/punch_hole fault path Alistair Popple
2025-02-28  3:30 ` [PATCH v9 02/20] fs/dax: Return unmapped busy pages from dax_layout_busy_page_range() Alistair Popple
2025-02-28  3:30 ` Alistair Popple [this message]
2025-02-28  3:30 ` [PATCH v9 04/20] fs/dax: Refactor wait for dax idle page Alistair Popple
2025-02-28  3:31 ` [PATCH v9 05/20] fs/dax: Create a common implementation to break DAX layouts Alistair Popple
2025-02-28  3:31 ` [PATCH v9 06/20] fs/dax: Always remove DAX page-cache entries when breaking layouts Alistair Popple
2025-02-28  3:31 ` [PATCH v9 07/20] fs/dax: Ensure all pages are idle prior to filesystem unmount Alistair Popple
2025-02-28  3:31 ` [PATCH v9 08/20] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag Alistair Popple
2025-02-28  3:31 ` [PATCH v9 09/20] mm/gup: Remove redundant check for PCI P2PDMA page Alistair Popple
2025-02-28  3:31 ` [PATCH v9 10/20] mm/mm_init: Move p2pdma page refcount initialisation to p2pdma Alistair Popple
2025-02-28  3:31 ` [PATCH v9 11/20] mm: Allow compound zone device pages Alistair Popple
2025-02-28  3:31 ` [PATCH v9 12/20] mm/memory: Enhance insert_page_into_pte_locked() to create writable mappings Alistair Popple
2025-02-28  3:31 ` [PATCH v9 13/20] mm/memory: Add vmf_insert_page_mkwrite() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 14/20] mm/rmap: Add support for PUD sized mappings to rmap Alistair Popple
2025-02-28  3:31 ` [PATCH v9 15/20] mm/huge_memory: Add vmf_insert_folio_pud() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 16/20] mm/huge_memory: Add vmf_insert_folio_pmd() Alistair Popple
2025-02-28  3:31 ` [PATCH v9 17/20] mm/gup: Don't allow FOLL_LONGTERM pinning of FS DAX pages Alistair Popple
2025-02-28  3:31 ` [PATCH v9 18/20] dcssblk: Mark DAX broken, remove FS_DAX_LIMITED support Alistair Popple
2025-02-28  3:31 ` [PATCH v9 19/20] fs/dax: Properly refcount fs dax pages Alistair Popple
2025-03-03  8:58   ` David Hildenbrand
2025-03-26 21:04     ` Dan Williams
2025-02-28  3:31 ` [PATCH v9 20/20] device/dax: Properly refcount device dax pages when mapping Alistair Popple
2025-02-28  3:42 ` [PATCH v9 00/20] fs/dax: Fix ZONE_DEVICE page reference counts Alistair Popple
2025-03-04  4:46   ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b11b2baed7157dc900bf07a4571bf71b7cd82d97.1740713401.git-series.apopple@nvidia.com \
    --to=apopple@nvidia.com \
    --cc=akpm@linux-foundation.org \
    --cc=alison.schofield@intel.com \
    --cc=bhelgaas@google.com \
    --cc=catalin.marinas@arm.com \
    --cc=chenhuacai@kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dave.jiang@intel.com \
    --cc=david@fromorbit.com \
    --cc=david@redhat.com \
    --cc=djwong@kernel.org \
    --cc=gerald.schaefer@linux.ibm.com \
    --cc=hch@lst.de \
    --cc=ira.weiny@intel.com \
    --cc=jack@suse.cz \
    --cc=jgg@ziepe.ca \
    --cc=jhubbard@nvidia.com \
    --cc=kernel@xen0n.name \
    --cc=lina@asahilina.net \
    --cc=linmiaohe@huawei.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=logang@deltatee.com \
    --cc=loongarch@lists.linux.dev \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=nvdimm@lists.linux.dev \
    --cc=peterx@redhat.com \
    --cc=tytso@mit.edu \
    --cc=vishal.l.verma@intel.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=zhang.lyra@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox