From: Alistair Popple <apopple@nvidia.com>
To: akpm@linux-foundation.org, dan.j.williams@intel.com, linux-mm@kvack.org
Cc: Alistair Popple <apopple@nvidia.com>,
lina@asahilina.net, zhang.lyra@gmail.com,
gerald.schaefer@linux.ibm.com, vishal.l.verma@intel.com,
dave.jiang@intel.com, logang@deltatee.com, bhelgaas@google.com,
jack@suse.cz, jgg@ziepe.ca, catalin.marinas@arm.com,
will@kernel.org, mpe@ellerman.id.au, npiggin@gmail.com,
dave.hansen@linux.intel.com, ira.weiny@intel.com,
willy@infradead.org, djwong@kernel.org, tytso@mit.edu,
linmiaohe@huawei.com, david@redhat.com, peterx@redhat.com,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-arm-kernel@lists.infradead.org,
linuxppc-dev@lists.ozlabs.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-ext4@vger.kernel.org, linux-xfs@vger.kernel.org,
jhubbard@nvidia.com, hch@lst.de, david@fromorbit.com
Subject: [PATCH v4 07/25] fs/dax: Ensure all pages are idle prior to filesystem unmount
Date: Tue, 17 Dec 2024 16:12:50 +1100 [thread overview]
Message-ID: <f6aea86fad2d670a35ff9d60ba8e9f3f748bbd8c.1734407924.git-series.apopple@nvidia.com> (raw)
In-Reply-To: <cover.18cbcff3638c6aacc051c44533ebc6c002bf2bd9.1734407924.git-series.apopple@nvidia.com>
File systems call dax_break_mapping() prior to reallocating file
system blocks to ensure the page is not undergoing any DMA or other
accesses. Generally this is needed when a file is truncated to ensure
that if a block is reallocated nothing is writing to it. However
filesystems currently don't call this when an FS DAX inode is evicted.
This can cause problems when the file system is unmounted as a page
can continue to be under going DMA or other remote access after
unmount. This means if the file system is remounted any truncate or
other operation which requires the underlying file system block to be
freed will not wait for the remote access to complete. Therefore a
busy block may be reallocated to a new file leading to corruption.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
fs/dax.c | 26 ++++++++++++++++++++++++++
fs/ext4/inode.c | 32 ++++++++++++++------------------
fs/xfs/xfs_inode.c | 9 +++++++++
fs/xfs/xfs_inode.h | 1 +
fs/xfs/xfs_super.c | 18 ++++++++++++++++++
include/linux/dax.h | 2 ++
6 files changed, 70 insertions(+), 18 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index cd6cca8..34a7690 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -883,6 +883,14 @@ static int wait_page_idle(struct page *page,
TASK_INTERRUPTIBLE, 0, 0, cb(inode));
}
+static void wait_page_idle_uninterruptible(struct page *page,
+ void (cb)(struct inode *),
+ struct inode *inode)
+{
+ ___wait_var_event(page, page_ref_count(page) == 1,
+ TASK_UNINTERRUPTIBLE, 0, 0, cb(inode));
+}
+
/*
* Unmaps the inode and waits for any DMA to complete prior to deleting the
* DAX mapping entries for the range.
@@ -908,6 +916,24 @@ int dax_break_mapping(struct inode *inode, loff_t start, loff_t end,
}
EXPORT_SYMBOL_GPL(dax_break_mapping);
+void dax_break_mapping_uninterruptible(struct inode *inode,
+ void (cb)(struct inode *))
+{
+ struct page *page;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, 0,
+ LLONG_MAX);
+ if (!page)
+ break;
+
+ wait_page_idle_uninterruptible(page, cb, inode);
+ } while (true);
+
+ dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_break_mapping_uninterruptible);
+
/*
* Invalidate DAX entry if it is clean.
*/
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ee8e83f..fa35161 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -163,6 +163,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
(inode->i_size < EXT4_N_BLOCKS * 4);
}
+static void ext4_wait_dax_page(struct inode *inode)
+{
+ filemap_invalidate_unlock(inode->i_mapping);
+ schedule();
+ filemap_invalidate_lock(inode->i_mapping);
+}
+
+int ext4_break_layouts(struct inode *inode)
+{
+ return dax_break_mapping_inode(inode, ext4_wait_dax_page);
+}
+
/*
* Called at the last iput() if i_nlink is zero.
*/
@@ -181,6 +193,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ dax_break_mapping_uninterruptible(inode, ext4_wait_dax_page);
+
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
@@ -3902,24 +3916,6 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
-static void ext4_wait_dax_page(struct inode *inode)
-{
- filemap_invalidate_unlock(inode->i_mapping);
- schedule();
- filemap_invalidate_lock(inode->i_mapping);
-}
-
-int ext4_break_layouts(struct inode *inode)
-{
- struct page *page;
- int error;
-
- if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
- return -EINVAL;
-
- return dax_break_mapping_inode(inode, ext4_wait_dax_page);
-}
-
/*
* ext4_punch_hole: punches a hole in a file by releasing the blocks
* associated with the given offset and length
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4410b42..c7ec5ab 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2997,6 +2997,15 @@ xfs_break_dax_layouts(
return dax_break_mapping_inode(inode, xfs_wait_dax_page);
}
+void
+xfs_break_dax_layouts_uninterruptible(
+ struct inode *inode)
+{
+ xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
+
+ dax_break_mapping_uninterruptible(inode, xfs_wait_dax_page);
+}
+
int
xfs_break_layouts(
struct inode *inode,
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c4f03f6..613797a 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -594,6 +594,7 @@ xfs_itruncate_extents(
}
int xfs_break_dax_layouts(struct inode *inode);
+void xfs_break_dax_layouts_uninterruptible(struct inode *inode);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8524b9d..73ec060 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -751,6 +751,23 @@ xfs_fs_drop_inode(
return generic_drop_inode(inode);
}
+STATIC void
+xfs_fs_evict_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+
+ if (IS_DAX(inode)) {
+ xfs_ilock(ip, iolock);
+ xfs_break_dax_layouts_uninterruptible(inode);
+ xfs_iunlock(ip, iolock);
+ }
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
static void
xfs_mount_free(
struct xfs_mount *mp)
@@ -1189,6 +1206,7 @@ static const struct super_operations xfs_super_operations = {
.destroy_inode = xfs_fs_destroy_inode,
.dirty_inode = xfs_fs_dirty_inode,
.drop_inode = xfs_fs_drop_inode,
+ .evict_inode = xfs_fs_evict_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
diff --git a/include/linux/dax.h b/include/linux/dax.h
index ef9e02c..7c3773f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -274,6 +274,8 @@ static inline int __must_check dax_break_mapping_inode(struct inode *inode,
{
return dax_break_mapping(inode, 0, LLONG_MAX, cb);
}
+void dax_break_mapping_uninterruptible(struct inode *inode,
+ void (cb)(struct inode *));
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same,
--
git-series 0.9.1
next prev parent reply other threads:[~2024-12-17 5:17 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-17 5:12 [PATCH v4 00/25] fs/dax: Fix ZONE_DEVICE page reference counts Alistair Popple
2024-12-17 5:12 ` [PATCH v4 01/25] fuse: Fix dax truncate/punch_hole fault path Alistair Popple
2024-12-17 5:12 ` [PATCH v4 02/25] fs/dax: Return unmapped busy pages from dax_layout_busy_page_range() Alistair Popple
2024-12-17 5:12 ` [PATCH v4 03/25] fs/dax: Don't skip locked entries when scanning entries Alistair Popple
2024-12-17 5:12 ` [PATCH v4 04/25] fs/dax: Refactor wait for dax idle page Alistair Popple
2024-12-17 5:12 ` [PATCH v4 05/25] fs/dax: Create a common implementation to break DAX layouts Alistair Popple
2024-12-17 5:12 ` [PATCH v4 06/25] fs/dax: Always remove DAX page-cache entries when breaking layouts Alistair Popple
2024-12-17 5:12 ` Alistair Popple [this message]
2024-12-17 5:12 ` [PATCH v4 08/25] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag Alistair Popple
2024-12-17 5:12 ` [PATCH v4 09/25] mm/gup.c: Remove redundant check for PCI P2PDMA page Alistair Popple
2024-12-17 22:06 ` David Hildenbrand
2024-12-17 5:12 ` [PATCH v4 10/25] mm/mm_init: Move p2pdma page refcount initialisation to p2pdma Alistair Popple
2024-12-17 22:14 ` David Hildenbrand
2024-12-18 22:49 ` Alistair Popple
2024-12-20 18:29 ` David Hildenbrand
2024-12-17 5:12 ` [PATCH v4 11/25] mm: Allow compound zone device pages Alistair Popple
2024-12-17 5:12 ` [PATCH v4 12/25] mm/memory: Enhance insert_page_into_pte_locked() to create writable mappings Alistair Popple
2024-12-20 19:01 ` David Hildenbrand
2024-12-20 19:06 ` David Hildenbrand
2025-01-06 2:07 ` Alistair Popple
2025-01-07 11:29 ` David Hildenbrand
2024-12-17 5:12 ` [PATCH v4 13/25] mm/memory: Add vmf_insert_page_mkwrite() Alistair Popple
2024-12-17 5:12 ` [PATCH v4 14/25] rmap: Add support for PUD sized mappings to rmap Alistair Popple
2024-12-17 22:27 ` David Hildenbrand
2024-12-18 22:55 ` Alistair Popple
2024-12-20 18:31 ` David Hildenbrand
2024-12-17 5:12 ` [PATCH v4 15/25] huge_memory: Add vmf_insert_folio_pud() Alistair Popple
2024-12-20 18:52 ` David Hildenbrand
2025-01-06 6:39 ` Alistair Popple
2024-12-17 5:12 ` [PATCH v4 16/25] huge_memory: Add vmf_insert_folio_pmd() Alistair Popple
2024-12-20 18:54 ` David Hildenbrand
2024-12-17 5:13 ` [PATCH v4 17/25] memremap: Add is_device_dax_page() and is_fsdax_page() helpers Alistair Popple
2024-12-20 18:39 ` David Hildenbrand
2024-12-17 5:13 ` [PATCH v4 18/25] gup: Don't allow FOLL_LONGTERM pinning of FS DAX pages Alistair Popple
2024-12-17 22:33 ` David Hildenbrand
2024-12-17 5:13 ` [PATCH v4 19/25] proc/task_mmu: Ignore ZONE_DEVICE pages Alistair Popple
2024-12-17 22:31 ` David Hildenbrand
2024-12-18 23:11 ` Alistair Popple
2024-12-20 18:32 ` David Hildenbrand
2025-01-06 6:43 ` Alistair Popple
2024-12-17 5:13 ` [PATCH v4 20/25] mm/mlock: Skip ZONE_DEVICE PMDs during mlock Alistair Popple
2024-12-17 22:28 ` David Hildenbrand
2024-12-17 5:13 ` [PATCH v4 21/25] fs/dax: Properly refcount fs dax pages Alistair Popple
2024-12-17 5:13 ` [PATCH v4 22/25] device/dax: Properly refcount device dax pages when mapping Alistair Popple
2024-12-17 5:13 ` [PATCH v4 23/25] mm: Remove pXX_devmap callers Alistair Popple
2024-12-17 5:13 ` [PATCH v4 24/25] mm: Remove devmap related functions and page table bits Alistair Popple
2024-12-17 5:13 ` [PATCH v4 25/25] Revert "riscv: mm: Add support for ZONE_DEVICE" Alistair Popple
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f6aea86fad2d670a35ff9d60ba8e9f3f748bbd8c.1734407924.git-series.apopple@nvidia.com \
--to=apopple@nvidia.com \
--cc=akpm@linux-foundation.org \
--cc=bhelgaas@google.com \
--cc=catalin.marinas@arm.com \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=dave.jiang@intel.com \
--cc=david@fromorbit.com \
--cc=david@redhat.com \
--cc=djwong@kernel.org \
--cc=gerald.schaefer@linux.ibm.com \
--cc=hch@lst.de \
--cc=ira.weiny@intel.com \
--cc=jack@suse.cz \
--cc=jgg@ziepe.ca \
--cc=jhubbard@nvidia.com \
--cc=lina@asahilina.net \
--cc=linmiaohe@huawei.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-xfs@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=logang@deltatee.com \
--cc=mpe@ellerman.id.au \
--cc=npiggin@gmail.com \
--cc=nvdimm@lists.linux.dev \
--cc=peterx@redhat.com \
--cc=tytso@mit.edu \
--cc=vishal.l.verma@intel.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=zhang.lyra@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox