From: Josef Bacik <josef@toxicpanda.com>
To: kernel-team@fb.com, linux-fsdevel@vger.kernel.org, jack@suse.cz,
amir73il@gmail.com, brauner@kernel.org,
linux-xfs@vger.kernel.org, linux-bcachefs@vger.kernel.org,
linux-btrfs@vger.kernel.org, linux-mm@kvack.org
Subject: [PATCH v5 14/18] fsnotify: generate pre-content permission event on page fault
Date: Wed, 4 Sep 2024 16:28:04 -0400 [thread overview]
Message-ID: <eb208a363df0afccfafad8078d7563d54513f295.1725481503.git.josef@toxicpanda.com> (raw)
In-Reply-To: <cover.1725481503.git.josef@toxicpanda.com>
FS_PRE_ACCESS or FS_PRE_MODIFY will be generated on page fault depending
on the faulting method.
This pre-content event is meant to be used by hierarchical storage
managers that want to fill in the file content on first read access.
Export a simple helper that file systems that have their own ->fault()
will use, and have a more complicated helper to be do fancy things with
in filemap_fault.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
include/linux/mm.h | 1 +
mm/filemap.c | 116 ++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 110 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ab3d78116043..89665732b404 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3503,6 +3503,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf);
extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
diff --git a/mm/filemap.c b/mm/filemap.c
index 8b1684b62177..b2d29947ce7f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,6 +46,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
+#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -3112,13 +3113,13 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
* that. If we didn't pin a file then we return NULL. The file that is
* returned needs to be fput()'ed when we're done with it.
*/
-static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf,
+ struct file *fpin)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
- struct file *fpin = NULL;
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
@@ -3190,12 +3191,12 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
- struct folio *folio)
+ struct folio *folio,
+ struct file *fpin)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
- struct file *fpin = NULL;
unsigned int mmap_miss;
/* See comment in do_sync_mmap_readahead. */
@@ -3260,6 +3261,93 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
return ret;
}
+/*
+ * If we have pre-content watches on this file we will need to emit an event for
+ * this range. We will handle dropping the lock and emitting the event.
+ *
+ * If FAULT_FLAG_RETRY_NOWAIT is set then we'll return VM_FAULT_RETRY.
+ *
+ * If no event was emitted then *fpin will be NULL and we will return 0.
+ *
+ * If any error occurred we will return VM_FAULT_SIGBUS, *fpin could still be
+ * set and will need to have fput() called on it.
+ *
+ * If we emitted the event then we will return 0 and *fpin will be set, this
+ * must have fput() called on it, and the caller must call VM_FAULT_RETRY after
+ * any other operations it does in order to re-fault the page and make sure the
+ * appropriate locking is maintained.
+ *
+ * Return: the appropriate vm_fault_t return code, 0 on success.
+ */
+static vm_fault_t __filemap_fsnotify_fault(struct vm_fault *vmf,
+ struct file **fpin)
+{
+ struct file *file = vmf->vma->vm_file;
+ loff_t pos = vmf->pgoff << PAGE_SHIFT;
+ int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
+ int ret;
+
+ /*
+ * We already did this and now we're retrying with everything locked,
+ * don't emit the event and continue.
+ */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return 0;
+
+ /* No watches, return NULL. */
+ if (!fsnotify_file_has_pre_content_watches(file))
+ return 0;
+
+ /* We are NOWAIT, we can't wait, just return EAGAIN. */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return VM_FAULT_RETRY;
+
+ /*
+ * If this fails then we're not allowed to drop the fault lock, return a
+ * SIGBUS so we don't errantly populate pagecache with bogus data for
+ * this file.
+ */
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (*fpin == NULL)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * We can't fput(*fpin) at this point because we could have been passed
+ * in fpin from a previous call.
+ */
+ ret = fsnotify_file_area_perm(*fpin, mask, &pos, PAGE_SIZE);
+ if (ret)
+ return VM_FAULT_SIGBUS;
+
+ return 0;
+}
+
+/**
+ * filemap_fsnotify_fault - maybe emit a pre-content event.
+ * @vmf: struct vm_fault containing details of the fault.
+ *
+ * If we have a pre-content watch on this file we will emit an event for this
+ * range. If we return anything the fault caller should return immediately, we
+ * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
+ * fault again and then the fault handler will run the second time through.
+ *
+ * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
+ */
+vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
+{
+ struct file *fpin = NULL;
+ vm_fault_t ret;
+
+ ret = __filemap_fsnotify_fault(vmf, &fpin);
+ if (fpin) {
+ fput(fpin);
+ if (!ret)
+ ret = VM_FAULT_RETRY;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
+
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@@ -3299,6 +3387,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
+ /*
+ * If we have pre-content watchers then we need to generate events on
+ * page fault so that we can populate any data before the fault.
+ */
+ ret = __filemap_fsnotify_fault(vmf, &fpin);
+ if (unlikely(ret)) {
+ if (fpin)
+ fput(fpin);
+ return ret;
+ }
+
/*
* Do we have something in the page cache already?
*/
@@ -3309,21 +3408,24 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
* the lock.
*/
if (!(vmf->flags & FAULT_FLAG_TRIED))
- fpin = do_async_mmap_readahead(vmf, folio);
+ fpin = do_async_mmap_readahead(vmf, folio, fpin);
if (unlikely(!folio_test_uptodate(folio))) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
} else {
ret = filemap_fault_recheck_pte_none(vmf);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ if (fpin)
+ goto out_retry;
return ret;
+ }
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
- fpin = do_sync_mmap_readahead(vmf);
+ fpin = do_sync_mmap_readahead(vmf, fpin);
retry_find:
/*
* See comment in filemap_create_folio() why we need
--
2.43.0
next prev parent reply other threads:[~2024-09-04 20:29 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-04 20:27 [PATCH v5 00/18] fanotify: add pre-content hooks Josef Bacik
2024-09-04 20:27 ` [PATCH v5 01/18] fanotify: don't skip extra event info if no info_mode is set Josef Bacik
2024-09-05 7:49 ` Amir Goldstein
2024-09-04 20:27 ` [PATCH v5 02/18] fsnotify: introduce pre-content permission event Josef Bacik
2024-09-04 20:27 ` [PATCH v5 03/18] fsnotify: generate pre-content permission event on open Josef Bacik
2024-10-24 13:06 ` Amir Goldstein
2024-09-04 20:27 ` [PATCH v5 04/18] fanotify: introduce FAN_PRE_ACCESS permission event Josef Bacik
2024-09-04 20:27 ` [PATCH v5 05/18] fanotify: introduce FAN_PRE_MODIFY " Josef Bacik
2024-09-04 20:27 ` [PATCH v5 06/18] fanotify: pass optional file access range in pre-content event Josef Bacik
2024-09-04 20:27 ` [PATCH v5 07/18] fanotify: rename a misnamed constant Josef Bacik
2024-09-04 20:27 ` [PATCH v5 08/18] fanotify: report file range info with pre-content events Josef Bacik
2024-09-04 20:27 ` [PATCH v5 09/18] fanotify: allow to set errno in FAN_DENY permission response Josef Bacik
2024-09-04 20:28 ` [PATCH v5 10/18] fs: add a flag to indicate the fs supports pre-content events Josef Bacik
2024-09-05 8:07 ` Amir Goldstein
2024-09-04 20:28 ` [PATCH v5 11/18] fanotify: add a helper to check for pre content events Josef Bacik
2024-09-05 8:09 ` Amir Goldstein
2024-09-04 20:28 ` [PATCH v5 12/18] fanotify: disable readahead if we have pre-content watches Josef Bacik
2024-09-05 8:12 ` Amir Goldstein
2024-09-04 20:28 ` [PATCH v5 13/18] mm: don't allow huge faults for files with pre content watches Josef Bacik
2024-09-05 8:14 ` Amir Goldstein
2024-09-04 20:28 ` Josef Bacik [this message]
2024-09-04 20:28 ` [PATCH v5 15/18] bcachefs: add pre-content fsnotify hook to fault Josef Bacik
2024-09-04 20:28 ` [PATCH v5 16/18] xfs: add pre-content fsnotify hook for write faults Josef Bacik
2024-09-05 8:29 ` Amir Goldstein
2024-09-04 20:28 ` [PATCH v5 17/18] btrfs: disable defrag on pre-content watched files Josef Bacik
2024-09-05 8:23 ` Amir Goldstein
2024-09-04 20:28 ` [PATCH v5 18/18] fs: enable pre-content events on supported file systems Josef Bacik
2024-09-05 8:27 ` Amir Goldstein
2024-09-05 10:36 ` Jan Kara
[not found] ` <CAOQ4uxikusW_q=zdqDKCHz8kGoTyUg1htWhPR1OFAFGHdj-vcQ@mail.gmail.com>
2024-09-05 10:32 ` [PATCH v5 00/18] fanotify: add pre-content hooks Jan Kara
2024-09-05 19:30 ` Josef Bacik
2024-09-05 12:08 ` Jan Kara
2024-09-05 19:29 ` Josef Bacik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=eb208a363df0afccfafad8078d7563d54513f295.1725481503.git.josef@toxicpanda.com \
--to=josef@toxicpanda.com \
--cc=amir73il@gmail.com \
--cc=brauner@kernel.org \
--cc=jack@suse.cz \
--cc=kernel-team@fb.com \
--cc=linux-bcachefs@vger.kernel.org \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox