From: Usama Arif <usama.arif@linux.dev>
To: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Cc: Usama Arif <usama.arif@linux.dev>,
Andrew Morton <akpm@linux-foundation.org>,
Peter Xu <peterx@redhat.com>,
David Hildenbrand <david@kernel.org>,
Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
Suren Baghdasaryan <surenb@google.com>,
Vlastimil Babka <vbabka@kernel.org>,
"Liam R . Howlett" <Liam.Howlett@oracle.com>,
Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
Shuah Khan <skhan@linuxfoundation.org>,
Sean Christopherson <seanjc@google.com>,
Paolo Bonzini <pbonzini@redhat.com>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
kvm@vger.kernel.org
Subject: Re: [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
Date: Wed, 15 Apr 2026 08:08:59 -0700 [thread overview]
Message-ID: <20260415150900.3660575-1-usama.arif@linux.dev> (raw)
In-Reply-To: <20260414142354.1465950-11-kas@kernel.org>
On Tue, 14 Apr 2026 15:23:44 +0100 "Kiryl Shutsemau (Meta)" <kas@kernel.org> wrote:
> Add UFFDIO_SET_MODE ioctl to toggle UFFD_FEATURE_MINOR_ASYNC at
> runtime. Takes mmap_write_lock for serialization against all in-flight
> faults. On sync-to-async transition, wake threads blocked in
> handle_userfault() so they retry and auto-resolve.
>
> Since ctx->features can now be modified concurrently, add
> userfaultfd_features() helper that wraps READ_ONCE() and convert
> all ctx->features reads to use it.
>
> Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
> Assisted-by: Claude:claude-opus-4-6
> ---
> fs/userfaultfd.c | 95 ++++++++++++++++++++++++++++----
> include/uapi/linux/userfaultfd.h | 13 +++++
> 2 files changed, 96 insertions(+), 12 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 43064238fd8d..0edb33599491 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -79,24 +79,33 @@ struct userfaultfd_wake_range {
> /* internal indication that UFFD_API ioctl was successfully executed */
> #define UFFD_FEATURE_INITIALIZED (1u << 31)
>
> +/*
> + * Read ctx->features with READ_ONCE() since UFFDIO_SET_MODE can
> + * modify it concurrently.
> + */
> +static unsigned int userfaultfd_features(struct userfaultfd_ctx *ctx)
> +{
> + return READ_ONCE(ctx->features);
> +}
> +
> static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
> {
> - return ctx->features & UFFD_FEATURE_INITIALIZED;
> + return userfaultfd_features(ctx) & UFFD_FEATURE_INITIALIZED;
> }
>
> static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
> {
> - return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
> + return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_WP_ASYNC);
> }
>
> static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
> {
> - return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
> + return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ANON);
> }
>
> static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
> {
> - return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
> + return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_MINOR_ASYNC);
> }
>
> static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
> @@ -122,7 +131,7 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
> if (!ctx)
> return false;
>
> - return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
> + return userfaultfd_features(ctx) & UFFD_FEATURE_WP_UNPOPULATED;
> }
>
> static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
> @@ -435,7 +444,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> /* 0 or > 1 flags set is a bug; we expect exactly 1. */
> VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
>
> - if (ctx->features & UFFD_FEATURE_SIGBUS)
> + if (userfaultfd_features(ctx) & UFFD_FEATURE_SIGBUS)
> goto out;
> if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
> goto out;
> @@ -506,7 +515,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
> uwq.wq.private = current;
> uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
> - reason, ctx->features);
> + reason, userfaultfd_features(ctx));
> uwq.ctx = ctx;
> uwq.waken = false;
>
> @@ -668,7 +677,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> if (!octx)
> return 0;
>
> - if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
> + if (!(userfaultfd_features(octx) & UFFD_FEATURE_EVENT_FORK)) {
> userfaultfd_reset_ctx(vma);
> return 0;
> }
> @@ -774,7 +783,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> if (!ctx)
> return;
>
> - if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
> + if (userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMAP) {
> vm_ctx->ctx = ctx;
> userfaultfd_ctx_get(ctx);
> down_write(&ctx->map_changing_lock);
> @@ -824,7 +833,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> struct userfaultfd_wait_queue ewq;
>
> ctx = vma->vm_userfaultfd_ctx.ctx;
> - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
> + if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_REMOVE))
> return true;
>
> userfaultfd_ctx_get(ctx);
> @@ -863,7 +872,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
> struct userfaultfd_unmap_ctx *unmap_ctx;
> struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
>
> - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
> + if (!ctx || !(userfaultfd_features(ctx) & UFFD_FEATURE_EVENT_UNMAP) ||
> has_unmap_ctx(ctx, unmaps, start, end))
> return 0;
>
> @@ -1826,6 +1835,65 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
> return ret;
> }
>
> +/*
> + * Features that can be toggled at runtime via UFFDIO_SET_MODE.
> + * Only async features that were enabled at UFFDIO_API time may be toggled.
> + */
> +#define UFFD_FEATURE_TOGGLEABLE (UFFD_FEATURE_MINOR_ASYNC)
> +
> +static int userfaultfd_set_mode(struct userfaultfd_ctx *ctx,
> + unsigned long arg)
> +{
> + struct uffdio_set_mode mode;
> + struct mm_struct *mm = ctx->mm;
> +
> + if (copy_from_user(&mode, (void __user *)arg, sizeof(mode)))
> + return -EFAULT;
> +
> + /* enable and disable must not overlap */
> + if (mode.enable & mode.disable)
> + return -EINVAL;
> +
> + /* only toggleable features are allowed */
> + if ((mode.enable | mode.disable) & ~UFFD_FEATURE_TOGGLEABLE)
> + return -EINVAL;
The commit message states "Only async features that were enabled at
UFFDIO_API time may be toggled." However, the code only checks that
the requested feature is in UFFD_FEATURE_TOGGLEABLE.
Is it intentional that a user who opened a uffd without
UFFD_FEATURE_MINOR_ASYNC can still enable it later via
UFFDIO_SET_MODE?
> +
> + if (!mmget_not_zero(mm))
> + return -ESRCH;
> +
> + /*
> + * mmap_write_lock serializes against all page faults.
> + * After we release, no in-flight faults from the old mode exist.
> + */
> + {
> + unsigned int new_features;
> +
> + mmap_write_lock(mm);
> + new_features = userfaultfd_features(ctx);
> + new_features |= mode.enable;
> + new_features &= ~mode.disable;
> + WRITE_ONCE(ctx->features, new_features);
> + mmap_write_unlock(mm);
> + }
> +
> + /*
> + * If switching to async, wake threads blocked in handle_userfault().
> + * They will retry the fault and auto-resolve under the new mode.
> + * len=0 means wake all pending faults on this context.
> + */
> + if (mode.enable & UFFD_FEATURE_MINOR_ASYNC) {
> + struct userfaultfd_wake_range range = { .len = 0 };
> +
> + spin_lock_irq(&ctx->fault_pending_wqh.lock);
> + __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
> + &range);
> + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
> + spin_unlock_irq(&ctx->fault_pending_wqh.lock);
> + }
> +
> + mmput(mm);
> + return 0;
> +}
>
> static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> {
> @@ -2150,6 +2218,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
> case UFFDIO_DEACTIVATE:
> ret = userfaultfd_deactivate(ctx, arg);
> break;
> + case UFFDIO_SET_MODE:
> + ret = userfaultfd_set_mode(ctx, arg);
> + break;
> }
> return ret;
> }
> @@ -2177,7 +2248,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
> * protocols: aa:... bb:...
> */
> seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
> - pending, total, UFFD_API, ctx->features,
> + pending, total, UFFD_API, userfaultfd_features(ctx),
> UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
> }
> #endif
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 775825da2596..f0f14f9db06c 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -84,6 +84,7 @@
> #define _UFFDIO_CONTINUE (0x07)
> #define _UFFDIO_POISON (0x08)
> #define _UFFDIO_DEACTIVATE (0x09)
> +#define _UFFDIO_SET_MODE (0x0A)
> #define _UFFDIO_API (0x3F)
>
> /* userfaultfd ioctl ids */
> @@ -110,6 +111,8 @@
> struct uffdio_poison)
> #define UFFDIO_DEACTIVATE _IOR(UFFDIO, _UFFDIO_DEACTIVATE, \
> struct uffdio_range)
> +#define UFFDIO_SET_MODE _IOW(UFFDIO, _UFFDIO_SET_MODE, \
> + struct uffdio_set_mode)
>
> /* read() structure */
> struct uffd_msg {
> @@ -395,6 +398,16 @@ struct uffdio_move {
> __s64 move;
> };
>
> +struct uffdio_set_mode {
> + /*
> + * Toggle async mode for features at runtime.
> + * Supported: UFFD_FEATURE_MINOR_ASYNC.
> + * Setting a bit in both enable and disable is invalid.
> + */
> + __u64 enable;
> + __u64 disable;
> +};
> +
> /*
> * Flags for the userfaultfd(2) system call itself.
> */
> --
> 2.51.2
>
>
next prev parent reply other threads:[~2026-04-15 15:09 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Kiryl Shutsemau (Meta)
2026-04-15 15:08 ` Usama Arif [this message]
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08 ` Kiryl Shutsemau
2026-04-14 17:45 ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10 ` Kiryl Shutsemau
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260415150900.3660575-1-usama.arif@linux.dev \
--to=usama.arif@linux.dev \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=corbet@lwn.net \
--cc=david@kernel.org \
--cc=kas@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=pbonzini@redhat.com \
--cc=peterx@redhat.com \
--cc=rppt@kernel.org \
--cc=seanjc@google.com \
--cc=skhan@linuxfoundation.org \
--cc=surenb@google.com \
--cc=vbabka@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox