linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Xu <peterx@redhat.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>, Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Zi Yan <ziy@nvidia.com>, Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
	kvm@vger.kernel.org, "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Subject: [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl
Date: Tue, 14 Apr 2026 15:23:37 +0100	[thread overview]
Message-ID: <20260414142354.1465950-4-kas@kernel.org> (raw)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>

UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR
range:

- Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE).
  Pages stay resident with PFNs preserved, only permissions removed.
  MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA,
  bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.

- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single().
  Pages stay in page cache.

- Private hugetlb: rejected with -EINVAL (zapping would destroy content).

Cleanup on unregister/close: restore protnone PTEs to normal permissions
in userfaultfd_clear_vma(), preventing permanently inaccessible pages.

Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c              | 35 ++++++++++++++++
 include/linux/mm.h            |  2 +
 include/linux/userfaultfd_k.h |  2 +
 mm/huge_memory.c              |  9 ++--
 mm/mprotect.c                 |  9 +++-
 mm/userfaultfd.c              | 78 +++++++++++++++++++++++++++++++++--
 6 files changed, 127 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d508ad19e89..b317c9854b86 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
 			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
 
+		/* DEACTIVATE is only supported for MINOR ranges. */
+		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+			ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE);
+
 		/*
 		 * Now that we scanned all vmas we can already tell
 		 * userland which ioctls methods are guaranteed to
@@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	return ret;
 }
 
+static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
+				  unsigned long arg)
+{
+	int ret;
+	struct uffdio_range uffdio_range;
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	if (copy_from_user(&uffdio_range, (void __user *)arg,
+			   sizeof(uffdio_range)))
+		return -EFAULT;
+
+	ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len);
+	if (ret)
+		return ret;
+
+	if (mmget_not_zero(ctx->mm)) {
+		ret = mdeactivate_range(ctx, uffdio_range.start,
+					uffdio_range.len);
+		mmput(ctx->mm);
+	} else {
+		return -ESRCH;
+	}
+
+	return ret;
+}
+
 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 {
 	__s64 ret;
@@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_POISON:
 		ret = userfaultfd_poison(ctx, arg);
 		break;
+	case UFFDIO_DEACTIVATE:
+		ret = userfaultfd_deactivate(ctx, arg);
+		break;
 	}
 	return ret;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..fc2841264d56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
 					    MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd deactivation */
+#define  MM_CP_UFFD_DEACTIVATE             (1UL << 4)
 
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d1d4ed4a08b0..c94b5c5b5f24 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 			       unsigned long len, bool enable_wp);
 extern long uffd_wp_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long len, bool enable_wp);
+extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+			     unsigned long len);
 
 /* move_pages */
 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..2ad736ff007c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pmd_t oldpmd, entry;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int ret = 1;
@@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		goto unlock;
 	}
 
-	if (prot_numa) {
+	/* Already protnone — nothing to do for either NUMA or uffd */
+	if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd))
+		goto unlock;
 
+	if (prot_numa) {
 		/*
 		 * Avoid trapping faults against the zero page. The read-only
 		 * data is likely to be read-cached on the local CPU and
@@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (is_huge_zero_pmd(*pmd))
 			goto unlock;
 
-		if (pmd_protnone(*pmd))
-			goto unlock;
-
 		if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
 					     vma_is_single_threaded_private(vma)))
 			goto unlock;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..7c612a680014 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 	long pages = 0;
 	bool is_private_single_threaded;
 	bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+	bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
 	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
 	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 	int nr_ptes;
@@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			pte_t ptent;
 
 			/* Already in the desired state. */
-			if (prot_numa && pte_protnone(oldpte))
+			if ((prot_numa || uffd_deactivate) &&
+			    pte_protnone(oldpte))
 				continue;
 
 			page = vm_normal_page(vma, addr, oldpte);
@@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb,
 			/*
 			 * Avoid trapping faults against the zero or KSM
 			 * pages. See similar comment in change_huge_pmd.
+			 * Skip this filter for uffd deactivation which
+			 * must set protnone regardless of NUMA placement.
 			 */
 			if (prot_numa &&
 			    !folio_can_map_prot_numa(folio, vma,
@@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb,
 	WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 #endif
 
+	if (cp_flags & MM_CP_UFFD_DEACTIVATE)
+		newprot = PAGE_NONE;
+
 	if (is_vm_hugetlb_page(vma))
 		pages = hugetlb_change_protection(vma, start, end, newprot,
 						  cp_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dba1ea26fdfe..3373b11b9d83 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
 		goto out_unlock;
-	if (!vma_is_shmem(dst_vma) &&
+	if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) &&
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
@@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 			break;
 		}
 		dst_pmdval = pmdp_get_lockless(dst_pmd);
+		if (unlikely(!pmd_present(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
 		/*
 		 * If the dst_pmd is THP don't override it and just be strict.
 		 * (This includes the case where the PMD used to be THP and
 		 * changed back to none after __pte_alloc().)
 		 */
-		if (unlikely(!pmd_present(dst_pmdval) ||
-				pmd_trans_huge(dst_pmdval))) {
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
 			err = -EEXIST;
 			break;
 		}
@@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
 	return err;
 }
 
+int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+		      unsigned long len)
+{
+	struct mm_struct *dst_mm = ctx->mm;
+	unsigned long end = start + len;
+	struct vm_area_struct *dst_vma;
+	long err;
+	VMA_ITERATOR(vmi, dst_mm, start);
+
+	VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+	VM_WARN_ON_ONCE(start + len <= start);
+
+	guard(mmap_read_lock)(dst_mm);
+	guard(rwsem_read)(&ctx->map_changing_lock);
+
+	if (atomic_read(&ctx->mmap_changing))
+		return -EAGAIN;
+
+	err = -ENOENT;
+	for_each_vma_range(vmi, dst_vma, end) {
+		unsigned long vma_start = max(dst_vma->vm_start, start);
+		unsigned long vma_end = min(dst_vma->vm_end, end);
+
+		if (!userfaultfd_minor(dst_vma)) {
+			err = -ENOENT;
+			break;
+		}
+
+		/*
+		 * Private hugetlb has no page cache to fall back on —
+		 * zapping PTEs would destroy page content.
+		 */
+		if (is_vm_hugetlb_page(dst_vma) &&
+		    !(dst_vma->vm_flags & VM_SHARED)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (vma_is_anonymous(dst_vma)) {
+			/* Anonymous: set protnone, pages stay resident */
+			struct mmu_gather tlb;
+
+			tlb_gather_mmu(&tlb, dst_mm);
+			err = change_protection(&tlb, dst_vma, vma_start,
+						vma_end,
+						MM_CP_UFFD_DEACTIVATE);
+			tlb_finish_mmu(&tlb);
+			if (err < 0)
+				break;
+		} else {
+			/* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */
+			zap_page_range_single(dst_vma, vma_start,
+					      vma_end - vma_start, NULL);
+		}
+		err = 0;
+	}
+	return err;
+}
 
 void double_pt_lock(spinlock_t *ptl1,
 		    spinlock_t *ptl2)
@@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
 	if (userfaultfd_wp(vma))
 		uffd_wp_range(vma, start, end - start, false);
 
+	/* Restore protnone PTEs to normal permissions */
+	if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) {
+		struct mmu_gather tlb;
+
+		tlb_gather_mmu(&tlb, vma->vm_mm);
+		change_protection(&tlb, vma, start, end,
+				  MM_CP_TRY_CHANGE_WRITABLE);
+		tlb_finish_mmu(&tlb);
+	}
+
 	ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
 				    vma->vm_flags & ~__VM_UFFD_FLAGS,
 				    NULL_VM_UFFD_CTX, give_up_on_oom);
-- 
2.51.2



  parent reply	other threads:[~2026-04-14 14:24 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-14 14:23 [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` Kiryl Shutsemau (Meta) [this message]
2026-04-14 14:23 ` [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 08/12] userfaultfd: enable UFFD_FEATURE_MINOR_ANON Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 09/12] mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 10/12] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 11/12] selftests/mm: add userfaultfd anonymous minor fault tests Kiryl Shutsemau (Meta)
2026-04-14 14:23 ` [RFC, PATCH 12/12] Documentation/userfaultfd: document working set tracking Kiryl Shutsemau (Meta)
2026-04-14 15:28 ` [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory Peter Xu
2026-04-14 17:08   ` Kiryl Shutsemau
2026-04-14 17:45     ` Peter Xu
2026-04-14 15:37 ` David Hildenbrand (Arm)
2026-04-14 17:10   ` Kiryl Shutsemau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260414142354.1465950-4-kas@kernel.org \
    --to=kas@kernel.org \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=corbet@lwn.net \
    --cc=david@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=skhan@linuxfoundation.org \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox