From: Wei Yang <richard.weiyang@gmail.com>
To: akpm@linux-foundation.org
Cc: david@redhat.com, lorenzo.stoakes@oracle.com, riel@surriel.com,
vbabka@suse.cz, harry.yoo@oracle.com, jannh@google.com,
baohua@kernel.org, linux-mm@kvack.org,
Wei Yang <richard.weiyang@gmail.com>
Subject: [RFC Patch 1/5] mm: move anon_vma manipulation functions to own file
Date: Tue, 29 Apr 2025 09:06:35 +0000 [thread overview]
Message-ID: <20250429090639.784-2-richard.weiyang@gmail.com> (raw)
In-Reply-To: <20250429090639.784-1-richard.weiyang@gmail.com>
This patch introduce anon_vma.c and move anon_vma manipulation function
to this file from rmap.c.
This allows us to create userland testing code and verify the
functionality for further change.
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Harry Yoo <harry.yoo@oracle.com>
---
MAINTAINERS | 3 +
include/linux/anon_vma.h | 163 ++++++++++++++++
include/linux/rmap.h | 147 +--------------
mm/Makefile | 2 +-
mm/anon_vma.c | 396 +++++++++++++++++++++++++++++++++++++++
mm/anon_vma_internal.h | 14 ++
mm/rmap.c | 391 --------------------------------------
7 files changed, 578 insertions(+), 538 deletions(-)
create mode 100644 include/linux/anon_vma.h
create mode 100644 mm/anon_vma.c
create mode 100644 mm/anon_vma_internal.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 395cfe3c757d..2b4edb27307f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15575,7 +15575,10 @@ R: Harry Yoo <harry.yoo@oracle.com>
L: linux-mm@kvack.org
S: Maintained
F: include/linux/rmap.h
+F: include/linux/anon_vma.h
F: mm/rmap.c
+F: mm/anon_vma.c
+F: mm/anon_vma_internal.c
MEMORY MANAGEMENT - SECRETMEM
M: Andrew Morton <akpm@linux-foundation.org>
diff --git a/include/linux/anon_vma.h b/include/linux/anon_vma.h
new file mode 100644
index 000000000000..c2f190c786a9
--- /dev/null
+++ b/include/linux/anon_vma.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * anon_vma.h
+ */
+#ifndef __ANON_VMA_H
+#define __ANON_VMA_H
+
+#include <linux/mm_types.h>
+
+/*
+ * The anon_vma heads a list of private "related" vmas, to scan if
+ * an anonymous page pointing to this anon_vma needs to be unmapped:
+ * the vmas on the list will be related by forking, or by splitting.
+ *
+ * Since vmas come and go as they are split and merged (particularly
+ * in mprotect), the mapping field of an anonymous page cannot point
+ * directly to a vma: instead it points to an anon_vma, on whose list
+ * the related vmas can be easily linked or unlinked.
+ *
+ * After unlinking the last vma on the list, we must garbage collect
+ * the anon_vma object itself: we're guaranteed no page can be
+ * pointing to this anon_vma once its vma list is empty.
+ */
+struct anon_vma {
+ struct anon_vma *root; /* Root of this anon_vma tree */
+ struct rw_semaphore rwsem; /* W: modification, R: walking the list */
+ /*
+ * The refcount is taken on an anon_vma when there is no
+ * guarantee that the vma of page tables will exist for
+ * the duration of the operation. A caller that takes
+ * the reference is responsible for clearing up the
+ * anon_vma if they are the last user on release
+ */
+ atomic_t refcount;
+
+ /*
+ * Count of child anon_vmas. Equals to the count of all anon_vmas that
+ * have ->parent pointing to this one, including itself.
+ *
+ * This counter is used for making decision about reusing anon_vma
+ * instead of forking new one. See comments in function anon_vma_clone.
+ */
+ unsigned long num_children;
+ /* Count of VMAs whose ->anon_vma pointer points to this object. */
+ unsigned long num_active_vmas;
+
+ struct anon_vma *parent; /* Parent of this anon_vma */
+
+ /*
+ * NOTE: the LSB of the rb_root.rb_node is set by
+ * mm_take_all_locks() _after_ taking the above lock. So the
+ * rb_root must only be read/written after taking the above lock
+ * to be sure to see a valid next pointer. The LSB bit itself
+ * is serialized by a system wide lock only visible to
+ * mm_take_all_locks() (mm_all_locks_mutex).
+ */
+
+ /* Interval tree of private "related" vmas */
+ struct rb_root_cached rb_root;
+};
+
+/*
+ * The copy-on-write semantics of fork mean that an anon_vma
+ * can become associated with multiple processes. Furthermore,
+ * each child process will have its own anon_vma, where new
+ * pages for that process are instantiated.
+ *
+ * This structure allows us to find the anon_vmas associated
+ * with a VMA, or the VMAs associated with an anon_vma.
+ * The "same_vma" list contains the anon_vma_chains linking
+ * all the anon_vmas associated with this VMA.
+ * The "rb" field indexes on an interval tree the anon_vma_chains
+ * which link all the VMAs associated with this anon_vma.
+ */
+struct anon_vma_chain {
+ struct vm_area_struct *vma;
+ struct anon_vma *anon_vma;
+ struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
+ struct rb_node rb; /* locked by anon_vma->rwsem */
+ unsigned long rb_subtree_last;
+#ifdef CONFIG_DEBUG_VM_RB
+ unsigned long cached_vma_start, cached_vma_last;
+#endif
+};
+
+#ifdef CONFIG_MMU
+
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+ atomic_inc(&anon_vma->refcount);
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma);
+
+static inline void put_anon_vma(struct anon_vma *anon_vma)
+{
+ if (atomic_dec_and_test(&anon_vma->refcount))
+ __put_anon_vma(anon_vma);
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+{
+ down_write(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+ return down_write_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
+{
+ up_write(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+ down_read(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
+{
+ return down_read_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+ up_read(&anon_vma->root->rwsem);
+}
+
+
+/*
+ * anon_vma helper functions.
+ */
+void anon_vma_init(void); /* create anon_vma_cachep */
+int __anon_vma_prepare(struct vm_area_struct *);
+void unlink_anon_vmas(struct vm_area_struct *);
+int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+ if (likely(vma->anon_vma))
+ return 0;
+
+ return __anon_vma_prepare(vma);
+}
+
+static inline void anon_vma_merge(struct vm_area_struct *vma,
+ struct vm_area_struct *next)
+{
+ VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
+ unlink_anon_vmas(next);
+}
+
+#else /* !CONFIG_MMU */
+
+#define anon_vma_init() do {} while (0)
+#define anon_vma_prepare(vma) (0)
+
+#endif /* CONFIG_MMU */
+
+#endif /* __ANON_VMA_H */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6b82b618846e..5116d72b8f79 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -14,82 +14,7 @@
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/bit_spinlock.h>
-
-/*
- * The anon_vma heads a list of private "related" vmas, to scan if
- * an anonymous page pointing to this anon_vma needs to be unmapped:
- * the vmas on the list will be related by forking, or by splitting.
- *
- * Since vmas come and go as they are split and merged (particularly
- * in mprotect), the mapping field of an anonymous page cannot point
- * directly to a vma: instead it points to an anon_vma, on whose list
- * the related vmas can be easily linked or unlinked.
- *
- * After unlinking the last vma on the list, we must garbage collect
- * the anon_vma object itself: we're guaranteed no page can be
- * pointing to this anon_vma once its vma list is empty.
- */
-struct anon_vma {
- struct anon_vma *root; /* Root of this anon_vma tree */
- struct rw_semaphore rwsem; /* W: modification, R: walking the list */
- /*
- * The refcount is taken on an anon_vma when there is no
- * guarantee that the vma of page tables will exist for
- * the duration of the operation. A caller that takes
- * the reference is responsible for clearing up the
- * anon_vma if they are the last user on release
- */
- atomic_t refcount;
-
- /*
- * Count of child anon_vmas. Equals to the count of all anon_vmas that
- * have ->parent pointing to this one, including itself.
- *
- * This counter is used for making decision about reusing anon_vma
- * instead of forking new one. See comments in function anon_vma_clone.
- */
- unsigned long num_children;
- /* Count of VMAs whose ->anon_vma pointer points to this object. */
- unsigned long num_active_vmas;
-
- struct anon_vma *parent; /* Parent of this anon_vma */
-
- /*
- * NOTE: the LSB of the rb_root.rb_node is set by
- * mm_take_all_locks() _after_ taking the above lock. So the
- * rb_root must only be read/written after taking the above lock
- * to be sure to see a valid next pointer. The LSB bit itself
- * is serialized by a system wide lock only visible to
- * mm_take_all_locks() (mm_all_locks_mutex).
- */
-
- /* Interval tree of private "related" vmas */
- struct rb_root_cached rb_root;
-};
-
-/*
- * The copy-on-write semantics of fork mean that an anon_vma
- * can become associated with multiple processes. Furthermore,
- * each child process will have its own anon_vma, where new
- * pages for that process are instantiated.
- *
- * This structure allows us to find the anon_vmas associated
- * with a VMA, or the VMAs associated with an anon_vma.
- * The "same_vma" list contains the anon_vma_chains linking
- * all the anon_vmas associated with this VMA.
- * The "rb" field indexes on an interval tree the anon_vma_chains
- * which link all the VMAs associated with this anon_vma.
- */
-struct anon_vma_chain {
- struct vm_area_struct *vma;
- struct anon_vma *anon_vma;
- struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
- struct rb_node rb; /* locked by anon_vma->rwsem */
- unsigned long rb_subtree_last;
-#ifdef CONFIG_DEBUG_VM_RB
- unsigned long cached_vma_start, cached_vma_last;
-#endif
-};
+#include <linux/anon_vma.h>
enum ttu_flags {
TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */
@@ -104,73 +29,6 @@ enum ttu_flags {
};
#ifdef CONFIG_MMU
-static inline void get_anon_vma(struct anon_vma *anon_vma)
-{
- atomic_inc(&anon_vma->refcount);
-}
-
-void __put_anon_vma(struct anon_vma *anon_vma);
-
-static inline void put_anon_vma(struct anon_vma *anon_vma)
-{
- if (atomic_dec_and_test(&anon_vma->refcount))
- __put_anon_vma(anon_vma);
-}
-
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
-{
- down_write(&anon_vma->root->rwsem);
-}
-
-static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
-{
- return down_write_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
-{
- up_write(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
-{
- down_read(&anon_vma->root->rwsem);
-}
-
-static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
-{
- return down_read_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
-{
- up_read(&anon_vma->root->rwsem);
-}
-
-
-/*
- * anon_vma helper functions.
- */
-void anon_vma_init(void); /* create anon_vma_cachep */
-int __anon_vma_prepare(struct vm_area_struct *);
-void unlink_anon_vmas(struct vm_area_struct *);
-int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
-int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
-
-static inline int anon_vma_prepare(struct vm_area_struct *vma)
-{
- if (likely(vma->anon_vma))
- return 0;
-
- return __anon_vma_prepare(vma);
-}
-
-static inline void anon_vma_merge(struct vm_area_struct *vma,
- struct vm_area_struct *next)
-{
- VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
- unlink_anon_vmas(next);
-}
struct anon_vma *folio_get_anon_vma(const struct folio *folio);
@@ -1020,9 +878,6 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
#else /* !CONFIG_MMU */
-#define anon_vma_init() do {} while (0)
-#define anon_vma_prepare(vma) (0)
-
static inline int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
diff --git a/mm/Makefile b/mm/Makefile
index e7f6bbf8ae5f..468a4a076832 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o anon_vma.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/anon_vma.c b/mm/anon_vma.c
new file mode 100644
index 000000000000..321784e1c3eb
--- /dev/null
+++ b/mm/anon_vma.c
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "anon_vma_internal.h"
+#include <linux/anon_vma.h>
+
+static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+ struct anon_vma *anon_vma;
+
+ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ if (anon_vma) {
+ atomic_set(&anon_vma->refcount, 1);
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
+ anon_vma->parent = anon_vma;
+ /*
+ * Initialise the anon_vma root to point to itself. If called
+ * from fork, the root will be reset to the parents anon_vma.
+ */
+ anon_vma->root = anon_vma;
+ }
+
+ return anon_vma;
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+ VM_BUG_ON(atomic_read(&anon_vma->refcount));
+
+ /*
+ * Synchronize against folio_lock_anon_vma_read() such that
+ * we can safely hold the lock without the anon_vma getting
+ * freed.
+ *
+ * Relies on the full mb implied by the atomic_dec_and_test() from
+ * put_anon_vma() against the acquire barrier implied by
+ * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
+ *
+ * folio_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
+ * LOCK MB
+ * atomic_read() rwsem_is_locked()
+ *
+ * LOCK should suffice since the actual taking of the lock must
+ * happen _before_ what follows.
+ */
+ might_sleep();
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_unlock_write(anon_vma);
+ }
+
+ kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma)
+{
+ struct anon_vma *root = anon_vma->root;
+
+ anon_vma_free(anon_vma);
+ if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ anon_vma_free(root);
+}
+
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
+{
+ return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
+}
+
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+ kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
+
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
+{
+ avc->vma = vma;
+ avc->anon_vma = anon_vma;
+ list_add(&avc->same_vma, &vma->anon_vma_chain);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
+}
+
+/**
+ * __anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, which
+ * is handled inline by anon_vma_prepare(). But if
+ * not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
+ * and that may actually touch the rwsem even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ */
+int __anon_vma_prepare(struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct anon_vma *anon_vma, *allocated;
+ struct anon_vma_chain *avc;
+
+ mmap_assert_locked(mm);
+ might_sleep();
+
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_enomem;
+
+ anon_vma = find_mergeable_anon_vma(vma);
+ allocated = NULL;
+ if (!anon_vma) {
+ anon_vma = anon_vma_alloc();
+ if (unlikely(!anon_vma))
+ goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
+ allocated = anon_vma;
+ }
+
+ anon_vma_lock_write(anon_vma);
+ /* page_table_lock to protect against threads */
+ spin_lock(&mm->page_table_lock);
+ if (likely(!vma->anon_vma)) {
+ vma->anon_vma = anon_vma;
+ anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->num_active_vmas++;
+ allocated = NULL;
+ avc = NULL;
+ }
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock_write(anon_vma);
+
+ if (unlikely(allocated))
+ put_anon_vma(allocated);
+ if (unlikely(avc))
+ anon_vma_chain_free(avc);
+
+ return 0;
+
+ out_enomem_free_avc:
+ anon_vma_chain_free(avc);
+ out_enomem:
+ return -ENOMEM;
+}
+
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+ struct anon_vma *new_root = anon_vma->root;
+ if (new_root != root) {
+ if (WARN_ON_ONCE(root))
+ up_write(&root->rwsem);
+ root = new_root;
+ down_write(&root->rwsem);
+ }
+ return root;
+}
+
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+ if (root)
+ up_write(&root->rwsem);
+}
+
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ *
+ * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
+ * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
+ * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
+ * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
+ * call, we can identify this case by checking (!dst->anon_vma &&
+ * src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+{
+ struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *root = NULL;
+
+ list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma;
+
+ avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
+ if (unlikely(!avc)) {
+ unlock_anon_vma_root(root);
+ root = NULL;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+ }
+ anon_vma = pavc->anon_vma;
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_chain_link(dst, avc, anon_vma);
+
+ /*
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
+ *
+ * Root anon_vma is never reused:
+ * it has self-parent reference and at least one child.
+ */
+ if (!dst->anon_vma && src->anon_vma &&
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
+ dst->anon_vma = anon_vma;
+ }
+ if (dst->anon_vma)
+ dst->anon_vma->num_active_vmas++;
+ unlock_anon_vma_root(root);
+ return 0;
+
+ enomem_failure:
+ /*
+ * dst->anon_vma is dropped here otherwise its num_active_vmas can
+ * be incorrectly decremented in unlink_anon_vmas().
+ * We can safely do this because callers of anon_vma_clone() don't care
+ * about dst->anon_vma if anon_vma_clone() failed.
+ */
+ dst->anon_vma = NULL;
+ unlink_anon_vmas(dst);
+ return -ENOMEM;
+}
+
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
+{
+ struct anon_vma_chain *avc;
+ struct anon_vma *anon_vma;
+ int error;
+
+ /* Don't bother if the parent process has no anon_vma here. */
+ if (!pvma->anon_vma)
+ return 0;
+
+ /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+ vma->anon_vma = NULL;
+
+ /*
+ * First, attach the new VMA to the parent VMA's anon_vmas,
+ * so rmap can find non-COWed pages in child processes.
+ */
+ error = anon_vma_clone(vma, pvma);
+ if (error)
+ return error;
+
+ /* An existing anon_vma has been reused, all done then. */
+ if (vma->anon_vma)
+ return 0;
+
+ /* Then add our own anon_vma. */
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ goto out_error;
+ anon_vma->num_active_vmas++;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto out_error_free_anon_vma;
+
+ /*
+ * The root anon_vma's rwsem is the lock actually used when we
+ * lock any of the anon_vmas in this anon_vma tree.
+ */
+ anon_vma->root = pvma->anon_vma->root;
+ anon_vma->parent = pvma->anon_vma;
+ /*
+ * With refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned until
+ * this anon_vma is freed, because the lock lives in the root.
+ */
+ get_anon_vma(anon_vma->root);
+ /* Mark this anon_vma as the one where our new (COWed) pages go. */
+ vma->anon_vma = anon_vma;
+ anon_vma_lock_write(anon_vma);
+ anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->parent->num_children++;
+ anon_vma_unlock_write(anon_vma);
+
+ return 0;
+
+ out_error_free_anon_vma:
+ put_anon_vma(anon_vma);
+ out_error:
+ unlink_anon_vmas(vma);
+ return -ENOMEM;
+}
+
+void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc, *next;
+ struct anon_vma *root = NULL;
+
+ /*
+ * Unlink each anon_vma chained to the VMA. This list is ordered
+ * from newest to oldest, ensuring the root anon_vma gets freed last.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ root = lock_anon_vma_root(root, anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
+
+ /*
+ * Leave empty anon_vmas on the list - we'll need
+ * to free them outside the lock.
+ */
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
+ anon_vma->parent->num_children--;
+ continue;
+ }
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+ if (vma->anon_vma) {
+ vma->anon_vma->num_active_vmas--;
+
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ }
+ unlock_anon_vma_root(root);
+
+ /*
+ * Iterate the list once more, it now only contains empty and unlinked
+ * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+ * needing to write-acquire the anon_vma->root->rwsem.
+ */
+ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
+ put_anon_vma(anon_vma);
+
+ list_del(&avc->same_vma);
+ anon_vma_chain_free(avc);
+ }
+}
+
+static void anon_vma_ctor(void *data)
+{
+ struct anon_vma *anon_vma = data;
+
+ init_rwsem(&anon_vma->rwsem);
+ atomic_set(&anon_vma->refcount, 0);
+ anon_vma->rb_root = RB_ROOT_CACHED;
+}
+
+void __init anon_vma_init(void)
+{
+ anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+ 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+ anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+ SLAB_PANIC|SLAB_ACCOUNT);
+}
+
diff --git a/mm/anon_vma_internal.h b/mm/anon_vma_internal.h
new file mode 100644
index 000000000000..fa364649dc96
--- /dev/null
+++ b/mm/anon_vma_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * anon_vma_internal.h
+ *
+ * Headers required by anon_vma.c, which can be substituted accordingly when
+ * testing anon_vma functionality.
+ */
+
+#ifndef __MM_ANON_VMA_INTERNAL_H
+#define __MM_ANON_VMA_INTERNAL_H
+
+#include "internal.h"
+
+#endif /* __MM_ANON_VMA_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 67bb273dfb80..ec70360b51f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -58,7 +58,6 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
@@ -84,387 +83,6 @@
#include "internal.h"
-static struct kmem_cache *anon_vma_cachep;
-static struct kmem_cache *anon_vma_chain_cachep;
-
-static inline struct anon_vma *anon_vma_alloc(void)
-{
- struct anon_vma *anon_vma;
-
- anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
- if (anon_vma) {
- atomic_set(&anon_vma->refcount, 1);
- anon_vma->num_children = 0;
- anon_vma->num_active_vmas = 0;
- anon_vma->parent = anon_vma;
- /*
- * Initialise the anon_vma root to point to itself. If called
- * from fork, the root will be reset to the parents anon_vma.
- */
- anon_vma->root = anon_vma;
- }
-
- return anon_vma;
-}
-
-static inline void anon_vma_free(struct anon_vma *anon_vma)
-{
- VM_BUG_ON(atomic_read(&anon_vma->refcount));
-
- /*
- * Synchronize against folio_lock_anon_vma_read() such that
- * we can safely hold the lock without the anon_vma getting
- * freed.
- *
- * Relies on the full mb implied by the atomic_dec_and_test() from
- * put_anon_vma() against the acquire barrier implied by
- * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
- *
- * folio_lock_anon_vma_read() VS put_anon_vma()
- * down_read_trylock() atomic_dec_and_test()
- * LOCK MB
- * atomic_read() rwsem_is_locked()
- *
- * LOCK should suffice since the actual taking of the lock must
- * happen _before_ what follows.
- */
- might_sleep();
- if (rwsem_is_locked(&anon_vma->root->rwsem)) {
- anon_vma_lock_write(anon_vma);
- anon_vma_unlock_write(anon_vma);
- }
-
- kmem_cache_free(anon_vma_cachep, anon_vma);
-}
-
-static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
-{
- return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
-}
-
-static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
-{
- kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
-}
-
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
-{
- avc->vma = vma;
- avc->anon_vma = anon_vma;
- list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-}
-
-/**
- * __anon_vma_prepare - attach an anon_vma to a memory region
- * @vma: the memory region in question
- *
- * This makes sure the memory mapping described by 'vma' has
- * an 'anon_vma' attached to it, so that we can associate the
- * anonymous pages mapped into it with that anon_vma.
- *
- * The common case will be that we already have one, which
- * is handled inline by anon_vma_prepare(). But if
- * not we either need to find an adjacent mapping that we
- * can re-use the anon_vma from (very common when the only
- * reason for splitting a vma has been mprotect()), or we
- * allocate a new one.
- *
- * Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
- * and that may actually touch the rwsem even in the newly
- * allocated vma (it depends on RCU to make sure that the
- * anon_vma isn't actually destroyed).
- *
- * As a result, we need to do proper anon_vma locking even
- * for the new allocation. At the same time, we do not want
- * to do any locking for the common case of already having
- * an anon_vma.
- */
-int __anon_vma_prepare(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
- struct anon_vma *anon_vma, *allocated;
- struct anon_vma_chain *avc;
-
- mmap_assert_locked(mm);
- might_sleep();
-
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_enomem;
-
- anon_vma = find_mergeable_anon_vma(vma);
- allocated = NULL;
- if (!anon_vma) {
- anon_vma = anon_vma_alloc();
- if (unlikely(!anon_vma))
- goto out_enomem_free_avc;
- anon_vma->num_children++; /* self-parent link for new root */
- allocated = anon_vma;
- }
-
- anon_vma_lock_write(anon_vma);
- /* page_table_lock to protect against threads */
- spin_lock(&mm->page_table_lock);
- if (likely(!vma->anon_vma)) {
- vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->num_active_vmas++;
- allocated = NULL;
- avc = NULL;
- }
- spin_unlock(&mm->page_table_lock);
- anon_vma_unlock_write(anon_vma);
-
- if (unlikely(allocated))
- put_anon_vma(allocated);
- if (unlikely(avc))
- anon_vma_chain_free(avc);
-
- return 0;
-
- out_enomem_free_avc:
- anon_vma_chain_free(avc);
- out_enomem:
- return -ENOMEM;
-}
-
-/*
- * This is a useful helper function for locking the anon_vma root as
- * we traverse the vma->anon_vma_chain, looping over anon_vma's that
- * have the same vma.
- *
- * Such anon_vma's should have the same root, so you'd expect to see
- * just a single mutex_lock for the whole traversal.
- */
-static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
-{
- struct anon_vma *new_root = anon_vma->root;
- if (new_root != root) {
- if (WARN_ON_ONCE(root))
- up_write(&root->rwsem);
- root = new_root;
- down_write(&root->rwsem);
- }
- return root;
-}
-
-static inline void unlock_anon_vma_root(struct anon_vma *root)
-{
- if (root)
- up_write(&root->rwsem);
-}
-
-/*
- * Attach the anon_vmas from src to dst.
- * Returns 0 on success, -ENOMEM on failure.
- *
- * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
- * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
- * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
- * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
- * call, we can identify this case by checking (!dst->anon_vma &&
- * src->anon_vma).
- *
- * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
- * and reuse existing anon_vma which has no vmas and only one child anon_vma.
- * This prevents degradation of anon_vma hierarchy to endless linear chain in
- * case of constantly forking task. On the other hand, an anon_vma with more
- * than one child isn't reused even if there was no alive vma, thus rmap
- * walker has a good chance of avoiding scanning the whole hierarchy when it
- * searches where page is mapped.
- */
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
-{
- struct anon_vma_chain *avc, *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma;
-
- avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
- if (unlikely(!avc)) {
- unlock_anon_vma_root(root);
- root = NULL;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto enomem_failure;
- }
- anon_vma = pavc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
- anon_vma_chain_link(dst, avc, anon_vma);
-
- /*
- * Reuse existing anon_vma if it has no vma and only one
- * anon_vma child.
- *
- * Root anon_vma is never reused:
- * it has self-parent reference and at least one child.
- */
- if (!dst->anon_vma && src->anon_vma &&
- anon_vma->num_children < 2 &&
- anon_vma->num_active_vmas == 0)
- dst->anon_vma = anon_vma;
- }
- if (dst->anon_vma)
- dst->anon_vma->num_active_vmas++;
- unlock_anon_vma_root(root);
- return 0;
-
- enomem_failure:
- /*
- * dst->anon_vma is dropped here otherwise its num_active_vmas can
- * be incorrectly decremented in unlink_anon_vmas().
- * We can safely do this because callers of anon_vma_clone() don't care
- * about dst->anon_vma if anon_vma_clone() failed.
- */
- dst->anon_vma = NULL;
- unlink_anon_vmas(dst);
- return -ENOMEM;
-}
-
-/*
- * Attach vma to its own anon_vma, as well as to the anon_vmas that
- * the corresponding VMA in the parent process is attached to.
- * Returns 0 on success, non-zero on failure.
- */
-int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
-{
- struct anon_vma_chain *avc;
- struct anon_vma *anon_vma;
- int error;
-
- /* Don't bother if the parent process has no anon_vma here. */
- if (!pvma->anon_vma)
- return 0;
-
- /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
- vma->anon_vma = NULL;
-
- /*
- * First, attach the new VMA to the parent VMA's anon_vmas,
- * so rmap can find non-COWed pages in child processes.
- */
- error = anon_vma_clone(vma, pvma);
- if (error)
- return error;
-
- /* An existing anon_vma has been reused, all done then. */
- if (vma->anon_vma)
- return 0;
-
- /* Then add our own anon_vma. */
- anon_vma = anon_vma_alloc();
- if (!anon_vma)
- goto out_error;
- anon_vma->num_active_vmas++;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_error_free_anon_vma;
-
- /*
- * The root anon_vma's rwsem is the lock actually used when we
- * lock any of the anon_vmas in this anon_vma tree.
- */
- anon_vma->root = pvma->anon_vma->root;
- anon_vma->parent = pvma->anon_vma;
- /*
- * With refcounts, an anon_vma can stay around longer than the
- * process it belongs to. The root anon_vma needs to be pinned until
- * this anon_vma is freed, because the lock lives in the root.
- */
- get_anon_vma(anon_vma->root);
- /* Mark this anon_vma as the one where our new (COWed) pages go. */
- vma->anon_vma = anon_vma;
- anon_vma_lock_write(anon_vma);
- anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->num_children++;
- anon_vma_unlock_write(anon_vma);
-
- return 0;
-
- out_error_free_anon_vma:
- put_anon_vma(anon_vma);
- out_error:
- unlink_anon_vmas(vma);
- return -ENOMEM;
-}
-
-void unlink_anon_vmas(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc, *next;
- struct anon_vma *root = NULL;
-
- /*
- * Unlink each anon_vma chained to the VMA. This list is ordered
- * from newest to oldest, ensuring the root anon_vma gets freed last.
- */
- list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma = avc->anon_vma;
-
- root = lock_anon_vma_root(root, anon_vma);
- anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
-
- /*
- * Leave empty anon_vmas on the list - we'll need
- * to free them outside the lock.
- */
- if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
- anon_vma->parent->num_children--;
- continue;
- }
-
- list_del(&avc->same_vma);
- anon_vma_chain_free(avc);
- }
- if (vma->anon_vma) {
- vma->anon_vma->num_active_vmas--;
-
- /*
- * vma would still be needed after unlink, and anon_vma will be prepared
- * when handle fault.
- */
- vma->anon_vma = NULL;
- }
- unlock_anon_vma_root(root);
-
- /*
- * Iterate the list once more, it now only contains empty and unlinked
- * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
- * needing to write-acquire the anon_vma->root->rwsem.
- */
- list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma = avc->anon_vma;
-
- VM_WARN_ON(anon_vma->num_children);
- VM_WARN_ON(anon_vma->num_active_vmas);
- put_anon_vma(anon_vma);
-
- list_del(&avc->same_vma);
- anon_vma_chain_free(avc);
- }
-}
-
-static void anon_vma_ctor(void *data)
-{
- struct anon_vma *anon_vma = data;
-
- init_rwsem(&anon_vma->rwsem);
- atomic_set(&anon_vma->refcount, 0);
- anon_vma->rb_root = RB_ROOT_CACHED;
-}
-
-void __init anon_vma_init(void)
-{
- anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
- anon_vma_ctor);
- anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
- SLAB_PANIC|SLAB_ACCOUNT);
-}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
@@ -2749,15 +2367,6 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
-void __put_anon_vma(struct anon_vma *anon_vma)
-{
- struct anon_vma *root = anon_vma->root;
-
- anon_vma_free(anon_vma);
- if (root != anon_vma && atomic_dec_and_test(&root->refcount))
- anon_vma_free(root);
-}
-
static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
struct rmap_walk_control *rwc)
{
--
2.34.1
next prev parent reply other threads:[~2025-04-29 9:07 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-29 9:06 [RFC Patch 0/5] Make anon_vma operations testable Wei Yang
2025-04-29 9:06 ` Wei Yang [this message]
2025-04-29 9:06 ` [RFC Patch 2/5] anon_vma: add skeleton code for userland testing of anon_vma logic Wei Yang
2025-05-01 1:31 ` Wei Yang
2025-05-01 9:41 ` Lorenzo Stoakes
2025-05-01 14:45 ` Wei Yang
2025-04-29 9:06 ` [RFC Patch 3/5] anon_vma: add test for mergeable anon_vma Wei Yang
2025-04-29 9:06 ` [RFC Patch 4/5] anon_vma: add test for reusable anon_vma Wei Yang
2025-04-29 9:06 ` [RFC Patch 5/5] anon_vma: add test to assert no double-reuse Wei Yang
2025-04-29 9:31 ` [RFC Patch 0/5] Make anon_vma operations testable Lorenzo Stoakes
2025-04-29 9:38 ` David Hildenbrand
2025-04-29 9:41 ` Lorenzo Stoakes
2025-04-29 23:56 ` Wei Yang
2025-04-30 7:47 ` David Hildenbrand
2025-04-30 15:44 ` Wei Yang
2025-04-30 21:36 ` David Hildenbrand
2025-05-14 1:23 ` Wei Yang
2025-05-27 6:34 ` Wei Yang
2025-05-27 11:31 ` David Hildenbrand
2025-05-28 1:17 ` Wei Yang
2025-05-30 2:11 ` Wei Yang
2025-05-30 8:00 ` David Hildenbrand
2025-05-30 14:05 ` Wei Yang
2025-05-30 14:39 ` David Hildenbrand
2025-05-30 23:23 ` Wei Yang
2025-06-03 21:31 ` David Hildenbrand
2025-04-29 23:15 ` Wei Yang
2025-04-30 14:38 ` Lorenzo Stoakes
2025-04-30 15:41 ` Wei Yang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250429090639.784-2-richard.weiyang@gmail.com \
--to=richard.weiyang@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=david@redhat.com \
--cc=harry.yoo@oracle.com \
--cc=jannh@google.com \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=riel@surriel.com \
--cc=vbabka@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox