* [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-19 18:22 ` Liam R. Howlett
2025-12-17 12:27 ` [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink Lorenzo Stoakes
` (6 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
Add kdoc comments, describe exactly what these functinos are used for in
detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma
&& src->anon_vma dance is ONLY for fork.
Both are confusing functions that will be refactored in a subsequent patch
but the first stage is establishing documentation and some invariatns.
Add some basic CONFIG_DEBUG_VM asserts that help document expected state,
specifically:
anon_vma_clone()
- mmap write lock held.
- We do nothing if src VMA is not faulted.
- The destination VMA has no anon_vma_chain yet.
- We are always operating on the same active VMA (i.e. vma->anon-vma).
- If not forking, must operate on the same mm_struct.
unlink_anon_vmas()
- mmap lock held (read on unmap downgraded).
- That unfaulted VMAs are no-ops.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/rmap.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 64 insertions(+), 18 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index d6799afe1114..0e34c0a69fbc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -257,30 +257,60 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
up_write(&root->rwsem);
}
-/*
- * Attach the anon_vmas from src to dst.
- * Returns 0 on success, -ENOMEM on failure.
- *
- * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
- * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
- * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
- * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
- * call, we can identify this case by checking (!dst->anon_vma &&
- * src->anon_vma).
- *
- * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
- * and reuse existing anon_vma which has no vmas and only one child anon_vma.
- * This prevents degradation of anon_vma hierarchy to endless linear chain in
- * case of constantly forking task. On the other hand, an anon_vma with more
- * than one child isn't reused even if there was no alive vma, thus rmap
- * walker has a good chance of avoiding scanning the whole hierarchy when it
- * searches where page is mapped.
+static void check_anon_vma_clone(struct vm_area_struct *dst,
+ struct vm_area_struct *src)
+{
+ /* The write lock must be held. */
+ mmap_assert_write_locked(src->vm_mm);
+ /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
+ VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
+
+ /* No source anon_vma is a no-op. */
+ VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
+ VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
+ /* We are establishing a new anon_vma_chain. */
+ VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
+ /*
+ * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
+ * must be the same across dst and src.
+ */
+ VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+}
+
+/**
+ * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
+ * all of the anon_vma objects contained within @src anon_vma_chain's.
+ * @dst: The destination VMA with an empty anon_vma_chain.
+ * @src: The source VMA we wish to duplicate.
+ *
+ * This is the heart of the VMA side of the anon_vma implementation - we invoke
+ * this function whenever we need to set up a new VMA's anon_vma state.
+ *
+ * This is invoked for:
+ *
+ * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
+ * clone @src into @dst.
+ * - VMA split.
+ * - VMA (m)remap.
+ * - Fork of faulted VMA.
+ *
+ * In all cases other than fork this is simply a duplication. Fork additionally
+ * adds a new active anon_vma.
+ *
+ * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
+ * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
+ * but do have a single child. This is to avoid waste of memory when repeatedly
+ * forking.
+ *
+ * Returns: 0 on success, -ENOMEM on failure.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+ check_anon_vma_clone(dst, src);
+
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
@@ -392,11 +422,27 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
return -ENOMEM;
}
+/**
+ * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
+ * anon_vma_chain objects.
+ * @vma: The VMA whose links to anon_vma objects is to be severed.
+ *
+ * As part of the process anon_vma_chain's are freed,
+ * anon_vma->num_children,num_active_vmas is updated as required and, if the
+ * relevant anon_vma references no further VMAs, its reference count is
+ * decremented.
+ */
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
struct anon_vma *root = NULL;
+ /* Always hold mmap lock, read-lock on unmap possibly. */
+ mmap_assert_locked(vma->vm_mm);
+
+ /* Unfaulted is a no-op. */
+ VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
+
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts
2025-12-17 12:27 ` [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts Lorenzo Stoakes
@ 2025-12-19 18:22 ` Liam R. Howlett
2025-12-29 21:18 ` Suren Baghdasaryan
0 siblings, 1 reply; 20+ messages in thread
From: Liam R. Howlett @ 2025-12-19 18:22 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Suren Baghdasaryan, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [251217 07:27]:
> Add kdoc comments, describe exactly what these functinos are used for in
> detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma
> && src->anon_vma dance is ONLY for fork.
>
> Both are confusing functions that will be refactored in a subsequent patch
> but the first stage is establishing documentation and some invariatns.
>
> Add some basic CONFIG_DEBUG_VM asserts that help document expected state,
> specifically:
>
> anon_vma_clone()
> - mmap write lock held.
> - We do nothing if src VMA is not faulted.
> - The destination VMA has no anon_vma_chain yet.
> - We are always operating on the same active VMA (i.e. vma->anon-vma).
> - If not forking, must operate on the same mm_struct.
>
> unlink_anon_vmas()
> - mmap lock held (read on unmap downgraded).
> - That unfaulted VMAs are no-ops.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> ---
> mm/rmap.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 64 insertions(+), 18 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index d6799afe1114..0e34c0a69fbc 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -257,30 +257,60 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
> up_write(&root->rwsem);
> }
>
> -/*
> - * Attach the anon_vmas from src to dst.
> - * Returns 0 on success, -ENOMEM on failure.
> - *
> - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
> - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
> - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
> - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
> - * call, we can identify this case by checking (!dst->anon_vma &&
> - * src->anon_vma).
> - *
> - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
> - * and reuse existing anon_vma which has no vmas and only one child anon_vma.
> - * This prevents degradation of anon_vma hierarchy to endless linear chain in
> - * case of constantly forking task. On the other hand, an anon_vma with more
> - * than one child isn't reused even if there was no alive vma, thus rmap
> - * walker has a good chance of avoiding scanning the whole hierarchy when it
> - * searches where page is mapped.
> +static void check_anon_vma_clone(struct vm_area_struct *dst,
> + struct vm_area_struct *src)
> +{
> + /* The write lock must be held. */
> + mmap_assert_write_locked(src->vm_mm);
> + /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> + VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> +
> + /* No source anon_vma is a no-op. */
> + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
> + /* We are establishing a new anon_vma_chain. */
> + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
> + /*
> + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
> + * must be the same across dst and src.
> + */
> + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> +}
> +
> +/**
> + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
> + * all of the anon_vma objects contained within @src anon_vma_chain's.
> + * @dst: The destination VMA with an empty anon_vma_chain.
> + * @src: The source VMA we wish to duplicate.
> + *
> + * This is the heart of the VMA side of the anon_vma implementation - we invoke
> + * this function whenever we need to set up a new VMA's anon_vma state.
> + *
> + * This is invoked for:
> + *
> + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
> + * clone @src into @dst.
> + * - VMA split.
> + * - VMA (m)remap.
> + * - Fork of faulted VMA.
> + *
> + * In all cases other than fork this is simply a duplication. Fork additionally
> + * adds a new active anon_vma.
> + *
> + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
> + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
> + * but do have a single child. This is to avoid waste of memory when repeatedly
> + * forking.
> + *
> + * Returns: 0 on success, -ENOMEM on failure.
> */
> int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> {
> struct anon_vma_chain *avc, *pavc;
> struct anon_vma *root = NULL;
>
> + check_anon_vma_clone(dst, src);
> +
> list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> struct anon_vma *anon_vma;
>
> @@ -392,11 +422,27 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> return -ENOMEM;
> }
>
> +/**
> + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
> + * anon_vma_chain objects.
> + * @vma: The VMA whose links to anon_vma objects is to be severed.
> + *
> + * As part of the process anon_vma_chain's are freed,
> + * anon_vma->num_children,num_active_vmas is updated as required and, if the
> + * relevant anon_vma references no further VMAs, its reference count is
> + * decremented.
> + */
> void unlink_anon_vmas(struct vm_area_struct *vma)
> {
> struct anon_vma_chain *avc, *next;
> struct anon_vma *root = NULL;
>
> + /* Always hold mmap lock, read-lock on unmap possibly. */
> + mmap_assert_locked(vma->vm_mm);
> +
> + /* Unfaulted is a no-op. */
> + VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
> +
> /*
> * Unlink each anon_vma chained to the VMA. This list is ordered
> * from newest to oldest, ensuring the root anon_vma gets freed last.
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts
2025-12-19 18:22 ` Liam R. Howlett
@ 2025-12-29 21:18 ` Suren Baghdasaryan
2025-12-30 21:21 ` Suren Baghdasaryan
0 siblings, 1 reply; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-29 21:18 UTC (permalink / raw)
To: Liam R. Howlett, Lorenzo Stoakes, Andrew Morton,
Suren Baghdasaryan, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Fri, Dec 19, 2025 at 10:22 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [251217 07:27]:
> > Add kdoc comments, describe exactly what these functinos are used for in
> > detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma
> > && src->anon_vma dance is ONLY for fork.
> >
> > Both are confusing functions that will be refactored in a subsequent patch
> > but the first stage is establishing documentation and some invariatns.
> >
> > Add some basic CONFIG_DEBUG_VM asserts that help document expected state,
> > specifically:
> >
> > anon_vma_clone()
> > - mmap write lock held.
> > - We do nothing if src VMA is not faulted.
> > - The destination VMA has no anon_vma_chain yet.
> > - We are always operating on the same active VMA (i.e. vma->anon-vma).
nit: s/vma->anon-vma/vma->anon_vma
> > - If not forking, must operate on the same mm_struct.
> >
> > unlink_anon_vmas()
> > - mmap lock held (read on unmap downgraded).
Out of curiosity I looked for the place where unlink_anon_vmas() is
called with mmap_lock downgraded to read but could not find it. Could
you please point me to it?
> > - That unfaulted VMAs are no-ops.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>
> > ---
> > mm/rmap.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------
> > 1 file changed, 64 insertions(+), 18 deletions(-)
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index d6799afe1114..0e34c0a69fbc 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -257,30 +257,60 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
> > up_write(&root->rwsem);
> > }
> >
> > -/*
> > - * Attach the anon_vmas from src to dst.
> > - * Returns 0 on success, -ENOMEM on failure.
> > - *
> > - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
> > - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
> > - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
> > - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
> > - * call, we can identify this case by checking (!dst->anon_vma &&
> > - * src->anon_vma).
> > - *
> > - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
> > - * and reuse existing anon_vma which has no vmas and only one child anon_vma.
> > - * This prevents degradation of anon_vma hierarchy to endless linear chain in
> > - * case of constantly forking task. On the other hand, an anon_vma with more
> > - * than one child isn't reused even if there was no alive vma, thus rmap
> > - * walker has a good chance of avoiding scanning the whole hierarchy when it
> > - * searches where page is mapped.
> > +static void check_anon_vma_clone(struct vm_area_struct *dst,
> > + struct vm_area_struct *src)
> > +{
> > + /* The write lock must be held. */
> > + mmap_assert_write_locked(src->vm_mm);
> > + /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> > + VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> > +
> > + /* No source anon_vma is a no-op. */
I'm confused about the above comment. Do you mean that if
!src->anon_vma then it's a no-op and therefore this function shouldn't
be called? If so, we could simply have VM_WARN_ON_ONCE(!src->anon_vma)
but checks below have more conditions. Can this comment be perhaps
expanded please so that the reader clearly understands what is allowed
and what is not. For example, combination (!src->anon_vma &&
!dst->anon_vma) is allowed and we correctly not triggering a warning
here, however that's still a no-op IIUC.
> > + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> > + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
> > + /* We are establishing a new anon_vma_chain. */
> > + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
> > + /*
> > + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
> > + * must be the same across dst and src.
This is the second time in this small function where we have to remind
that dst->anon_vma==NULL means that we are forking. Maybe it's better
to introduce a `bool forking = dst->anon_vma==NULL;` variable at the
beginning and use it in all these checks?
I know, I'm nitpicking but as you said, anon_vma code is very
compicated, so the more clarity we can bring to it the better.
> > + */
> > + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> > +}
> > +
> > +/**
> > + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
> > + * all of the anon_vma objects contained within @src anon_vma_chain's.
> > + * @dst: The destination VMA with an empty anon_vma_chain.
> > + * @src: The source VMA we wish to duplicate.
> > + *
> > + * This is the heart of the VMA side of the anon_vma implementation - we invoke
> > + * this function whenever we need to set up a new VMA's anon_vma state.
> > + *
> > + * This is invoked for:
> > + *
> > + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
> > + * clone @src into @dst.
> > + * - VMA split.
> > + * - VMA (m)remap.
> > + * - Fork of faulted VMA.
> > + *
> > + * In all cases other than fork this is simply a duplication. Fork additionally
> > + * adds a new active anon_vma.
> > + *
> > + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
> > + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
> > + * but do have a single child. This is to avoid waste of memory when repeatedly
> > + * forking.
> > + *
> > + * Returns: 0 on success, -ENOMEM on failure.
> > */
> > int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > {
> > struct anon_vma_chain *avc, *pavc;
> > struct anon_vma *root = NULL;
> >
> > + check_anon_vma_clone(dst, src);
> > +
> > list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> > struct anon_vma *anon_vma;
> >
> > @@ -392,11 +422,27 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> > return -ENOMEM;
> > }
> >
> > +/**
> > + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
> > + * anon_vma_chain objects.
> > + * @vma: The VMA whose links to anon_vma objects is to be severed.
> > + *
> > + * As part of the process anon_vma_chain's are freed,
> > + * anon_vma->num_children,num_active_vmas is updated as required and, if the
> > + * relevant anon_vma references no further VMAs, its reference count is
> > + * decremented.
> > + */
> > void unlink_anon_vmas(struct vm_area_struct *vma)
> > {
> > struct anon_vma_chain *avc, *next;
> > struct anon_vma *root = NULL;
> >
> > + /* Always hold mmap lock, read-lock on unmap possibly. */
> > + mmap_assert_locked(vma->vm_mm);
> > +
> > + /* Unfaulted is a no-op. */
> > + VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
Hmm. anon_vma_clone() calls unlink_anon_vmas() after setting
dst->anon_vma=NULL in the enomem_failure path. This warning would
imply that in such case dst->anon_vma_chain is always non-empty. But I
don't think we can always expect that... What if the very first call
to anon_vma_chain_alloc() in anon_vma_clone()'s loop failed, I think
this would result in dst->anon_vma_chain being empty, no?
> > +
> > /*
> > * Unlink each anon_vma chained to the VMA. This list is ordered
> > * from newest to oldest, ensuring the root anon_vma gets freed last.
> > --
> > 2.52.0
> >
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts
2025-12-29 21:18 ` Suren Baghdasaryan
@ 2025-12-30 21:21 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 21:21 UTC (permalink / raw)
To: Liam R. Howlett, Lorenzo Stoakes, Andrew Morton,
Suren Baghdasaryan, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Mon, Dec 29, 2025 at 1:18 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Dec 19, 2025 at 10:22 AM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [251217 07:27]:
> > > Add kdoc comments, describe exactly what these functinos are used for in
> > > detail, pointing out importantly that the anon_vma_clone() !dst->anon_vma
> > > && src->anon_vma dance is ONLY for fork.
> > >
> > > Both are confusing functions that will be refactored in a subsequent patch
> > > but the first stage is establishing documentation and some invariatns.
> > >
> > > Add some basic CONFIG_DEBUG_VM asserts that help document expected state,
> > > specifically:
> > >
> > > anon_vma_clone()
> > > - mmap write lock held.
> > > - We do nothing if src VMA is not faulted.
> > > - The destination VMA has no anon_vma_chain yet.
> > > - We are always operating on the same active VMA (i.e. vma->anon-vma).
>
> nit: s/vma->anon-vma/vma->anon_vma
>
> > > - If not forking, must operate on the same mm_struct.
> > >
> > > unlink_anon_vmas()
> > > - mmap lock held (read on unmap downgraded).
>
> Out of curiosity I looked for the place where unlink_anon_vmas() is
> called with mmap_lock downgraded to read but could not find it. Could
> you please point me to it?
>
> > > - That unfaulted VMAs are no-ops.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >
> > Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> >
> > > ---
> > > mm/rmap.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------
> > > 1 file changed, 64 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > index d6799afe1114..0e34c0a69fbc 100644
> > > --- a/mm/rmap.c
> > > +++ b/mm/rmap.c
> > > @@ -257,30 +257,60 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
> > > up_write(&root->rwsem);
> > > }
> > >
> > > -/*
> > > - * Attach the anon_vmas from src to dst.
> > > - * Returns 0 on success, -ENOMEM on failure.
> > > - *
> > > - * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
> > > - * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
> > > - * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
> > > - * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
> > > - * call, we can identify this case by checking (!dst->anon_vma &&
> > > - * src->anon_vma).
> > > - *
> > > - * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
> > > - * and reuse existing anon_vma which has no vmas and only one child anon_vma.
> > > - * This prevents degradation of anon_vma hierarchy to endless linear chain in
> > > - * case of constantly forking task. On the other hand, an anon_vma with more
> > > - * than one child isn't reused even if there was no alive vma, thus rmap
> > > - * walker has a good chance of avoiding scanning the whole hierarchy when it
> > > - * searches where page is mapped.
> > > +static void check_anon_vma_clone(struct vm_area_struct *dst,
> > > + struct vm_area_struct *src)
> > > +{
> > > + /* The write lock must be held. */
> > > + mmap_assert_write_locked(src->vm_mm);
As a side-note, I think we might be able to claim
vma_assert_locked(src) here if we move vma_start_write(vma) at
https://elixir.bootlin.com/linux/v6.19-rc3/source/mm/vma.c#L538 to
line 527. I looked over other places where we call anon_vma_clone()
and they seem to write-lock src VMA before cloning anon_vma.
mmap_assert_write_locked() is sufficient here because we do not change
anon_vma under VMA lock (see __vmf_anon_prepare() upgrading to mmap
lock if a new anon_vma has to be established during page fault) but I
think we can be more strict here.
> > > + /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> > > + VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> > > +
> > > + /* No source anon_vma is a no-op. */
>
> I'm confused about the above comment. Do you mean that if
> !src->anon_vma then it's a no-op and therefore this function shouldn't
> be called? If so, we could simply have VM_WARN_ON_ONCE(!src->anon_vma)
> but checks below have more conditions. Can this comment be perhaps
> expanded please so that the reader clearly understands what is allowed
> and what is not. For example, combination (!src->anon_vma &&
> !dst->anon_vma) is allowed and we correctly not triggering a warning
> here, however that's still a no-op IIUC.
>
> > > + VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> > > + VM_WARN_ON_ONCE(!src->anon_vma && dst->anon_vma);
> > > + /* We are establishing a new anon_vma_chain. */
> > > + VM_WARN_ON_ONCE(!list_empty(&dst->anon_vma_chain));
> > > + /*
> > > + * On fork, dst->anon_vma is set NULL (temporarily). Otherwise, anon_vma
> > > + * must be the same across dst and src.
>
> This is the second time in this small function where we have to remind
> that dst->anon_vma==NULL means that we are forking. Maybe it's better
> to introduce a `bool forking = dst->anon_vma==NULL;` variable at the
> beginning and use it in all these checks?
>
> I know, I'm nitpicking but as you said, anon_vma code is very
> compicated, so the more clarity we can bring to it the better.
>
> > > + */
> > > + VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> > > +}
> > > +
> > > +/**
> > > + * anon_vma_clone - Establishes new anon_vma_chain objects in @dst linking to
> > > + * all of the anon_vma objects contained within @src anon_vma_chain's.
> > > + * @dst: The destination VMA with an empty anon_vma_chain.
> > > + * @src: The source VMA we wish to duplicate.
> > > + *
> > > + * This is the heart of the VMA side of the anon_vma implementation - we invoke
> > > + * this function whenever we need to set up a new VMA's anon_vma state.
> > > + *
> > > + * This is invoked for:
> > > + *
> > > + * - VMA Merge, but only when @dst is unfaulted and @src is faulted - meaning we
> > > + * clone @src into @dst.
> > > + * - VMA split.
> > > + * - VMA (m)remap.
> > > + * - Fork of faulted VMA.
> > > + *
> > > + * In all cases other than fork this is simply a duplication. Fork additionally
> > > + * adds a new active anon_vma.
> > > + *
> > > + * ONLY in the case of fork do we try to 'reuse' existing anon_vma's in an
> > > + * anon_vma hierarchy, reusing anon_vma's which have no VMA associated with them
> > > + * but do have a single child. This is to avoid waste of memory when repeatedly
> > > + * forking.
> > > + *
> > > + * Returns: 0 on success, -ENOMEM on failure.
> > > */
> > > int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > > {
> > > struct anon_vma_chain *avc, *pavc;
> > > struct anon_vma *root = NULL;
> > >
> > > + check_anon_vma_clone(dst, src);
> > > +
> > > list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> > > struct anon_vma *anon_vma;
> > >
> > > @@ -392,11 +422,27 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> > > return -ENOMEM;
> > > }
> > >
> > > +/**
> > > + * unlink_anon_vmas() - remove all links between a VMA and anon_vma's, freeing
> > > + * anon_vma_chain objects.
> > > + * @vma: The VMA whose links to anon_vma objects is to be severed.
> > > + *
> > > + * As part of the process anon_vma_chain's are freed,
> > > + * anon_vma->num_children,num_active_vmas is updated as required and, if the
> > > + * relevant anon_vma references no further VMAs, its reference count is
> > > + * decremented.
> > > + */
> > > void unlink_anon_vmas(struct vm_area_struct *vma)
> > > {
> > > struct anon_vma_chain *avc, *next;
> > > struct anon_vma *root = NULL;
> > >
> > > + /* Always hold mmap lock, read-lock on unmap possibly. */
> > > + mmap_assert_locked(vma->vm_mm);
> > > +
> > > + /* Unfaulted is a no-op. */
> > > + VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
>
> Hmm. anon_vma_clone() calls unlink_anon_vmas() after setting
> dst->anon_vma=NULL in the enomem_failure path. This warning would
> imply that in such case dst->anon_vma_chain is always non-empty. But I
> don't think we can always expect that... What if the very first call
> to anon_vma_chain_alloc() in anon_vma_clone()'s loop failed, I think
> this would result in dst->anon_vma_chain being empty, no?
>
> > > +
> > > /*
> > > * Unlink each anon_vma chained to the VMA. This list is ordered
> > > * from newest to oldest, ensuring the root anon_vma gets freed last.
> > > --
> > > 2.52.0
> > >
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
2025-12-17 12:27 ` [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-19 18:28 ` Liam R. Howlett
2025-12-17 12:27 ` [PATCH 3/8] mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap Lorenzo Stoakes
` (5 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
For both anon_vma_clone() and unlink_anon_vmas(), if the source VMA or the
VMA to be linked are unfaulted (e.g. !vma->anon_vma), then the functions do
nothing. Simply exit early in these cases.
In the unlink_anon_vmas() case we can also remove a conditional that checks
whether vma->anon_vma is set.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/rmap.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 0e34c0a69fbc..9332d1cbc643 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -309,6 +309,9 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+ if (!src->anon_vma)
+ return 0;
+
check_anon_vma_clone(dst, src);
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
@@ -441,7 +444,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
mmap_assert_locked(vma->vm_mm);
/* Unfaulted is a no-op. */
- VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
+ if (!vma->anon_vma)
+ return;
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
@@ -465,15 +469,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma) {
- vma->anon_vma->num_active_vmas--;
- /*
- * vma would still be needed after unlink, and anon_vma will be prepared
- * when handle fault.
- */
- vma->anon_vma = NULL;
- }
+ vma->anon_vma->num_active_vmas--;
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
unlock_anon_vma_root(root);
/*
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink
2025-12-17 12:27 ` [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink Lorenzo Stoakes
@ 2025-12-19 18:28 ` Liam R. Howlett
2025-12-29 21:41 ` Suren Baghdasaryan
0 siblings, 1 reply; 20+ messages in thread
From: Liam R. Howlett @ 2025-12-19 18:28 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Suren Baghdasaryan, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [251217 07:27]:
> For both anon_vma_clone() and unlink_anon_vmas(), if the source VMA or the
> VMA to be linked are unfaulted (e.g. !vma->anon_vma), then the functions do
> nothing. Simply exit early in these cases.
>
> In the unlink_anon_vmas() case we can also remove a conditional that checks
> whether vma->anon_vma is set.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/rmap.c | 20 +++++++++++---------
> 1 file changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 0e34c0a69fbc..9332d1cbc643 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -309,6 +309,9 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> struct anon_vma_chain *avc, *pavc;
> struct anon_vma *root = NULL;
>
> + if (!src->anon_vma)
> + return 0;
> +
> check_anon_vma_clone(dst, src);
>
> list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> @@ -441,7 +444,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> mmap_assert_locked(vma->vm_mm);
>
> /* Unfaulted is a no-op. */
> - VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
> + if (!vma->anon_vma)
> + return;
I guess it doesn't matter because you just added the !list_empty()
check, but did you mean to drop that part?
>
> /*
> * Unlink each anon_vma chained to the VMA. This list is ordered
> @@ -465,15 +469,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> list_del(&avc->same_vma);
> anon_vma_chain_free(avc);
> }
> - if (vma->anon_vma) {
> - vma->anon_vma->num_active_vmas--;
>
> - /*
> - * vma would still be needed after unlink, and anon_vma will be prepared
> - * when handle fault.
> - */
> - vma->anon_vma = NULL;
> - }
> + vma->anon_vma->num_active_vmas--;
> + /*
> + * vma would still be needed after unlink, and anon_vma will be prepared
> + * when handle fault.
> + */
> + vma->anon_vma = NULL;
> unlock_anon_vma_root(root);
>
> /*
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink
2025-12-19 18:28 ` Liam R. Howlett
@ 2025-12-29 21:41 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-29 21:41 UTC (permalink / raw)
To: Liam R. Howlett, Lorenzo Stoakes, Andrew Morton,
Suren Baghdasaryan, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Fri, Dec 19, 2025 at 10:28 AM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [251217 07:27]:
> > For both anon_vma_clone() and unlink_anon_vmas(), if the source VMA or the
> > VMA to be linked are unfaulted (e.g. !vma->anon_vma), then the functions do
> > nothing. Simply exit early in these cases.
> >
> > In the unlink_anon_vmas() case we can also remove a conditional that checks
> > whether vma->anon_vma is set.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/rmap.c | 20 +++++++++++---------
> > 1 file changed, 11 insertions(+), 9 deletions(-)
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 0e34c0a69fbc..9332d1cbc643 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -309,6 +309,9 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > struct anon_vma_chain *avc, *pavc;
> > struct anon_vma *root = NULL;
> >
> > + if (!src->anon_vma)
> > + return 0;
> > +
> > check_anon_vma_clone(dst, src);
check_anon_vma_clone() is used only here and contains a couple of
warnings with "!src->anon_vma && ..." conditions, which now will never
be triggered even if the second part of those conditions was true.
This seems like a regression (you are not checking conditions you were
checking before this change). To avoid that, you can place this early
exit after the call to check_anon_vma_clone(dst, src).
> >
> > list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> > @@ -441,7 +444,8 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> > mmap_assert_locked(vma->vm_mm);
> >
> > /* Unfaulted is a no-op. */
> > - VM_WARN_ON_ONCE(!vma->anon_vma && !list_empty(&vma->anon_vma_chain));
> > + if (!vma->anon_vma)
> > + return;
>
> I guess it doesn't matter because you just added the !list_empty()
> check, but did you mean to drop that part?
>
> >
> > /*
> > * Unlink each anon_vma chained to the VMA. This list is ordered
> > @@ -465,15 +469,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> > list_del(&avc->same_vma);
> > anon_vma_chain_free(avc);
> > }
> > - if (vma->anon_vma) {
> > - vma->anon_vma->num_active_vmas--;
> >
> > - /*
> > - * vma would still be needed after unlink, and anon_vma will be prepared
> > - * when handle fault.
> > - */
> > - vma->anon_vma = NULL;
> > - }
> > + vma->anon_vma->num_active_vmas--;
> > + /*
> > + * vma would still be needed after unlink, and anon_vma will be prepared
> > + * when handle fault.
> > + */
> > + vma->anon_vma = NULL;
> > unlock_anon_vma_root(root);
> >
> > /*
> > --
> > 2.52.0
> >
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 3/8] mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
2025-12-17 12:27 ` [PATCH 1/8] mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add asserts Lorenzo Stoakes
2025-12-17 12:27 ` [PATCH 2/8] mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-29 22:17 ` Suren Baghdasaryan
2025-12-17 12:27 ` [PATCH 4/8] mm/rmap: remove anon_vma_merge() function Lorenzo Stoakes
` (4 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
The root anon_vma of all anon_vma's linked to a VMA must by definition be
the same - a VMA and all of its descendants/ancestors must exist in the
same CoW chain.
Commit bb4aa39676f7 ("mm: avoid repeated anon_vma lock/unlock sequences in
anon_vma_clone()") introduced paranoid checking of the root anon_vma
remaining the same throughout all AVC's in 2011.
I think 15 years later we can safely assume that this is always the case.
Additionally, since unfaulted VMAs being cloned from or unlinked are
no-op's, we can simply lock the anon_vma's associated with this rather than
doing any specific dance around this.
This removes unnecessary checks and makes it clear that the root anon_vma
is shared between all anon_vma's in a given VMA's anon_vma_chain.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/rmap.c | 48 ++++++++++++------------------------------------
1 file changed, 12 insertions(+), 36 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 9332d1cbc643..60134a566073 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -231,32 +231,6 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
return -ENOMEM;
}
-/*
- * This is a useful helper function for locking the anon_vma root as
- * we traverse the vma->anon_vma_chain, looping over anon_vma's that
- * have the same vma.
- *
- * Such anon_vma's should have the same root, so you'd expect to see
- * just a single mutex_lock for the whole traversal.
- */
-static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
-{
- struct anon_vma *new_root = anon_vma->root;
- if (new_root != root) {
- if (WARN_ON_ONCE(root))
- up_write(&root->rwsem);
- root = new_root;
- down_write(&root->rwsem);
- }
- return root;
-}
-
-static inline void unlock_anon_vma_root(struct anon_vma *root)
-{
- if (root)
- up_write(&root->rwsem);
-}
-
static void check_anon_vma_clone(struct vm_area_struct *dst,
struct vm_area_struct *src)
{
@@ -307,26 +281,25 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
struct anon_vma_chain *avc, *pavc;
- struct anon_vma *root = NULL;
if (!src->anon_vma)
return 0;
check_anon_vma_clone(dst, src);
+ anon_vma_lock_write(src->anon_vma);
list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma;
avc = anon_vma_chain_alloc(GFP_NOWAIT);
if (unlikely(!avc)) {
- unlock_anon_vma_root(root);
- root = NULL;
+ anon_vma_unlock_write(src->anon_vma);
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto enomem_failure;
+ anon_vma_lock_write(src->anon_vma);
}
anon_vma = pavc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
/*
@@ -343,7 +316,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
if (dst->anon_vma)
dst->anon_vma->num_active_vmas++;
- unlock_anon_vma_root(root);
+
+ anon_vma_unlock_write(src->anon_vma);
return 0;
enomem_failure:
@@ -438,15 +412,17 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
- struct anon_vma *root = NULL;
+ struct anon_vma *active_anon_vma = vma->anon_vma;
/* Always hold mmap lock, read-lock on unmap possibly. */
mmap_assert_locked(vma->vm_mm);
/* Unfaulted is a no-op. */
- if (!vma->anon_vma)
+ if (!active_anon_vma)
return;
+ anon_vma_lock_write(active_anon_vma);
+
/*
* Unlink each anon_vma chained to the VMA. This list is ordered
* from newest to oldest, ensuring the root anon_vma gets freed last.
@@ -454,7 +430,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- root = lock_anon_vma_root(root, anon_vma);
anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
@@ -470,13 +445,14 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
anon_vma_chain_free(avc);
}
- vma->anon_vma->num_active_vmas--;
+ active_anon_vma->num_active_vmas--;
/*
* vma would still be needed after unlink, and anon_vma will be prepared
* when handle fault.
*/
vma->anon_vma = NULL;
- unlock_anon_vma_root(root);
+ anon_vma_unlock_write(active_anon_vma);
+
/*
* Iterate the list once more, it now only contains empty and unlinked
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 3/8] mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap
2025-12-17 12:27 ` [PATCH 3/8] mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap Lorenzo Stoakes
@ 2025-12-29 22:17 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-29 22:17 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> The root anon_vma of all anon_vma's linked to a VMA must by definition be
> the same - a VMA and all of its descendants/ancestors must exist in the
> same CoW chain.
>
> Commit bb4aa39676f7 ("mm: avoid repeated anon_vma lock/unlock sequences in
> anon_vma_clone()") introduced paranoid checking of the root anon_vma
> remaining the same throughout all AVC's in 2011.
>
> I think 15 years later we can safely assume that this is always the case.
>
> Additionally, since unfaulted VMAs being cloned from or unlinked are
> no-op's, we can simply lock the anon_vma's associated with this rather than
> doing any specific dance around this.
>
> This removes unnecessary checks and makes it clear that the root anon_vma
> is shared between all anon_vma's in a given VMA's anon_vma_chain.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/rmap.c | 48 ++++++++++++------------------------------------
> 1 file changed, 12 insertions(+), 36 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 9332d1cbc643..60134a566073 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -231,32 +231,6 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> return -ENOMEM;
> }
>
> -/*
> - * This is a useful helper function for locking the anon_vma root as
> - * we traverse the vma->anon_vma_chain, looping over anon_vma's that
> - * have the same vma.
> - *
> - * Such anon_vma's should have the same root, so you'd expect to see
> - * just a single mutex_lock for the whole traversal.
> - */
> -static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
> -{
> - struct anon_vma *new_root = anon_vma->root;
> - if (new_root != root) {
> - if (WARN_ON_ONCE(root))
> - up_write(&root->rwsem);
> - root = new_root;
> - down_write(&root->rwsem);
> - }
> - return root;
> -}
> -
> -static inline void unlock_anon_vma_root(struct anon_vma *root)
> -{
> - if (root)
> - up_write(&root->rwsem);
> -}
> -
> static void check_anon_vma_clone(struct vm_area_struct *dst,
> struct vm_area_struct *src)
> {
> @@ -307,26 +281,25 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> {
> struct anon_vma_chain *avc, *pavc;
> - struct anon_vma *root = NULL;
>
> if (!src->anon_vma)
> return 0;
>
> check_anon_vma_clone(dst, src);
>
> + anon_vma_lock_write(src->anon_vma);
> list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> struct anon_vma *anon_vma;
>
> avc = anon_vma_chain_alloc(GFP_NOWAIT);
> if (unlikely(!avc)) {
> - unlock_anon_vma_root(root);
> - root = NULL;
> + anon_vma_unlock_write(src->anon_vma);
> avc = anon_vma_chain_alloc(GFP_KERNEL);
> if (!avc)
> goto enomem_failure;
> + anon_vma_lock_write(src->anon_vma);
So, we drop and then reacquire src->anon_vma->root->rwsem, expecting
src->anon_vma and src->anon_vma->root to be the same. And IIUC
src->vm_mm's mmap lock is what guarantees all this. If so, could you
please add a clarifying comment here?
> }
> anon_vma = pavc->anon_vma;
> - root = lock_anon_vma_root(root, anon_vma);
> anon_vma_chain_link(dst, avc, anon_vma);
>
> /*
> @@ -343,7 +316,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> }
> if (dst->anon_vma)
> dst->anon_vma->num_active_vmas++;
> - unlock_anon_vma_root(root);
> +
> + anon_vma_unlock_write(src->anon_vma);
> return 0;
>
> enomem_failure:
> @@ -438,15 +412,17 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> void unlink_anon_vmas(struct vm_area_struct *vma)
> {
> struct anon_vma_chain *avc, *next;
> - struct anon_vma *root = NULL;
> + struct anon_vma *active_anon_vma = vma->anon_vma;
>
> /* Always hold mmap lock, read-lock on unmap possibly. */
> mmap_assert_locked(vma->vm_mm);
>
> /* Unfaulted is a no-op. */
> - if (!vma->anon_vma)
> + if (!active_anon_vma)
> return;
>
> + anon_vma_lock_write(active_anon_vma);
> +
> /*
> * Unlink each anon_vma chained to the VMA. This list is ordered
> * from newest to oldest, ensuring the root anon_vma gets freed last.
> @@ -454,7 +430,6 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
> struct anon_vma *anon_vma = avc->anon_vma;
>
> - root = lock_anon_vma_root(root, anon_vma);
> anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
>
> /*
> @@ -470,13 +445,14 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> anon_vma_chain_free(avc);
> }
>
> - vma->anon_vma->num_active_vmas--;
> + active_anon_vma->num_active_vmas--;
> /*
> * vma would still be needed after unlink, and anon_vma will be prepared
> * when handle fault.
> */
> vma->anon_vma = NULL;
> - unlock_anon_vma_root(root);
> + anon_vma_unlock_write(active_anon_vma);
> +
>
> /*
> * Iterate the list once more, it now only contains empty and unlinked
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 4/8] mm/rmap: remove anon_vma_merge() function
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
` (2 preceding siblings ...)
2025-12-17 12:27 ` [PATCH 3/8] mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-30 19:35 ` Suren Baghdasaryan
2025-12-17 12:27 ` [PATCH 5/8] mm/rmap: make anon_vma functions internal Lorenzo Stoakes
` (3 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
This function is confusing, we already have the concept of anon_vma merge
to adjacent VMA's anon_vma's to increase probability of anon_vma
compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.),
as well as anon_vma reuse, along side the usual VMA merge logic.
We can remove the anon_vma check as it is redundant - a merge would not
have been permitted with removal if the anon_vma's were not the same (and
in the case of an unfaulted/faulted merge, we would have already set the
unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()).
Avoid overloading this term when we're very simply unlinking anon_vma state
from a removed VMA upon merge.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/rmap.h | 7 -------
mm/vma.c | 2 +-
tools/testing/vma/vma_internal.h | 5 -----
3 files changed, 1 insertion(+), 13 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index daa92a58585d..832bfc0ccfc6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
return __anon_vma_prepare(vma);
}
-static inline void anon_vma_merge(struct vm_area_struct *vma,
- struct vm_area_struct *next)
-{
- VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
- unlink_anon_vmas(next);
-}
-
struct anon_vma *folio_get_anon_vma(const struct folio *folio);
#ifdef CONFIG_MM_ID
diff --git a/mm/vma.c b/mm/vma.c
index fc90befd162f..feb4bbd3b259 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -376,7 +376,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
fput(vp->file);
}
if (vp->remove->anon_vma)
- anon_vma_merge(vp->vma, vp->remove);
+ unlink_anon_vmas(vp->remove);
mm->map_count--;
mpol_put(vma_policy(vp->remove));
if (!vp->remove2)
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 9f0a9f5ed0fe..93e5792306d9 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
{
}
-static inline void anon_vma_merge(struct vm_area_struct *vma,
- struct vm_area_struct *next)
-{
-}
-
static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 4/8] mm/rmap: remove anon_vma_merge() function
2025-12-17 12:27 ` [PATCH 4/8] mm/rmap: remove anon_vma_merge() function Lorenzo Stoakes
@ 2025-12-30 19:35 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 19:35 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> This function is confusing, we already have the concept of anon_vma merge
> to adjacent VMA's anon_vma's to increase probability of anon_vma
> compatibility and therefore VMA merge (see is_mergeable_anon_vma() etc.),
> as well as anon_vma reuse, along side the usual VMA merge logic.
>
> We can remove the anon_vma check as it is redundant - a merge would not
> have been permitted with removal if the anon_vma's were not the same (and
> in the case of an unfaulted/faulted merge, we would have already set the
> unfaulted VMA's anon_vma to vp->remove->anon_vma in dup_anon_vma()).
>
> Avoid overloading this term when we're very simply unlinking anon_vma state
> from a removed VMA upon merge.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> ---
> include/linux/rmap.h | 7 -------
> mm/vma.c | 2 +-
> tools/testing/vma/vma_internal.h | 5 -----
> 3 files changed, 1 insertion(+), 13 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index daa92a58585d..832bfc0ccfc6 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -165,13 +165,6 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
> return __anon_vma_prepare(vma);
> }
>
> -static inline void anon_vma_merge(struct vm_area_struct *vma,
> - struct vm_area_struct *next)
> -{
> - VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
> - unlink_anon_vmas(next);
> -}
> -
> struct anon_vma *folio_get_anon_vma(const struct folio *folio);
>
> #ifdef CONFIG_MM_ID
> diff --git a/mm/vma.c b/mm/vma.c
> index fc90befd162f..feb4bbd3b259 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -376,7 +376,7 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
> fput(vp->file);
> }
> if (vp->remove->anon_vma)
> - anon_vma_merge(vp->vma, vp->remove);
> + unlink_anon_vmas(vp->remove);
> mm->map_count--;
> mpol_put(vma_policy(vp->remove));
> if (!vp->remove2)
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 9f0a9f5ed0fe..93e5792306d9 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -1265,11 +1265,6 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
> {
> }
>
> -static inline void anon_vma_merge(struct vm_area_struct *vma,
> - struct vm_area_struct *next)
> -{
> -}
> -
> static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma,
> unsigned long start,
> unsigned long end,
> --
> 2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 5/8] mm/rmap: make anon_vma functions internal
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
` (3 preceding siblings ...)
2025-12-17 12:27 ` [PATCH 4/8] mm/rmap: remove anon_vma_merge() function Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-30 19:38 ` Suren Baghdasaryan
2025-12-17 12:27 ` [PATCH 6/8] mm/mmap_lock: add vma_is_attached() helper Lorenzo Stoakes
` (2 subsequent siblings)
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
The bulk of the anon_vma operations are only used by mm, so formalise this
by putting the function prototypes and inlines in mm/internal.h. This
allows us to make changes without having to worry about the rest of the
kernel.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/rmap.h | 60 --------------------------------------------
mm/internal.h | 58 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 58 insertions(+), 60 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 832bfc0ccfc6..dd764951b03d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -104,68 +104,8 @@ enum ttu_flags {
};
#ifdef CONFIG_MMU
-static inline void get_anon_vma(struct anon_vma *anon_vma)
-{
- atomic_inc(&anon_vma->refcount);
-}
-
-void __put_anon_vma(struct anon_vma *anon_vma);
-
-static inline void put_anon_vma(struct anon_vma *anon_vma)
-{
- if (atomic_dec_and_test(&anon_vma->refcount))
- __put_anon_vma(anon_vma);
-}
-
-static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
-{
- down_write(&anon_vma->root->rwsem);
-}
-static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
-{
- return down_write_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
-{
- up_write(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
-{
- down_read(&anon_vma->root->rwsem);
-}
-
-static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
-{
- return down_read_trylock(&anon_vma->root->rwsem);
-}
-
-static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
-{
- up_read(&anon_vma->root->rwsem);
-}
-
-
-/*
- * anon_vma helper functions.
- */
void anon_vma_init(void); /* create anon_vma_cachep */
-int __anon_vma_prepare(struct vm_area_struct *);
-void unlink_anon_vmas(struct vm_area_struct *);
-int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
-int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
-
-static inline int anon_vma_prepare(struct vm_area_struct *vma)
-{
- if (likely(vma->anon_vma))
- return 0;
-
- return __anon_vma_prepare(vma);
-}
-
-struct anon_vma *folio_get_anon_vma(const struct folio *folio);
#ifdef CONFIG_MM_ID
static __always_inline void folio_lock_large_mapcount(struct folio *folio)
diff --git a/mm/internal.h b/mm/internal.h
index e430da900430..469d4ef1ccc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -199,6 +199,64 @@ static inline void vma_close(struct vm_area_struct *vma)
#ifdef CONFIG_MMU
+static inline void get_anon_vma(struct anon_vma *anon_vma)
+{
+ atomic_inc(&anon_vma->refcount);
+}
+
+void __put_anon_vma(struct anon_vma *anon_vma);
+
+static inline void put_anon_vma(struct anon_vma *anon_vma)
+{
+ if (atomic_dec_and_test(&anon_vma->refcount))
+ __put_anon_vma(anon_vma);
+}
+
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
+{
+ down_write(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+ return down_write_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
+{
+ up_write(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+ down_read(&anon_vma->root->rwsem);
+}
+
+static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
+{
+ return down_read_trylock(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+ up_read(&anon_vma->root->rwsem);
+}
+
+struct anon_vma *folio_get_anon_vma(const struct folio *folio);
+
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
+int __anon_vma_prepare(struct vm_area_struct *vma);
+void unlink_anon_vmas(struct vm_area_struct *vma);
+
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+ if (likely(vma->anon_vma))
+ return 0;
+
+ return __anon_vma_prepare(vma);
+}
+
/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 5/8] mm/rmap: make anon_vma functions internal
2025-12-17 12:27 ` [PATCH 5/8] mm/rmap: make anon_vma functions internal Lorenzo Stoakes
@ 2025-12-30 19:38 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 19:38 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> The bulk of the anon_vma operations are only used by mm, so formalise this
> by putting the function prototypes and inlines in mm/internal.h. This
> allows us to make changes without having to worry about the rest of the
> kernel.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> ---
> include/linux/rmap.h | 60 --------------------------------------------
> mm/internal.h | 58 ++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 58 insertions(+), 60 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 832bfc0ccfc6..dd764951b03d 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -104,68 +104,8 @@ enum ttu_flags {
> };
>
> #ifdef CONFIG_MMU
> -static inline void get_anon_vma(struct anon_vma *anon_vma)
> -{
> - atomic_inc(&anon_vma->refcount);
> -}
> -
> -void __put_anon_vma(struct anon_vma *anon_vma);
> -
> -static inline void put_anon_vma(struct anon_vma *anon_vma)
> -{
> - if (atomic_dec_and_test(&anon_vma->refcount))
> - __put_anon_vma(anon_vma);
> -}
> -
> -static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
> -{
> - down_write(&anon_vma->root->rwsem);
> -}
>
> -static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
> -{
> - return down_write_trylock(&anon_vma->root->rwsem);
> -}
> -
> -static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
> -{
> - up_write(&anon_vma->root->rwsem);
> -}
> -
> -static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
> -{
> - down_read(&anon_vma->root->rwsem);
> -}
> -
> -static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
> -{
> - return down_read_trylock(&anon_vma->root->rwsem);
> -}
> -
> -static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
> -{
> - up_read(&anon_vma->root->rwsem);
> -}
> -
> -
> -/*
> - * anon_vma helper functions.
> - */
> void anon_vma_init(void); /* create anon_vma_cachep */
> -int __anon_vma_prepare(struct vm_area_struct *);
> -void unlink_anon_vmas(struct vm_area_struct *);
> -int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
> -int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
> -
> -static inline int anon_vma_prepare(struct vm_area_struct *vma)
> -{
> - if (likely(vma->anon_vma))
> - return 0;
> -
> - return __anon_vma_prepare(vma);
> -}
> -
> -struct anon_vma *folio_get_anon_vma(const struct folio *folio);
>
> #ifdef CONFIG_MM_ID
> static __always_inline void folio_lock_large_mapcount(struct folio *folio)
> diff --git a/mm/internal.h b/mm/internal.h
> index e430da900430..469d4ef1ccc5 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -199,6 +199,64 @@ static inline void vma_close(struct vm_area_struct *vma)
>
> #ifdef CONFIG_MMU
>
> +static inline void get_anon_vma(struct anon_vma *anon_vma)
> +{
> + atomic_inc(&anon_vma->refcount);
> +}
> +
> +void __put_anon_vma(struct anon_vma *anon_vma);
> +
> +static inline void put_anon_vma(struct anon_vma *anon_vma)
> +{
> + if (atomic_dec_and_test(&anon_vma->refcount))
> + __put_anon_vma(anon_vma);
> +}
> +
> +static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
> +{
> + down_write(&anon_vma->root->rwsem);
> +}
> +
> +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
> +{
> + return down_write_trylock(&anon_vma->root->rwsem);
> +}
> +
> +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
> +{
> + up_write(&anon_vma->root->rwsem);
> +}
> +
> +static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
> +{
> + down_read(&anon_vma->root->rwsem);
> +}
> +
> +static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
> +{
> + return down_read_trylock(&anon_vma->root->rwsem);
> +}
> +
> +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
> +{
> + up_read(&anon_vma->root->rwsem);
> +}
> +
> +struct anon_vma *folio_get_anon_vma(const struct folio *folio);
> +
> +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
> +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
> +int __anon_vma_prepare(struct vm_area_struct *vma);
> +void unlink_anon_vmas(struct vm_area_struct *vma);
> +
> +static inline int anon_vma_prepare(struct vm_area_struct *vma)
> +{
> + if (likely(vma->anon_vma))
> + return 0;
> +
> + return __anon_vma_prepare(vma);
> +}
> +
> /* Flags for folio_pte_batch(). */
> typedef int __bitwise fpb_t;
>
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 6/8] mm/mmap_lock: add vma_is_attached() helper
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
` (4 preceding siblings ...)
2025-12-17 12:27 ` [PATCH 5/8] mm/rmap: make anon_vma functions internal Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-30 19:50 ` Suren Baghdasaryan
2025-12-17 12:27 ` [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible Lorenzo Stoakes
2025-12-17 12:27 ` [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone() Lorenzo Stoakes
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
This makes it easy to explicitly check for VMA detachment, which is useful
for things like asserts.
Note that we intentionally do not allow this function to be available
should CONFIG_PER_VMA_LOCK be set - this is because vma_assert_attached()
and vma_assert_detached() are no-ops if !CONFIG_PER_VMA_LOCK, so there is
no correct state for vma_is_attached() to be in if this configuration
option is not specified.
Therefore users elsewhere must invoke this function only after checking for
CONFIG_PER_VMA_LOCK.
We rework the assert functions to utilise this.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/mmap_lock.h | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index d53f72dba7fe..b50416fbba20 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -251,6 +251,11 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
+static inline bool vma_is_attached(struct vm_area_struct *vma)
+{
+ return refcount_read(&vma->vm_refcnt);
+}
+
/*
* WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
* assertions should be made either under mmap_write_lock or when the object
@@ -258,12 +263,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
*/
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(!vma_is_attached(vma));
}
static inline void vma_assert_detached(struct vm_area_struct *vma)
{
- WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+ WARN_ON_ONCE(vma_is_attached(vma));
}
static inline void vma_mark_attached(struct vm_area_struct *vma)
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 6/8] mm/mmap_lock: add vma_is_attached() helper
2025-12-17 12:27 ` [PATCH 6/8] mm/mmap_lock: add vma_is_attached() helper Lorenzo Stoakes
@ 2025-12-30 19:50 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 19:50 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> This makes it easy to explicitly check for VMA detachment, which is useful
> for things like asserts.
>
> Note that we intentionally do not allow this function to be available
> should CONFIG_PER_VMA_LOCK be set - this is because vma_assert_attached()
> and vma_assert_detached() are no-ops if !CONFIG_PER_VMA_LOCK, so there is
> no correct state for vma_is_attached() to be in if this configuration
> option is not specified.
>
> Therefore users elsewhere must invoke this function only after checking for
> CONFIG_PER_VMA_LOCK.
>
> We rework the assert functions to utilise this.
Thank you! This nicely documents vm_refcnt attached state. Another
step in this direction is adding:
static inline bool vma_is_read_locked(struct vm_area_struct *vma)
{
return refcount_read(&vma->vm_refcnt) > 1;
}
and changing vma_assert_locked() to use it.
But I can do that in a separate patch, so LGTM.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> ---
> include/linux/mmap_lock.h | 9 +++++++--
> 1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index d53f72dba7fe..b50416fbba20 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -251,6 +251,11 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
> !__is_vma_write_locked(vma, &mm_lock_seq), vma);
> }
>
> +static inline bool vma_is_attached(struct vm_area_struct *vma)
> +{
> + return refcount_read(&vma->vm_refcnt);
> +}
> +
> /*
> * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
> * assertions should be made either under mmap_write_lock or when the object
> @@ -258,12 +263,12 @@ static inline void vma_assert_locked(struct vm_area_struct *vma)
> */
> static inline void vma_assert_attached(struct vm_area_struct *vma)
> {
> - WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
> + WARN_ON_ONCE(!vma_is_attached(vma));
> }
>
> static inline void vma_assert_detached(struct vm_area_struct *vma)
> {
> - WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
> + WARN_ON_ONCE(vma_is_attached(vma));
> }
>
> static inline void vma_mark_attached(struct vm_area_struct *vma)
> --
> 2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
` (5 preceding siblings ...)
2025-12-17 12:27 ` [PATCH 6/8] mm/mmap_lock: add vma_is_attached() helper Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-30 21:35 ` Suren Baghdasaryan
2025-12-17 12:27 ` [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone() Lorenzo Stoakes
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
There is no reason to allocate the anon_vma_chain under the anon_vma write
lock when cloning - we can in fact assign these to the destination VMA
safely as we hold the exclusive mmap lock and therefore preclude anybody
else accessing these fields.
We only need take the anon_vma write lock when we link rbtree edges from
the anon_vma to the newly established AVCs.
This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance
introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation
under anon_vma lock"), further simplifying this logic.
This should reduce lock anon_vma contention, and clarifies exactly where
the anon_vma lock is required.
We cannot adjust __anon_vma_prepare() in the same way as this is only
protected by VMA read lock, so we have to perform the allocation here under
the anon_vma write lock and page_table_lock (to protect against racing
threads), and we wish to retain the lock ordering.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/rmap.c | 49 +++++++++++++++++++++++++++++--------------------
1 file changed, 29 insertions(+), 20 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 60134a566073..de9de6d71c23 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -146,14 +146,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
+static void anon_vma_chain_assign(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -210,7 +209,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
@@ -287,20 +287,28 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
check_anon_vma_clone(dst, src);
+ /*
+ * Allocate AVCs. We don't need an anon_vma lock for this as we
+ * are not updating the anon_vma rbtree nor are we changing
+ * anon_vma statistics.
+ *
+ * We hold the mmap write lock so there's no possibliity of
+ * the unlinked AVC's being observed yet.
+ */
+ list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+
+ anon_vma_chain_assign(dst, avc, pavc->anon_vma);
+ }
+
+ /* Now link the anon_vma's back to the newly inserted AVCs. */
anon_vma_lock_write(src->anon_vma);
- list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma;
-
- avc = anon_vma_chain_alloc(GFP_NOWAIT);
- if (unlikely(!avc)) {
- anon_vma_unlock_write(src->anon_vma);
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto enomem_failure;
- anon_vma_lock_write(src->anon_vma);
- }
- anon_vma = pavc->anon_vma;
- anon_vma_chain_link(dst, avc, anon_vma);
+ list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
/*
* Reuse existing anon_vma if it has no vma and only one
@@ -316,7 +324,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
if (dst->anon_vma)
dst->anon_vma->num_active_vmas++;
-
anon_vma_unlock_write(src->anon_vma);
return 0;
@@ -385,8 +392,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ /* Now let rmap see it. */
anon_vma_lock_write(anon_vma);
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible
2025-12-17 12:27 ` [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible Lorenzo Stoakes
@ 2025-12-30 21:35 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 21:35 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> There is no reason to allocate the anon_vma_chain under the anon_vma write
> lock when cloning - we can in fact assign these to the destination VMA
> safely as we hold the exclusive mmap lock and therefore preclude anybody
> else accessing these fields.
>
> We only need take the anon_vma write lock when we link rbtree edges from
> the anon_vma to the newly established AVCs.
>
> This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance
> introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation
> under anon_vma lock"), further simplifying this logic.
>
> This should reduce lock anon_vma contention, and clarifies exactly where
> the anon_vma lock is required.
>
> We cannot adjust __anon_vma_prepare() in the same way as this is only
> protected by VMA read lock, so we have to perform the allocation here under
> the anon_vma write lock and page_table_lock (to protect against racing
> threads), and we wish to retain the lock ordering.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
One nit but otherwise nice cleanup.
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> ---
> mm/rmap.c | 49 +++++++++++++++++++++++++++++--------------------
> 1 file changed, 29 insertions(+), 20 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 60134a566073..de9de6d71c23 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -146,14 +146,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
> kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
> }
>
> -static void anon_vma_chain_link(struct vm_area_struct *vma,
> - struct anon_vma_chain *avc,
> - struct anon_vma *anon_vma)
> +static void anon_vma_chain_assign(struct vm_area_struct *vma,
> + struct anon_vma_chain *avc,
> + struct anon_vma *anon_vma)
> {
> avc->vma = vma;
> avc->anon_vma = anon_vma;
> list_add(&avc->same_vma, &vma->anon_vma_chain);
> - anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> }
>
> /**
> @@ -210,7 +209,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> spin_lock(&mm->page_table_lock);
> if (likely(!vma->anon_vma)) {
> vma->anon_vma = anon_vma;
> - anon_vma_chain_link(vma, avc, anon_vma);
> + anon_vma_chain_assign(vma, avc, anon_vma);
> + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> anon_vma->num_active_vmas++;
> allocated = NULL;
> avc = NULL;
> @@ -287,20 +287,28 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>
> check_anon_vma_clone(dst, src);
>
> + /*
> + * Allocate AVCs. We don't need an anon_vma lock for this as we
> + * are not updating the anon_vma rbtree nor are we changing
> + * anon_vma statistics.
> + *
> + * We hold the mmap write lock so there's no possibliity of
To be more specific, we are holding src's mmap write lock. I think
clarifying that will avoid any confusion.
> + * the unlinked AVC's being observed yet.
> + */
> + list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
> + avc = anon_vma_chain_alloc(GFP_KERNEL);
> + if (!avc)
> + goto enomem_failure;
> +
> + anon_vma_chain_assign(dst, avc, pavc->anon_vma);
> + }
> +
> + /* Now link the anon_vma's back to the newly inserted AVCs. */
> anon_vma_lock_write(src->anon_vma);
> - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> - struct anon_vma *anon_vma;
> -
> - avc = anon_vma_chain_alloc(GFP_NOWAIT);
> - if (unlikely(!avc)) {
> - anon_vma_unlock_write(src->anon_vma);
> - avc = anon_vma_chain_alloc(GFP_KERNEL);
> - if (!avc)
> - goto enomem_failure;
> - anon_vma_lock_write(src->anon_vma);
> - }
> - anon_vma = pavc->anon_vma;
> - anon_vma_chain_link(dst, avc, anon_vma);
> + list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
> + struct anon_vma *anon_vma = avc->anon_vma;
> +
> + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
>
> /*
> * Reuse existing anon_vma if it has no vma and only one
> @@ -316,7 +324,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> }
> if (dst->anon_vma)
> dst->anon_vma->num_active_vmas++;
> -
> anon_vma_unlock_write(src->anon_vma);
> return 0;
>
> @@ -385,8 +392,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> get_anon_vma(anon_vma->root);
> /* Mark this anon_vma as the one where our new (COWed) pages go. */
> vma->anon_vma = anon_vma;
> + anon_vma_chain_assign(vma, avc, anon_vma);
> + /* Now let rmap see it. */
> anon_vma_lock_write(anon_vma);
> - anon_vma_chain_link(vma, avc, anon_vma);
> + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> anon_vma->parent->num_children++;
> anon_vma_unlock_write(anon_vma);
>
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()
2025-12-17 12:27 [PATCH 0/8] mm: clean up anon_vma implementation Lorenzo Stoakes
` (6 preceding siblings ...)
2025-12-17 12:27 ` [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible Lorenzo Stoakes
@ 2025-12-17 12:27 ` Lorenzo Stoakes
2025-12-30 22:02 ` Suren Baghdasaryan
7 siblings, 1 reply; 20+ messages in thread
From: Lorenzo Stoakes @ 2025-12-17 12:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Suren Baghdasaryan, Liam R . Howlett, Vlastimil Babka,
Shakeel Butt, David Hildenbrand, Rik van Riel, Harry Yoo,
Jann Horn, Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li,
Barry Song, linux-mm, linux-kernel
Specify which operation is being performed to anon_vma_clone(), which
allows us to do checks specific to each operation type, as well as to
separate out and make clear that the anon_vma reuse logic is absolutely
specific to fork only.
This opens the door to further refactorings and refinements later as we
have more information to work with.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/internal.h | 11 +++++-
mm/rmap.c | 67 ++++++++++++++++++++++----------
mm/vma.c | 6 +--
tools/testing/vma/vma_internal.h | 11 +++++-
4 files changed, 69 insertions(+), 26 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 469d4ef1ccc5..b4d4bca0f9a7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
struct anon_vma *folio_get_anon_vma(const struct folio *folio);
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
+/* Operations which modify VMAs. */
+enum vma_operation {
+ VMA_OP_SPLIT,
+ VMA_OP_MERGE_UNFAULTED,
+ VMA_OP_REMAP,
+ VMA_OP_FORK,
+};
+
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation);
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
int __anon_vma_prepare(struct vm_area_struct *vma);
void unlink_anon_vmas(struct vm_area_struct *vma);
diff --git a/mm/rmap.c b/mm/rmap.c
index de9de6d71c23..f08e6bc57379 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
}
static void check_anon_vma_clone(struct vm_area_struct *dst,
- struct vm_area_struct *src)
+ struct vm_area_struct *src,
+ enum vma_operation operation)
{
/* The write lock must be held. */
mmap_assert_write_locked(src->vm_mm);
- /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
- VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
+ /* If not a fork then must be on same mm. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
/* No source anon_vma is a no-op. */
VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
@@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
* must be the same across dst and src.
*/
VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+
+ /* For the anon_vma to be compatible, it can only be singular. */
+ VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
+ !list_is_singular(&src->anon_vma_chain));
+#ifdef CONFIG_PER_VMA_LOCK
+ /* Only merging an unfaulted VMA leaves the destination attached. */
+ VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
+ vma_is_attached(dst));
+#endif
+}
+
+static void find_reusable_anon_vma(struct vm_area_struct *vma,
+ struct anon_vma *anon_vma)
+{
+ /* If already populated, nothing to do.*/
+ if (vma->anon_vma)
+ return;
+
+ /*
+ * We reuse an anon_vma if any linking VMAs were unmapped and it has
+ * only a single child at most.
+ */
+ if (anon_vma->num_active_vmas > 0)
+ return;
+ if (anon_vma->num_children > 1)
+ return;
+
+ vma->anon_vma = anon_vma;
+ anon_vma->num_active_vmas++;
}
/**
@@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
* all of the anon_vma objects contained within @src anon_vma_chain's.
* @dst: The destination VMA with an empty anon_vma_chain.
* @src: The source VMA we wish to duplicate.
+ * @operation: The type of operation which resulted in the clone.
*
* This is the heart of the VMA side of the anon_vma implementation - we invoke
* this function whenever we need to set up a new VMA's anon_vma state.
@@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
*
* Returns: 0 on success, -ENOMEM on failure.
*/
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation)
{
struct anon_vma_chain *avc, *pavc;
+ struct anon_vma *active_anon_vma = src->anon_vma;
- if (!src->anon_vma)
+ if (!active_anon_vma)
return 0;
- check_anon_vma_clone(dst, src);
+ check_anon_vma_clone(dst, src, operation);
/*
* Allocate AVCs. We don't need an anon_vma lock for this as we
@@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
struct anon_vma *anon_vma = avc->anon_vma;
anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-
- /*
- * Reuse existing anon_vma if it has no vma and only one
- * anon_vma child.
- *
- * Root anon_vma is never reused:
- * it has self-parent reference and at least one child.
- */
- if (!dst->anon_vma && src->anon_vma &&
- anon_vma->num_children < 2 &&
- anon_vma->num_active_vmas == 0)
- dst->anon_vma = anon_vma;
+ if (operation == VMA_OP_FORK)
+ find_reusable_anon_vma(dst, anon_vma);
}
- if (dst->anon_vma)
+
+ if (operation != VMA_OP_FORK)
dst->anon_vma->num_active_vmas++;
- anon_vma_unlock_write(src->anon_vma);
+
+ anon_vma_unlock_write(active_anon_vma);
return 0;
enomem_failure:
@@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- error = anon_vma_clone(vma, pvma);
+ error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
if (error)
return error;
diff --git a/mm/vma.c b/mm/vma.c
index feb4bbd3b259..e297c3a94133 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (err)
goto out_free_vmi;
- err = anon_vma_clone(new, vma);
+ err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
if (err)
goto out_free_mpol;
@@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
- ret = anon_vma_clone(dst, src);
+ ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
if (ret)
return ret;
@@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
vma_set_range(new_vma, addr, addr + len, pgoff);
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- if (anon_vma_clone(new_vma, vma))
+ if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 93e5792306d9..7fa56dcc53a6 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -600,6 +600,14 @@ struct mmap_action {
bool hide_from_rmap_until_complete :1;
};
+/* Operations which modify VMAs. */
+enum vma_operation {
+ VMA_OP_SPLIT,
+ VMA_OP_MERGE_UNFAULTED,
+ VMA_OP_REMAP,
+ VMA_OP_FORK,
+};
+
/*
* Describes a VMA that is about to be mmap()'ed. Drivers may choose to
* manipulate mutable fields which will cause those fields to be updated in the
@@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
return 0;
}
-static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+ enum vma_operation operation)
{
/* For testing purposes. We indicate that an anon_vma has been cloned. */
if (src->anon_vma != NULL) {
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()
2025-12-17 12:27 ` [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone() Lorenzo Stoakes
@ 2025-12-30 22:02 ` Suren Baghdasaryan
0 siblings, 0 replies; 20+ messages in thread
From: Suren Baghdasaryan @ 2025-12-30 22:02 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Andrew Morton, Liam R . Howlett, Vlastimil Babka, Shakeel Butt,
David Hildenbrand, Rik van Riel, Harry Yoo, Jann Horn,
Mike Rapoport, Michal Hocko, Pedro Falcato, Chris Li, Barry Song,
linux-mm, linux-kernel
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Specify which operation is being performed to anon_vma_clone(), which
> allows us to do checks specific to each operation type, as well as to
> separate out and make clear that the anon_vma reuse logic is absolutely
> specific to fork only.
>
> This opens the door to further refactorings and refinements later as we
> have more information to work with.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/internal.h | 11 +++++-
> mm/rmap.c | 67 ++++++++++++++++++++++----------
> mm/vma.c | 6 +--
> tools/testing/vma/vma_internal.h | 11 +++++-
> 4 files changed, 69 insertions(+), 26 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 469d4ef1ccc5..b4d4bca0f9a7 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
>
> struct anon_vma *folio_get_anon_vma(const struct folio *folio);
>
> -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
> +/* Operations which modify VMAs. */
> +enum vma_operation {
> + VMA_OP_SPLIT,
> + VMA_OP_MERGE_UNFAULTED,
> + VMA_OP_REMAP,
> + VMA_OP_FORK,
> +};
> +
> +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> + enum vma_operation operation);
> int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
> int __anon_vma_prepare(struct vm_area_struct *vma);
> void unlink_anon_vmas(struct vm_area_struct *vma);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index de9de6d71c23..f08e6bc57379 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> }
>
> static void check_anon_vma_clone(struct vm_area_struct *dst,
> - struct vm_area_struct *src)
> + struct vm_area_struct *src,
> + enum vma_operation operation)
> {
> /* The write lock must be held. */
> mmap_assert_write_locked(src->vm_mm);
> - /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> - VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> + /* If not a fork then must be on same mm. */
> + VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
>
> /* No source anon_vma is a no-op. */
> VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> @@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> * must be the same across dst and src.
> */
> VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> +
> + /* For the anon_vma to be compatible, it can only be singular. */
> + VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
> + !list_is_singular(&src->anon_vma_chain));
> +#ifdef CONFIG_PER_VMA_LOCK
> + /* Only merging an unfaulted VMA leaves the destination attached. */
Do you mean to say that for all other operations dst should not be yet
attached? If so, "leaves the destination attached" is a bit confusing
choice of words to indicate that attaching the VMA in other cases has
not yet happened. Maybe reword as: "For cases other than merging an
unfaulted VMA, the VMA should not be attached yet."
> + VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
> + vma_is_attached(dst));
> +#endif
> +}
> +
> +static void find_reusable_anon_vma(struct vm_area_struct *vma,
> + struct anon_vma *anon_vma)
Naming is hard but a function that assigns vma->anon_vma and
increments num_active_vmas should not be called
"find_reusable_anon_vma". I would suggest keeping the name but making
it return the anon_vma (or NULL if it's not reusable), letting the
caller do the assignment and the increment.
> +{
> + /* If already populated, nothing to do.*/
> + if (vma->anon_vma)
> + return;
> +
> + /*
> + * We reuse an anon_vma if any linking VMAs were unmapped and it has
> + * only a single child at most.
> + */
> + if (anon_vma->num_active_vmas > 0)
> + return;
> + if (anon_vma->num_children > 1)
> + return;
> +
> + vma->anon_vma = anon_vma;
> + anon_vma->num_active_vmas++;
You moved num_active_vmas++ here to fix the accounting later... Sneaky
but correct AFAICT :)
> }
>
> /**
> @@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> * all of the anon_vma objects contained within @src anon_vma_chain's.
> * @dst: The destination VMA with an empty anon_vma_chain.
> * @src: The source VMA we wish to duplicate.
> + * @operation: The type of operation which resulted in the clone.
> *
> * This is the heart of the VMA side of the anon_vma implementation - we invoke
> * this function whenever we need to set up a new VMA's anon_vma state.
> @@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> *
> * Returns: 0 on success, -ENOMEM on failure.
> */
> -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> + enum vma_operation operation)
> {
> struct anon_vma_chain *avc, *pavc;
> + struct anon_vma *active_anon_vma = src->anon_vma;
>
> - if (!src->anon_vma)
> + if (!active_anon_vma)
> return 0;
>
> - check_anon_vma_clone(dst, src);
> + check_anon_vma_clone(dst, src, operation);
>
> /*
> * Allocate AVCs. We don't need an anon_vma lock for this as we
> @@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> struct anon_vma *anon_vma = avc->anon_vma;
>
> anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> -
> - /*
> - * Reuse existing anon_vma if it has no vma and only one
> - * anon_vma child.
> - *
> - * Root anon_vma is never reused:
> - * it has self-parent reference and at least one child.
> - */
> - if (!dst->anon_vma && src->anon_vma &&
> - anon_vma->num_children < 2 &&
> - anon_vma->num_active_vmas == 0)
> - dst->anon_vma = anon_vma;
> + if (operation == VMA_OP_FORK)
> + find_reusable_anon_vma(dst, anon_vma);
> }
> - if (dst->anon_vma)
> +
> + if (operation != VMA_OP_FORK)
> dst->anon_vma->num_active_vmas++;
> - anon_vma_unlock_write(src->anon_vma);
> +
> + anon_vma_unlock_write(active_anon_vma);
> return 0;
>
> enomem_failure:
> @@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> * First, attach the new VMA to the parent VMA's anon_vmas,
> * so rmap can find non-COWed pages in child processes.
> */
> - error = anon_vma_clone(vma, pvma);
> + error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
> if (error)
> return error;
>
> diff --git a/mm/vma.c b/mm/vma.c
> index feb4bbd3b259..e297c3a94133 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> if (err)
> goto out_free_vmi;
>
> - err = anon_vma_clone(new, vma);
> + err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
> if (err)
> goto out_free_mpol;
>
> @@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
>
> vma_assert_write_locked(dst);
> dst->anon_vma = src->anon_vma;
> - ret = anon_vma_clone(dst, src);
> + ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
> if (ret)
> return ret;
>
> @@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> vma_set_range(new_vma, addr, addr + len, pgoff);
> if (vma_dup_policy(vma, new_vma))
> goto out_free_vma;
> - if (anon_vma_clone(new_vma, vma))
> + if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
> goto out_free_mempol;
> if (new_vma->vm_file)
> get_file(new_vma->vm_file);
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 93e5792306d9..7fa56dcc53a6 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -600,6 +600,14 @@ struct mmap_action {
> bool hide_from_rmap_until_complete :1;
> };
>
> +/* Operations which modify VMAs. */
> +enum vma_operation {
> + VMA_OP_SPLIT,
> + VMA_OP_MERGE_UNFAULTED,
> + VMA_OP_REMAP,
> + VMA_OP_FORK,
> +};
> +
> /*
> * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
> * manipulate mutable fields which will cause those fields to be updated in the
> @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
> return 0;
> }
>
> -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> + enum vma_operation operation)
> {
> /* For testing purposes. We indicate that an anon_vma has been cloned. */
> if (src->anon_vma != NULL) {
> --
> 2.52.0
>
^ permalink raw reply [flat|nested] 20+ messages in thread