linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] vma_start_write_killable
@ 2025-11-03 18:03 Matthew Wilcox (Oracle)
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Matthew Wilcox (Oracle) @ 2025-11-03 18:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Suren Baghdasaryan, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

When we added the VMA lock, we made a major oversight in not adding a
killable variant.  That can run us into trouble where a thread takes
the VMA lock for read (eg handling a page fault) and then goes out to
lunch for an hour (eg doing reclaim).  Another thread tries to modify
the VMA, taking the mmap_lock for write, then attempts to lock the VMA
for write.  That blocks on the first thread, and ensures that every
other page fault now tries to take the mmap_lock for read.  Because
everything's in an uninterruptible sleep, we can't kill the task,
which makes me angry.

This patch set just adds vma_start_write_killable() and converts one
caller to use it.  Most users are somewhat tricky to convert, so expect
follow-up individual patches per call-site which need careful analysis
to make sure we've done proper cleanup.

Matthew Wilcox (Oracle) (2):
  mm: Add vma_start_write_killable()
  mm: Use vma_start_write_killable() in dup_mmap()

 include/linux/mmap_lock.h        | 31 +++++++++++++++++++++++++++++--
 mm/mmap.c                        | 12 +++---------
 mm/mmap_lock.c                   | 27 ++++++++++++++++++---------
 tools/testing/vma/vma_internal.h |  8 ++++++++
 4 files changed, 58 insertions(+), 20 deletions(-)

-- 
2.47.2



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 18:03 [PATCH 0/2] vma_start_write_killable Matthew Wilcox (Oracle)
@ 2025-11-03 18:03 ` Matthew Wilcox (Oracle)
  2025-11-03 21:53   ` Suren Baghdasaryan
                     ` (2 more replies)
  2025-11-03 18:03 ` [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap() Matthew Wilcox (Oracle)
  2025-11-04  9:08 ` [syzbot ci] Re: vma_start_write_killable syzbot ci
  2 siblings, 3 replies; 11+ messages in thread
From: Matthew Wilcox (Oracle) @ 2025-11-03 18:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Suren Baghdasaryan, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

The vma can be held read-locked for a substantial period of time, eg if
memory allocation needs to go into reclaim.  It's useful to be able to
send fatal signals to threads which are waiting for the write lock.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/mmap_lock.h        | 31 +++++++++++++++++++++++++++++--
 mm/mmap_lock.c                   | 27 ++++++++++++++++++---------
 tools/testing/vma/vma_internal.h |  8 ++++++++
 3 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 2c9fffa58714..b198d6443355 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
 	return (vma->vm_lock_seq == *mm_lock_seq);
 }
 
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+		int state);
 
 /*
  * Begin writing to a VMA.
@@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	__vma_start_write(vma, mm_lock_seq);
+	__vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
+}
+
+/**
+ * vma_start_write_killable - Begin writing to a VMA.
+ * @vma: The VMA we are going to modify.
+ *
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ *
+ * Context: May sleep while waiting for readers to drop the vma read lock.
+ * Caller must already hold the mmap_lock for write.
+ *
+ * Return: 0 for a successful acquisition.  -EINTR if a fatal signal was
+ * received.
+ */
+static inline
+int __must_check vma_start_write_killable(struct vm_area_struct *vma)
+{
+	unsigned int mm_lock_seq;
+
+	if (__is_vma_write_locked(vma, &mm_lock_seq))
+		return 0;
+	return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -286,6 +310,9 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
 		{ return NULL; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline
+int __must_check vma_start_write_killable(struct vm_area_struct *vma)
+{ return 0; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 		{ mmap_assert_write_locked(vma->vm_mm); }
 static inline void vma_assert_attached(struct vm_area_struct *vma) {}
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 0a0db5849b8e..dbaa6376a870 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -45,8 +45,10 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
 
 #ifdef CONFIG_MMU
 #ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+static inline int __vma_enter_locked(struct vm_area_struct *vma,
+		bool detaching, int state)
 {
+	int err;
 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
 
 	/* Additional refcnt if the vma is attached. */
@@ -58,15 +60,17 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
 	 */
 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
-		return false;
+		return 0;
 
 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
-	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
-		   TASK_UNINTERRUPTIBLE);
+		   state);
+	if (err)
+		return err;
 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
 
-	return true;
+	return 1;
 }
 
 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
@@ -75,16 +79,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
 }
 
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+		int state)
 {
-	bool locked;
+	int locked;
 
 	/*
 	 * __vma_enter_locked() returns false immediately if the vma is not
 	 * attached, otherwise it waits until refcnt is indicating that vma
 	 * is attached with no readers.
 	 */
-	locked = __vma_enter_locked(vma, false);
+	locked = __vma_enter_locked(vma, false, state);
+	if (locked < 0)
+		return locked;
 
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
@@ -100,6 +107,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
 		__vma_exit_locked(vma, &detached);
 		WARN_ON_ONCE(detached); /* vma should remain attached */
 	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__vma_start_write);
 
@@ -118,7 +127,7 @@ void vma_mark_detached(struct vm_area_struct *vma)
 	 */
 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
 		/* Wait until vma is detached with no readers. */
-		if (__vma_enter_locked(vma, true)) {
+		if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
 			bool detached;
 
 			__vma_exit_locked(vma, &detached);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index dc976a285ad2..917062cfbc69 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -844,6 +844,14 @@ static inline void vma_start_write(struct vm_area_struct *vma)
 	vma->vm_lock_seq++;
 }
 
+static inline __must_check
+int vma_start_write_killable(struct vm_area_struct *vma)
+{
+	/* Used to indicate to tests that a write operation has begun. */
+	vma->vm_lock_seq++;
+	return 0;
+}
+
 static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 unsigned long start,
 					 unsigned long end,
-- 
2.47.2



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap()
  2025-11-03 18:03 [PATCH 0/2] vma_start_write_killable Matthew Wilcox (Oracle)
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
@ 2025-11-03 18:03 ` Matthew Wilcox (Oracle)
  2025-11-03 21:56   ` Suren Baghdasaryan
  2025-11-07 19:12   ` Liam R. Howlett
  2025-11-04  9:08 ` [syzbot ci] Re: vma_start_write_killable syzbot ci
  2 siblings, 2 replies; 11+ messages in thread
From: Matthew Wilcox (Oracle) @ 2025-11-03 18:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox (Oracle),
	linux-mm, Suren Baghdasaryan, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

Allow waiting for the VMA write lock to be interrupted by fatal signals.
We can remove the explicit check for fatal_signal_pending() as we will
check it during vma_start_write_killable().  This is an improvement as
we will not wait for the reader to finish before checking for signals.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/mmap.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 5fd3b80fda1d..03773fe777b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1750,7 +1750,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	for_each_vma(vmi, mpnt) {
 		struct file *file;
 
-		vma_start_write(mpnt);
+		retval = vma_start_write_killable(mpnt);
+		if (retval < 0)
+			goto loop_out;
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
 						    mpnt->vm_end, GFP_KERNEL);
@@ -1761,14 +1763,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			continue;
 		}
 		charge = 0;
-		/*
-		 * Don't duplicate many vmas if we've been oom-killed (for
-		 * example)
-		 */
-		if (fatal_signal_pending(current)) {
-			retval = -EINTR;
-			goto loop_out;
-		}
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned long len = vma_pages(mpnt);
 
-- 
2.47.2



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
@ 2025-11-03 21:53   ` Suren Baghdasaryan
  2025-11-03 23:14     ` Matthew Wilcox
  2025-11-04 16:09   ` Matthew Wilcox
  2025-11-07 19:12   ` Liam R. Howlett
  2 siblings, 1 reply; 11+ messages in thread
From: Suren Baghdasaryan @ 2025-11-03 21:53 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Andrew Morton, linux-mm, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

On Mon, Nov 3, 2025 at 10:03 AM Matthew Wilcox (Oracle)
<willy@infradead.org> wrote:
>
> The vma can be held read-locked for a substantial period of time, eg if
> memory allocation needs to go into reclaim.  It's useful to be able to
> send fatal signals to threads which are waiting for the write lock.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Thanks Matthew!
With the below suggestions addressed,

Reviewed-by: Suren Baghdasaryan <surenb@google.com>

> ---
>  include/linux/mmap_lock.h        | 31 +++++++++++++++++++++++++++++--
>  mm/mmap_lock.c                   | 27 ++++++++++++++++++---------
>  tools/testing/vma/vma_internal.h |  8 ++++++++
>  3 files changed, 55 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 2c9fffa58714..b198d6443355 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
>         return (vma->vm_lock_seq == *mm_lock_seq);
>  }
>
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
> +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
> +               int state);
>
>  /*
>   * Begin writing to a VMA.
> @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>         if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
>
> -       __vma_start_write(vma, mm_lock_seq);
> +       __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
> +}
> +
> +/**
> + * vma_start_write_killable - Begin writing to a VMA.
> + * @vma: The VMA we are going to modify.
> + *
> + * Exclude concurrent readers under the per-VMA lock until the currently
> + * write-locked mmap_lock is dropped or downgraded.
> + *
> + * Context: May sleep while waiting for readers to drop the vma read lock.
> + * Caller must already hold the mmap_lock for write.
> + *
> + * Return: 0 for a successful acquisition.  -EINTR if a fatal signal was
> + * received.
> + */
> +static inline
> +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> +{
> +       unsigned int mm_lock_seq;
> +
> +       if (__is_vma_write_locked(vma, &mm_lock_seq))
> +               return 0;
> +       return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
>  }
>
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -286,6 +310,9 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
>                 { return NULL; }
>  static inline void vma_end_read(struct vm_area_struct *vma) {}
>  static inline void vma_start_write(struct vm_area_struct *vma) {}
> +static inline
> +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> +{ return 0; }

nit: a tab for consistency with other stubs please.

>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>                 { mmap_assert_write_locked(vma->vm_mm); }
>  static inline void vma_assert_attached(struct vm_area_struct *vma) {}
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index 0a0db5849b8e..dbaa6376a870 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -45,8 +45,10 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
>
>  #ifdef CONFIG_MMU
>  #ifdef CONFIG_PER_VMA_LOCK

Let's add a comment to list possible return values:
0 - the vma is not attached;
1 - the vma is attached with no readers;
negative - an error code;

> -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
> +static inline int __vma_enter_locked(struct vm_area_struct *vma,
> +               bool detaching, int state)
>  {
> +       int err;
>         unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
>
>         /* Additional refcnt if the vma is attached. */
> @@ -58,15 +60,17 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
>          * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
>          */
>         if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> -               return false;
> +               return 0;
>
>         rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> -       rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> +       err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
>                    refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> -                  TASK_UNINTERRUPTIBLE);
> +                  state);
> +       if (err)
> +               return err;
>         lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
>
> -       return true;
> +       return 1;
>  }
>
>  static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> @@ -75,16 +79,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
>         rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>  }
>
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
> +               int state)
>  {
> -       bool locked;
> +       int locked;
>
>         /*
>          * __vma_enter_locked() returns false immediately if the vma is not
>          * attached, otherwise it waits until refcnt is indicating that vma
>          * is attached with no readers.
>          */
> -       locked = __vma_enter_locked(vma, false);
> +       locked = __vma_enter_locked(vma, false, state);
> +       if (locked < 0)
> +               return locked;
>
>         /*
>          * We should use WRITE_ONCE() here because we can have concurrent reads
> @@ -100,6 +107,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
>                 __vma_exit_locked(vma, &detached);
>                 WARN_ON_ONCE(detached); /* vma should remain attached */
>         }
> +
> +       return 0;
>  }
>  EXPORT_SYMBOL_GPL(__vma_start_write);
>
> @@ -118,7 +127,7 @@ void vma_mark_detached(struct vm_area_struct *vma)
>          */
>         if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
>                 /* Wait until vma is detached with no readers. */
> -               if (__vma_enter_locked(vma, true)) {
> +               if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
>                         bool detached;
>
>                         __vma_exit_locked(vma, &detached);
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index dc976a285ad2..917062cfbc69 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -844,6 +844,14 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>         vma->vm_lock_seq++;
>  }
>
> +static inline __must_check
> +int vma_start_write_killable(struct vm_area_struct *vma)
> +{
> +       /* Used to indicate to tests that a write operation has begun. */
> +       vma->vm_lock_seq++;
> +       return 0;
> +}
> +
>  static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
>                                          unsigned long start,
>                                          unsigned long end,
> --
> 2.47.2
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap()
  2025-11-03 18:03 ` [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap() Matthew Wilcox (Oracle)
@ 2025-11-03 21:56   ` Suren Baghdasaryan
  2025-11-07 19:12   ` Liam R. Howlett
  1 sibling, 0 replies; 11+ messages in thread
From: Suren Baghdasaryan @ 2025-11-03 21:56 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Andrew Morton, linux-mm, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

On Mon, Nov 3, 2025 at 10:03 AM Matthew Wilcox (Oracle)
<willy@infradead.org> wrote:
>
> Allow waiting for the VMA write lock to be interrupted by fatal signals.
> We can remove the explicit check for fatal_signal_pending() as we will
> check it during vma_start_write_killable().  This is an improvement as
> we will not wait for the reader to finish before checking for signals.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Reviewed-by: Suren Baghdasaryan <surenb@google.com>

> ---
>  mm/mmap.c | 12 +++---------
>  1 file changed, 3 insertions(+), 9 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5fd3b80fda1d..03773fe777b7 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1750,7 +1750,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>         for_each_vma(vmi, mpnt) {
>                 struct file *file;
>
> -               vma_start_write(mpnt);
> +               retval = vma_start_write_killable(mpnt);
> +               if (retval < 0)
> +                       goto loop_out;
>                 if (mpnt->vm_flags & VM_DONTCOPY) {
>                         retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
>                                                     mpnt->vm_end, GFP_KERNEL);
> @@ -1761,14 +1763,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>                         continue;
>                 }
>                 charge = 0;
> -               /*
> -                * Don't duplicate many vmas if we've been oom-killed (for
> -                * example)
> -                */
> -               if (fatal_signal_pending(current)) {
> -                       retval = -EINTR;
> -                       goto loop_out;
> -               }
>                 if (mpnt->vm_flags & VM_ACCOUNT) {
>                         unsigned long len = vma_pages(mpnt);
>
> --
> 2.47.2
>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 21:53   ` Suren Baghdasaryan
@ 2025-11-03 23:14     ` Matthew Wilcox
  2025-11-03 23:17       ` Suren Baghdasaryan
  0 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox @ 2025-11-03 23:14 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Andrew Morton, linux-mm, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

On Mon, Nov 03, 2025 at 01:53:44PM -0800, Suren Baghdasaryan wrote:
> > @@ -286,6 +310,9 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> >                 { return NULL; }
> >  static inline void vma_end_read(struct vm_area_struct *vma) {}
> >  static inline void vma_start_write(struct vm_area_struct *vma) {}
> > +static inline
> > +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> > +{ return 0; }
> 
> nit: a tab for consistency with other stubs please.

No.  This is a stupid indentation style that's not in use anywhere else.
There is not one inline function in linux/mm.h that uses this formatting.
If anything, I'd correct the other stubs to match the rest of the kernel.

> Let's add a comment to list possible return values:
> 0 - the vma is not attached;
> 1 - the vma is attached with no readers;
> negative - an error code;

I considered doing that, but decided it wasn't worth doing since there
are only two callers and it's a static function.  If you feel strongly
I'll add it though.



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 23:14     ` Matthew Wilcox
@ 2025-11-03 23:17       ` Suren Baghdasaryan
  0 siblings, 0 replies; 11+ messages in thread
From: Suren Baghdasaryan @ 2025-11-03 23:17 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andrew Morton, linux-mm, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

On Mon, Nov 3, 2025 at 3:15 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Nov 03, 2025 at 01:53:44PM -0800, Suren Baghdasaryan wrote:
> > > @@ -286,6 +310,9 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
> > >                 { return NULL; }
> > >  static inline void vma_end_read(struct vm_area_struct *vma) {}
> > >  static inline void vma_start_write(struct vm_area_struct *vma) {}
> > > +static inline
> > > +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> > > +{ return 0; }
> >
> > nit: a tab for consistency with other stubs please.
>
> No.  This is a stupid indentation style that's not in use anywhere else.
> There is not one inline function in linux/mm.h that uses this formatting.
> If anything, I'd correct the other stubs to match the rest of the kernel.

Ok, I wasn't aware of that.

>
> > Let's add a comment to list possible return values:
> > 0 - the vma is not attached;
> > 1 - the vma is attached with no readers;
> > negative - an error code;
>
> I considered doing that, but decided it wasn't worth doing since there
> are only two callers and it's a static function.  If you feel strongly
> I'll add it though.

If it's not too much trouble. I think it's not very clear from reading
the code, so a comment would be helpful.


>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [syzbot ci] Re: vma_start_write_killable
  2025-11-03 18:03 [PATCH 0/2] vma_start_write_killable Matthew Wilcox (Oracle)
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
  2025-11-03 18:03 ` [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap() Matthew Wilcox (Oracle)
@ 2025-11-04  9:08 ` syzbot ci
  2 siblings, 0 replies; 11+ messages in thread
From: syzbot ci @ 2025-11-04  9:08 UTC (permalink / raw)
  To: akpm, chriscli, jannh, liam.howlett, linux-mm, lorenzo.stoakes,
	pfalcato, shakeel.butt, surenb, vbabka, willy
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v1] vma_start_write_killable
https://lore.kernel.org/all/20251103180348.3368668-1-willy@infradead.org
* [PATCH 1/2] mm: Add vma_start_write_killable()
* [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap()

and found the following issue:
possible deadlock in exit_mmap

Full report is available here:
https://ci.syzbot.org/series/3eb0eeab-3254-43ec-ad2d-e439ab81ad3e

***

possible deadlock in exit_mmap

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      fd57572253bc356330dbe5b233c2e1d8426c66fd
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/b55422d4-9fe5-4a2c-9938-3e67223a358f/config
C repro:   https://ci.syzbot.org/findings/ecb8b9e5-1c01-48c4-b7b6-9155dea27118/c_repro
syz repro: https://ci.syzbot.org/findings/ecb8b9e5-1c01-48c4-b7b6-9155dea27118/syz_repro

======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Not tainted
------------------------------------------------------
syz.0.24/6004 is trying to acquire lock:
ffff88810f7fcd20 (&mm->mmap_lock){++++}-{4:4}, at: mmap_read_lock include/linux/mmap_lock.h:395 [inline]
ffff88810f7fcd20 (&mm->mmap_lock){++++}-{4:4}, at: exit_mmap+0x126/0xb40 mm/mmap.c:1265

but task is already holding lock:
ffff888111ad0f88 (vm_lock){++++}-{0:0}, at: __vma_start_write+0x23/0x140 mm/mmap_lock.c:92

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (vm_lock){++++}-{0:0}:
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       __vma_enter_locked+0x1a0/0x570 mm/mmap_lock.c:65
       __vma_start_write+0x23/0x140 mm/mmap_lock.c:92
       vma_start_write include/linux/mmap_lock.h:213 [inline]
       mprotect_fixup+0x57d/0x9c0 mm/mprotect.c:828
       setup_arg_pages+0x52a/0xa90 fs/exec.c:670
       load_elf_binary+0xba4/0x2740 fs/binfmt_elf.c:1028
       search_binary_handler fs/exec.c:1670 [inline]
       exec_binprm fs/exec.c:1702 [inline]
       bprm_execve+0x99c/0x1450 fs/exec.c:1754
       kernel_execve+0x8f0/0x9f0 fs/exec.c:1920
       try_to_run_init_process+0x13/0x60 init/main.c:1411
       kernel_init+0xad/0x1d0 init/main.c:1539
       ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
       ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

-> #0 (&mm->mmap_lock){++++}-{4:4}:
       check_prev_add kernel/locking/lockdep.c:3165 [inline]
       check_prevs_add kernel/locking/lockdep.c:3284 [inline]
       validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
       __lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       down_read+0x46/0x2e0 kernel/locking/rwsem.c:1537
       mmap_read_lock include/linux/mmap_lock.h:395 [inline]
       exit_mmap+0x126/0xb40 mm/mmap.c:1265
       __mmput+0x118/0x430 kernel/fork.c:1133
       mmput kernel/fork.c:1156 [inline]
       dup_mm kernel/fork.c:1506 [inline]
       copy_mm+0x1f3/0x4b0 kernel/fork.c:1541
       copy_process+0x1706/0x3c00 kernel/fork.c:2181
       kernel_clone+0x21e/0x840 kernel/fork.c:2609
       __do_sys_clone kernel/fork.c:2750 [inline]
       __se_sys_clone kernel/fork.c:2734 [inline]
       __x64_sys_clone+0x18b/0x1e0 kernel/fork.c:2734
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(vm_lock);
                               lock(&mm->mmap_lock);
                               lock(vm_lock);
  rlock(&mm->mmap_lock);

 *** DEADLOCK ***

2 locks held by syz.0.24/6004:
 #0: ffffffff8dff64d0 (dup_mmap_sem){.+.+}-{0:0}, at: dup_mm kernel/fork.c:1488 [inline]
 #0: ffffffff8dff64d0 (dup_mmap_sem){.+.+}-{0:0}, at: copy_mm+0x131/0x4b0 kernel/fork.c:1541
 #1: ffff888111ad0f88 (vm_lock){++++}-{0:0}, at: __vma_start_write+0x23/0x140 mm/mmap_lock.c:92

stack backtrace:
CPU: 0 UID: 0 PID: 6004 Comm: syz.0.24 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 print_circular_bug+0x2ee/0x310 kernel/locking/lockdep.c:2043
 check_noncircular+0x134/0x160 kernel/locking/lockdep.c:2175
 check_prev_add kernel/locking/lockdep.c:3165 [inline]
 check_prevs_add kernel/locking/lockdep.c:3284 [inline]
 validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
 __lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
 lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
 down_read+0x46/0x2e0 kernel/locking/rwsem.c:1537
 mmap_read_lock include/linux/mmap_lock.h:395 [inline]
 exit_mmap+0x126/0xb40 mm/mmap.c:1265
 __mmput+0x118/0x430 kernel/fork.c:1133
 mmput kernel/fork.c:1156 [inline]
 dup_mm kernel/fork.c:1506 [inline]
 copy_mm+0x1f3/0x4b0 kernel/fork.c:1541
 copy_process+0x1706/0x3c00 kernel/fork.c:2181
 kernel_clone+0x21e/0x840 kernel/fork.c:2609
 __do_sys_clone kernel/fork.c:2750 [inline]
 __se_sys_clone kernel/fork.c:2734 [inline]
 __x64_sys_clone+0x18b/0x1e0 kernel/fork.c:2734
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f45ab18efc9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f45abfaefe8 EFLAGS: 00000206 ORIG_RAX: 0000000000000038
RAX: ffffffffffffffda RBX: 00007f45ab3e6090 RCX: 00007f45ab18efc9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000001000
RBP: 00007f45ab211f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
R13: 00007f45ab3e6128 R14: 00007f45ab3e6090 R15: 00007ffe9b2a02c8
 </TASK>
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 0 PID: 6004 at lib/refcount.c:19 refcount_warn_saturate+0x13a/0x1d0 lib/refcount.c:19
Modules linked in:
CPU: 0 UID: 0 PID: 6004 Comm: syz.0.24 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:refcount_warn_saturate+0x13a/0x1d0 lib/refcount.c:19
Code: 20 57 be 8b e8 87 a8 f9 fc 90 0f 0b 90 90 eb b7 e8 6b 8c 36 fd c6 05 1b 75 dd 0a 01 90 48 c7 c7 60 56 be 8b e8 67 a8 f9 fc 90 <0f> 0b 90 90 eb 97 e8 4b 8c 36 fd c6 05 ff 74 dd 0a 01 90 48 c7 c7
RSP: 0018:ffffc900036f75a8 EFLAGS: 00010246
RAX: 37ca69179b3b1c00 RBX: 0000000000000000 RCX: ffff88810c6c0000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000002
RBP: ffffc900036f76b0 R08: 0000000000000003 R09: 0000000000000004
R10: dffffc0000000000 R11: fffffbfff1bba650 R12: ffff888111ad0f80
R13: 1ffff920006deec4 R14: ffff888111ad0f80 R15: 0000000000000000
FS:  00007f45abfaf6c0(0000) GS:ffff88818eb3e000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f45abfaefc8 CR3: 00000001bbaba000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 __refcount_add_not_zero include/linux/refcount.h:187 [inline]
 refcount_add_not_zero include/linux/refcount.h:212 [inline]
 __vma_enter_locked+0x534/0x570 mm/mmap_lock.c:62
 __vma_start_write+0x23/0x140 mm/mmap_lock.c:92
 vma_start_write include/linux/mmap_lock.h:213 [inline]
 vma_merge_existing_range mm/vma.c:905 [inline]
 vma_modify+0xce0/0x1970 mm/vma.c:1608
 vma_modify_flags_uffd+0x204/0x250 mm/vma.c:1697
 userfaultfd_clear_vma mm/userfaultfd.c:2030 [inline]
 userfaultfd_release_all+0x34c/0x5d0 mm/userfaultfd.c:2149
 userfaultfd_release+0xe7/0x1b0 fs/userfaultfd.c:868
 __fput+0x44c/0xa70 fs/file_table.c:468
 task_work_run+0x1d4/0x260 kernel/task_work.c:227
 get_signal+0x11ec/0x1340 kernel/signal.c:2807
 arch_do_signal_or_restart+0xa0/0x790 arch/x86/kernel/signal.c:337
 exit_to_user_mode_loop+0x72/0x130 kernel/entry/common.c:40
 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
 syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
 do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f45ab18efc9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f45abfaefe8 EFLAGS: 00000206 ORIG_RAX: 0000000000000038
RAX: fffffffffffffff4 RBX: 00007f45ab3e6090 RCX: 00007f45ab18efc9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000001000
RBP: 00007f45ab211f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
R13: 00007f45ab3e6128 R14: 00007f45ab3e6090 R15: 00007ffe9b2a02c8
 </TASK>


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
  2025-11-03 21:53   ` Suren Baghdasaryan
@ 2025-11-04 16:09   ` Matthew Wilcox
  2025-11-07 19:12   ` Liam R. Howlett
  2 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2025-11-04 16:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, Suren Baghdasaryan, Liam R. Howlett, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

On Mon, Nov 03, 2025 at 06:03:45PM +0000, Matthew Wilcox (Oracle) wrote:
> @@ -58,15 +60,17 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
>  	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
>  	 */
>  	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> -		return false;
> +		return 0;
>  
>  	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> -	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> +	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
>  		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> -		   TASK_UNINTERRUPTIBLE);
> +		   state);
> +	if (err)
> +		return err;
>  	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
>  
> -	return true;
> +	return 1;

I'll wait a couple more days for review before sending v2, but the fix
for the syzbot issue is:

@@ -66,8 +71,10 @@ static inline int __vma_enter_locked(struct vm_area_struct *vma,
        err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
                   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
                   state);
-       if (err)
+       if (err) {
+               rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
                return err;
+       }
        lock_acquired(&vma->vmlock_dep_map, _RET_IP_);

        return 1;



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] mm: Add vma_start_write_killable()
  2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
  2025-11-03 21:53   ` Suren Baghdasaryan
  2025-11-04 16:09   ` Matthew Wilcox
@ 2025-11-07 19:12   ` Liam R. Howlett
  2 siblings, 0 replies; 11+ messages in thread
From: Liam R. Howlett @ 2025-11-07 19:12 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Andrew Morton, linux-mm, Suren Baghdasaryan, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

* Matthew Wilcox (Oracle) <willy@infradead.org> [251103 13:04]:
> The vma can be held read-locked for a substantial period of time, eg if
> memory allocation needs to go into reclaim.  It's useful to be able to
> send fatal signals to threads which are waiting for the write lock.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  include/linux/mmap_lock.h        | 31 +++++++++++++++++++++++++++++--
>  mm/mmap_lock.c                   | 27 ++++++++++++++++++---------
>  tools/testing/vma/vma_internal.h |  8 ++++++++
>  3 files changed, 55 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 2c9fffa58714..b198d6443355 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -195,7 +195,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
>  	return (vma->vm_lock_seq == *mm_lock_seq);
>  }
>  
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
> +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
> +		int state);
>  
>  /*
>   * Begin writing to a VMA.
> @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>  	if (__is_vma_write_locked(vma, &mm_lock_seq))
>  		return;
>  
> -	__vma_start_write(vma, mm_lock_seq);
> +	__vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE);
> +}
> +
> +/**
> + * vma_start_write_killable - Begin writing to a VMA.
> + * @vma: The VMA we are going to modify.
> + *
> + * Exclude concurrent readers under the per-VMA lock until the currently
> + * write-locked mmap_lock is dropped or downgraded.
> + *
> + * Context: May sleep while waiting for readers to drop the vma read lock.
> + * Caller must already hold the mmap_lock for write.
> + *
> + * Return: 0 for a successful acquisition.  -EINTR if a fatal signal was
> + * received.
> + */
> +static inline
> +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> +{
> +	unsigned int mm_lock_seq;
> +
> +	if (__is_vma_write_locked(vma, &mm_lock_seq))
> +		return 0;
> +	return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE);
>  }
>  
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> @@ -286,6 +310,9 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
>  		{ return NULL; }
>  static inline void vma_end_read(struct vm_area_struct *vma) {}
>  static inline void vma_start_write(struct vm_area_struct *vma) {}
> +static inline
> +int __must_check vma_start_write_killable(struct vm_area_struct *vma)
> +{ return 0; }
>  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
>  		{ mmap_assert_write_locked(vma->vm_mm); }
>  static inline void vma_assert_attached(struct vm_area_struct *vma) {}
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index 0a0db5849b8e..dbaa6376a870 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -45,8 +45,10 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
>  
>  #ifdef CONFIG_MMU
>  #ifdef CONFIG_PER_VMA_LOCK
> -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
> +static inline int __vma_enter_locked(struct vm_area_struct *vma,
> +		bool detaching, int state)
>  {
> +	int err;
>  	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
>  
>  	/* Additional refcnt if the vma is attached. */
> @@ -58,15 +60,17 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
>  	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
>  	 */
>  	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
> -		return false;
> +		return 0;
>  
>  	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
> -	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
> +	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
>  		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
> -		   TASK_UNINTERRUPTIBLE);
> +		   state);
> +	if (err)
> +		return err;
>  	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
>  
> -	return true;
> +	return 1;
>  }
>  
>  static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
> @@ -75,16 +79,19 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
>  	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
>  }
>  
> -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
> +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
> +		int state)
>  {
> -	bool locked;
> +	int locked;
>  
>  	/*
>  	 * __vma_enter_locked() returns false immediately if the vma is not
>  	 * attached, otherwise it waits until refcnt is indicating that vma
>  	 * is attached with no readers.
>  	 */
> -	locked = __vma_enter_locked(vma, false);
> +	locked = __vma_enter_locked(vma, false, state);
> +	if (locked < 0)
> +		return locked;
>  
>  	/*
>  	 * We should use WRITE_ONCE() here because we can have concurrent reads
> @@ -100,6 +107,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
>  		__vma_exit_locked(vma, &detached);
>  		WARN_ON_ONCE(detached); /* vma should remain attached */
>  	}
> +
> +	return 0;
>  }
>  EXPORT_SYMBOL_GPL(__vma_start_write);
>  
> @@ -118,7 +127,7 @@ void vma_mark_detached(struct vm_area_struct *vma)
>  	 */
>  	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
>  		/* Wait until vma is detached with no readers. */
> -		if (__vma_enter_locked(vma, true)) {
> +		if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
>  			bool detached;
>  
>  			__vma_exit_locked(vma, &detached);
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index dc976a285ad2..917062cfbc69 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -844,6 +844,14 @@ static inline void vma_start_write(struct vm_area_struct *vma)
>  	vma->vm_lock_seq++;
>  }
>  
> +static inline __must_check
> +int vma_start_write_killable(struct vm_area_struct *vma)
> +{
> +	/* Used to indicate to tests that a write operation has begun. */
> +	vma->vm_lock_seq++;
> +	return 0;
> +}
> +
>  static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
>  					 unsigned long start,
>  					 unsigned long end,
> -- 
> 2.47.2
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap()
  2025-11-03 18:03 ` [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap() Matthew Wilcox (Oracle)
  2025-11-03 21:56   ` Suren Baghdasaryan
@ 2025-11-07 19:12   ` Liam R. Howlett
  1 sibling, 0 replies; 11+ messages in thread
From: Liam R. Howlett @ 2025-11-07 19:12 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Andrew Morton, linux-mm, Suren Baghdasaryan, Lorenzo Stoakes,
	Vlastimil Babka, Shakeel Butt, Jann Horn, Pedro Falcato,
	Chris Li

* Matthew Wilcox (Oracle) <willy@infradead.org> [251103 13:04]:
> Allow waiting for the VMA write lock to be interrupted by fatal signals.
> We can remove the explicit check for fatal_signal_pending() as we will
> check it during vma_start_write_killable().  This is an improvement as
> we will not wait for the reader to finish before checking for signals.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  mm/mmap.c | 12 +++---------
>  1 file changed, 3 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5fd3b80fda1d..03773fe777b7 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1750,7 +1750,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  	for_each_vma(vmi, mpnt) {
>  		struct file *file;
>  
> -		vma_start_write(mpnt);
> +		retval = vma_start_write_killable(mpnt);
> +		if (retval < 0)
> +			goto loop_out;
>  		if (mpnt->vm_flags & VM_DONTCOPY) {
>  			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
>  						    mpnt->vm_end, GFP_KERNEL);
> @@ -1761,14 +1763,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  			continue;
>  		}
>  		charge = 0;
> -		/*
> -		 * Don't duplicate many vmas if we've been oom-killed (for
> -		 * example)
> -		 */
> -		if (fatal_signal_pending(current)) {
> -			retval = -EINTR;
> -			goto loop_out;
> -		}
>  		if (mpnt->vm_flags & VM_ACCOUNT) {
>  			unsigned long len = vma_pages(mpnt);
>  
> -- 
> 2.47.2
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-11-07 19:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-03 18:03 [PATCH 0/2] vma_start_write_killable Matthew Wilcox (Oracle)
2025-11-03 18:03 ` [PATCH 1/2] mm: Add vma_start_write_killable() Matthew Wilcox (Oracle)
2025-11-03 21:53   ` Suren Baghdasaryan
2025-11-03 23:14     ` Matthew Wilcox
2025-11-03 23:17       ` Suren Baghdasaryan
2025-11-04 16:09   ` Matthew Wilcox
2025-11-07 19:12   ` Liam R. Howlett
2025-11-03 18:03 ` [PATCH 2/2] mm: Use vma_start_write_killable() in dup_mmap() Matthew Wilcox (Oracle)
2025-11-03 21:56   ` Suren Baghdasaryan
2025-11-07 19:12   ` Liam R. Howlett
2025-11-04  9:08 ` [syzbot ci] Re: vma_start_write_killable syzbot ci

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox