From: Andrii Nakryiko <andrii@kernel.org>
To: linux-trace-kernel@vger.kernel.org, linux-mm@kvack.org,
akpm@linux-foundation.org, peterz@infradead.org
Cc: oleg@redhat.com, rostedt@goodmis.org, mhiramat@kernel.org,
bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
jolsa@kernel.org, paulmck@kernel.org, willy@infradead.org,
surenb@google.com, mjguzik@gmail.com, brauner@kernel.org,
jannh@google.com, mhocko@kernel.org, vbabka@suse.cz,
shakeel.butt@linux.dev, hannes@cmpxchg.org,
Liam.Howlett@oracle.com, lorenzo.stoakes@oracle.com,
david@redhat.com, arnd@arndb.de, richard.weiyang@gmail.com,
zhangpeng.00@bytedance.com, linmiaohe@huawei.com,
viro@zeniv.linux.org.uk, hca@linux.ibm.com,
Andrii Nakryiko <andrii@kernel.org>
Subject: [PATCH v4 tip/perf/core 1/4] mm: Convert mm_lock_seq to a proper seqcount
Date: Sun, 27 Oct 2024 18:08:15 -0700 [thread overview]
Message-ID: <20241028010818.2487581-2-andrii@kernel.org> (raw)
In-Reply-To: <20241028010818.2487581-1-andrii@kernel.org>
From: Suren Baghdasaryan <surenb@google.com>
Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock
variants to increment it, in-line with the usual seqcount usage pattern.
This lets us check whether the mmap_lock is write-locked by checking
mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be
used when implementing mmap_lock speculation functions.
As a result vm_lock_seq is also change to be unsigned to match the type
of mm_lock_seq.sequence.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
include/linux/mm.h | 12 +++----
include/linux/mm_types.h | 7 ++--
include/linux/mmap_lock.h | 58 +++++++++++++++++++++-----------
kernel/fork.c | 5 +--
mm/init-mm.c | 2 +-
tools/testing/vma/vma.c | 4 +--
tools/testing/vma/vma_internal.h | 4 +--
7 files changed, 56 insertions(+), 36 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..94b537088142 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -698,7 +698,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -715,7 +715,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock);
return false;
}
@@ -730,7 +730,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
}
/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -738,7 +738,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = vma->vm_mm->mm_lock_seq;
+ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
return (vma->vm_lock_seq == *mm_lock_seq);
}
@@ -749,7 +749,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ unsigned int mm_lock_seq;
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
@@ -767,7 +767,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
- int mm_lock_seq;
+ unsigned int mm_lock_seq;
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6e3bdf8e38bc..76e0cdc0462b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -715,7 +715,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
- int vm_lock_seq;
+ unsigned int vm_lock_seq;
/* Unstable RCU readers are allowed to read this. */
struct vma_lock *vm_lock;
#endif
@@ -887,6 +887,9 @@ struct mm_struct {
* Roughly speaking, incrementing the sequence number is
* equivalent to releasing locks on VMAs; reading the sequence
* number can be part of taking a read lock on a VMA.
+ * Incremented every time mmap_lock is write-locked/unlocked.
+ * Initialized to 0, therefore odd values indicate mmap_lock
+ * is write-locked and even values that it's released.
*
* Can be modified under write mmap_lock using RELEASE
* semantics.
@@ -895,7 +898,7 @@ struct mm_struct {
* Can be read with ACQUIRE semantics if not holding write
* mmap_lock.
*/
- int mm_lock_seq;
+ seqcount_t mm_lock_seq;
#endif
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index de9dc20b01ba..6b3272686860 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -71,39 +71,38 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
}
#ifdef CONFIG_PER_VMA_LOCK
-/*
- * Drop all currently-held per-VMA locks.
- * This is called from the mmap_lock implementation directly before releasing
- * a write-locked mmap_lock (or downgrading it to read-locked).
- * This should normally NOT be called manually from other places.
- * If you want to call this manually anyway, keep in mind that this will release
- * *all* VMA write locks, including ones from further up the stack.
- */
-static inline void vma_end_write_all(struct mm_struct *mm)
+static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
- mmap_assert_write_locked(mm);
- /*
- * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
- * mmap_lock being held.
- * We need RELEASE semantics here to ensure that preceding stores into
- * the VMA take effect before we unlock it with this store.
- * Pairs with ACQUIRE semantics in vma_start_read().
- */
- smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
+ seqcount_init(&mm->mm_lock_seq);
+}
+
+static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
+{
+ do_raw_write_seqcount_begin(&mm->mm_lock_seq);
+}
+
+static inline void mm_lock_seqcount_end(struct mm_struct *mm)
+{
+ do_raw_write_seqcount_end(&mm->mm_lock_seq);
}
+
#else
-static inline void vma_end_write_all(struct mm_struct *mm) {}
+static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
+static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
+static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
#endif
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
+ mm_lock_seqcount_init(mm);
}
static inline void mmap_write_lock(struct mm_struct *mm)
{
__mmap_lock_trace_start_locking(mm, true);
down_write(&mm->mmap_lock);
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
@@ -111,6 +110,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
__mmap_lock_trace_start_locking(mm, true);
down_write_nested(&mm->mmap_lock, subclass);
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, true);
}
@@ -120,10 +120,30 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
__mmap_lock_trace_start_locking(mm, true);
ret = down_write_killable(&mm->mmap_lock);
+ if (!ret)
+ mm_lock_seqcount_begin(mm);
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
return ret;
}
+/*
+ * Drop all currently-held per-VMA locks.
+ * This is called from the mmap_lock implementation directly before releasing
+ * a write-locked mmap_lock (or downgrading it to read-locked).
+ * This should normally NOT be called manually from other places.
+ * If you want to call this manually anyway, keep in mind that this will release
+ * *all* VMA write locks, including ones from further up the stack.
+ */
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ /*
+ * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
+ * mmap_lock being held.
+ */
+ mm_lock_seqcount_end(mm);
+}
+
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
diff --git a/kernel/fork.c b/kernel/fork.c
index 89ceb4a68af2..55c4088543dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
return false;
init_rwsem(&vma->vm_lock->lock);
- vma->vm_lock_seq = -1;
+ vma->vm_lock_seq = UINT_MAX;
return true;
}
@@ -1261,9 +1261,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
-#ifdef CONFIG_PER_VMA_LOCK
- mm->mm_lock_seq = 0;
-#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..6af3ad675930 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,7 +40,7 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
- .mm_lock_seq = 0,
+ .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index c53f220eb6cc..bcdf831dfe3e 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -87,7 +87,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
* begun. Linking to the tree will have caused this to be incremented,
* which means we will get a false positive otherwise.
*/
- vma->vm_lock_seq = -1;
+ vma->vm_lock_seq = UINT_MAX;
return vma;
}
@@ -212,7 +212,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
int seq = vma->vm_lock_seq;
/* We reset after each check. */
- vma->vm_lock_seq = -1;
+ vma->vm_lock_seq = UINT_MAX;
/* The vma_start_write() stub simply increments this value. */
return seq > -1;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index c5b9da034511..4007ec580f85 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -231,7 +231,7 @@ struct vm_area_struct {
* counter reuse can only lead to occasional unnecessary use of the
* slowpath.
*/
- int vm_lock_seq;
+ unsigned int vm_lock_seq;
struct vma_lock *vm_lock;
#endif
@@ -406,7 +406,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
return false;
init_rwsem(&vma->vm_lock->lock);
- vma->vm_lock_seq = -1;
+ vma->vm_lock_seq = UINT_MAX;
return true;
}
--
2.43.5
next prev parent reply other threads:[~2024-10-28 1:09 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-28 1:08 [PATCH v4 tip/perf/core 0/4] uprobes,mm: speculative lockless VMA-to-uprobe lookup Andrii Nakryiko
2024-10-28 1:08 ` Andrii Nakryiko [this message]
2024-10-29 11:52 ` [PATCH v4 tip/perf/core 1/4] mm: Convert mm_lock_seq to a proper seqcount Vlastimil Babka
2024-11-21 12:40 ` Peter Zijlstra
2024-11-21 15:35 ` Suren Baghdasaryan
2024-10-28 1:08 ` [PATCH v4 tip/perf/core 2/4] mm: Introduce mmap_lock_speculation_{begin|end} Andrii Nakryiko
2024-10-29 16:48 ` Vlastimil Babka
2024-11-21 14:44 ` Peter Zijlstra
2024-11-21 15:22 ` Peter Zijlstra
2024-11-21 15:36 ` Suren Baghdasaryan
2024-11-21 16:32 ` Suren Baghdasaryan
2024-10-28 1:08 ` [PATCH v4 tip/perf/core 3/4] uprobes: simplify find_active_uprobe_rcu() VMA checks Andrii Nakryiko
2024-10-28 1:51 ` Masami Hiramatsu
2024-10-28 1:08 ` [PATCH v4 tip/perf/core 4/4] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution Andrii Nakryiko
2024-11-12 0:28 ` Masami Hiramatsu
2024-11-12 1:04 ` Suren Baghdasaryan
2024-11-12 18:09 ` Andrii Nakryiko
2024-11-12 23:53 ` Masami Hiramatsu
2024-11-06 2:01 ` [PATCH v4 tip/perf/core 0/4] uprobes,mm: speculative lockless VMA-to-uprobe lookup Andrii Nakryiko
2024-11-11 17:26 ` Andrii Nakryiko
2024-11-20 15:40 ` Andrii Nakryiko
2024-11-20 15:43 ` Peter Zijlstra
2024-11-20 16:03 ` Ingo Molnar
2024-11-20 17:23 ` Andrii Nakryiko
2024-11-21 9:33 ` Ingo Molnar
2024-11-21 14:43 ` Andrii Nakryiko
2024-11-20 17:23 ` Andrii Nakryiko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241028010818.2487581-2-andrii@kernel.org \
--to=andrii@kernel.org \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=arnd@arndb.de \
--cc=bpf@vger.kernel.org \
--cc=brauner@kernel.org \
--cc=david@redhat.com \
--cc=hannes@cmpxchg.org \
--cc=hca@linux.ibm.com \
--cc=jannh@google.com \
--cc=jolsa@kernel.org \
--cc=linmiaohe@huawei.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@kernel.org \
--cc=mjguzik@gmail.com \
--cc=oleg@redhat.com \
--cc=paulmck@kernel.org \
--cc=peterz@infradead.org \
--cc=richard.weiyang@gmail.com \
--cc=rostedt@goodmis.org \
--cc=shakeel.butt@linux.dev \
--cc=surenb@google.com \
--cc=vbabka@suse.cz \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
--cc=zhangpeng.00@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox