From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: willy@infradead.org, liam.howlett@oracle.com,
lorenzo.stoakes@oracle.com, mhocko@suse.com, vbabka@suse.cz,
hannes@cmpxchg.org, mjguzik@gmail.com, oliver.sang@intel.com,
mgorman@techsingularity.net, david@redhat.com,
peterx@redhat.com, oleg@redhat.com, dave@stgolabs.net,
paulmck@kernel.org, brauner@kernel.org, dhowells@redhat.com,
hdanton@sina.com, hughd@google.com, minchan@google.com,
jannh@google.com, shakeel.butt@linux.dev,
souravpanda@google.com, pasha.tatashin@soleen.com,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
kernel-team@android.com, surenb@google.com
Subject: [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock
Date: Mon, 11 Nov 2024 12:55:05 -0800 [thread overview]
Message-ID: <20241111205506.3404479-4-surenb@google.com> (raw)
In-Reply-To: <20241111205506.3404479-1-surenb@google.com>
rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore with an atomic variable.
When a reader takes read lock, it increments the atomic, unless the
top two bits are set indicating a writer is present.
When writer takes write lock, it sets VMA_LOCK_WR_LOCKED bit if there
are no readers or VMA_LOCK_WR_WAIT bit if readers are holding the lock
and puts itself onto newly introduced mm.vma_writer_wait. Since all
writers take mmap_lock in write mode first, there can be only one writer
at a time. The last reader to release the lock will signal the writer
to wake up.
atomic_t might overflow if there are many competing readers, therefore
vma_start_read() implements an overflow check and if that occurs it
exits with a failure to lock. vma_start_read_locked{_nested} may cause
an overflow but it is later handled by __vma_end_read().
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/mm.h | 142 ++++++++++++++++++++++++++++++++++----
include/linux/mm_types.h | 18 ++++-
include/linux/mmap_lock.h | 3 +
kernel/fork.c | 2 +-
mm/init-mm.c | 2 +
5 files changed, 151 insertions(+), 16 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1c2899464db..27c0e9ba81c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -686,7 +686,41 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#ifdef CONFIG_PER_VMA_LOCK
static inline void vma_lock_init(struct vma_lock *vm_lock)
{
- init_rwsem(&vm_lock->lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key lockdep_key;
+
+ lockdep_init_map(&vm_lock->dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+ atomic_set(&vm_lock->count, VMA_LOCK_UNLOCKED);
+}
+
+static inline unsigned int vma_lock_reader_count(unsigned int counter)
+{
+ return counter & VMA_LOCK_RD_COUNT_MASK;
+}
+
+static inline void __vma_end_read(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned int count, prev, new;
+
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ if (unlikely(vma_lock_reader_count(count) == 0)) {
+ /*
+ * Overflow was possible in vma_start_read_locked().
+ * When detected, wrap around preserving writer bits.
+ */
+ new = count | ~(VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT);
+ } else
+ new = count - 1;
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ if (vma_lock_reader_count(new) == 0 && (new & VMA_LOCK_WR_WAIT))
+ wake_up(&mm->vma_writer_wait);
}
/*
@@ -696,6 +730,9 @@ static inline void vma_lock_init(struct vma_lock *vm_lock)
*/
static inline bool vma_start_read(struct vm_area_struct *vma)
{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int count, prev, new;
+
/*
* Check before locking. A race might cause false locked result.
* We can use READ_ONCE() for the mm_lock_seq here, and don't need
@@ -703,11 +740,35 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* we don't rely on for anything - the mm_lock_seq read against which we
* need ordering is below.
*/
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
return false;
- if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
- return false;
+ rwsem_acquire_read(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ /* Is VMA is write-locked or writer is waiting? */
+ if (count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ return false;
+ }
+
+ new = count + 1;
+ /* If atomic_t overflows, fail to lock. */
+ if (new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)) {
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ return false;
+ }
+
+ /*
+ * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+ * vma_start_write, on failure we retry.
+ */
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
/*
* Overflow might produce false locked result.
@@ -720,8 +781,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* after it has been unlocked.
* This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock.lock);
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ __vma_end_read(mm, vma);
return false;
}
return true;
@@ -733,8 +794,30 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
*/
static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
- mmap_assert_locked(vma->vm_mm);
- down_read_nested(&vma->vm_lock.lock, subclass);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned int count, prev, new;
+
+ mmap_assert_locked(mm);
+
+ rwsem_acquire_read(&vma->vm_lock.dep_map, subclass, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ /* We are holding mmap_lock, no active or waiting writers are possible. */
+ VM_BUG_ON_VMA(count & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT), vma);
+ new = count + 1;
+ /* Unlikely but if atomic_t overflows, wrap around to. */
+ if (WARN_ON(new & (VMA_LOCK_WR_LOCKED | VMA_LOCK_WR_WAIT)))
+ new = 0;
+ /*
+ * Atomic RMW will provide implicit mb on success to pair with smp_wmb in
+ * vma_start_write, on failure we retry.
+ */
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
}
/*
@@ -743,14 +826,15 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
*/
static inline void vma_start_read_locked(struct vm_area_struct *vma)
{
- mmap_assert_locked(vma->vm_mm);
- down_read(&vma->vm_lock.lock);
+ vma_start_read_locked_nested(vma, 0);
}
static inline void vma_end_read(struct vm_area_struct *vma)
{
+ struct mm_struct *mm = vma->vm_mm;
+
rcu_read_lock(); /* keeps vma alive till the end of up_read */
- up_read(&vma->vm_lock.lock);
+ __vma_end_read(mm, vma);
rcu_read_unlock();
}
@@ -774,12 +858,34 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l
*/
static inline void vma_start_write(struct vm_area_struct *vma)
{
+ unsigned int count, prev, new;
unsigned int mm_lock_seq;
+ might_sleep();
if (__is_vma_write_locked(vma, &mm_lock_seq))
return;
- down_write(&vma->vm_lock.lock);
+ rwsem_acquire(&vma->vm_lock.dep_map, 0, 0, _RET_IP_);
+ count = (unsigned int)atomic_read(&vma->vm_lock.count);
+ for (;;) {
+ if (vma_lock_reader_count(count) > 0)
+ new = count | VMA_LOCK_WR_WAIT;
+ else
+ new = count | VMA_LOCK_WR_LOCKED;
+ prev = atomic_cmpxchg(&vma->vm_lock.count, count, new);
+ if (prev == count)
+ break;
+ count = prev;
+ }
+ if (new & VMA_LOCK_WR_WAIT) {
+ lock_contended(&vma->vm_lock.dep_map, _RET_IP_);
+ wait_event(vma->vm_mm->vma_writer_wait,
+ atomic_cmpxchg(&vma->vm_lock.count, VMA_LOCK_WR_WAIT,
+ VMA_LOCK_WR_LOCKED) == VMA_LOCK_WR_WAIT);
+
+ }
+ lock_acquired(&vma->vm_lock.dep_map, _RET_IP_);
+
/*
* We should use WRITE_ONCE() here because we can have concurrent reads
* from the early lockless pessimistic check in vma_start_read().
@@ -787,7 +893,10 @@ static inline void vma_start_write(struct vm_area_struct *vma)
* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
*/
WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock.lock);
+ /* Write barrier to ensure vm_lock_seq change is visible before count */
+ smp_wmb();
+ rwsem_release(&vma->vm_lock.dep_map, _RET_IP_);
+ atomic_set(&vma->vm_lock.count, VMA_LOCK_UNLOCKED);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -797,9 +906,14 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
+static inline bool is_vma_read_locked(struct vm_area_struct *vma)
+{
+ return vma_lock_reader_count((unsigned int)atomic_read(&vma->vm_lock.count)) > 0;
+}
+
static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- if (!rwsem_is_locked(&vma->vm_lock.lock))
+ if (!is_vma_read_locked(vma))
vma_assert_write_locked(vma);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5c4bfdcfac72..789bccc05520 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -615,8 +615,23 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
}
#endif
+#define VMA_LOCK_UNLOCKED 0
+#define VMA_LOCK_WR_LOCKED (1 << 31)
+#define VMA_LOCK_WR_WAIT (1 << 30)
+
+#define VMA_LOCK_RD_COUNT_MASK (VMA_LOCK_WR_WAIT - 1)
+
struct vma_lock {
- struct rw_semaphore lock;
+ /*
+ * count & VMA_LOCK_RD_COUNT_MASK > 0 ==> read-locked with 'count' number of readers
+ * count & VMA_LOCK_WR_LOCKED != 0 ==> write-locked
+ * count & VMA_LOCK_WR_WAIT != 0 ==> writer is waiting
+ * count = 0 ==> unlocked
+ */
+ atomic_t count;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
struct vma_numab_state {
@@ -883,6 +898,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct wait_queue_head vma_writer_wait;
/*
* This field has lock-like semantics, meaning it is sometimes
* accessed with ACQUIRE/RELEASE semantics.
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 58dde2e35f7e..769ab97fff3e 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -121,6 +121,9 @@ static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ init_waitqueue_head(&mm->vma_writer_wait);
+#endif
}
static inline void mmap_write_lock(struct mm_struct *mm)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9e504105f24f..726050c557e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -486,7 +486,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
vm_rcu);
/* The vma should not be locked while being destroyed. */
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
+ VM_BUG_ON_VMA(is_vma_read_locked(vma), vma);
__vm_area_free(vma);
}
#endif
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 6af3ad675930..db058873ba18 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,6 +40,8 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
+ .vma_writer_wait =
+ __WAIT_QUEUE_HEAD_INITIALIZER(init_mm.vma_writer_wait),
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
--
2.47.0.277.g8800431eea-goog
next prev parent reply other threads:[~2024-11-11 20:55 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-11-11 20:55 [PATCH 0/4] move per-vma lock into vm_area_struct Suren Baghdasaryan
2024-11-11 20:55 ` [PATCH 1/4] mm: introduce vma_start_read_locked{_nested} helpers Suren Baghdasaryan
2024-11-11 20:55 ` [PATCH 2/4] mm: move per-vma lock into vm_area_struct Suren Baghdasaryan
2024-11-12 9:36 ` kernel test robot
2024-11-12 9:47 ` kernel test robot
2024-11-12 10:18 ` kernel test robot
2024-11-12 15:57 ` Vlastimil Babka
2024-11-12 16:08 ` Suren Baghdasaryan
2024-11-12 16:58 ` Suren Baghdasaryan
2024-11-11 20:55 ` Suren Baghdasaryan [this message]
2024-11-12 0:06 ` [PATCH 3/4] mm: replace rw_semaphore with atomic_t in vma_lock Andrew Morton
2024-11-12 0:52 ` Suren Baghdasaryan
2024-11-12 0:35 ` Davidlohr Bueso
2024-11-12 0:59 ` Suren Baghdasaryan
2024-11-12 4:58 ` Matthew Wilcox
2024-11-12 15:18 ` Suren Baghdasaryan
2024-12-10 22:38 ` Peter Zijlstra
2024-12-10 23:37 ` Suren Baghdasaryan
2024-12-11 8:25 ` Peter Zijlstra
2024-12-11 15:20 ` Suren Baghdasaryan
2024-12-12 3:01 ` Suren Baghdasaryan
2024-12-12 9:16 ` Peter Zijlstra
2024-12-12 14:17 ` Suren Baghdasaryan
2024-12-12 14:19 ` Suren Baghdasaryan
2024-12-13 4:48 ` Suren Baghdasaryan
2024-12-13 9:57 ` Peter Zijlstra
2024-12-13 17:45 ` Suren Baghdasaryan
2024-12-13 18:19 ` Matthew Wilcox
2024-12-13 18:35 ` Peter Zijlstra
2024-12-13 18:37 ` Suren Baghdasaryan
2024-12-16 19:32 ` Suren Baghdasaryan
2024-12-13 9:22 ` Peter Zijlstra
2024-12-13 17:41 ` Suren Baghdasaryan
2024-11-11 20:55 ` [PATCH 4/4] mm: move lesser used vma_area_struct members into the last cacheline Suren Baghdasaryan
2024-11-12 10:07 ` Vlastimil Babka
2024-11-12 15:45 ` Suren Baghdasaryan
2024-11-11 21:41 ` [PATCH 0/4] move per-vma lock into vm_area_struct Suren Baghdasaryan
2024-11-12 2:48 ` Liam R. Howlett
2024-11-12 3:27 ` Suren Baghdasaryan
2024-11-11 22:18 ` Davidlohr Bueso
2024-11-11 23:19 ` Suren Baghdasaryan
2024-11-12 0:03 ` Davidlohr Bueso
2024-11-12 0:43 ` Suren Baghdasaryan
2024-11-12 0:11 ` Andrew Morton
2024-11-12 0:48 ` Suren Baghdasaryan
2024-11-12 2:35 ` Liam R. Howlett
2024-11-12 3:23 ` Suren Baghdasaryan
2024-11-12 9:39 ` Lorenzo Stoakes
2024-11-12 15:38 ` Suren Baghdasaryan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241111205506.3404479-4-surenb@google.com \
--to=surenb@google.com \
--cc=akpm@linux-foundation.org \
--cc=brauner@kernel.org \
--cc=dave@stgolabs.net \
--cc=david@redhat.com \
--cc=dhowells@redhat.com \
--cc=hannes@cmpxchg.org \
--cc=hdanton@sina.com \
--cc=hughd@google.com \
--cc=jannh@google.com \
--cc=kernel-team@android.com \
--cc=liam.howlett@oracle.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mgorman@techsingularity.net \
--cc=mhocko@suse.com \
--cc=minchan@google.com \
--cc=mjguzik@gmail.com \
--cc=oleg@redhat.com \
--cc=oliver.sang@intel.com \
--cc=pasha.tatashin@soleen.com \
--cc=paulmck@kernel.org \
--cc=peterx@redhat.com \
--cc=shakeel.butt@linux.dev \
--cc=souravpanda@google.com \
--cc=vbabka@suse.cz \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox