linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: lizhe.67@bytedance.com
To: peterz@infradead.org, mingo@redhat.com, will@kernel.org,
	longman@redhat.com, boqun.feng@gmail.com,
	akpm@linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org, lizhe.67@bytedance.com
Subject: [RFC 1/2] rwsem: introduce upgrade_read interface
Date: Wed, 16 Oct 2024 12:35:59 +0800	[thread overview]
Message-ID: <20241016043600.35139-2-lizhe.67@bytedance.com> (raw)
In-Reply-To: <20241016043600.35139-1-lizhe.67@bytedance.com>

From: Li Zhe <lizhe.67@bytedance.com>

Introduce a new rwsem interface upgrade_read(). We can call it
to upgrade the lock into write rwsem lock after we get read lock.
This interface will wait for all readers to exit before obtaining
the write lock. In addition, this interface has a higher priority
than any process waiting for the write lock and subsequent threads
that want to obtain the read lock.

Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
---
 include/linux/rwsem.h  |  1 +
 kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c8b543d428b0..90183ab5ea79 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
  * downgrade write lock to read lock
  */
 extern void downgrade_write(struct rw_semaphore *sem);
+extern int upgrade_read(struct rw_semaphore *sem);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2bbb6eca5144..0583e1be3dbf 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -37,6 +37,7 @@
  * meanings when set.
  *  - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
  *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
+ *  - Bit 2: RWSEM_UPGRADING    - doing upgrade read process
  *
  * When the rwsem is reader-owned and a spinning writer has timed out,
  * the nonspinnable bit will be set to disable optimistic spinning.
@@ -62,7 +63,8 @@
  */
 #define RWSEM_READER_OWNED	(1UL << 0)
 #define RWSEM_NONSPINNABLE	(1UL << 1)
-#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+#define RWSEM_UPGRADING		(1UL << 2)
+#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE | RWSEM_UPGRADING)
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
@@ -93,7 +95,8 @@
  * Bit  0    - writer locked bit
  * Bit  1    - waiters present bit
  * Bit  2    - lock handoff bit
- * Bits 3-7  - reserved
+ * Bit  3    - upgrade read bit
+ * Bits 4-7  - reserved
  * Bits 8-30 - 23-bit reader count
  * Bit  31   - read fail bit
  *
@@ -117,6 +120,7 @@
 #define RWSEM_WRITER_LOCKED	(1UL << 0)
 #define RWSEM_FLAG_WAITERS	(1UL << 1)
 #define RWSEM_FLAG_HANDOFF	(1UL << 2)
+#define RWSEM_FLAG_UPGRADE_READ	(1UL << 3)
 #define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
 
 #define RWSEM_READER_SHIFT	8
@@ -143,6 +147,13 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
 	atomic_long_set(&sem->owner, (long)current);
 }
 
+static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
+{
+	lockdep_assert_preemption_disabled();
+	atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
+			RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
+}
+
 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
 {
 	lockdep_assert_preemption_disabled();
@@ -201,7 +212,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 	 */
 	long count = atomic_long_read(&sem->count);
 
-	if (count & RWSEM_WRITER_MASK)
+	if ((count & RWSEM_WRITER_MASK) && !(count & RWSEM_FLAG_UPGRADE_READ))
 		return false;
 	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
 }
@@ -1336,6 +1347,8 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
 static inline void __up_read(struct rw_semaphore *sem)
 {
 	long tmp;
+	unsigned long flags;
+	struct task_struct *owner;
 
 	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
 	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
@@ -1349,6 +1362,9 @@ static inline void __up_read(struct rw_semaphore *sem)
 		clear_nonspinnable(sem);
 		rwsem_wake(sem);
 	}
+	owner = rwsem_owner_flags(sem, &flags);
+	if (unlikely(!(tmp & RWSEM_READER_MASK) && (flags & RWSEM_UPGRADING)))
+		wake_up_process(owner);
 	preempt_enable();
 }
 
@@ -1641,6 +1657,71 @@ void downgrade_write(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(downgrade_write);
 
+static inline void rwsem_clear_upgrade_flag(struct rw_semaphore *sem)
+{
+	atomic_long_andnot(RWSEM_FLAG_UPGRADE_READ, &sem->count);
+}
+
+/*
+ * upgrade read lock to write lock
+ */
+static inline int __upgrade_read(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	preempt_disable();
+
+	tmp = atomic_long_read(&sem->count);
+	do {
+		if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
+			preempt_enable();
+			return -EBUSY;
+		}
+	} while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
+		tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
+
+	if ((tmp & RWSEM_READER_MASK) == RWSEM_READER_BIAS) {
+		/* fast path */
+		DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+		rwsem_clear_upgrade_flag(sem);
+		rwsem_set_owner(sem);
+		preempt_enable();
+		return 0;
+	}
+	/* slow path */
+	raw_spin_lock_irq(&sem->wait_lock);
+	rwsem_set_owner_upgrade(sem);
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	for (;;) {
+		if (!(atomic_long_read(&sem->count) & RWSEM_READER_MASK))
+			break;
+		raw_spin_unlock_irq(&sem->wait_lock);
+		schedule_preempt_disabled();
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		raw_spin_lock_irq(&sem->wait_lock);
+	}
+
+	rwsem_clear_upgrade_flag(sem);
+	rwsem_set_owner(sem);
+	__set_current_state(TASK_RUNNING);
+	raw_spin_unlock_irq(&sem->wait_lock);
+	preempt_enable();
+	return 0;
+}
+
+/*
+ * upgrade read lock to write lock
+ *
+ * Return: 0 on success, error code on failure
+ */
+int upgrade_read(struct rw_semaphore *sem)
+{
+	return __upgrade_read(sem);
+}
+EXPORT_SYMBOL(upgrade_read);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 void down_read_nested(struct rw_semaphore *sem, int subclass)
-- 
2.20.1



  reply	other threads:[~2024-10-16  4:36 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-16  4:35 [RFC 0/2] " lizhe.67
2024-10-16  4:35 ` lizhe.67 [this message]
2024-10-16  4:56   ` [RFC 1/2] " Christoph Hellwig
2024-10-16  7:33     ` lizhe.67
2024-10-16  7:36       ` Christoph Hellwig
2024-10-16  8:00         ` lizhe.67
2024-10-16  8:03           ` Christoph Hellwig
2024-10-16  8:13             ` lizhe.67
2024-10-16 11:51     ` Matthew Wilcox
2024-10-16 12:21       ` Christoph Hellwig
2024-10-16 11:49   ` Matthew Wilcox
2024-10-17  6:23     ` lizhe.67
2024-10-16 14:23   ` Waiman Long
2024-10-16 18:05     ` Matthew Wilcox
2024-10-16 18:39       ` Waiman Long
2024-10-17  6:46     ` lizhe.67
2024-10-17 15:05     ` Christoph Hellwig
2024-10-17 17:36       ` Waiman Long
2024-10-18  5:06         ` Christoph Hellwig
2024-10-16  4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
2024-10-16 11:53   ` Matthew Wilcox
2024-10-17  6:18     ` lizhe.67
2024-10-17 13:20       ` Matthew Wilcox
2024-10-18  6:37         ` lizhe.67
2024-10-23  7:27   ` kernel test robot
2024-10-16  8:09 ` [RFC 0/2] rwsem: introduce upgrade_read interface Peter Zijlstra
2024-10-16  8:53   ` lizhe.67
2024-10-16 12:10     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241016043600.35139-2-lizhe.67@bytedance.com \
    --to=lizhe.67@bytedance.com \
    --cc=akpm@linux-foundation.org \
    --cc=boqun.feng@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox