* [RFC 0/2] rwsem: introduce upgrade_read interface
@ 2024-10-16 4:35 lizhe.67
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
` (2 more replies)
0 siblings, 3 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 4:35 UTC (permalink / raw)
To: peterz, mingo, will, longman, boqun.feng, akpm
Cc: linux-kernel, linux-mm, lizhe.67
From: Li Zhe <lizhe.67@bytedance.com>
In the current kernel rwsem implementation, there is an interface to
downgrade write lock to read lock, but there is no interface to upgrade
a read lock to write lock. This means that in order to acquire write
lock while holding read lock, we have to release the read lock first and
then acquire the write lock, which will introduce some troubles in
concurrent programming. This patch set provides the 'upgrade_read' interface
to solve this problem. This interface can change a read lock to a write
lock.
Li Zhe (2):
rwsem: introduce upgrade_read interface
khugepaged: use upgrade_read() to optimize collapse_huge_page
include/linux/rwsem.h | 1 +
kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
mm/khugepaged.c | 36 ++++++++---------
3 files changed, 104 insertions(+), 20 deletions(-)
--
2.20.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:35 [RFC 0/2] rwsem: introduce upgrade_read interface lizhe.67
@ 2024-10-16 4:35 ` lizhe.67
2024-10-16 4:56 ` Christoph Hellwig
` (2 more replies)
2024-10-16 4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
2024-10-16 8:09 ` [RFC 0/2] rwsem: introduce upgrade_read interface Peter Zijlstra
2 siblings, 3 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 4:35 UTC (permalink / raw)
To: peterz, mingo, will, longman, boqun.feng, akpm
Cc: linux-kernel, linux-mm, lizhe.67
From: Li Zhe <lizhe.67@bytedance.com>
Introduce a new rwsem interface upgrade_read(). We can call it
to upgrade the lock into write rwsem lock after we get read lock.
This interface will wait for all readers to exit before obtaining
the write lock. In addition, this interface has a higher priority
than any process waiting for the write lock and subsequent threads
that want to obtain the read lock.
Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
---
include/linux/rwsem.h | 1 +
kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 85 insertions(+), 3 deletions(-)
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c8b543d428b0..90183ab5ea79 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
* downgrade write lock to read lock
*/
extern void downgrade_write(struct rw_semaphore *sem);
+extern int upgrade_read(struct rw_semaphore *sem);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2bbb6eca5144..0583e1be3dbf 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -37,6 +37,7 @@
* meanings when set.
* - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
* - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
+ * - Bit 2: RWSEM_UPGRADING - doing upgrade read process
*
* When the rwsem is reader-owned and a spinning writer has timed out,
* the nonspinnable bit will be set to disable optimistic spinning.
@@ -62,7 +63,8 @@
*/
#define RWSEM_READER_OWNED (1UL << 0)
#define RWSEM_NONSPINNABLE (1UL << 1)
-#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+#define RWSEM_UPGRADING (1UL << 2)
+#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE | RWSEM_UPGRADING)
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
@@ -93,7 +95,8 @@
* Bit 0 - writer locked bit
* Bit 1 - waiters present bit
* Bit 2 - lock handoff bit
- * Bits 3-7 - reserved
+ * Bit 3 - upgrade read bit
+ * Bits 4-7 - reserved
* Bits 8-30 - 23-bit reader count
* Bit 31 - read fail bit
*
@@ -117,6 +120,7 @@
#define RWSEM_WRITER_LOCKED (1UL << 0)
#define RWSEM_FLAG_WAITERS (1UL << 1)
#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_UPGRADE_READ (1UL << 3)
#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
#define RWSEM_READER_SHIFT 8
@@ -143,6 +147,13 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
atomic_long_set(&sem->owner, (long)current);
}
+static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
+{
+ lockdep_assert_preemption_disabled();
+ atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
+ RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
+}
+
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
lockdep_assert_preemption_disabled();
@@ -201,7 +212,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
*/
long count = atomic_long_read(&sem->count);
- if (count & RWSEM_WRITER_MASK)
+ if ((count & RWSEM_WRITER_MASK) && !(count & RWSEM_FLAG_UPGRADE_READ))
return false;
return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
}
@@ -1336,6 +1347,8 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
static inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
+ unsigned long flags;
+ struct task_struct *owner;
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
@@ -1349,6 +1362,9 @@ static inline void __up_read(struct rw_semaphore *sem)
clear_nonspinnable(sem);
rwsem_wake(sem);
}
+ owner = rwsem_owner_flags(sem, &flags);
+ if (unlikely(!(tmp & RWSEM_READER_MASK) && (flags & RWSEM_UPGRADING)))
+ wake_up_process(owner);
preempt_enable();
}
@@ -1641,6 +1657,71 @@ void downgrade_write(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(downgrade_write);
+static inline void rwsem_clear_upgrade_flag(struct rw_semaphore *sem)
+{
+ atomic_long_andnot(RWSEM_FLAG_UPGRADE_READ, &sem->count);
+}
+
+/*
+ * upgrade read lock to write lock
+ */
+static inline int __upgrade_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ preempt_disable();
+
+ tmp = atomic_long_read(&sem->count);
+ do {
+ if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
+ preempt_enable();
+ return -EBUSY;
+ }
+ } while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
+ tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
+
+ if ((tmp & RWSEM_READER_MASK) == RWSEM_READER_BIAS) {
+ /* fast path */
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ rwsem_clear_upgrade_flag(sem);
+ rwsem_set_owner(sem);
+ preempt_enable();
+ return 0;
+ }
+ /* slow path */
+ raw_spin_lock_irq(&sem->wait_lock);
+ rwsem_set_owner_upgrade(sem);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ for (;;) {
+ if (!(atomic_long_read(&sem->count) & RWSEM_READER_MASK))
+ break;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ schedule_preempt_disabled();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+
+ rwsem_clear_upgrade_flag(sem);
+ rwsem_set_owner(sem);
+ __set_current_state(TASK_RUNNING);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ preempt_enable();
+ return 0;
+}
+
+/*
+ * upgrade read lock to write lock
+ *
+ * Return: 0 on success, error code on failure
+ */
+int upgrade_read(struct rw_semaphore *sem)
+{
+ return __upgrade_read(sem);
+}
+EXPORT_SYMBOL(upgrade_read);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void down_read_nested(struct rw_semaphore *sem, int subclass)
--
2.20.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-16 4:35 [RFC 0/2] rwsem: introduce upgrade_read interface lizhe.67
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
@ 2024-10-16 4:36 ` lizhe.67
2024-10-16 11:53 ` Matthew Wilcox
2024-10-23 7:27 ` kernel test robot
2024-10-16 8:09 ` [RFC 0/2] rwsem: introduce upgrade_read interface Peter Zijlstra
2 siblings, 2 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 4:36 UTC (permalink / raw)
To: peterz, mingo, will, longman, boqun.feng, akpm
Cc: linux-kernel, linux-mm, lizhe.67
From: Li Zhe <lizhe.67@bytedance.com>
In function collapse_huge_page(), we drop mmap read lock and get
mmap write lock to prevent most accesses to pagetables. There is
a small time window to allow other tasks to acquire the mmap lock.
With the use of upgrade_read(), we don't need to check vma and pmd
again in most cases.
Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
---
mm/khugepaged.c | 36 +++++++++++++++++++-----------------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f9c39898eaff..934051274f7a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1142,23 +1142,25 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
goto out_nolock;
}
- mmap_read_unlock(mm);
- /*
- * Prevent all access to pagetables with the exception of
- * gup_fast later handled by the ptep_clear_flush and the VM
- * handled by the anon_vma lock + PG_lock.
- *
- * UFFDIO_MOVE is prevented to race as well thanks to the
- * mmap_lock.
- */
- mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
- if (result != SCAN_SUCCEED)
- goto out_up_write;
- /* check if the pmd is still valid */
- result = check_pmd_still_valid(mm, address, pmd);
- if (result != SCAN_SUCCEED)
- goto out_up_write;
+ if (upgrade_read(&mm->mmap_lock)) {
+ mmap_read_unlock(mm);
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later handled by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ *
+ * UFFDIO_MOVE is prevented to race as well thanks to the
+ * mmap_lock.
+ */
+ mmap_write_lock(mm);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED)
+ goto out_up_write;
+ /* check if the pmd is still valid */
+ result = check_pmd_still_valid(mm, address, pmd);
+ if (result != SCAN_SUCCEED)
+ goto out_up_write;
+ }
vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
--
2.20.1
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
@ 2024-10-16 4:56 ` Christoph Hellwig
2024-10-16 7:33 ` lizhe.67
2024-10-16 11:51 ` Matthew Wilcox
2024-10-16 11:49 ` Matthew Wilcox
2024-10-16 14:23 ` Waiman Long
2 siblings, 2 replies; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-16 4:56 UTC (permalink / raw)
To: lizhe.67
Cc: peterz, mingo, will, longman, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 12:35:59PM +0800, lizhe.67@bytedance.com wrote:
> From: Li Zhe <lizhe.67@bytedance.com>
>
> Introduce a new rwsem interface upgrade_read(). We can call it
It's obviously a try_upgrade_read, right?
> +extern int upgrade_read(struct rw_semaphore *sem);
No need for the extern. Also please make any new advanced funtionality
like this an EXPORT_SYMBO_GPL. That is if you actually had a modular
user, the current one is built-in. (although we could use this in
XFS if it lands).
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:56 ` Christoph Hellwig
@ 2024-10-16 7:33 ` lizhe.67
2024-10-16 7:36 ` Christoph Hellwig
2024-10-16 11:51 ` Matthew Wilcox
1 sibling, 1 reply; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 7:33 UTC (permalink / raw)
To: hch
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Tue, 15 Oct 2024 21:56:51 -0700, hch@infradead.org wrote:
>On Wed, Oct 16, 2024 at 12:35:59PM +0800, lizhe.67@bytedance.com wrote:
>> From: Li Zhe <lizhe.67@bytedance.com>
>>
>> Introduce a new rwsem interface upgrade_read(). We can call it
>
>It's obviously a try_upgrade_read, right?
Yes, try_upgrade_read is a better name, I will fix it in v2. Thanks!
>
>> +extern int upgrade_read(struct rw_semaphore *sem);
>
>No need for the extern. Also please make any new advanced funtionality
Sorry I don't understand why extern is not needed. If we remove it,
we will encounter the following compilation error:
mm/khugepaged.c:1145:6: error: implicit declaration of function 'upgrade_read'; did you mean 'i_gid_read'? [-Werror=implicit-function-declaration]
if (upgrade_read(&mm->mmap_lock)) {
^~~~~~~~~~~~
>like this an EXPORT_SYMBO_GPL. That is if you actually had a modular
>user, the current one is built-in. (although we could use this in
>XFS if it lands).
I see all advanced functionality in kernel/locking/rwsem.c is tagged with
EXPORT_SYMBOL (which is used in my patch), and I'm not sure if I need to
change it to EXPORT_SYMBO_GPL.
Hope to see this interface used by XFS.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 7:33 ` lizhe.67
@ 2024-10-16 7:36 ` Christoph Hellwig
2024-10-16 8:00 ` lizhe.67
0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-16 7:36 UTC (permalink / raw)
To: lizhe.67
Cc: hch, akpm, boqun.feng, linux-kernel, linux-mm, longman, mingo,
peterz, will
On Wed, Oct 16, 2024 at 03:33:40PM +0800, lizhe.67@bytedance.com wrote:
> >> +extern int upgrade_read(struct rw_semaphore *sem);
> >
> >No need for the extern. Also please make any new advanced funtionality
>
> Sorry I don't understand why extern is not needed. If we remove it,
> we will encounter the following compilation error:
That sounds like you dropped the entire line above, and not just the
"extern ".
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 7:36 ` Christoph Hellwig
@ 2024-10-16 8:00 ` lizhe.67
2024-10-16 8:03 ` Christoph Hellwig
0 siblings, 1 reply; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 8:00 UTC (permalink / raw)
To: hch
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Wed, 16 Oct 2024 00:36:19 -0700, hch@infradead.org wrote:
>> >> +extern int upgrade_read(struct rw_semaphore *sem);
>> >
>> >No need for the extern. Also please make any new advanced funtionality
>>
>> Sorry I don't understand why extern is not needed. If we remove it,
>> we will encounter the following compilation error:
>
>That sounds like you dropped the entire line above, and not just the
>"extern ".
OK I know what you mean. It is indeed OK to remove "extern", but all function
declarations in the rwsem.h have the "extern" prefix. I think it would be
better to keep it consistent.
Thanks!
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 8:00 ` lizhe.67
@ 2024-10-16 8:03 ` Christoph Hellwig
2024-10-16 8:13 ` lizhe.67
0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-16 8:03 UTC (permalink / raw)
To: lizhe.67
Cc: hch, akpm, boqun.feng, linux-kernel, linux-mm, longman, mingo,
peterz, will
On Wed, Oct 16, 2024 at 04:00:57PM +0800, lizhe.67@bytedance.com wrote:
> OK I know what you mean. It is indeed OK to remove "extern", but all function
> declarations in the rwsem.h have the "extern" prefix. I think it would be
> better to keep it consistent.
They are pointless and should never be added.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 0/2] rwsem: introduce upgrade_read interface
2024-10-16 4:35 [RFC 0/2] rwsem: introduce upgrade_read interface lizhe.67
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
2024-10-16 4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
@ 2024-10-16 8:09 ` Peter Zijlstra
2024-10-16 8:53 ` lizhe.67
2 siblings, 1 reply; 28+ messages in thread
From: Peter Zijlstra @ 2024-10-16 8:09 UTC (permalink / raw)
To: lizhe.67; +Cc: mingo, will, longman, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 12:35:58PM +0800, lizhe.67@bytedance.com wrote:
> From: Li Zhe <lizhe.67@bytedance.com>
>
> In the current kernel rwsem implementation, there is an interface to
> downgrade write lock to read lock, but there is no interface to upgrade
> a read lock to write lock. This means that in order to acquire write
> lock while holding read lock, we have to release the read lock first and
> then acquire the write lock, which will introduce some troubles in
> concurrent programming. This patch set provides the 'upgrade_read' interface
> to solve this problem. This interface can change a read lock to a write
> lock.
upgrade-read is fundamentally prone to deadlocks. Imagine two concurrent
invocations, each waiting for all readers to go away before proceeding
to upgrade to a writer.
Any solution to fixing that will end up being semantically similar to
dropping the read lock and acquiring a write lock -- there will not be a
single continuous critical section.
As such, this interface makes no sense.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 8:03 ` Christoph Hellwig
@ 2024-10-16 8:13 ` lizhe.67
0 siblings, 0 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 8:13 UTC (permalink / raw)
To: hch
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Wed, 16 Oct 2024 01:03:33 -0700, hch@infradead.org wrote:
>> OK I know what you mean. It is indeed OK to remove "extern", but all function
>> declarations in the rwsem.h have the "extern" prefix. I think it would be
>> better to keep it consistent.
>
>They are pointless and should never be added.
OK I will remove it in v2. Thanks!
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 0/2] rwsem: introduce upgrade_read interface
2024-10-16 8:09 ` [RFC 0/2] rwsem: introduce upgrade_read interface Peter Zijlstra
@ 2024-10-16 8:53 ` lizhe.67
2024-10-16 12:10 ` Peter Zijlstra
0 siblings, 1 reply; 28+ messages in thread
From: lizhe.67 @ 2024-10-16 8:53 UTC (permalink / raw)
To: peterz
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman, mingo, will
On Wed, 16 Oct 2024 10:09:55 +0200, peterz@infradead.org wrote:
> On Wed, Oct 16, 2024 at 12:35:58PM +0800, lizhe.67@bytedance.com wrote:
> > From: Li Zhe <lizhe.67@bytedance.com>
> >
> > In the current kernel rwsem implementation, there is an interface to
> > downgrade write lock to read lock, but there is no interface to upgrade
> > a read lock to write lock. This means that in order to acquire write
> > lock while holding read lock, we have to release the read lock first and
> > then acquire the write lock, which will introduce some troubles in
> > concurrent programming. This patch set provides the 'upgrade_read' interface
> > to solve this problem. This interface can change a read lock to a write
> > lock.
>
> upgrade-read is fundamentally prone to deadlocks. Imagine two concurrent
> invocations, each waiting for all readers to go away before proceeding
> to upgrade to a writer.
>
> Any solution to fixing that will end up being semantically similar to
> dropping the read lock and acquiring a write lock -- there will not be a
> single continuous critical section.
According to the implementation of this patch, one of the invocation will
get '-EBUSY' in this case. If -EBUSY is obtained and the invocation thread
continues to retry instead of dropping the read lock and acquiring a write lock,
it may cause problems. Of course, this patchset only try it's best to achieve a
single continuous critical section as much as possible, and there is no guarantee.
> As such, this interface makes no sense.
This interface is just trying to reduce the overhead caused by the additional
checks, which is caused by non-continuous critical sections, as much as possible.
Rather than eliminating it in all scenarios. So would it be better to change the
error code to something else? So that the caller will not retry this interface?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
2024-10-16 4:56 ` Christoph Hellwig
@ 2024-10-16 11:49 ` Matthew Wilcox
2024-10-17 6:23 ` lizhe.67
2024-10-16 14:23 ` Waiman Long
2 siblings, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2024-10-16 11:49 UTC (permalink / raw)
To: lizhe.67
Cc: peterz, mingo, will, longman, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 12:35:59PM +0800, lizhe.67@bytedance.com wrote:
> +++ b/include/linux/rwsem.h
> @@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
> * downgrade write lock to read lock
> */
> extern void downgrade_write(struct rw_semaphore *sem);
> +extern int upgrade_read(struct rw_semaphore *sem);
This needs __must_check, and I think this should be a bool, not an errno
return.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:56 ` Christoph Hellwig
2024-10-16 7:33 ` lizhe.67
@ 2024-10-16 11:51 ` Matthew Wilcox
2024-10-16 12:21 ` Christoph Hellwig
1 sibling, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2024-10-16 11:51 UTC (permalink / raw)
To: Christoph Hellwig
Cc: lizhe.67, peterz, mingo, will, longman, boqun.feng, akpm,
linux-kernel, linux-mm
On Tue, Oct 15, 2024 at 09:56:51PM -0700, Christoph Hellwig wrote:
> On Wed, Oct 16, 2024 at 12:35:59PM +0800, lizhe.67@bytedance.com wrote:
> > From: Li Zhe <lizhe.67@bytedance.com>
> >
> > Introduce a new rwsem interface upgrade_read(). We can call it
>
> It's obviously a try_upgrade_read, right?
Well, that's confusing. "try" usually means "don't sleep", and this
sleeps. Maybe it shouldn't sleep; ie we make this fail if there's any
other reader? It'll succeed less often, but it'll be easier to
understand.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-16 4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
@ 2024-10-16 11:53 ` Matthew Wilcox
2024-10-17 6:18 ` lizhe.67
2024-10-23 7:27 ` kernel test robot
1 sibling, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2024-10-16 11:53 UTC (permalink / raw)
To: lizhe.67
Cc: peterz, mingo, will, longman, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 12:36:00PM +0800, lizhe.67@bytedance.com wrote:
> From: Li Zhe <lizhe.67@bytedance.com>
>
> In function collapse_huge_page(), we drop mmap read lock and get
> mmap write lock to prevent most accesses to pagetables. There is
> a small time window to allow other tasks to acquire the mmap lock.
> With the use of upgrade_read(), we don't need to check vma and pmd
> again in most cases.
This is clearly a performance optimisation. So you must have some
numebrs that justify this, please include them.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 0/2] rwsem: introduce upgrade_read interface
2024-10-16 8:53 ` lizhe.67
@ 2024-10-16 12:10 ` Peter Zijlstra
0 siblings, 0 replies; 28+ messages in thread
From: Peter Zijlstra @ 2024-10-16 12:10 UTC (permalink / raw)
To: lizhe.67; +Cc: akpm, boqun.feng, linux-kernel, linux-mm, longman, mingo, will
On Wed, Oct 16, 2024 at 04:53:45PM +0800, lizhe.67@bytedance.com wrote:
> On Wed, 16 Oct 2024 10:09:55 +0200, peterz@infradead.org wrote:
>
> > On Wed, Oct 16, 2024 at 12:35:58PM +0800, lizhe.67@bytedance.com wrote:
> > > From: Li Zhe <lizhe.67@bytedance.com>
> > >
> > > In the current kernel rwsem implementation, there is an interface to
> > > downgrade write lock to read lock, but there is no interface to upgrade
> > > a read lock to write lock. This means that in order to acquire write
> > > lock while holding read lock, we have to release the read lock first and
> > > then acquire the write lock, which will introduce some troubles in
> > > concurrent programming. This patch set provides the 'upgrade_read' interface
> > > to solve this problem. This interface can change a read lock to a write
> > > lock.
> >
> > upgrade-read is fundamentally prone to deadlocks. Imagine two concurrent
> > invocations, each waiting for all readers to go away before proceeding
> > to upgrade to a writer.
> >
> > Any solution to fixing that will end up being semantically similar to
> > dropping the read lock and acquiring a write lock -- there will not be a
> > single continuous critical section.
>
> According to the implementation of this patch, one of the invocation will
Since the premise as described here is utter nonsense, I didn't get to
actually reading the implementation -- why continue to waste time etc.
> get '-EBUSY' in this case. If -EBUSY is obtained and the invocation thread
> continues to retry instead of dropping the read lock and acquiring a write lock,
> it may cause problems.
Failure should drop the read lock, otherwise it is too easy to mess
things up.
> Of course, this patchset only try it's best to achieve a
> single continuous critical section as much as possible, and there is no guarantee.
As already stated, nothing like that was mentioned.
> > As such, this interface makes no sense.
>
> This interface is just trying to reduce the overhead caused by the
> additional checks, which is caused by non-continuous critical
> sections, as much as possible. Rather than eliminating it in all
> scenarios. So would it be better to change the error code to something
> else? So that the caller will not retry this interface?
You fail to quantify the gains. How am I supposed to know if the
(significant?) increase in complexity is worth it?
Why should I accept this increase in complexity for the sake of
khugepaged, something which I care very little about?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 11:51 ` Matthew Wilcox
@ 2024-10-16 12:21 ` Christoph Hellwig
0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-16 12:21 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Christoph Hellwig, lizhe.67, peterz, mingo, will, longman,
boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 12:51:23PM +0100, Matthew Wilcox wrote:
> > It's obviously a try_upgrade_read, right?
>
> Well, that's confusing. "try" usually means "don't sleep", and this
> sleeps. Maybe it shouldn't sleep; ie we make this fail if there's any
> other reader? It'll succeed less often, but it'll be easier to
> understand.
To me try primarily implies that it can fail and the return value
needs to be checked. But I guess it has different implications to
different people.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
2024-10-16 4:56 ` Christoph Hellwig
2024-10-16 11:49 ` Matthew Wilcox
@ 2024-10-16 14:23 ` Waiman Long
2024-10-16 18:05 ` Matthew Wilcox
` (2 more replies)
2 siblings, 3 replies; 28+ messages in thread
From: Waiman Long @ 2024-10-16 14:23 UTC (permalink / raw)
To: lizhe.67, peterz, mingo, will, boqun.feng, akpm; +Cc: linux-kernel, linux-mm
On 10/16/24 12:35 AM, lizhe.67@bytedance.com wrote:
> From: Li Zhe <lizhe.67@bytedance.com>
>
> Introduce a new rwsem interface upgrade_read(). We can call it
> to upgrade the lock into write rwsem lock after we get read lock.
> This interface will wait for all readers to exit before obtaining
> the write lock. In addition, this interface has a higher priority
> than any process waiting for the write lock and subsequent threads
> that want to obtain the read lock.
>
> Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
> ---
> include/linux/rwsem.h | 1 +
> kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index c8b543d428b0..90183ab5ea79 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
> * downgrade write lock to read lock
> */
> extern void downgrade_write(struct rw_semaphore *sem);
> +extern int upgrade_read(struct rw_semaphore *sem);
>
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> /*
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 2bbb6eca5144..0583e1be3dbf 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -37,6 +37,7 @@
> * meanings when set.
> * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
> * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
> + * - Bit 2: RWSEM_UPGRADING - doing upgrade read process
> *
> * When the rwsem is reader-owned and a spinning writer has timed out,
> * the nonspinnable bit will be set to disable optimistic spinning.
> @@ -62,7 +63,8 @@
> */
> #define RWSEM_READER_OWNED (1UL << 0)
> #define RWSEM_NONSPINNABLE (1UL << 1)
> -#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
> +#define RWSEM_UPGRADING (1UL << 2)
> +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE | RWSEM_UPGRADING)
>
> #ifdef CONFIG_DEBUG_RWSEMS
> # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
> @@ -93,7 +95,8 @@
> * Bit 0 - writer locked bit
> * Bit 1 - waiters present bit
> * Bit 2 - lock handoff bit
> - * Bits 3-7 - reserved
> + * Bit 3 - upgrade read bit
> + * Bits 4-7 - reserved
> * Bits 8-30 - 23-bit reader count
> * Bit 31 - read fail bit
> *
> @@ -117,6 +120,7 @@
> #define RWSEM_WRITER_LOCKED (1UL << 0)
> #define RWSEM_FLAG_WAITERS (1UL << 1)
> #define RWSEM_FLAG_HANDOFF (1UL << 2)
> +#define RWSEM_FLAG_UPGRADE_READ (1UL << 3)
> #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
>
> #define RWSEM_READER_SHIFT 8
> @@ -143,6 +147,13 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
> atomic_long_set(&sem->owner, (long)current);
> }
>
> +static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
> +{
> + lockdep_assert_preemption_disabled();
> + atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
> + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
> +}
Because of possible racing between 2 competing upgraders, read lock
owner setting has to be atomic to avoid one overwriting the others.
> +
> static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> {
> lockdep_assert_preemption_disabled();
> @@ -201,7 +212,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
> */
> long count = atomic_long_read(&sem->count);
>
> - if (count & RWSEM_WRITER_MASK)
> + if ((count & RWSEM_WRITER_MASK) && !(count & RWSEM_FLAG_UPGRADE_READ))
> return false;
> return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
> }
> @@ -1336,6 +1347,8 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
> static inline void __up_read(struct rw_semaphore *sem)
> {
> long tmp;
> + unsigned long flags;
> + struct task_struct *owner;
>
> DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
> DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
> @@ -1349,6 +1362,9 @@ static inline void __up_read(struct rw_semaphore *sem)
> clear_nonspinnable(sem);
> rwsem_wake(sem);
> }
> + owner = rwsem_owner_flags(sem, &flags);
> + if (unlikely(!(tmp & RWSEM_READER_MASK) && (flags & RWSEM_UPGRADING)))
> + wake_up_process(owner);
> preempt_enable();
> }
>
> @@ -1641,6 +1657,71 @@ void downgrade_write(struct rw_semaphore *sem)
> }
> EXPORT_SYMBOL(downgrade_write);
>
> +static inline void rwsem_clear_upgrade_flag(struct rw_semaphore *sem)
> +{
> + atomic_long_andnot(RWSEM_FLAG_UPGRADE_READ, &sem->count);
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + */
> +static inline int __upgrade_read(struct rw_semaphore *sem)
> +{
> + long tmp;
> +
> + preempt_disable();
> +
> + tmp = atomic_long_read(&sem->count);
> + do {
> + if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
> + preempt_enable();
> + return -EBUSY;
> + }
> + } while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
> + tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
> +
> + if ((tmp & RWSEM_READER_MASK) == RWSEM_READER_BIAS) {
> + /* fast path */
> + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
> + rwsem_clear_upgrade_flag(sem);
> + rwsem_set_owner(sem);
> + preempt_enable();
> + return 0;
> + }
> + /* slow path */
> + raw_spin_lock_irq(&sem->wait_lock);
> + rwsem_set_owner_upgrade(sem);
> +
> + set_current_state(TASK_UNINTERRUPTIBLE);
> +
> + for (;;) {
> + if (!(atomic_long_read(&sem->count) & RWSEM_READER_MASK))
> + break;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + schedule_preempt_disabled();
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + raw_spin_lock_irq(&sem->wait_lock);
> + }
> +
> + rwsem_clear_upgrade_flag(sem);
> + rwsem_set_owner(sem);
> + __set_current_state(TASK_RUNNING);
> + raw_spin_unlock_irq(&sem->wait_lock);
> + preempt_enable();
> + return 0;
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int upgrade_read(struct rw_semaphore *sem)
> +{
> + return __upgrade_read(sem);
> +}
> +EXPORT_SYMBOL(upgrade_read);
This new interface should have an API similar to a trylock. True if
successful, false otherwise. I like the read_try_upgrade() name.
Another alternative that I have been thinking about is a down_read()
variant with intention to upgrade later. This will ensure that only one
active reader is allowed to upgrade later. With this, upgrade_read()
will always succeed, maybe with some sleeping, as long as the correct
down_read() is used.
Cheers,
Longman
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 14:23 ` Waiman Long
@ 2024-10-16 18:05 ` Matthew Wilcox
2024-10-16 18:39 ` Waiman Long
2024-10-17 6:46 ` lizhe.67
2024-10-17 15:05 ` Christoph Hellwig
2 siblings, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2024-10-16 18:05 UTC (permalink / raw)
To: Waiman Long
Cc: lizhe.67, peterz, mingo, will, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 10:23:14AM -0400, Waiman Long wrote:
> Another alternative that I have been thinking about is a down_read() variant
> with intention to upgrade later. This will ensure that only one active
> reader is allowed to upgrade later. With this, upgrade_read() will always
> succeed, maybe with some sleeping, as long as the correct down_read() is
> used.
How is that different from Kent's SIX locks other than you can take an
rwsem for write immediately (SIX locks have to be taken for Intent and
then Upgraded)?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 18:05 ` Matthew Wilcox
@ 2024-10-16 18:39 ` Waiman Long
0 siblings, 0 replies; 28+ messages in thread
From: Waiman Long @ 2024-10-16 18:39 UTC (permalink / raw)
To: Matthew Wilcox, Waiman Long
Cc: lizhe.67, peterz, mingo, will, boqun.feng, akpm, linux-kernel, linux-mm
[-- Attachment #1: Type: text/plain, Size: 717 bytes --]
On 10/16/24 2:05 PM, Matthew Wilcox wrote:
> On Wed, Oct 16, 2024 at 10:23:14AM -0400, Waiman Long wrote:
>> Another alternative that I have been thinking about is a down_read() variant
>> with intention to upgrade later. This will ensure that only one active
>> reader is allowed to upgrade later. With this, upgrade_read() will always
>> succeed, maybe with some sleeping, as long as the correct down_read() is
>> used.
> How is that different from Kent's SIX locks other than you can take an
> rwsem for write immediately (SIX locks have to be taken for Intent and
> then Upgraded)?
Yes, it is modeled after Kent's SIX locks. The goal is to eventually
eliminate the need of a separate SIX lock.
Cheers,
Longman
[-- Attachment #2: Type: text/html, Size: 1267 bytes --]
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-16 11:53 ` Matthew Wilcox
@ 2024-10-17 6:18 ` lizhe.67
2024-10-17 13:20 ` Matthew Wilcox
0 siblings, 1 reply; 28+ messages in thread
From: lizhe.67 @ 2024-10-17 6:18 UTC (permalink / raw)
To: willy
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Wed, 16 Oct 2024 12:53:15 +0100, willy@infradead.org wrote:
>On Wed, Oct 16, 2024 at 12:36:00PM +0800, lizhe.67@bytedance.com wrote:
>> From: Li Zhe <lizhe.67@bytedance.com>
>>
>> In function collapse_huge_page(), we drop mmap read lock and get
>> mmap write lock to prevent most accesses to pagetables. There is
>> a small time window to allow other tasks to acquire the mmap lock.
>> With the use of upgrade_read(), we don't need to check vma and pmd
>> again in most cases.
>
>This is clearly a performance optimisation. So you must have some
>numebrs that justify this, please include them.
Yes, I will add the relevant data to v2 patch.
Thanks!
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 11:49 ` Matthew Wilcox
@ 2024-10-17 6:23 ` lizhe.67
0 siblings, 0 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-17 6:23 UTC (permalink / raw)
To: willy
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Wed, 16 Oct 2024 12:49:44 +0100, willy@infradead.org wrote:
>On Wed, Oct 16, 2024 at 12:35:59PM +0800, lizhe.67@bytedance.com wrote:
>> +++ b/include/linux/rwsem.h
>> @@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
>> * downgrade write lock to read lock
>> */
>> extern void downgrade_write(struct rw_semaphore *sem);
>> +extern int upgrade_read(struct rw_semaphore *sem);
>
>This needs __must_check, and I think this should be a bool, not an errno
>return.
I can't agree more. I will fix it in v2.
Thanks!
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 14:23 ` Waiman Long
2024-10-16 18:05 ` Matthew Wilcox
@ 2024-10-17 6:46 ` lizhe.67
2024-10-17 15:05 ` Christoph Hellwig
2 siblings, 0 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-17 6:46 UTC (permalink / raw)
To: llong
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, mingo, peterz, will
On Wed, 16 Oct 2024 10:23:14 -0400, llong@redhat.com wrote:
>> +static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
>> +{
>> + lockdep_assert_preemption_disabled();
>> + atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
>> + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
>> +}
>
>Because of possible racing between 2 competing upgraders, read lock
>owner setting has to be atomic to avoid one overwriting the others.
I did concurrent processing at the very beginning of the function
__upgrade_read(). Is it not necessary to do concurrent processing here?
The relevant code is as follows.
+static inline int __upgrade_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ preempt_disable();
+
+ tmp = atomic_long_read(&sem->count);
+ do {
+ if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
+ preempt_enable();
+ return -EBUSY;
+ }
+ } while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
+ tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
>This new interface should have an API similar to a trylock. True if
>successful, false otherwise. I like the read_try_upgrade() name.
I can't agree more. I will add an read_try_upgrade() API in v2.
>Another alternative that I have been thinking about is a down_read()
>variant with intention to upgrade later. This will ensure that only one
>active reader is allowed to upgrade later. With this, upgrade_read()
>will always succeed, maybe with some sleeping, as long as the correct
>down_read() is used.
I haven't figured out how to do this yet. If the current upgrade_read
idea is also OK, should I continue to complete this patchset according
to this idea?
>Cheers,
>Longman
Thanks for your review!
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-17 6:18 ` lizhe.67
@ 2024-10-17 13:20 ` Matthew Wilcox
2024-10-18 6:37 ` lizhe.67
0 siblings, 1 reply; 28+ messages in thread
From: Matthew Wilcox @ 2024-10-17 13:20 UTC (permalink / raw)
To: lizhe.67
Cc: akpm, boqun.feng, linux-kernel, linux-mm, longman, mingo, peterz, will
On Thu, Oct 17, 2024 at 02:18:41PM +0800, lizhe.67@bytedance.com wrote:
> On Wed, 16 Oct 2024 12:53:15 +0100, willy@infradead.org wrote:
>
> >On Wed, Oct 16, 2024 at 12:36:00PM +0800, lizhe.67@bytedance.com wrote:
> >> From: Li Zhe <lizhe.67@bytedance.com>
> >>
> >> In function collapse_huge_page(), we drop mmap read lock and get
> >> mmap write lock to prevent most accesses to pagetables. There is
> >> a small time window to allow other tasks to acquire the mmap lock.
> >> With the use of upgrade_read(), we don't need to check vma and pmd
> >> again in most cases.
> >
> >This is clearly a performance optimisation. So you must have some
> >numebrs that justify this, please include them.
>
> Yes, I will add the relevant data to v2 patch.
How about telling us all now so we know whether to continue discussing
this?
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-16 14:23 ` Waiman Long
2024-10-16 18:05 ` Matthew Wilcox
2024-10-17 6:46 ` lizhe.67
@ 2024-10-17 15:05 ` Christoph Hellwig
2024-10-17 17:36 ` Waiman Long
2 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-17 15:05 UTC (permalink / raw)
To: Waiman Long
Cc: lizhe.67, peterz, mingo, will, boqun.feng, akpm, linux-kernel, linux-mm
On Wed, Oct 16, 2024 at 10:23:14AM -0400, Waiman Long wrote:
> Another alternative that I have been thinking about is a down_read() variant
> with intention to upgrade later. This will ensure that only one active
> reader is allowed to upgrade later. With this, upgrade_read() will always
> succeed, maybe with some sleeping, as long as the correct down_read() is
> used.
At least for the XFS use case where direct I/O takes a share lock
that needs to be replaced with an exclusive one for certain kinds of
I/O would be useless. But then again we've survived without this
operation for a long time, despite the initial port bringing one over
from IRIX.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-17 15:05 ` Christoph Hellwig
@ 2024-10-17 17:36 ` Waiman Long
2024-10-18 5:06 ` Christoph Hellwig
0 siblings, 1 reply; 28+ messages in thread
From: Waiman Long @ 2024-10-17 17:36 UTC (permalink / raw)
To: Christoph Hellwig, Waiman Long
Cc: lizhe.67, peterz, mingo, will, boqun.feng, akpm, linux-kernel, linux-mm
On 10/17/24 11:05 AM, Christoph Hellwig wrote:
> On Wed, Oct 16, 2024 at 10:23:14AM -0400, Waiman Long wrote:
>> Another alternative that I have been thinking about is a down_read() variant
>> with intention to upgrade later. This will ensure that only one active
>> reader is allowed to upgrade later. With this, upgrade_read() will always
>> succeed, maybe with some sleeping, as long as the correct down_read() is
>> used.
> At least for the XFS use case where direct I/O takes a share lock
> that needs to be replaced with an exclusive one for certain kinds of
> I/O would be useless. But then again we've survived without this
> operation for a long time, despite the initial port bringing one over
> from IRIX.
That means XFS only needs to upgrade to a write lock in certain cases
only, not all of them. Right? In that case, read_try_upgrade() that
attempts to upgrade to a write lock will be useful.
Cheers,
Longman
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 1/2] rwsem: introduce upgrade_read interface
2024-10-17 17:36 ` Waiman Long
@ 2024-10-18 5:06 ` Christoph Hellwig
0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2024-10-18 5:06 UTC (permalink / raw)
To: Waiman Long
Cc: Christoph Hellwig, lizhe.67, peterz, mingo, will, boqun.feng,
akpm, linux-kernel, linux-mm
On Thu, Oct 17, 2024 at 01:36:41PM -0400, Waiman Long wrote:
> > At least for the XFS use case where direct I/O takes a share lock
> > that needs to be replaced with an exclusive one for certain kinds of
> > I/O would be useless. But then again we've survived without this
> > operation for a long time, despite the initial port bringing one over
> > from IRIX.
>
> That means XFS only needs to upgrade to a write lock in certain cases only,
> not all of them.
Yes. Basically when we detect a direct I/O writes needs to clear the
SUID bit or other metadata, or when it is an extending write.
> Right? In that case, read_try_upgrade() that attempts to
> upgrade to a write lock will be useful.
Yes. But I also understand Peters reasoning that it will be very hard
to actually have a useful implementation that does better than just
unlocking (which is what we do currently).
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-17 13:20 ` Matthew Wilcox
@ 2024-10-18 6:37 ` lizhe.67
0 siblings, 0 replies; 28+ messages in thread
From: lizhe.67 @ 2024-10-18 6:37 UTC (permalink / raw)
To: willy
Cc: akpm, boqun.feng, linux-kernel, linux-mm, lizhe.67, longman,
mingo, peterz, will
On Thu, 17 Oct 2024 14:20:12 +0100, willy@infradead.org wrote:
> On Thu, Oct 17, 2024 at 02:18:41PM +0800, lizhe.67@bytedance.com wrote:
> > On Wed, 16 Oct 2024 12:53:15 +0100, willy@infradead.org wrote:
> >
> > >On Wed, Oct 16, 2024 at 12:36:00PM +0800, lizhe.67@bytedance.com wrote:
> > >> From: Li Zhe <lizhe.67@bytedance.com>
> > >>
> > >> In function collapse_huge_page(), we drop mmap read lock and get
> > >> mmap write lock to prevent most accesses to pagetables. There is
> > >> a small time window to allow other tasks to acquire the mmap lock.
> > >> With the use of upgrade_read(), we don't need to check vma and pmd
> > >> again in most cases.
> > >
> > >This is clearly a performance optimisation. So you must have some
> > >numebrs that justify this, please include them.
> >
> > Yes, I will add the relevant data to v2 patch.
>
> How about telling us all now so we know whether to continue discussing
> this?
In my test environment, function collapse_huge_page() only achieved a 0.25%
performance improvement. I use ftrace to get the execution time of
collapse_huge_page(). The test code and test command are as follows.
(1) Test result:
average execution time of collapse_huge_page()
before this patch: 1611.06283 us
after this patch: 1597.01474 us
(2) Test code:
#define MMAP_SIZE (2ul*1024*1024)
#define ALIGN(x, mask) (((x) + ((mask)-1)) & ~((mask)-1))
int main(void)
{
int num = 100;
size_t page_sz = getpagesize();
while (num--) {
size_t index;
unsigned char *p_map;
unsigned char *p_map_real;
p_map = (unsigned char *)mmap(0, 2 * MMAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (p_map == MAP_FAILED) {
printf("mmap fail\n");
return -1;
} else {
p_map_real = (char *)ALIGN((unsigned long)p_map, MMAP_SIZE);
printf("mmap get %p, align to %p\n", p_map, p_map_real);
}
for(index = 0; index < MMAP_SIZE; index += page_sz)
p_map_real[index] = 6;
int ret = madvise(p_map_real, MMAP_SIZE, 25);
printf("ret is %d\n", ret);
munmap(p_map, 2 * MMAP_SIZE);
}
return 0;
}
(3) Test command:
echo never > /sys/kernel/mm/transparent_hugepage/enabled
gcc test.c -o test
trace-cmd record -p function_graph -g collapse_huge_page --max-graph-depth 1 ./test
The optimization of the function collapse_huge_page() seems insignificant.
I am not sure whether it will have a more obvious optimization effect in
other scenarios.
^ permalink raw reply [flat|nested] 28+ messages in thread
* Re: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
2024-10-16 4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
2024-10-16 11:53 ` Matthew Wilcox
@ 2024-10-23 7:27 ` kernel test robot
1 sibling, 0 replies; 28+ messages in thread
From: kernel test robot @ 2024-10-23 7:27 UTC (permalink / raw)
To: lizhe.67
Cc: oe-lkp, lkp, linux-mm, peterz, mingo, will, longman, boqun.feng,
akpm, linux-kernel, lizhe.67, oliver.sang
Hello,
kernel test robot noticed "WARNING:at_include/linux/rwsem.h:#collapse_huge_page" on:
commit: 6604438065fc95d188ffcde12f687dfc319ef2cc ("[RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page")
url: https://github.com/intel-lab-lkp/linux/commits/lizhe-67-bytedance-com/rwsem-introduce-upgrade_read-interface/20241016-123810
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 823a566221a5639f6c69424897218e5d6431a970
patch link: https://lore.kernel.org/all/20241016043600.35139-3-lizhe.67@bytedance.com/
patch subject: [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page
in testcase: boot
config: x86_64-rhel-8.3-kselftests
compiler: gcc-12
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
(please refer to attached dmesg/kmsg for entire log/backtrace)
+------------------------------------------------------+------------+------------+
| | 6a6ad5e040 | 6604438065 |
+------------------------------------------------------+------------+------------+
| WARNING:at_include/linux/rwsem.h:#collapse_huge_page | 0 | 18 |
| RIP:collapse_huge_page | 0 | 18 |
+------------------------------------------------------+------------+------------+
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202410231434.e98a7287-oliver.sang@intel.com
[ 19.879799][ T39] ------------[ cut here ]------------
[ 19.880779][ T39] WARNING: CPU: 0 PID: 39 at include/linux/rwsem.h:203 collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.881951][ T39] Modules linked in: sch_fq_codel
[ 19.882624][ T39] CPU: 0 UID: 0 PID: 39 Comm: khugepaged Not tainted 6.12.0-rc2-00004-g6604438065fc #1
[ 19.883821][ T39] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 19.885086][ T39] RIP: 0010:collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.885835][ T39] Code: 0f 85 7f f9 ff ff 48 c7 c2 00 e0 59 84 be 6e 03 00 00 48 c7 c7 60 e0 59 84 c6 05 fd 58 49 04 01 e8 88 4e 7e ff e9 5b f9 ff ff <0f> 0b 48 b8 00 00 00 00 00 fc ff df 4c 89 c2 48 c1 ea 03 80 3c 02
All code
========
0: 0f 85 7f f9 ff ff jne 0xfffffffffffff985
6: 48 c7 c2 00 e0 59 84 mov $0xffffffff8459e000,%rdx
d: be 6e 03 00 00 mov $0x36e,%esi
12: 48 c7 c7 60 e0 59 84 mov $0xffffffff8459e060,%rdi
19: c6 05 fd 58 49 04 01 movb $0x1,0x44958fd(%rip) # 0x449591d
20: e8 88 4e 7e ff callq 0xffffffffff7e4ead
25: e9 5b f9 ff ff jmpq 0xfffffffffffff985
2a:* 0f 0b ud2 <-- trapping instruction
2c: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax
33: fc ff df
36: 4c 89 c2 mov %r8,%rdx
39: 48 c1 ea 03 shr $0x3,%rdx
3d: 80 .byte 0x80
3e: 3c 02 cmp $0x2,%al
Code starting with the faulting instruction
===========================================
0: 0f 0b ud2
2: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax
9: fc ff df
c: 4c 89 c2 mov %r8,%rdx
f: 48 c1 ea 03 shr $0x3,%rdx
13: 80 .byte 0x80
14: 3c 02 cmp $0x2,%al
[ 19.888059][ T39] RSP: 0000:ffffc90000297998 EFLAGS: 00010246
[ 19.888832][ T39] RAX: 0000000000000000 RBX: 1ffff92000052f39 RCX: 0000000000000000
[ 19.889839][ T39] RDX: 0000000000000000 RSI: ffff88813394afd8 RDI: ffff88810625c320
[ 19.890860][ T39] RBP: ffff8882e908a6c8 R08: ffff8882e908a6d8 R09: ffffed10267295ee
[ 19.891844][ T39] R10: ffff88813394af77 R11: ffff8882ea258440 R12: ffff88813394ae40
[ 19.892865][ T39] R13: ffffffff859ef180 R14: ffffc90000297ab8 R15: ffff88813394af68
[ 19.893868][ T39] FS: 0000000000000000(0000) GS:ffff8883aee00000(0000) knlGS:0000000000000000
[ 19.894985][ T39] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 19.895775][ T39] CR2: 0000000028a5b608 CR3: 0000000133892000 CR4: 00000000000406f0
[ 19.896782][ T39] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 19.897783][ T39] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 19.898793][ T39] Call Trace:
[ 19.902237][ T39] <TASK>
[ 19.902820][ T39] ? collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.903524][ T39] ? __warn (kernel/panic.c:748)
[ 19.904099][ T39] ? collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.904863][ T39] ? report_bug (lib/bug.c:180 lib/bug.c:219)
[ 19.905477][ T39] ? handle_bug (arch/x86/kernel/traps.c:285)
[ 19.906057][ T39] ? exc_invalid_op (arch/x86/kernel/traps.c:309 (discriminator 1))
[ 19.906674][ T39] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621)
[ 19.907345][ T39] ? collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.908036][ T39] ? collapse_huge_page (include/linux/rwsem.h:203 include/linux/mmap_lock.h:70 include/linux/mm.h:735 include/linux/mm.h:754 mm/khugepaged.c:1165)
[ 19.908739][ T39] ? __pte_offset_map_lock (include/linux/pgtable.h:324 include/linux/pgtable.h:594 mm/pgtable-generic.c:376)
[ 19.909447][ T39] ? __pfx_collapse_huge_page (mm/khugepaged.c:1095)
[ 19.910150][ T39] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5793)
[ 19.910817][ T39] ? __lock_acquire (kernel/locking/lockdep.c:5202)
[ 19.911460][ T39] ? do_raw_spin_lock (arch/x86/include/asm/atomic.h:107 include/linux/atomic/atomic-arch-fallback.h:2170 include/linux/atomic/atomic-instrumented.h:1302 include/asm-generic/qspinlock.h:111 kernel/locking/spinlock_debug.c:116)
[ 19.912111][ T39] ? find_held_lock (kernel/locking/lockdep.c:5315)
[ 19.912753][ T39] ? find_held_lock (kernel/locking/lockdep.c:5315)
[ 19.913389][ T39] ? hpage_collapse_scan_pmd (include/linux/rcupdate.h:347 include/linux/rcupdate.h:880 include/linux/pgtable.h:115 mm/khugepaged.c:1422)
[ 19.914118][ T39] ? __lock_release+0x10b/0x440
[ 19.914823][ T39] ? hpage_collapse_scan_pmd (include/linux/rcupdate.h:347 include/linux/rcupdate.h:880 include/linux/pgtable.h:115 mm/khugepaged.c:1422)
[ 19.915528][ T39] ? hpage_collapse_scan_pmd (include/linux/rcupdate.h:347 include/linux/rcupdate.h:880 include/linux/pgtable.h:115 mm/khugepaged.c:1422)
[ 19.916202][ T39] hpage_collapse_scan_pmd (mm/khugepaged.c:1427)
[ 19.916915][ T39] ? __pfx_hpage_collapse_scan_pmd (mm/khugepaged.c:1261)
[ 19.917678][ T39] ? __thp_vma_allowable_orders (arch/x86/include/asm/bitops.h:206 (discriminator 1) arch/x86/include/asm/bitops.h:238 (discriminator 1) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 1) mm/huge_memory.c:118 (discriminator 1))
[ 19.918441][ T39] ? __pfx___might_resched (kernel/sched/core.c:8586)
[ 19.919187][ T39] khugepaged_scan_mm_slot+0x8bd/0xd90
[ 19.920082][ T39] ? __pfx_khugepaged_scan_mm_slot+0x10/0x10
[ 19.921041][ T39] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5793)
[ 19.921756][ T39] ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114)
[ 19.922464][ T39] ? __pfx___might_resched (kernel/sched/core.c:8586)
[ 19.923200][ T39] khugepaged (include/linux/spinlock.h:391 mm/khugepaged.c:2521 mm/khugepaged.c:2573)
[ 19.923801][ T39] ? __pfx_khugepaged (mm/khugepaged.c:2566)
[ 19.924471][ T39] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4347 kernel/locking/lockdep.c:4406)
[ 19.925235][ T39] ? __kthread_parkme (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 kernel/kthread.c:280)
[ 19.925865][ T39] ? __pfx_khugepaged (mm/khugepaged.c:2566)
[ 19.926499][ T39] kthread (kernel/kthread.c:389)
[ 19.927083][ T39] ? __pfx_kthread (kernel/kthread.c:342)
[ 19.927775][ T39] ret_from_fork (arch/x86/kernel/process.c:153)
[ 19.928395][ T39] ? __pfx_kthread (kernel/kthread.c:342)
[ 19.929026][ T39] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
[ 19.929685][ T39] </TASK>
[ 19.930123][ T39] irq event stamp: 951
[ 19.930689][ T39] hardirqs last enabled at (965): __up_console_sem (arch/x86/include/asm/irqflags.h:42 (discriminator 1) arch/x86/include/asm/irqflags.h:97 (discriminator 1) arch/x86/include/asm/irqflags.h:155 (discriminator 1) kernel/printk/printk.c:344 (discriminator 1))
[ 19.931960][ T39] hardirqs last disabled at (978): __up_console_sem (kernel/printk/printk.c:342 (discriminator 1))
[ 19.933120][ T39] softirqs last enabled at (892): handle_softirqs (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:401 kernel/softirq.c:582)
[ 19.934254][ T39] softirqs last disabled at (885): __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637)
[ 19.935397][ T39] ---[ end trace 0000000000000000 ]---
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20241023/202410231434.e98a7287-oliver.sang@intel.com
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 28+ messages in thread
end of thread, other threads:[~2024-10-23 7:27 UTC | newest]
Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-10-16 4:35 [RFC 0/2] rwsem: introduce upgrade_read interface lizhe.67
2024-10-16 4:35 ` [RFC 1/2] " lizhe.67
2024-10-16 4:56 ` Christoph Hellwig
2024-10-16 7:33 ` lizhe.67
2024-10-16 7:36 ` Christoph Hellwig
2024-10-16 8:00 ` lizhe.67
2024-10-16 8:03 ` Christoph Hellwig
2024-10-16 8:13 ` lizhe.67
2024-10-16 11:51 ` Matthew Wilcox
2024-10-16 12:21 ` Christoph Hellwig
2024-10-16 11:49 ` Matthew Wilcox
2024-10-17 6:23 ` lizhe.67
2024-10-16 14:23 ` Waiman Long
2024-10-16 18:05 ` Matthew Wilcox
2024-10-16 18:39 ` Waiman Long
2024-10-17 6:46 ` lizhe.67
2024-10-17 15:05 ` Christoph Hellwig
2024-10-17 17:36 ` Waiman Long
2024-10-18 5:06 ` Christoph Hellwig
2024-10-16 4:36 ` [RFC 2/2] khugepaged: use upgrade_read() to optimize collapse_huge_page lizhe.67
2024-10-16 11:53 ` Matthew Wilcox
2024-10-17 6:18 ` lizhe.67
2024-10-17 13:20 ` Matthew Wilcox
2024-10-18 6:37 ` lizhe.67
2024-10-23 7:27 ` kernel test robot
2024-10-16 8:09 ` [RFC 0/2] rwsem: introduce upgrade_read interface Peter Zijlstra
2024-10-16 8:53 ` lizhe.67
2024-10-16 12:10 ` Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox