* [PATCH 1/1] mm: implement page refcount locking via dedicated bit
2026-02-26 16:27 [PATCH 0/1] mm: improve folio refcount scalability Gladyshev Ilya
@ 2026-02-26 16:27 ` Gladyshev Ilya
2026-02-28 22:19 ` [PATCH 0/1] mm: improve folio refcount scalability Andrew Morton
1 sibling, 0 replies; 4+ messages in thread
From: Gladyshev Ilya @ 2026-02-26 16:27 UTC (permalink / raw)
To: Ilya Gladyshev
Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Zi Yan, Harry Yoo,
Matthew Wilcox, Yu Zhao, Baolin Wang, Alistair Popple,
Gorbunov Ivan, Muchun Song, linux-mm, linux-kernel,
Kiryl Shutsemau
The current atomic-based page refcount implementation treats zero
counter as dead and requires a compare-and-swap loop in folio_try_get()
to prevent incrementing a dead refcount. This CAS loop acts as a
serialization point and can become a significant bottleneck during
high-frequency file read operations.
This patch introduces FOLIO_LOCKED_BIT to distinguish between a
(temporary) zero refcount and a locked (dead/frozen) state. Because now
incrementing counter doesn't affect it's locked/unlocked state, it is
possible to use an optimistic atomic_add_return() in
page_ref_add_unless_zero() that operates independently of the locked bit.
The locked state is handled after the increment attempt, eliminating the
need for the CAS loop.
If locked state is detected after atomic_add(), pageref counter will be
reset using CAS loop, eliminating theoretical possibility of overflow.
Co-developed-by: Gorbunov Ivan <gorbunov.ivan@h-partners.com>
Signed-off-by: Gorbunov Ivan <gorbunov.ivan@h-partners.com>
Signed-off-by: Gladyshev Ilya <gladyshev.ilya1@h-partners.com>
---
include/linux/page-flags.h | 5 ++++-
include/linux/page_ref.h | 28 ++++++++++++++++++++++++----
2 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7c2195baf4c1..f2a9302104eb 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -196,6 +196,9 @@ enum pageflags {
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
+/* Most significant bit in page refcount */
+#define PAGEREF_LOCKED_BIT (1 << 31)
+
#ifndef __GENERATING_BOUNDS_H
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
@@ -257,7 +260,7 @@ static __always_inline bool page_count_writable(const struct page *page)
* The refcount check also prevents modification attempts to other (r/o)
* tail pages that are not fake heads.
*/
- if (!atomic_read_acquire(&page->_refcount))
+ if (atomic_read_acquire(&page->_refcount) & PAGEREF_LOCKED_BIT)
return false;
return page_fixed_fake_head(page) == page;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index b0e3f4a4b4b8..f2f2775af4bb 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -64,7 +64,12 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
static inline int page_ref_count(const struct page *page)
{
- return atomic_read(&page->_refcount);
+ int val = atomic_read(&page->_refcount);
+
+ if (unlikely(val & PAGEREF_LOCKED_BIT))
+ return 0;
+
+ return val;
}
/**
@@ -176,6 +181,9 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
{
int ret = atomic_sub_and_test(nr, &page->_refcount);
+ if (ret)
+ ret = !atomic_cmpxchg_relaxed(&page->_refcount, 0, PAGEREF_LOCKED_BIT);
+
if (page_ref_tracepoint_active(page_ref_mod_and_test))
__page_ref_mod_and_test(page, -nr, ret);
return ret;
@@ -204,6 +212,9 @@ static inline int page_ref_dec_and_test(struct page *page)
{
int ret = atomic_dec_and_test(&page->_refcount);
+ if (ret)
+ ret = !atomic_cmpxchg_relaxed(&page->_refcount, 0, PAGEREF_LOCKED_BIT);
+
if (page_ref_tracepoint_active(page_ref_mod_and_test))
__page_ref_mod_and_test(page, -1, ret);
return ret;
@@ -228,14 +239,23 @@ static inline int folio_ref_dec_return(struct folio *folio)
return page_ref_dec_return(&folio->page);
}
+#define _PAGEREF_LOCKED_LIMIT ((1 << 30) | PAGEREF_LOCKED_BIT)
+
static inline bool page_ref_add_unless_zero(struct page *page, int nr)
{
bool ret = false;
+ int val;
rcu_read_lock();
/* avoid writing to the vmemmap area being remapped */
- if (page_count_writable(page))
- ret = atomic_add_unless(&page->_refcount, nr, 0);
+ if (page_count_writable(page)) {
+ val = atomic_add_return(nr, &page->_refcount);
+ ret = !(val & PAGEREF_LOCKED_BIT);
+
+ /* Undo atomic_add() if counter is locked and scary big */
+ while (unlikely((unsigned int)val >= _PAGEREF_LOCKED_LIMIT))
+ val = atomic_cmpxchg_relaxed(&page->_refcount, val, PAGEREF_LOCKED_BIT);
+ }
rcu_read_unlock();
if (page_ref_tracepoint_active(page_ref_mod_unless))
@@ -271,7 +291,7 @@ static inline bool folio_ref_try_add(struct folio *folio, int count)
static inline int page_ref_freeze(struct page *page, int count)
{
- int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
+ int ret = likely(atomic_cmpxchg(&page->_refcount, count, PAGEREF_LOCKED_BIT) == count);
if (page_ref_tracepoint_active(page_ref_freeze))
__page_ref_freeze(page, count, ret);
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH 0/1] mm: improve folio refcount scalability
2026-02-26 16:27 [PATCH 0/1] mm: improve folio refcount scalability Gladyshev Ilya
2026-02-26 16:27 ` [PATCH 1/1] mm: implement page refcount locking via dedicated bit Gladyshev Ilya
@ 2026-02-28 22:19 ` Andrew Morton
2026-03-01 3:27 ` Linus Torvalds
1 sibling, 1 reply; 4+ messages in thread
From: Andrew Morton @ 2026-02-28 22:19 UTC (permalink / raw)
To: Gladyshev Ilya
Cc: David Hildenbrand, Lorenzo Stoakes, Liam R . Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
Zi Yan, Harry Yoo, Matthew Wilcox, Yu Zhao, Baolin Wang,
Alistair Popple, Gorbunov Ivan, Muchun Song, linux-mm,
linux-kernel, Kiryl Shutsemau, Dave Chinner, Linus Torvalds
On Thu, 26 Feb 2026 16:27:22 +0000 Gladyshev Ilya <gladyshev.ilya1@h-partners.com> wrote:
> This patch was previously posted as an RFC and received positive, but
> little, feedback. So I decided to fix remaining drawbacks and repost it
> as non-RFC patch. Overall logic, as well as performance, remained the
> same.
>
> Intro
> =====
> This patch optimizes small file read performance and overall folio refcount
> scalability by refactoring page_ref_add_unless [core of folio_try_get].
> This is alternative approach to previous attempts to fix small read
> performance by avoiding refcount bumps [1][2].
>
> Overview
> ========
> Current refcount implementation is using zero counter as locked (dead/frozen)
> state, which required CAS loop for increments to avoid temporary unlocks in
> try_get functions. These CAS loops became a serialization point for otherwise
> scalable and fast read side.
>
> Proposed implementation separates "locked" logic from the counting, allowing
> the use of optimistic fetch_add() instead of CAS. For more details, please
> refer to the commit message of the patch itself.
>
> Proposed logic maintains the same public API as before, including all existing
> memory barrier guarantees.
>
> Performance
> ===========
> Performance was measured using a simple custom benchmark based on
> will-it-scale[3]. This benchmark spawns N pinned threads/processes that
> execute the following loop:
> ``
> char buf[]
> fd = open(/* same file in tmpfs */);
>
> while (true) {
> pread(fd, buf, /* read size = */ 64, /* offset = */0)
> }
> ``
> While this is a synthetic load, it does highlight existing issue and
> doesn't differ a lot from benchmarking in [2] patch.
Well it's nice to see the performance benefits from Kiryl's ill-fated
patch
(https://lore.kernel.org/linux-mm/20251017141536.577466-1-kirill@shutemov.name/)
And this approach looks far simpler.
I'll paste the single patch below for others - I think it's not
desirable to prepare a [0/N] for a single-patch "series"!
Thanks, I'll await reviewer feedback for a couple of days then I'll
look at adding this to linux-next for some runtime testing.
> This benchmark measures operations per second in the inner loop and the
> results across all workers. Performance was tested on top of v6.15 kernel[4]
> on two platforms. Since threads and processes showed similar performance on
> both systems, only the thread results are provided below. The performance
> improvement scales linearly between the CPU counts shown.
>
> Platform 1: 2 x E5-2690 v3, 12C/12T each [disabled SMT]
>
> #threads | vanilla | patched | boost (%)
> 1 | 1343381 | 1344401 | +0.1
> 2 | 2186160 | 2455837 | +12.3
> 5 | 5277092 | 6108030 | +15.7
> 10 | 5858123 | 7506328 | +28.1
> 12 | 6484445 | 8137706 | +25.5
> /* Cross socket NUMA */
> 14 | 3145860 | 4247391 | +35.0
> 16 | 2350840 | 4262707 | +81.3
> 18 | 2378825 | 4121415 | +73.2
> 20 | 2438475 | 4683548 | +92.1
> 24 | 2325998 | 4529737 | +94.7
>
> Platform 2: 2 x AMD EPYC 9654, 96C/192T each [enabled SMT]
>
> #threads | vanilla | patched | boost (%)
> 1 | 1077276 | 1081653 | +0.4
> 5 | 4286838 | 4682513 | +9.2
> 10 | 1698095 | 1902753 | +12.1
> 20 | 1662266 | 1921603 | +15.6
> 49 | 1486745 | 1828926 | +23.0
> 97 | 1617365 | 2052635 | +26.9
> /* Cross socket NUMA */
> 105 | 1368319 | 1798862 | +31.5
> 136 | 1008071 | 1393055 | +38.2
> 168 | 879332 | 1245210 | +41.6
> /* SMT */
> 193 | 905432 | 1294833 | +43.0
> 289 | 851988 | 1313110 | +54.1
> 353 | 771288 | 1347165 | +74.7
>
> [1] https://lore.kernel.org/linux-mm/CAHk-=wj00-nGmXEkxY=-=Z_qP6kiGUziSFvxHJ9N-cLWry5zpA@mail.gmail.com/
> [2] https://lore.kernel.org/linux-mm/20251017141536.577466-1-kirill@shutemov.name/
> [3] https://github.com/antonblanchard/will-it-scale
> [4] There were no changes to page_ref.h between v6.15 and v6.18 or any
> significant performance changes on the read side in mm/filemap.c
>
> The current atomic-based page refcount implementation treats zero
> counter as dead and requires a compare-and-swap loop in folio_try_get()
> to prevent incrementing a dead refcount. This CAS loop acts as a
> serialization point and can become a significant bottleneck during
> high-frequency file read operations.
>
> This patch introduces FOLIO_LOCKED_BIT to distinguish between a
> (temporary) zero refcount and a locked (dead/frozen) state. Because now
> incrementing counter doesn't affect it's locked/unlocked state, it is
> possible to use an optimistic atomic_add_return() in
> page_ref_add_unless_zero() that operates independently of the locked bit.
> The locked state is handled after the increment attempt, eliminating the
> need for the CAS loop.
>
> If locked state is detected after atomic_add(), pageref counter will be
> reset using CAS loop, eliminating theoretical possibility of overflow.
>
> Co-developed-by: Gorbunov Ivan <gorbunov.ivan@h-partners.com>
> Signed-off-by: Gorbunov Ivan <gorbunov.ivan@h-partners.com>
> Signed-off-by: Gladyshev Ilya <gladyshev.ilya1@h-partners.com>
> ---
> include/linux/page-flags.h | 5 ++++-
> include/linux/page_ref.h | 28 ++++++++++++++++++++++++----
> 2 files changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 7c2195baf4c1..f2a9302104eb 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -196,6 +196,9 @@ enum pageflags {
>
> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>
> +/* Most significant bit in page refcount */
> +#define PAGEREF_LOCKED_BIT (1 << 31)
> +
> #ifndef __GENERATING_BOUNDS_H
>
> #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> @@ -257,7 +260,7 @@ static __always_inline bool page_count_writable(const struct page *page)
> * The refcount check also prevents modification attempts to other (r/o)
> * tail pages that are not fake heads.
> */
> - if (!atomic_read_acquire(&page->_refcount))
> + if (atomic_read_acquire(&page->_refcount) & PAGEREF_LOCKED_BIT)
> return false;
>
> return page_fixed_fake_head(page) == page;
> diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
> index b0e3f4a4b4b8..f2f2775af4bb 100644
> --- a/include/linux/page_ref.h
> +++ b/include/linux/page_ref.h
> @@ -64,7 +64,12 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
>
> static inline int page_ref_count(const struct page *page)
> {
> - return atomic_read(&page->_refcount);
> + int val = atomic_read(&page->_refcount);
> +
> + if (unlikely(val & PAGEREF_LOCKED_BIT))
> + return 0;
> +
> + return val;
> }
>
> /**
> @@ -176,6 +181,9 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
> {
> int ret = atomic_sub_and_test(nr, &page->_refcount);
>
> + if (ret)
> + ret = !atomic_cmpxchg_relaxed(&page->_refcount, 0, PAGEREF_LOCKED_BIT);
> +
> if (page_ref_tracepoint_active(page_ref_mod_and_test))
> __page_ref_mod_and_test(page, -nr, ret);
> return ret;
> @@ -204,6 +212,9 @@ static inline int page_ref_dec_and_test(struct page *page)
> {
> int ret = atomic_dec_and_test(&page->_refcount);
>
> + if (ret)
> + ret = !atomic_cmpxchg_relaxed(&page->_refcount, 0, PAGEREF_LOCKED_BIT);
> +
> if (page_ref_tracepoint_active(page_ref_mod_and_test))
> __page_ref_mod_and_test(page, -1, ret);
> return ret;
> @@ -228,14 +239,23 @@ static inline int folio_ref_dec_return(struct folio *folio)
> return page_ref_dec_return(&folio->page);
> }
>
> +#define _PAGEREF_LOCKED_LIMIT ((1 << 30) | PAGEREF_LOCKED_BIT)
> +
> static inline bool page_ref_add_unless_zero(struct page *page, int nr)
> {
> bool ret = false;
> + int val;
>
> rcu_read_lock();
> /* avoid writing to the vmemmap area being remapped */
> - if (page_count_writable(page))
> - ret = atomic_add_unless(&page->_refcount, nr, 0);
> + if (page_count_writable(page)) {
> + val = atomic_add_return(nr, &page->_refcount);
> + ret = !(val & PAGEREF_LOCKED_BIT);
> +
> + /* Undo atomic_add() if counter is locked and scary big */
> + while (unlikely((unsigned int)val >= _PAGEREF_LOCKED_LIMIT))
> + val = atomic_cmpxchg_relaxed(&page->_refcount, val, PAGEREF_LOCKED_BIT);
> + }
> rcu_read_unlock();
>
> if (page_ref_tracepoint_active(page_ref_mod_unless))
> @@ -271,7 +291,7 @@ static inline bool folio_ref_try_add(struct folio *folio, int count)
>
> static inline int page_ref_freeze(struct page *page, int count)
> {
> - int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
> + int ret = likely(atomic_cmpxchg(&page->_refcount, count, PAGEREF_LOCKED_BIT) == count);
>
> if (page_ref_tracepoint_active(page_ref_freeze))
> __page_ref_freeze(page, count, ret);
> --
>
>
^ permalink raw reply [flat|nested] 4+ messages in thread