[RFC] making nested spin_trylock() work on UP?

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC] making nested spin_trylock() work on UP?
@ 2026-02-13 11:57 Vlastimil Babka
  2026-02-14  6:28 ` Matthew Wilcox
  0 siblings, 1 reply; 6+ messages in thread
From: Vlastimil Babka @ 2026-02-13 11:57 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Sebastian Andrzej Siewior
  Cc: LKML, linux-mm, Linus Torvalds, Waiman Long, Mel Gorman,
	Matthew Wilcox, Steven Rostedt

Hi,

this is not a real RFC PATCH, but more like discussion about possible
direction. I wanted to have a patch at hand, but the layers of spinlock APIs
are rather complex for me to untangle, so I'd rather know first if it's even
worth trying.

The page allocator has been using a locking scheme for its percpu page
caches (pcp) for years now, based on spin_trylock() with no _irqsave() part.
The point is that if we interrupt the locked section, we fail the trylock
and just fallback to something that's more expensive, but it's rare so we
don't need to pay the irqsave cost all the time in the fastpaths.

It's similar to but not exactly local_trylock_t (which is also newer anyway)
because in some cases we do lock the pcp of a non-local cpu to flush it, in
a way that's cheaper than IPI or queue_work_on().

The complication of this scheme has been UP non-debug spinlock
implementation which assumes spin_trylock() can't fail on UP and has no
state to track it. It just doesn't anticipate this usage scenario. So to
work around that we disable IRQs on UP, complicating the implementation.
Also recently we found years old bug in the implementation - see
038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n").

So my question is if we could have spinlock implementation supporting this
nested spin_trylock() usage, or if the UP optimization is still considered
too important to lose it. I was thinking:

- remove the UP implementation completely - would it increase the overhead
on SMP=n systems too much and do we still care?

- make the non-debug implementation a bit like the debug one so we do have
the 'locked' state (see include/linux/spinlock_up.h and lock->slock). This
also adds some overhead but not as much as the full SMP implementation?

Below is how this would simplify page_alloc.c.

Thanks,
Vlastimil

From 7a0f233ec0ae46324b2db6a09944e93c7cb14459 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 13 Feb 2026 12:51:02 +0100
Subject: [PATCH] mm/page_alloc: simplify as if UP spin_trylock() was reliable

---
 mm/page_alloc.c | 111 +++++++++++++-----------------------------------
 1 file changed, 30 insertions(+), 81 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d312ebaa1e77..f147126b6c06 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -95,23 +95,6 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
 
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-/*
- * On SMP, spin_trylock is sufficient protection.
- * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
- * Pass flags to a no-op inline function to typecheck and silence the unused
- * variable warning.
- */
-static inline void __pcp_trylock_noop(unsigned long *flags) { }
-#define pcp_trylock_prepare(flags)	__pcp_trylock_noop(&(flags))
-#define pcp_trylock_finish(flags)	__pcp_trylock_noop(&(flags))
-#else
-
-/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
-#define pcp_trylock_prepare(flags)	local_irq_save(flags)
-#define pcp_trylock_finish(flags)	local_irq_restore(flags)
-#endif
-
 /*
  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
  * a migration causing the wrong PCP to be locked and remote memory being
@@ -151,48 +134,22 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { }
 })
 
 /* struct per_cpu_pages specific helpers. */
-#define pcp_spin_trylock(ptr, UP_flags)					\
+#define pcp_spin_trylock(ptr)						\
 ({									\
 	struct per_cpu_pages *__ret;					\
-	pcp_trylock_prepare(UP_flags);					\
 	__ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr);	\
-	if (!__ret)							\
-		pcp_trylock_finish(UP_flags);				\
 	__ret;								\
 })
 
-#define pcp_spin_unlock(ptr, UP_flags)					\
+#define pcp_spin_unlock(ptr)						\
 ({									\
 	pcpu_spin_unlock(lock, ptr);					\
-	pcp_trylock_finish(UP_flags);					\
 })
 
-/*
- * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
- * a potentially remote cpu drain) and get interrupted by an operation that
- * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
- * spinlock assumptions making the trylock a no-op. So we have to turn that
- * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
- * remote cpu's so we can only be locking the only existing local one.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-static inline void __flags_noop(unsigned long *flags) { }
-#define pcp_spin_lock_maybe_irqsave(ptr, flags)		\
-({							\
-	 __flags_noop(&(flags));			\
-	 spin_lock(&(ptr)->lock);			\
-})
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)	\
-({							\
-	 spin_unlock(&(ptr)->lock);			\
-	 __flags_noop(&(flags));			\
-})
-#else
-#define pcp_spin_lock_maybe_irqsave(ptr, flags)		\
-		spin_lock_irqsave(&(ptr)->lock, flags)
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)	\
-		spin_unlock_irqrestore(&(ptr)->lock, flags)
-#endif
+#define pcp_spin_lock_nopin(ptr)			\
+		spin_lock(&(ptr)->lock)
+#define pcp_spin_unlock_nopin(ptr)			\
+		spin_unlock(&(ptr)->lock)
 
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
@@ -2583,7 +2540,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	int high_min, to_drain, to_drain_batched, batch;
-	unsigned long UP_flags;
 	bool todo = false;
 
 	high_min = READ_ONCE(pcp->high_min);
@@ -2603,9 +2559,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 	to_drain = pcp->count - pcp->high;
 	while (to_drain > 0) {
 		to_drain_batched = min(to_drain, batch);
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 		todo = true;
 
 		to_drain -= to_drain_batched;
@@ -2622,15 +2578,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long UP_flags;
 	int to_drain, batch;
 
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0) {
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 	}
 }
 #endif
@@ -2641,11 +2596,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	unsigned long UP_flags;
 	int count;
 
 	do {
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		count = pcp->count;
 		if (count) {
 			int to_drain = min(count,
@@ -2654,7 +2608,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 			free_pcppages_bulk(zone, to_drain, pcp, 0);
 			count -= to_drain;
 		}
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 	} while (count);
 }
 
@@ -2853,7 +2807,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
  */
 static bool free_frozen_page_commit(struct zone *zone,
 		struct per_cpu_pages *pcp, struct page *page, int migratetype,
-		unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
+		unsigned int order, fpi_t fpi_flags)
 {
 	int high, batch;
 	int to_free, to_free_batched;
@@ -2913,9 +2867,9 @@ static bool free_frozen_page_commit(struct zone *zone,
 		if (to_free == 0 || pcp->count == 0)
 			break;
 
-		pcp_spin_unlock(pcp, *UP_flags);
+		pcp_spin_unlock(pcp);
 
-		pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 		if (!pcp) {
 			ret = false;
 			break;
@@ -2927,7 +2881,7 @@ static bool free_frozen_page_commit(struct zone *zone,
 		 * returned in an unlocked state.
 		 */
 		if (smp_processor_id() != cpu) {
-			pcp_spin_unlock(pcp, *UP_flags);
+			pcp_spin_unlock(pcp);
 			ret = false;
 			break;
 		}
@@ -2959,7 +2913,6 @@ static bool free_frozen_page_commit(struct zone *zone,
 static void __free_frozen_pages(struct page *page, unsigned int order,
 				fpi_t fpi_flags)
 {
-	unsigned long UP_flags;
 	struct per_cpu_pages *pcp;
 	struct zone *zone;
 	unsigned long pfn = page_to_pfn(page);
@@ -2995,12 +2948,12 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
 		add_page_to_zone_llist(zone, page, order);
 		return;
 	}
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
 		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
-						order, fpi_flags, &UP_flags))
+						order, fpi_flags))
 			return;
-		pcp_spin_unlock(pcp, UP_flags);
+		pcp_spin_unlock(pcp);
 	} else {
 		free_one_page(zone, page, pfn, order, fpi_flags);
 	}
@@ -3021,7 +2974,6 @@ void free_frozen_pages_nolock(struct page *page, unsigned int order)
  */
 void free_unref_folios(struct folio_batch *folios)
 {
-	unsigned long UP_flags;
 	struct per_cpu_pages *pcp = NULL;
 	struct zone *locked_zone = NULL;
 	int i, j;
@@ -3064,7 +3016,7 @@ void free_unref_folios(struct folio_batch *folios)
 		if (zone != locked_zone ||
 		    is_migrate_isolate(migratetype)) {
 			if (pcp) {
-				pcp_spin_unlock(pcp, UP_flags);
+				pcp_spin_unlock(pcp);
 				locked_zone = NULL;
 				pcp = NULL;
 			}
@@ -3083,7 +3035,7 @@ void free_unref_folios(struct folio_batch *folios)
 			 * trylock is necessary as folios may be getting freed
 			 * from IRQ or SoftIRQ context after an IO completion.
 			 */
-			pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 			if (unlikely(!pcp)) {
 				free_one_page(zone, &folio->page, pfn,
 					      order, FPI_NONE);
@@ -3101,14 +3053,14 @@ void free_unref_folios(struct folio_batch *folios)
 
 		trace_mm_page_free_batched(&folio->page);
 		if (!free_frozen_page_commit(zone, pcp, &folio->page,
-				migratetype, order, FPI_NONE, &UP_flags)) {
+				migratetype, order, FPI_NONE)) {
 			pcp = NULL;
 			locked_zone = NULL;
 		}
 	}
 
 	if (pcp)
-		pcp_spin_unlock(pcp, UP_flags);
+		pcp_spin_unlock(pcp);
 	folio_batch_reinit(folios);
 }
 
@@ -3359,10 +3311,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
 	struct page *page;
-	unsigned long UP_flags;
 
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp)
 		return NULL;
 
@@ -3374,7 +3325,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	pcp->free_count >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
-	pcp_spin_unlock(pcp, UP_flags);
+	pcp_spin_unlock(pcp);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone, 1);
@@ -5062,7 +5013,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 			struct page **page_array)
 {
 	struct page *page;
-	unsigned long UP_flags;
 	struct zone *zone;
 	struct zoneref *z;
 	struct per_cpu_pages *pcp;
@@ -5156,7 +5106,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		goto failed;
 
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp)
 		goto failed;
 
@@ -5175,7 +5125,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		if (unlikely(!page)) {
 			/* Try and allocate at least one page */
 			if (!nr_account) {
-				pcp_spin_unlock(pcp, UP_flags);
+				pcp_spin_unlock(pcp);
 				goto failed;
 			}
 			break;
@@ -5187,7 +5137,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		page_array[nr_populated++] = page;
 	}
 
-	pcp_spin_unlock(pcp, UP_flags);
+	pcp_spin_unlock(pcp);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
 	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -6144,7 +6094,6 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 {
 	struct per_cpu_pages *pcp;
 	struct cpu_cacheinfo *cci;
-	unsigned long UP_flags;
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	cci = get_cpu_cacheinfo(cpu);
@@ -6155,12 +6104,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 	 * This can reduce zone lock contention without hurting
 	 * cache-hot pages sharing.
 	 */
-	pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+	pcp_spin_lock_nopin(pcp);
 	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
 		pcp->flags |= PCPF_FREE_HIGH_BATCH;
 	else
 		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
-	pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+	pcp_spin_unlock_nopin(pcp);
 }
 
 void setup_pcp_cacheinfo(unsigned int cpu)
-- 
2.53.0




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] making nested spin_trylock() work on UP?
  2026-02-13 11:57 [RFC] making nested spin_trylock() work on UP? Vlastimil Babka
@ 2026-02-14  6:28 ` Matthew Wilcox
  2026-02-14 16:32   ` Linus Torvalds
  2026-04-15 18:44   ` Harry Yoo (Oracle)
  0 siblings, 2 replies; 6+ messages in thread
From: Matthew Wilcox @ 2026-02-14  6:28 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon,
	Sebastian Andrzej Siewior, LKML, linux-mm, Linus Torvalds,
	Waiman Long, Mel Gorman, Steven Rostedt

On Fri, Feb 13, 2026 at 12:57:43PM +0100, Vlastimil Babka wrote:
> The page allocator has been using a locking scheme for its percpu page
> caches (pcp) for years now, based on spin_trylock() with no _irqsave() part.
> The point is that if we interrupt the locked section, we fail the trylock
> and just fallback to something that's more expensive, but it's rare so we
> don't need to pay the irqsave cost all the time in the fastpaths.
> 
> It's similar to but not exactly local_trylock_t (which is also newer anyway)
> because in some cases we do lock the pcp of a non-local cpu to flush it, in
> a way that's cheaper than IPI or queue_work_on().
> 
> The complication of this scheme has been UP non-debug spinlock
> implementation which assumes spin_trylock() can't fail on UP and has no
> state to track it. It just doesn't anticipate this usage scenario. So to
> work around that we disable IRQs on UP, complicating the implementation.
> Also recently we found years old bug in the implementation - see
> 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n").
> 
> So my question is if we could have spinlock implementation supporting this
> nested spin_trylock() usage, or if the UP optimization is still considered
> too important to lose it. I was thinking:
> 
> - remove the UP implementation completely - would it increase the overhead
> on SMP=n systems too much and do we still care?
> 
> - make the non-debug implementation a bit like the debug one so we do have
> the 'locked' state (see include/linux/spinlock_up.h and lock->slock). This
> also adds some overhead but not as much as the full SMP implementation?

What if we use an atomic_t on UP to simulate there being a spinlock,
but only for pcp?  Your demo shows pcp_spin_trylock() continuing to
exist, so how about doing something like:

#ifdef CONFIG_SMP
#define pcp_spin_trylock(ptr)						\
({									\
	struct per_cpu_pages *__ret;					\
	__ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr);	\
	__ret;								\
})
#else
static atomic_t pcp_UP_lock = ATOMIC_INIT(0);
#define pcp_spin_trylock(ptr)						\
({									\
	struct per_cpu_pages *__ret = NULL;				\
	if (atomic_try_cmpxchg(&pcp_UP_lock, 0, 1))			\
		__ret = (void *)&pcp_UP_lock;				\
	__ret;								\
});
#endif

(obviously you need pcp_spin_lock/pcp_spin_unlock also defined)

That only costs us 4 extra bytes on UP, rather than 4 bytes per spinlock.
And some people still use routers with tiny amounts of memory and a
single CPU, or retrocomputers with single CPUs.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] making nested spin_trylock() work on UP?
  2026-02-14  6:28 ` Matthew Wilcox
@ 2026-02-14 16:32   ` Linus Torvalds
  2026-02-16 10:32     ` Vlastimil Babka
  2026-04-15 18:44   ` Harry Yoo (Oracle)
  1 sibling, 1 reply; 6+ messages in thread
From: Linus Torvalds @ 2026-02-14 16:32 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Vlastimil Babka, Peter Zijlstra, Ingo Molnar, Will Deacon,
	Sebastian Andrzej Siewior, LKML, linux-mm, Waiman Long,
	Mel Gorman, Steven Rostedt

On Fri, 13 Feb 2026 at 22:29, Matthew Wilcox <willy@infradead.org> wrote:
>
> What if we use an atomic_t on UP to simulate there being a spinlock,
> but only for pcp?

Yes. Please just wrap this - very unusual - use case with a special
wrapper that can then be entirely different for UP and SMP, and use
something like the suggested "pcp_lock" that becomes a spinlock on
smp, and just a tracking variable on UP.

And I don't think it needs to even be marked as 'atomic_t' on UP - the
value is going to be idempotent even when modified from interrupts
(because it will just be modified back), so no need for any special
logic, I think. The generic 'atomic_t' ops on UP disable interrupts,
which is horrendous.

Changing spinlocks globally on UP to be something they haven't been
before does not sound like a good idea, particularly since no actual
developer uses UP any more (and  honestly, UP is dead outside of very
low-end platforms or legacy like 68k).

            Linus

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] making nested spin_trylock() work on UP?
  2026-02-14 16:32   ` Linus Torvalds
@ 2026-02-16 10:32     ` Vlastimil Babka
  0 siblings, 0 replies; 6+ messages in thread
From: Vlastimil Babka @ 2026-02-16 10:32 UTC (permalink / raw)
  To: Linus Torvalds, Matthew Wilcox
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon,
	Sebastian Andrzej Siewior, LKML, linux-mm, Waiman Long,
	Mel Gorman, Steven Rostedt, David Hildenbrand (Arm)

On 2/14/26 17:32, Linus Torvalds wrote:
> On Fri, 13 Feb 2026 at 22:29, Matthew Wilcox <willy@infradead.org> wrote:
>>
>> What if we use an atomic_t on UP to simulate there being a spinlock,
>> but only for pcp?
> 
> Yes. Please just wrap this - very unusual - use case with a special
> wrapper that can then be entirely different for UP and SMP, and use
> something like the suggested "pcp_lock" that becomes a spinlock on
> smp, and just a tracking variable on UP.
> 
> And I don't think it needs to even be marked as 'atomic_t' on UP - the
> value is going to be idempotent even when modified from interrupts
> (because it will just be modified back), so no need for any special
> logic, I think. The generic 'atomic_t' ops on UP disable interrupts,
> which is horrendous.

I think we could be using local_trylock_t, which does exactly this without
atomic_t.

However David had an even better suggestion (in a chat) that the pcp cache
is unnecessary on UP for scalability anyway, so we can just have the trylock
wrapper do nothing and fail unconditionally with SMP=n, and then all the
cleanups I wanted are immediately possible and we'll even save some memory
that would be cached unnecessarily on those small systems (and some code
will be eliminated as dead). So I'll be going with that.

> Changing spinlocks globally on UP to be something they haven't been
> before does not sound like a good idea, particularly since no actual
> developer uses UP any more (and  honestly, UP is dead outside of very
> low-end platforms or legacy like 68k).

Ack, thanks.

>             Linus



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] making nested spin_trylock() work on UP?
  2026-02-14  6:28 ` Matthew Wilcox
  2026-02-14 16:32   ` Linus Torvalds
@ 2026-04-15 18:44   ` Harry Yoo (Oracle)
  1 sibling, 0 replies; 6+ messages in thread
From: Harry Yoo (Oracle) @ 2026-04-15 18:44 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Vlastimil Babka, Peter Zijlstra, Ingo Molnar, Will Deacon,
	Sebastian Andrzej Siewior, LKML, linux-mm, Linus Torvalds,
	Waiman Long, Mel Gorman, Steven Rostedt, Alexei Starovoitov,
	Hao Li, Andrew Morton, Suren Baghdasaryan, Michal Hocko,
	Brendan Jackman, Johannes Weiner, Zi Yan, Christoph Lameter,
	David Rientjes, Roman Gushchin

[+Cc Alexei for _nolock() APIs]
[+Cc SLAB ALLOCATOR and PAGE ALLOCATOR folks]

I was testing kmalloc_nolock() on UP and I think
I'm dealt with a similar issue...

On Sat, Feb 14, 2026 at 06:28:43AM +0000, Matthew Wilcox wrote:
> On Fri, Feb 13, 2026 at 12:57:43PM +0100, Vlastimil Babka wrote:
> > The page allocator has been using a locking scheme for its percpu page
> > caches (pcp) for years now, based on spin_trylock() with no _irqsave() part.
> > The point is that if we interrupt the locked section, we fail the trylock
> > and just fallback to something that's more expensive, but it's rare so we
> > don't need to pay the irqsave cost all the time in the fastpaths.
> > 
> > It's similar to but not exactly local_trylock_t (which is also newer anyway)
> > because in some cases we do lock the pcp of a non-local cpu to flush it, in
> > a way that's cheaper than IPI or queue_work_on().
> > 
> > The complication of this scheme has been UP non-debug spinlock
> > implementation which assumes spin_trylock() can't fail on UP and has no
> > state to track it. It just doesn't anticipate this usage scenario.

This is not the only scenario that doesn't work.

I was testing "calling {kmalloc,kfree}_nolock() in an NMI handler
when the CPU is calling kmalloc() & kfree()" [1] scenario.

Weirdly it's broken (dmesg at the end of the email) on UP since v6.18,
where {kmalloc,kfree}_nolock() APIs were introduced.

[1] https://lore.kernel.org/linux-mm/20260406090907.11710-3-harry@kernel.org

> > So to
> > work around that we disable IRQs on UP, complicating the implementation.
> > Also recently we found years old bug in the implementation - see
> > 038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n").

In the case mentioned above, disabling IRQs doesn't work as the handler
can be called in an NMI context.

{kmalloc,kfree}_nolock()->spin_trylock_irqsave() can succeed on UP
when the CPU already acquired the spinlock w/ IRQs disabled.

> > So my question is if we could have spinlock implementation supporting this
> > nested spin_trylock() usage, or if the UP optimization is still considered
> > too important to lose it. I was thinking:
> > 
> > - remove the UP implementation completely - would it increase the overhead
> > on SMP=n systems too much and do we still care?
> > 
> > - make the non-debug implementation a bit like the debug one so we do have
> > the 'locked' state (see include/linux/spinlock_up.h and lock->slock). This
> > also adds some overhead but not as much as the full SMP implementation?
> 
> What if we use an atomic_t on UP to simulate there being a spinlock,
> but only for pcp?  Your demo shows pcp_spin_trylock() continuing to
> exist, so how about doing something like:
> 
> #ifdef CONFIG_SMP
> #define pcp_spin_trylock(ptr)						\
> ({									\
> 	struct per_cpu_pages *__ret;					\
> 	__ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr);	\
> 	__ret;								\
> })
> #else
> static atomic_t pcp_UP_lock = ATOMIC_INIT(0);
> #define pcp_spin_trylock(ptr)						\
> ({									\
> 	struct per_cpu_pages *__ret = NULL;				\
> 	if (atomic_try_cmpxchg(&pcp_UP_lock, 0, 1))			\
> 		__ret = (void *)&pcp_UP_lock;				\
> 	__ret;								\
> });
> #endif
>
> (obviously you need pcp_spin_lock/pcp_spin_unlock also defined)
> 
> That only costs us 4 extra bytes on UP, rather than 4 bytes per spinlock.
> And some people still use routers with tiny amounts of memory and a
> single CPU, or retrocomputers with single CPUs.

I think we need a special spinlock type that wraps something like this
and use them when spinlocks can be trylock'd in an unknown context:
pcp lock, zone lock, per-node partial slab list lock,
per-node barn lock, etc.

dmesg here, HEAD is a commit that adds the test case, on top of
commit af92793e52c3a ("slab: Introduce kmalloc_nolock() and
kfree_nolock()."):
> 
> [    3.658916] ------------[ cut here ]------------
> [    3.659492] perf: interrupt took too long (5015 > 5005), lowering kernel.perf_event_max_sample_rate to 39000
> [    3.660800] kernel BUG at mm/slub.c:4382!

This is BUG_ON(new.frozen) in freeze_slab(), which implies that
somebody else has taken it off list and froze it already (which should
have been prevented by the spinlock)

> [    3.661674] Oops: invalid opcode: 0000 [#1] NOPTI
> [    3.662427] CPU: 0 UID: 0 PID: 256 Comm: kunit_try_catch Tainted: G            E    N  6.17.0-rc3+ #24 PREEMPTLAZY
> [    3.663270] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST
> [    3.663658] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [    3.664571] RIP: 0010:___slab_alloc (mm/slub.c:4382 (discriminator 1) mm/slub.c:4599 (discriminator 1)) 
> [ 3.664949] Code: 4c 24 78 e8 32 cc ff ff 84 c0 0f 85 09 fa ff ff 49 8b 4c 24 28 4d 8b 6c 24 20 48 89 c8 48 89 4c 24 78 48 c1 e8 18 84 c0 79 b3 <0f> 0b 41 8b 46 10 a9 87 04 00 00 74 a1 a8 80 75 24 49 89 dd e9 09

-- 
Cheers,
Harry / Hyeonggon

> All code
> ========
>    0:	4c 24 78             	rex.WR and $0x78,%al
>    3:	e8 32 cc ff ff       	call   0xffffffffffffcc3a
>    8:	84 c0                	test   %al,%al
>    a:	0f 85 09 fa ff ff    	jne    0xfffffffffffffa19
>   10:	49 8b 4c 24 28       	mov    0x28(%r12),%rcx
>   15:	4d 8b 6c 24 20       	mov    0x20(%r12),%r13
>   1a:	48 89 c8             	mov    %rcx,%rax
>   1d:	48 89 4c 24 78       	mov    %rcx,0x78(%rsp)
>   22:	48 c1 e8 18          	shr    $0x18,%rax
>   26:	84 c0                	test   %al,%al
>   28:	79 b3                	jns    0xffffffffffffffdd
>   2a:*	0f 0b                	ud2		<-- trapping instruction
>   2c:	41 8b 46 10          	mov    0x10(%r14),%eax
>   30:	a9 87 04 00 00       	test   $0x487,%eax
>   35:	74 a1                	je     0xffffffffffffffd8
>   37:	a8 80                	test   $0x80,%al
>   39:	75 24                	jne    0x5f
>   3b:	49 89 dd             	mov    %rbx,%r13
>   3e:	e9                   	.byte 0xe9
>   3f:	09                   	.byte 0x9
> 
> Code starting with the faulting instruction
> ===========================================
>    0:	0f 0b                	ud2
>    2:	41 8b 46 10          	mov    0x10(%r14),%eax
>    6:	a9 87 04 00 00       	test   $0x487,%eax
>    b:	74 a1                	je     0xffffffffffffffae
>    d:	a8 80                	test   $0x80,%al
>    f:	75 24                	jne    0x35
>   11:	49 89 dd             	mov    %rbx,%r13
>   14:	e9                   	.byte 0xe9
>   15:	09                   	.byte 0x9
> [    3.666437] RSP: 0018:ffffc9d4001d3c80 EFLAGS: 00010282
> [    3.666865] RAX: 0000000000000080 RBX: ffff8990fffd2e20 RCX: 0000000080400040
> [    3.667440] RDX: ffff8990c0051c48 RSI: 0000000000400cc0 RDI: ffff8990c0054100
> [    3.668018] RBP: ffffc9d4001d3d40 R08: 0000000000400cc0 R09: ffff8990c0051c40
> [    3.668628] R10: ffff8990fffd2e20 R11: ffff8990fffd2e20 R12: ffffec0e04031cc0
> [    3.669222] R13: 0000000000000000 R14: ffff8990c0054100 R15: ffffffffc04e8174
> [    3.669815] FS:  0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> [    3.670475] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    3.670960] CR2: 00007ffcf4c95a68 CR3: 000000001f052000 CR4: 0000000000750ef0
> [    3.671554] PKRU: 55555554
> [    3.671799] Call Trace:
> [    3.672012]  <TASK>
> [    3.672199] ? test_kmalloc_kfree_nolock (lib/tests/slub_kunit.c:357 (discriminator 4)) slub_kunit 
> [    3.672704] ? test_kmalloc_kfree_nolock (lib/tests/slub_kunit.c:357 (discriminator 4)) slub_kunit 
> [    3.673211] __kmalloc_cache_noprof (mm/slub.c:4722 mm/slub.c:4798 mm/slub.c:5209 mm/slub.c:5695) 
> [    3.673595] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:183) 
> [    3.674003] test_kmalloc_kfree_nolock (lib/tests/slub_kunit.c:357 (discriminator 4)) slub_kunit 
> [    3.674475] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:183) 
> [    3.674869] ? test_kmalloc_kfree_nolock (lib/tests/slub_kunit.c:357 (discriminator 4)) slub_kunit 
> [    3.675354] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:183) 
> [    3.675754] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:183) 
> [    3.676144] ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:183) 
> [    3.676535] ? __switch_to (./arch/x86/include/asm/cpufeature.h:101 arch/x86/kernel/process_64.c:378 arch/x86/kernel/process_64.c:666) 
> [    3.676848] ? __pfx_kunit_generic_run_threadfn_adapter (lib/kunit/try-catch.c:26) kunit 
> [    3.677395] kunit_try_run_case (lib/kunit/test.c:441 lib/kunit/test.c:484) kunit 
> [    3.677802] ? __pfx_kunit_generic_run_threadfn_adapter (lib/kunit/try-catch.c:26) kunit 
> [    3.678355] kunit_generic_run_threadfn_adapter (lib/kunit/try-catch.c:31) kunit 
> [    3.678857] kthread (kernel/kthread.c:463) 
> [    3.679130] ? __pfx_kthread (kernel/kthread.c:412) 
> [    3.679442] ret_from_fork (arch/x86/kernel/process.c:154) 
> [    3.679759] ? __pfx_kthread (kernel/kthread.c:412) 
> [    3.680071] ret_from_fork_asm (arch/x86/entry/entry_64.S:255) 
> [    3.680397]  </TASK>
> [    3.680585] Modules linked in: slub_kunit(E) kunit(E) intel_rapl_msr(E) intel_rapl_common(E) aesni_intel(E) ghash_clmulni_intel(E) kvm_amd(E) ccp(E) kvm(E) irqbypass(E) input_leds(E) i2c_piix4(E) i2c_smbus(E) mac_hid(E)
> [    3.682187] ---[ end trace 0000000000000000 ]---
> [    3.683108] RIP: 0010:___slab_alloc (mm/slub.c:4382 (discriminator 1) mm/slub.c:4599 (discriminator 1)) 
> [ 3.684032] Code: 4c 24 78 e8 32 cc ff ff 84 c0 0f 85 09 fa ff ff 49 8b 4c 24 28 4d 8b 6c 24 20 48 89 c8 48 89 4c 24 78 48 c1 e8 18 84 c0 79 b3 <0f> 0b 41 8b 46 10 a9 87 04 00 00 74 a1 a8 80 75 24 49 89 dd e9 09
> All code
> ========
>    0:	4c 24 78             	rex.WR and $0x78,%al
>    3:	e8 32 cc ff ff       	call   0xffffffffffffcc3a
>    8:	84 c0                	test   %al,%al
>    a:	0f 85 09 fa ff ff    	jne    0xfffffffffffffa19
>   10:	49 8b 4c 24 28       	mov    0x28(%r12),%rcx
>   15:	4d 8b 6c 24 20       	mov    0x20(%r12),%r13
>   1a:	48 89 c8             	mov    %rcx,%rax
>   1d:	48 89 4c 24 78       	mov    %rcx,0x78(%rsp)
>   22:	48 c1 e8 18          	shr    $0x18,%rax
>   26:	84 c0                	test   %al,%al
>   28:	79 b3                	jns    0xffffffffffffffdd
>   2a:*	0f 0b                	ud2		<-- trapping instruction
>   2c:	41 8b 46 10          	mov    0x10(%r14),%eax
>   30:	a9 87 04 00 00       	test   $0x487,%eax
>   35:	74 a1                	je     0xffffffffffffffd8
>   37:	a8 80                	test   $0x80,%al
>   39:	75 24                	jne    0x5f
>   3b:	49 89 dd             	mov    %rbx,%r13
>   3e:	e9                   	.byte 0xe9
>   3f:	09                   	.byte 0x9
> 
> Code starting with the faulting instruction
> ===========================================
>    0:	0f 0b                	ud2
>    2:	41 8b 46 10          	mov    0x10(%r14),%eax
>    6:	a9 87 04 00 00       	test   $0x487,%eax
>    b:	74 a1                	je     0xffffffffffffffae
>    d:	a8 80                	test   $0x80,%al
>    f:	75 24                	jne    0x35
>   11:	49 89 dd             	mov    %rbx,%r13
>   14:	e9                   	.byte 0xe9
>   15:	09                   	.byte 0x9
> [    3.686093] RSP: 0018:ffffc9d4001d3c80 EFLAGS: 00010282
> [    3.687036] RAX: 0000000000000080 RBX: ffff8990fffd2e20 RCX: 0000000080400040
> [    3.688128] RDX: ffff8990c0051c48 RSI: 0000000000400cc0 RDI: ffff8990c0054100
> [    3.689244] RBP: ffffc9d4001d3d40 R08: 0000000000400cc0 R09: ffff8990c0051c40
> [    3.690353] R10: ffff8990fffd2e20 R11: ffff8990fffd2e20 R12: ffffec0e04031cc0
> [    3.691476] R13: 0000000000000000 R14: ffff8990c0054100 R15: ffffffffc04e8174
> [    3.692864] FS:  0000000000000000(0000) GS:0000000000000000(0000) knlGS:0000000000000000
> [    3.694016] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    3.694997] CR2: 00007ffcf4c95a68 CR3: 000000001f052000 CR4: 0000000000750ef0
> [    3.696109] PKRU: 55555554
> [    3.696834] note: kunit_try_catch[256] exited with preempt_count 1
> [    3.696910] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: vprintk_store (kernel/printk/printk.c:2358) 
> [    3.698650] Kernel Offset: 0x1d000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC] making nested spin_trylock() work on UP?
@ 2026-02-13 11:57 Vlastimil Babka
  0 siblings, 0 replies; 6+ messages in thread
From: Vlastimil Babka @ 2026-02-13 11:57 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Will Deacon, Sebastian Andrzej Siewior
  Cc: LKML, linux-mm, Linus Torvalds, Waiman Long, Mel Gorman,
	Matthew Wilcox, Steven Rostedt

Hi,

this is not a real RFC PATCH, but more like discussion about possible
direction. I wanted to have a patch at hand, but the layers of spinlock APIs
are rather complex for me to untangle, so I'd rather know first if it's even
worth trying.

The page allocator has been using a locking scheme for its percpu page
caches (pcp) for years now, based on spin_trylock() with no _irqsave() part.
The point is that if we interrupt the locked section, we fail the trylock
and just fallback to something that's more expensive, but it's rare so we
don't need to pay the irqsave cost all the time in the fastpaths.

It's similar to but not exactly local_trylock_t (which is also newer anyway)
because in some cases we do lock the pcp of a non-local cpu to flush it, in
a way that's cheaper than IPI or queue_work_on().

The complication of this scheme has been UP non-debug spinlock
implementation which assumes spin_trylock() can't fail on UP and has no
state to track it. It just doesn't anticipate this usage scenario. So to
work around that we disable IRQs on UP, complicating the implementation.
Also recently we found years old bug in the implementation - see
038a102535eb ("mm/page_alloc: prevent pcp corruption with SMP=n").

So my question is if we could have spinlock implementation supporting this
nested spin_trylock() usage, or if the UP optimization is still considered
too important to lose it. I was thinking:

- remove the UP implementation completely - would it increase the overhead
on SMP=n systems too much and do we still care?

- make the non-debug implementation a bit like the debug one so we do have
the 'locked' state (see include/linux/spinlock_up.h and lock->slock). This
also adds some overhead but not as much as the full SMP implementation?

Below is how this would simplify page_alloc.c.

Thanks,
Vlastimil

From 7a0f233ec0ae46324b2db6a09944e93c7cb14459 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Fri, 13 Feb 2026 12:51:02 +0100
Subject: [PATCH] mm/page_alloc: simplify as if UP spin_trylock() was reliable

---
 mm/page_alloc.c | 111 +++++++++++++-----------------------------------
 1 file changed, 30 insertions(+), 81 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d312ebaa1e77..f147126b6c06 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -95,23 +95,6 @@ typedef int __bitwise fpi_t;
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
 
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-/*
- * On SMP, spin_trylock is sufficient protection.
- * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
- * Pass flags to a no-op inline function to typecheck and silence the unused
- * variable warning.
- */
-static inline void __pcp_trylock_noop(unsigned long *flags) { }
-#define pcp_trylock_prepare(flags)	__pcp_trylock_noop(&(flags))
-#define pcp_trylock_finish(flags)	__pcp_trylock_noop(&(flags))
-#else
-
-/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
-#define pcp_trylock_prepare(flags)	local_irq_save(flags)
-#define pcp_trylock_finish(flags)	local_irq_restore(flags)
-#endif
-
 /*
  * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
  * a migration causing the wrong PCP to be locked and remote memory being
@@ -151,48 +134,22 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { }
 })
 
 /* struct per_cpu_pages specific helpers. */
-#define pcp_spin_trylock(ptr, UP_flags)					\
+#define pcp_spin_trylock(ptr)						\
 ({									\
 	struct per_cpu_pages *__ret;					\
-	pcp_trylock_prepare(UP_flags);					\
 	__ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr);	\
-	if (!__ret)							\
-		pcp_trylock_finish(UP_flags);				\
 	__ret;								\
 })
 
-#define pcp_spin_unlock(ptr, UP_flags)					\
+#define pcp_spin_unlock(ptr)						\
 ({									\
 	pcpu_spin_unlock(lock, ptr);					\
-	pcp_trylock_finish(UP_flags);					\
 })
 
-/*
- * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
- * a potentially remote cpu drain) and get interrupted by an operation that
- * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
- * spinlock assumptions making the trylock a no-op. So we have to turn that
- * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
- * remote cpu's so we can only be locking the only existing local one.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
-static inline void __flags_noop(unsigned long *flags) { }
-#define pcp_spin_lock_maybe_irqsave(ptr, flags)		\
-({							\
-	 __flags_noop(&(flags));			\
-	 spin_lock(&(ptr)->lock);			\
-})
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)	\
-({							\
-	 spin_unlock(&(ptr)->lock);			\
-	 __flags_noop(&(flags));			\
-})
-#else
-#define pcp_spin_lock_maybe_irqsave(ptr, flags)		\
-		spin_lock_irqsave(&(ptr)->lock, flags)
-#define pcp_spin_unlock_maybe_irqrestore(ptr, flags)	\
-		spin_unlock_irqrestore(&(ptr)->lock, flags)
-#endif
+#define pcp_spin_lock_nopin(ptr)			\
+		spin_lock(&(ptr)->lock)
+#define pcp_spin_unlock_nopin(ptr)			\
+		spin_unlock(&(ptr)->lock)
 
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
@@ -2583,7 +2540,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	int high_min, to_drain, to_drain_batched, batch;
-	unsigned long UP_flags;
 	bool todo = false;
 
 	high_min = READ_ONCE(pcp->high_min);
@@ -2603,9 +2559,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
 	to_drain = pcp->count - pcp->high;
 	while (to_drain > 0) {
 		to_drain_batched = min(to_drain, batch);
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 		todo = true;
 
 		to_drain -= to_drain_batched;
@@ -2622,15 +2578,14 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long UP_flags;
 	int to_drain, batch;
 
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0) {
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 	}
 }
 #endif
@@ -2641,11 +2596,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	unsigned long UP_flags;
 	int count;
 
 	do {
-		pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+		pcp_spin_lock_nopin(pcp);
 		count = pcp->count;
 		if (count) {
 			int to_drain = min(count,
@@ -2654,7 +2608,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 			free_pcppages_bulk(zone, to_drain, pcp, 0);
 			count -= to_drain;
 		}
-		pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+		pcp_spin_unlock_nopin(pcp);
 	} while (count);
 }
 
@@ -2853,7 +2807,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
  */
 static bool free_frozen_page_commit(struct zone *zone,
 		struct per_cpu_pages *pcp, struct page *page, int migratetype,
-		unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
+		unsigned int order, fpi_t fpi_flags)
 {
 	int high, batch;
 	int to_free, to_free_batched;
@@ -2913,9 +2867,9 @@ static bool free_frozen_page_commit(struct zone *zone,
 		if (to_free == 0 || pcp->count == 0)
 			break;
 
-		pcp_spin_unlock(pcp, *UP_flags);
+		pcp_spin_unlock(pcp);
 
-		pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 		if (!pcp) {
 			ret = false;
 			break;
@@ -2927,7 +2881,7 @@ static bool free_frozen_page_commit(struct zone *zone,
 		 * returned in an unlocked state.
 		 */
 		if (smp_processor_id() != cpu) {
-			pcp_spin_unlock(pcp, *UP_flags);
+			pcp_spin_unlock(pcp);
 			ret = false;
 			break;
 		}
@@ -2959,7 +2913,6 @@ static bool free_frozen_page_commit(struct zone *zone,
 static void __free_frozen_pages(struct page *page, unsigned int order,
 				fpi_t fpi_flags)
 {
-	unsigned long UP_flags;
 	struct per_cpu_pages *pcp;
 	struct zone *zone;
 	unsigned long pfn = page_to_pfn(page);
@@ -2995,12 +2948,12 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
 		add_page_to_zone_llist(zone, page, order);
 		return;
 	}
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
 		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
-						order, fpi_flags, &UP_flags))
+						order, fpi_flags))
 			return;
-		pcp_spin_unlock(pcp, UP_flags);
+		pcp_spin_unlock(pcp);
 	} else {
 		free_one_page(zone, page, pfn, order, fpi_flags);
 	}
@@ -3021,7 +2974,6 @@ void free_frozen_pages_nolock(struct page *page, unsigned int order)
  */
 void free_unref_folios(struct folio_batch *folios)
 {
-	unsigned long UP_flags;
 	struct per_cpu_pages *pcp = NULL;
 	struct zone *locked_zone = NULL;
 	int i, j;
@@ -3064,7 +3016,7 @@ void free_unref_folios(struct folio_batch *folios)
 		if (zone != locked_zone ||
 		    is_migrate_isolate(migratetype)) {
 			if (pcp) {
-				pcp_spin_unlock(pcp, UP_flags);
+				pcp_spin_unlock(pcp);
 				locked_zone = NULL;
 				pcp = NULL;
 			}
@@ -3083,7 +3035,7 @@ void free_unref_folios(struct folio_batch *folios)
 			 * trylock is necessary as folios may be getting freed
 			 * from IRQ or SoftIRQ context after an IO completion.
 			 */
-			pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+			pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 			if (unlikely(!pcp)) {
 				free_one_page(zone, &folio->page, pfn,
 					      order, FPI_NONE);
@@ -3101,14 +3053,14 @@ void free_unref_folios(struct folio_batch *folios)
 
 		trace_mm_page_free_batched(&folio->page);
 		if (!free_frozen_page_commit(zone, pcp, &folio->page,
-				migratetype, order, FPI_NONE, &UP_flags)) {
+				migratetype, order, FPI_NONE)) {
 			pcp = NULL;
 			locked_zone = NULL;
 		}
 	}
 
 	if (pcp)
-		pcp_spin_unlock(pcp, UP_flags);
+		pcp_spin_unlock(pcp);
 	folio_batch_reinit(folios);
 }
 
@@ -3359,10 +3311,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
 	struct page *page;
-	unsigned long UP_flags;
 
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp)
 		return NULL;
 
@@ -3374,7 +3325,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	pcp->free_count >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
-	pcp_spin_unlock(pcp, UP_flags);
+	pcp_spin_unlock(pcp);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone, 1);
@@ -5062,7 +5013,6 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 			struct page **page_array)
 {
 	struct page *page;
-	unsigned long UP_flags;
 	struct zone *zone;
 	struct zoneref *z;
 	struct per_cpu_pages *pcp;
@@ -5156,7 +5106,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		goto failed;
 
 	/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
-	pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (!pcp)
 		goto failed;
 
@@ -5175,7 +5125,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		if (unlikely(!page)) {
 			/* Try and allocate at least one page */
 			if (!nr_account) {
-				pcp_spin_unlock(pcp, UP_flags);
+				pcp_spin_unlock(pcp);
 				goto failed;
 			}
 			break;
@@ -5187,7 +5137,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 		page_array[nr_populated++] = page;
 	}
 
-	pcp_spin_unlock(pcp, UP_flags);
+	pcp_spin_unlock(pcp);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
 	zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -6144,7 +6094,6 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 {
 	struct per_cpu_pages *pcp;
 	struct cpu_cacheinfo *cci;
-	unsigned long UP_flags;
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 	cci = get_cpu_cacheinfo(cpu);
@@ -6155,12 +6104,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
 	 * This can reduce zone lock contention without hurting
 	 * cache-hot pages sharing.
 	 */
-	pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
+	pcp_spin_lock_nopin(pcp);
 	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
 		pcp->flags |= PCPF_FREE_HIGH_BATCH;
 	else
 		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
-	pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
+	pcp_spin_unlock_nopin(pcp);
 }
 
 void setup_pcp_cacheinfo(unsigned int cpu)
-- 
2.53.0




^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-04-15 18:45 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-02-13 11:57 [RFC] making nested spin_trylock() work on UP? Vlastimil Babka
2026-02-14  6:28 ` Matthew Wilcox
2026-02-14 16:32   ` Linus Torvalds
2026-02-16 10:32     ` Vlastimil Babka
2026-04-15 18:44   ` Harry Yoo (Oracle)
  -- strict thread matches above, loose matches on Subject: below --
2026-02-13 11:57 Vlastimil Babka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox