From: Harry Yoo <harry.yoo@oracle.com>
To: Andrew Morton <akpm@linux-foundation.org>,
Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@gentwo.org>,
David Rientjes <rientjes@google.com>,
Roman Gushchin <roman.gushchin@linux.dev>,
Johannes Weiner <hannes@cmpxchg.org>,
Shakeel Butt <shakeel.butt@linux.dev>,
Michal Hocko <mhocko@kernel.org>,
Harry Yoo <harry.yoo@oracle.com>, Hao Li <hao.li@linux.dev>,
Alexei Starovoitov <ast@kernel.org>,
Puranjay Mohan <puranjay@kernel.org>,
Andrii Nakryiko <andrii@kernel.org>,
Amery Hung <ameryhung@gmail.com>,
Catalin Marinas <catalin.marinas@arm.com>,
"Paul E . McKenney" <paulmck@kernel.org>,
Frederic Weisbecker <frederic@kernel.org>,
Neeraj Upadhyay <neeraj.upadhyay@kernel.org>,
Joel Fernandes <joelagnelf@nvidia.com>,
Josh Triplett <josh@joshtriplett.org>,
Boqun Feng <boqun.feng@gmail.com>,
Uladzislau Rezki <urezki@gmail.com>,
Steven Rostedt <rostedt@goodmis.org>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Lai Jiangshan <jiangshanlai@gmail.com>,
Zqiang <qiang.zhang@linux.dev>,
Dave Chinner <david@fromorbit.com>,
Qi Zheng <zhengqi.arch@bytedance.com>,
Muchun Song <muchun.song@linux.dev>,
rcu@vger.kernel.org, linux-mm@kvack.org, bpf@vger.kernel.org
Subject: [RFC PATCH 6/7] mm/slab: introduce kfree_rcu_nolock()
Date: Fri, 6 Feb 2026 18:34:09 +0900 [thread overview]
Message-ID: <20260206093410.160622-7-harry.yoo@oracle.com> (raw)
In-Reply-To: <20260206093410.160622-1-harry.yoo@oracle.com>
Currently, kfree_rcu() cannot be called in an NMI context.
In such a context, even calling call_rcu() is not legal,
forcing users to implement deferred freeing.
Make users' lives easier by introducing kfree_rcu_nolock() variant.
Unlike kfree_rcu(), kfree_rcu_nolock() only supports a 2-argument
variant, because, in the worst case where memory allocation fails,
the caller cannot synchronously wait for the grace period to finish.
Similar to kfree_nolock() implementation, try to acquire kfree_rcu_cpu
spinlock, and if that fails, insert the object to per-cpu lockless list
and delay freeing using irq_work that calls kvfree_call_rcu() later.
In case kmemleak or debugobjects is enabled, always defer freeing as
those debug features don't support NMI contexts.
When trylock succeeds, avoid consuming bnode and run_page_cache_worker()
altogether. Instead, insert objects into struct kfree_rcu_cpu.head
without consuming additional memory.
For now, the sheaves layer is bypassed if spinning is not allowed.
Scheduling delayed monitor work in an NMI context is tricky; use
irq_work to schedule, but use lazy irq_work to avoid raising self-IPIs.
That means scheduling delayed monitor work can be delayed up to the
length of a time slice.
Without CONFIG_KVFREE_RCU_BATCHED, all frees in the !allow_spin case are
delayed using irq_work.
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---
include/linux/rcupdate.h | 23 ++++---
mm/slab_common.c | 140 +++++++++++++++++++++++++++++++++------
2 files changed, 133 insertions(+), 30 deletions(-)
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index db5053a7b0cb..18bb7378b23d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1092,8 +1092,9 @@ static inline void rcu_read_unlock_migrate(void)
* The BUILD_BUG_ON check must not involve any function calls, hence the
* checks are done in macros here.
*/
-#define kfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf)
-#define kvfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf)
+#define kfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf, true)
+#define kfree_rcu_nolock(ptr, rf) kvfree_rcu_arg_2(ptr, rf, false)
+#define kvfree_rcu(ptr, rf) kvfree_rcu_arg_2(ptr, rf, true)
/**
* kfree_rcu_mightsleep() - kfree an object after a grace period.
@@ -1117,35 +1118,35 @@ static inline void rcu_read_unlock_migrate(void)
#ifdef CONFIG_KVFREE_RCU_BATCHED
-void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr);
-#define kvfree_call_rcu(head, ptr) \
+void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr, bool allow_spin);
+#define kvfree_call_rcu(head, ptr, spin) \
_Generic((head), \
struct rcu_head *: kvfree_call_rcu_ptr, \
struct rcu_ptr *: kvfree_call_rcu_ptr, \
void *: kvfree_call_rcu_ptr \
- )((struct rcu_ptr *)(head), (ptr))
+ )((struct rcu_ptr *)(head), (ptr), spin)
#else
-void kvfree_call_rcu_head(struct rcu_head *head, void *ptr);
+void kvfree_call_rcu_head(struct rcu_head *head, void *ptr, bool allow_spin);
static_assert(sizeof(struct rcu_head) == sizeof(struct rcu_ptr));
-#define kvfree_call_rcu(head, ptr) \
+#define kvfree_call_rcu(head, ptr, spin) \
_Generic((head), \
struct rcu_head *: kvfree_call_rcu_head, \
struct rcu_ptr *: kvfree_call_rcu_head, \
void *: kvfree_call_rcu_head \
- )((struct rcu_head *)(head), (ptr))
+ )((struct rcu_head *)(head), (ptr), spin)
#endif
/*
* The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
* comment of kfree_rcu() for details.
*/
-#define kvfree_rcu_arg_2(ptr, rf) \
+#define kvfree_rcu_arg_2(ptr, rf, spin) \
do { \
typeof (ptr) ___p = (ptr); \
\
if (___p) { \
BUILD_BUG_ON(offsetof(typeof(*(ptr)), rf) >= 4096); \
- kvfree_call_rcu(&((___p)->rf), (void *) (___p)); \
+ kvfree_call_rcu(&((___p)->rf), (void *) (___p), spin); \
} \
} while (0)
@@ -1154,7 +1155,7 @@ do { \
typeof(ptr) ___p = (ptr); \
\
if (___p) \
- kvfree_call_rcu(NULL, (void *) (___p)); \
+ kvfree_call_rcu(NULL, (void *) (___p), true); \
} while (0)
/*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d232b99a4b52..9d7801e5cb73 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1311,6 +1311,12 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a lockless linked list, not protected by the lock.
+ // This allows freeing objects in NMI context, where trylock may fail.
+ struct llist_head llist_head;
+ struct irq_work irq_work;
+ struct irq_work sched_monitor_irq_work;
+
// Objects queued on a linked list
struct rcu_ptr *head;
unsigned long head_gp_snap;
@@ -1333,12 +1339,61 @@ struct kfree_rcu_cpu {
struct llist_head bkvcache;
int nr_bkv_objs;
};
+#else
+struct kfree_rcu_cpu {
+ struct llist_head llist_head;
+ struct irq_work irq_work;
+};
#endif
+/* Universial implementation regardless of CONFIG_KVFREE_RCU_BATCHED */
+static void defer_kfree_rcu(struct irq_work *work)
+{
+ struct kfree_rcu_cpu *krcp;
+ struct llist_head *head;
+ struct llist_node *llnode, *pos, *t;
+
+ krcp = container_of(work, struct kfree_rcu_cpu, irq_work);
+ head = &krcp->llist_head;
+
+ if (llist_empty(head))
+ return;
+
+ llnode = llist_del_all(head);
+ llist_for_each_safe(pos, t, llnode) {
+ struct slab *slab;
+ void *objp;
+ struct rcu_ptr *rcup = (struct rcu_ptr *)pos;
+
+ slab = virt_to_slab(pos);
+ if (is_vmalloc_addr(pos) || !slab)
+ objp = (void *)PAGE_ALIGN_DOWN((unsigned long)pos);
+ else
+ objp = nearest_obj(slab->slab_cache, slab, pos);
+
+ kvfree_call_rcu(rcup, objp, true);
+ }
+}
+
#ifndef CONFIG_KVFREE_RCU_BATCHED
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+ .llist_head = LLIST_HEAD_INIT(llist_head),
+ .irq_work = IRQ_WORK_INIT(defer_kfree_rcu),
+};
-void kvfree_call_rcu_head(struct rcu_head *head, void *ptr)
+void kvfree_call_rcu_head(struct rcu_head *head, void *ptr, bool allow_spin)
{
+ if (!allow_spin) {
+ struct kfree_rcu_cpu *krcp;
+
+ guard(preempt)();
+
+ krcp = this_cpu_ptr(&krc);
+ if (llist_add((struct llist_node *)head, &krcp->llist_head))
+ irq_work_queue(&krcp->irq_work);
+ return;
+ }
+
if (head) {
kasan_record_aux_stack(ptr);
call_rcu(head, kvfree_rcu_cb);
@@ -1405,8 +1460,21 @@ struct kvfree_rcu_bulk_data {
#define KVFREE_BULK_MAX_ENTR \
((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
+static void schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp);
+
+static void sched_monitor_irq_work(struct irq_work *work)
+{
+ struct kfree_rcu_cpu *krcp;
+
+ krcp = container_of(work, struct kfree_rcu_cpu, sched_monitor_irq_work);
+ schedule_delayed_monitor_work(krcp);
+}
+
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+ .irq_work = IRQ_WORK_INIT(defer_kfree_rcu),
+ .sched_monitor_irq_work =
+ IRQ_WORK_INIT_LAZY(sched_monitor_irq_work),
};
static __always_inline void
@@ -1421,13 +1489,18 @@ debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
}
static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
+krc_this_cpu_lock(unsigned long *flags, bool allow_spin)
{
struct kfree_rcu_cpu *krcp;
local_irq_save(*flags); // For safely calling this_cpu_ptr().
krcp = this_cpu_ptr(&krc);
- raw_spin_lock(&krcp->lock);
+ if (allow_spin) {
+ raw_spin_lock(&krcp->lock);
+ } else if (!raw_spin_trylock(&krcp->lock)) {
+ local_irq_restore(*flags);
+ return NULL;
+ }
return krcp;
}
@@ -1841,25 +1914,27 @@ static void fill_page_cache_func(struct work_struct *work)
// Returns true if ptr was successfully recorded, else the caller must
// use a fallback.
static inline bool
-add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
- unsigned long *flags, void *ptr, bool can_alloc)
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu *krcp,
+ unsigned long *flags, void *ptr, bool can_alloc, bool allow_spin)
{
struct kvfree_rcu_bulk_data *bnode;
int idx;
- *krcp = krc_this_cpu_lock(flags);
- if (unlikely(!(*krcp)->initialized))
+ if (unlikely(!krcp->initialized))
+ return false;
+
+ if (!allow_spin)
return false;
idx = !!is_vmalloc_addr(ptr);
- bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ bnode = list_first_entry_or_null(&krcp->bulk_head[idx],
struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
- bnode = get_cached_bnode(*krcp);
+ bnode = get_cached_bnode(krcp);
if (!bnode && can_alloc) {
- krc_this_cpu_unlock(*krcp, *flags);
+ krc_this_cpu_unlock(krcp, *flags);
// __GFP_NORETRY - allows a light-weight direct reclaim
// what is OK from minimizing of fallback hitting point of
@@ -1874,7 +1949,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// scenarios.
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+ raw_spin_lock_irqsave(&krcp->lock, *flags);
}
if (!bnode)
@@ -1882,14 +1957,14 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
// Initialize the new block and attach it.
bnode->nr_records = 0;
- list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
+ list_add(&bnode->list, &krcp->bulk_head[idx]);
}
// Finally insert and update the GP for this page.
bnode->nr_records++;
bnode->records[bnode->nr_records - 1] = ptr;
get_state_synchronize_rcu_full(&bnode->gp_snap);
- atomic_inc(&(*krcp)->bulk_count[idx]);
+ atomic_inc(&krcp->bulk_count[idx]);
return true;
}
@@ -1949,7 +2024,7 @@ void __init kfree_rcu_scheduler_running(void)
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr)
+void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr, bool allow_spin)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
@@ -1965,7 +2040,12 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr)
if (!head)
might_sleep();
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr))
+ if (!allow_spin && (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD) ||
+ IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)))
+ goto defer_free;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (allow_spin && kfree_rcu_sheaf(ptr)))
return;
// Queue the object but don't yet schedule the batch.
@@ -1979,9 +2059,15 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr)
}
kasan_record_aux_stack(ptr);
- success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
+
+ krcp = krc_this_cpu_lock(&flags, allow_spin);
+ if (!krcp)
+ goto defer_free;
+
+ success = add_ptr_to_bulk_krc_lock(krcp, &flags, ptr, !head, allow_spin);
if (!success) {
- run_page_cache_worker(krcp);
+ if (allow_spin)
+ run_page_cache_worker(krcp);
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
@@ -2005,8 +2091,12 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr)
kmemleak_ignore(ptr);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- __schedule_delayed_monitor_work(krcp);
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) {
+ if (allow_spin)
+ __schedule_delayed_monitor_work(krcp);
+ else
+ irq_work_queue(&krcp->sched_monitor_irq_work);
+ }
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -2017,10 +2107,22 @@ void kvfree_call_rcu_ptr(struct rcu_ptr *head, void *ptr)
* CPU can pass the QS state.
*/
if (!success) {
+ VM_WARN_ON_ONCE(!allow_spin);
debug_rcu_head_unqueue((struct rcu_head *) ptr);
synchronize_rcu();
kvfree(ptr);
}
+ return;
+
+defer_free:
+ VM_WARN_ON_ONCE(allow_spin);
+ guard(preempt)();
+
+ krcp = this_cpu_ptr(&krc);
+ if (llist_add((struct llist_node *)head, &krcp->llist_head))
+ irq_work_queue(&krcp->irq_work);
+ return;
+
}
EXPORT_SYMBOL_GPL(kvfree_call_rcu_ptr);
--
2.43.0
next prev parent reply other threads:[~2026-02-06 9:35 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-06 9:34 [RFC PATCH 0/7] k[v]free_rcu() improvements Harry Yoo
2026-02-06 9:34 ` [RFC PATCH 1/7] mm/slab: introduce k[v]free_rcu() with struct rcu_ptr Harry Yoo
2026-02-11 10:16 ` Uladzislau Rezki
2026-02-11 10:44 ` Harry Yoo
2026-02-11 10:53 ` Uladzislau Rezki
2026-02-11 11:26 ` Harry Yoo
2026-02-11 13:02 ` Uladzislau Rezki
2026-02-11 17:05 ` Alexei Starovoitov
2026-02-12 11:52 ` Vlastimil Babka
2026-02-13 5:17 ` Harry Yoo
2026-02-06 9:34 ` [RFC PATCH 2/7] mm: use rcu_ptr instead of rcu_head Harry Yoo
2026-02-09 10:41 ` Uladzislau Rezki
2026-02-09 11:22 ` Harry Yoo
2026-02-06 9:34 ` [RFC PATCH 3/7] mm/slab: allow freeing kmalloc_nolock()'d objects using kfree[_rcu]() Harry Yoo
2026-02-06 9:34 ` [RFC PATCH 4/7] mm/slab: free a bit in enum objexts_flags Harry Yoo
2026-02-06 20:09 ` Alexei Starovoitov
2026-02-09 9:38 ` Vlastimil Babka
2026-02-09 18:44 ` Alexei Starovoitov
2026-02-06 9:34 ` [RFC PATCH 5/7] mm/slab: move kfree_rcu_cpu[_work] definitions Harry Yoo
2026-02-06 9:34 ` Harry Yoo [this message]
2026-02-12 2:58 ` [RFC PATCH 6/7] mm/slab: introduce kfree_rcu_nolock() Harry Yoo
2026-02-16 21:07 ` Joel Fernandes
2026-02-16 21:32 ` Joel Fernandes
2026-02-06 9:34 ` [RFC PATCH 7/7] mm/slab: make kfree_rcu_nolock() work with sheaves Harry Yoo
2026-02-12 19:15 ` Alexei Starovoitov
2026-02-13 11:55 ` Harry Yoo
2026-02-07 0:16 ` [RFC PATCH 0/7] k[v]free_rcu() improvements Paul E. McKenney
2026-02-07 1:21 ` Harry Yoo
2026-02-07 1:33 ` Paul E. McKenney
2026-02-09 9:02 ` Harry Yoo
2026-02-09 16:40 ` Paul E. McKenney
2026-02-12 14:28 ` Vlastimil Babka
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260206093410.160622-7-harry.yoo@oracle.com \
--to=harry.yoo@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=ameryhung@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=boqun.feng@gmail.com \
--cc=bpf@vger.kernel.org \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=david@fromorbit.com \
--cc=frederic@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=hao.li@linux.dev \
--cc=jiangshanlai@gmail.com \
--cc=joelagnelf@nvidia.com \
--cc=josh@joshtriplett.org \
--cc=linux-mm@kvack.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=neeraj.upadhyay@kernel.org \
--cc=paulmck@kernel.org \
--cc=puranjay@kernel.org \
--cc=qiang.zhang@linux.dev \
--cc=rcu@vger.kernel.org \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=rostedt@goodmis.org \
--cc=shakeel.butt@linux.dev \
--cc=urezki@gmail.com \
--cc=vbabka@suse.cz \
--cc=zhengqi.arch@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox