* [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
@ 2025-10-15 0:07 Alexei Starovoitov
2025-10-15 1:59 ` Harry Yoo
2025-10-15 10:31 ` patchwork-bot+netdevbpf
0 siblings, 2 replies; 5+ messages in thread
From: Alexei Starovoitov @ 2025-10-15 0:07 UTC (permalink / raw)
To: bpf
Cc: daniel, andrii, martin.lau, shakeel.butt, vbabka, harry.yoo,
yepeilin, linux-mm, kernel-team
From: Alexei Starovoitov <ast@kernel.org>
The following kmemleak splat:
[ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black
[ 8.106521] Call Trace:
[ 8.106521] <TASK>
[ 8.106521] dump_stack_lvl+0x4b/0x70
[ 8.106521] kvfree_call_rcu+0xcb/0x3b0
[ 8.106521] ? hrtimer_cancel+0x21/0x40
[ 8.106521] bpf_obj_free_fields+0x193/0x200
[ 8.106521] htab_map_update_elem+0x29c/0x410
[ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b
[ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86
[ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0
happens due to the combination of features and fixes, but mainly due to
commit 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip
kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()->
kvfree_call_rcu()->kmemleak_ignore() complains with the above splat.
To fix this imbalance, replace bpf_map_kmalloc_node() with
kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to
make sure that the objects allocated with kmalloc_nolock() are freed
with kfree_nolock() rather than the implicit kfree() that kfree_rcu()
uses internally.
Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so
it will always fail in PREEMPT_RT. This is not an issue at the moment,
since bpf_timers are disabled in PREEMPT_RT. In the future
bpf_spin_lock will be replaced with state machine similar to
bpf_task_work.
Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
v1->v2: Fix one missing kfree->kfree_nolock() conversion (caught by BPF AI bot)
include/linux/bpf.h | 4 ++++
kernel/bpf/helpers.c | 25 ++++++++++++++-----------
kernel/bpf/syscall.c | 15 +++++++++++++++
3 files changed, 33 insertions(+), 11 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a98c83346134..d808253f2e94 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2499,6 +2499,8 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node);
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node);
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags);
void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
gfp_t flags);
@@ -2511,6 +2513,8 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
*/
#define bpf_map_kmalloc_node(_map, _size, _flags, _node) \
kmalloc_node(_size, _flags, _node)
+#define bpf_map_kmalloc_nolock(_map, _size, _flags, _node) \
+ kmalloc_nolock(_size, _flags, _node)
#define bpf_map_kzalloc(_map, _size, _flags) \
kzalloc(_size, _flags)
#define bpf_map_kvcalloc(_map, _n, _size, _flags) \
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c9fab9a356df..8eb117c52817 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1215,13 +1215,20 @@ static void bpf_wq_work(struct work_struct *work)
rcu_read_unlock_trace();
}
+static void bpf_async_cb_rcu_free(struct rcu_head *rcu)
+{
+ struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu);
+
+ kfree_nolock(cb);
+}
+
static void bpf_wq_delete_work(struct work_struct *work)
{
struct bpf_work *w = container_of(work, struct bpf_work, delete_work);
cancel_work_sync(&w->work);
- kfree_rcu(w, cb.rcu);
+ call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free);
}
static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1237,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
- * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * call_rcu() right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,11 +1277,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
goto out;
}
- /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until
- * kmalloc_nolock() is available, avoid locking issues by using
- * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM).
- */
- cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node);
+ cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node);
if (!cb) {
ret = -ENOMEM;
goto out;
@@ -1315,7 +1318,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
* or pinned in bpffs.
*/
WRITE_ONCE(async->cb, NULL);
- kfree(cb);
+ kfree_nolock(cb);
ret = -EPERM;
}
out:
@@ -1580,7 +1583,7 @@ void bpf_timer_cancel_and_free(void *val)
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
- * kfree_rcu, even though no more timers can be armed.
+ * call_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1607,7 +1610,7 @@ void bpf_timer_cancel_and_free(void *val)
* completion.
*/
if (hrtimer_try_to_cancel(&t->timer) >= 0)
- kfree_rcu(t, cb.rcu);
+ call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free);
else
queue_work(system_dfl_wq, &t->cb.delete_work);
} else {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a9456a3e730..8a129746bd6c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,6 +520,21 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
return ptr;
}
+void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
+ int node)
+{
+ struct mem_cgroup *memcg, *old_memcg;
+ void *ptr;
+
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
+ ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+
+ return ptr;
+}
+
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
--
2.47.3
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
2025-10-15 0:07 [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures Alexei Starovoitov
@ 2025-10-15 1:59 ` Harry Yoo
2025-10-15 2:12 ` Alexei Starovoitov
2025-10-15 10:31 ` patchwork-bot+netdevbpf
1 sibling, 1 reply; 5+ messages in thread
From: Harry Yoo @ 2025-10-15 1:59 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, daniel, andrii, martin.lau, shakeel.butt, vbabka, yepeilin,
linux-mm, kernel-team
On Tue, Oct 14, 2025 at 05:07:00PM -0700, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> The following kmemleak splat:
> [ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black
> [ 8.106521] Call Trace:
> [ 8.106521] <TASK>
> [ 8.106521] dump_stack_lvl+0x4b/0x70
> [ 8.106521] kvfree_call_rcu+0xcb/0x3b0
> [ 8.106521] ? hrtimer_cancel+0x21/0x40
> [ 8.106521] bpf_obj_free_fields+0x193/0x200
> [ 8.106521] htab_map_update_elem+0x29c/0x410
> [ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b
> [ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86
> [ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0
>
> happens due to the combination of features and fixes, but mainly due to
> commit 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
> It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip
> kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()->
> kvfree_call_rcu()->kmemleak_ignore() complains with the above splat.
>
> To fix this imbalance, replace bpf_map_kmalloc_node() with
> kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to
> make sure that the objects allocated with kmalloc_nolock() are freed
> with kfree_nolock() rather than the implicit kfree() that kfree_rcu()
> uses internally.
>
> Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so
> it will always fail in PREEMPT_RT. This is not an issue at the moment,
> since bpf_timers are disabled in PREEMPT_RT. In the future
> bpf_spin_lock will be replaced with state machine similar to
> bpf_task_work.
>
> Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
LGTM,
Acked-by: Harry Yoo <harry.yoo@oracle.com>
So we're losing benefit of batch-processing via kfree_rcu() and
instead using call_rcu(), and I guess it's fine since it's not very
performance critical so we don't have to make kfree_rcu() work with
objects that are allocated via kmalloc_nolock()?
--
Cheers,
Harry / Hyeonggon
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
2025-10-15 1:59 ` Harry Yoo
@ 2025-10-15 2:12 ` Alexei Starovoitov
2025-10-15 10:06 ` Vlastimil Babka
0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2025-10-15 2:12 UTC (permalink / raw)
To: Harry Yoo
Cc: bpf, Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau,
Shakeel Butt, Vlastimil Babka, Peilin Ye, linux-mm, Kernel Team
On Tue, Oct 14, 2025 at 6:59 PM Harry Yoo <harry.yoo@oracle.com> wrote:
>
> On Tue, Oct 14, 2025 at 05:07:00PM -0700, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > The following kmemleak splat:
> > [ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black
> > [ 8.106521] Call Trace:
> > [ 8.106521] <TASK>
> > [ 8.106521] dump_stack_lvl+0x4b/0x70
> > [ 8.106521] kvfree_call_rcu+0xcb/0x3b0
> > [ 8.106521] ? hrtimer_cancel+0x21/0x40
> > [ 8.106521] bpf_obj_free_fields+0x193/0x200
> > [ 8.106521] htab_map_update_elem+0x29c/0x410
> > [ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b
> > [ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86
> > [ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0
> >
> > happens due to the combination of features and fixes, but mainly due to
> > commit 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
> > It's using __GFP_HIGH, which instructs slub/kmemleak internals to skip
> > kmemleak_alloc_recursive() on allocation, so subsequent kfree_rcu()->
> > kvfree_call_rcu()->kmemleak_ignore() complains with the above splat.
> >
> > To fix this imbalance, replace bpf_map_kmalloc_node() with
> > kmalloc_nolock() and kfree_rcu() with call_rcu() + kfree_nolock() to
> > make sure that the objects allocated with kmalloc_nolock() are freed
> > with kfree_nolock() rather than the implicit kfree() that kfree_rcu()
> > uses internally.
> >
> > Note, the kmalloc_nolock() happens under bpf_spin_lock_irqsave(), so
> > it will always fail in PREEMPT_RT. This is not an issue at the moment,
> > since bpf_timers are disabled in PREEMPT_RT. In the future
> > bpf_spin_lock will be replaced with state machine similar to
> > bpf_task_work.
> >
> > Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
> > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
>
> LGTM,
> Acked-by: Harry Yoo <harry.yoo@oracle.com>
>
> So we're losing benefit of batch-processing via kfree_rcu() and
> instead using call_rcu(), and I guess it's fine since it's not very
> performance critical
yes. Here freeing is not in critical path.
> so we don't have to make kfree_rcu() work with
> objects that are allocated via kmalloc_nolock()?
Not quite :) It's on the todo list.
Something like kfree_nolock_rcu() is needed, and
also SLAB_TYPESAFE_BY_RCU-like and new SLAB_TYPESAFE_BY_SRCU_FAST flag.
The plan for the upcoming merge window is to delete rcu tasks trace
and replace with srcu_fast. It will clarify next steps.
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
2025-10-15 2:12 ` Alexei Starovoitov
@ 2025-10-15 10:06 ` Vlastimil Babka
0 siblings, 0 replies; 5+ messages in thread
From: Vlastimil Babka @ 2025-10-15 10:06 UTC (permalink / raw)
To: Alexei Starovoitov, Harry Yoo
Cc: bpf, Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau,
Shakeel Butt, Peilin Ye, linux-mm, Kernel Team
On 10/15/25 04:12, Alexei Starovoitov wrote:
> On Tue, Oct 14, 2025 at 6:59 PM Harry Yoo <harry.yoo@oracle.com> wrote:
>>
>> >
>> > Fixes: 6d78b4473cdb ("bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init()")
>> > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
>> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> > ---
>>
>> LGTM,
>> Acked-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
>>
>> So we're losing benefit of batch-processing via kfree_rcu() and
>> instead using call_rcu(), and I guess it's fine since it's not very
>> performance critical
>
> yes. Here freeing is not in critical path.
>
>> so we don't have to make kfree_rcu() work with
>> objects that are allocated via kmalloc_nolock()?
>
> Not quite :) It's on the todo list.
> Something like kfree_nolock_rcu() is needed, and
> also SLAB_TYPESAFE_BY_RCU-like and new SLAB_TYPESAFE_BY_SRCU_FAST flag.
> The plan for the upcoming merge window is to delete rcu tasks trace
> and replace with srcu_fast. It will clarify next steps.
I'm a bit afraid if this approach of stricly matching kmalloc_nolock() with
kfree_nolock() is going to be a neverending whack-a-mole. Let's see...
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
2025-10-15 0:07 [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures Alexei Starovoitov
2025-10-15 1:59 ` Harry Yoo
@ 2025-10-15 10:31 ` patchwork-bot+netdevbpf
1 sibling, 0 replies; 5+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-10-15 10:31 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, daniel, andrii, martin.lau, shakeel.butt, vbabka, harry.yoo,
yepeilin, linux-mm, kernel-team
Hello:
This patch was applied to bpf/bpf.git (master)
by Daniel Borkmann <daniel@iogearbox.net>:
On Tue, 14 Oct 2025 17:07:00 -0700 you wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> The following kmemleak splat:
> [ 8.105530] kmemleak: Trying to color unknown object at 0xff11000100e918c0 as Black
> [ 8.106521] Call Trace:
> [ 8.106521] <TASK>
> [ 8.106521] dump_stack_lvl+0x4b/0x70
> [ 8.106521] kvfree_call_rcu+0xcb/0x3b0
> [ 8.106521] ? hrtimer_cancel+0x21/0x40
> [ 8.106521] bpf_obj_free_fields+0x193/0x200
> [ 8.106521] htab_map_update_elem+0x29c/0x410
> [ 8.106521] bpf_prog_cfc8cd0f42c04044_overwrite_cb+0x47/0x4b
> [ 8.106521] bpf_prog_8c30cd7c4db2e963_overwrite_timer+0x65/0x86
> [ 8.106521] bpf_prog_test_run_syscall+0xe1/0x2a0
>
> [...]
Here is the summary with links:
- [v2,bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures.
https://git.kernel.org/bpf/bpf/c/5fb750e8a9ae
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-10-15 10:31 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-15 0:07 [PATCH v2 bpf] bpf: Replace bpf_map_kmalloc_node() with kmalloc_nolock() to allocate bpf_async_cb structures Alexei Starovoitov
2025-10-15 1:59 ` Harry Yoo
2025-10-15 2:12 ` Alexei Starovoitov
2025-10-15 10:06 ` Vlastimil Babka
2025-10-15 10:31 ` patchwork-bot+netdevbpf
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox