* [PATCH 1/3] cgroup: reduce cgroup_file_kn_lock hold time in cgroup_file_notify()
2026-02-28 14:20 [PATCH 0/3] cgroup: improve cgroup_file_notify() scalability Shakeel Butt
@ 2026-02-28 14:20 ` Shakeel Butt
2026-02-28 14:20 ` [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify() Shakeel Butt
2026-02-28 14:20 ` [PATCH 3/3] cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock Shakeel Butt
2 siblings, 0 replies; 10+ messages in thread
From: Shakeel Butt @ 2026-02-28 14:20 UTC (permalink / raw)
To: Tejun Heo
Cc: Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
cgroup_file_notify() calls kernfs_notify() while holding the global
cgroup_file_kn_lock. kernfs_notify() does non-trivial work including
wake_up_interruptible() and acquisition of a second global spinlock
(kernfs_notify_lock), inflating the hold time.
Take a kernfs_get() reference under the lock and call kernfs_notify()
after dropping it, following the pattern from cgroup_file_show().
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reported-by: Jakub Kicinski <kuba@kernel.org>
---
kernel/cgroup/cgroup.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda317..33282c7d71e4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4687,6 +4687,7 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
void cgroup_file_notify(struct cgroup_file *cfile)
{
unsigned long flags;
+ struct kernfs_node *kn = NULL;
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
if (cfile->kn) {
@@ -4696,11 +4697,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
if (time_in_range(jiffies, last, next)) {
timer_reduce(&cfile->notify_timer, next);
} else {
- kernfs_notify(cfile->kn);
+ kn = cfile->kn;
+ kernfs_get(kn);
cfile->notified_at = jiffies;
}
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
+
+ if (kn) {
+ kernfs_notify(kn);
+ kernfs_put(kn);
+ }
}
EXPORT_SYMBOL_GPL(cgroup_file_notify);
--
2.47.3
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-02-28 14:20 [PATCH 0/3] cgroup: improve cgroup_file_notify() scalability Shakeel Butt
2026-02-28 14:20 ` [PATCH 1/3] cgroup: reduce cgroup_file_kn_lock hold time in cgroup_file_notify() Shakeel Butt
@ 2026-02-28 14:20 ` Shakeel Butt
2026-03-02 1:50 ` Chen Ridong
2026-02-28 14:20 ` [PATCH 3/3] cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock Shakeel Butt
2 siblings, 1 reply; 10+ messages in thread
From: Shakeel Butt @ 2026-02-28 14:20 UTC (permalink / raw)
To: Tejun Heo
Cc: Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
Add two lockless checks before acquiring the lock:
1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
2. READ_ONCE(cfile->notified_at) check to skip when within the
rate-limit window (~10ms).
Both checks have safe error directions -- a stale read can only cause
unnecessary lock acquisition, never a missed notification. Annotate
all write sites with WRITE_ONCE() to pair with the lockless readers.
The trade-off is that trailing timer_reduce() calls during bursts are
skipped, so the deferred notification that delivers the final state
may be lost. This is acceptable for the primary callers like
__memcg_memory_event() where events keep arriving.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reported-by: Jakub Kicinski <kuba@kernel.org>
---
kernel/cgroup/cgroup.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 33282c7d71e4..5473ebd0f6c1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cgroup_file *cfile = (void *)css + cft->file_offset;
spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = NULL;
+ WRITE_ONCE(cfile->kn, NULL);
spin_unlock_irq(&cgroup_file_kn_lock);
timer_delete_sync(&cfile->notify_timer);
@@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = kn;
+ WRITE_ONCE(cfile->kn, kn);
spin_unlock_irq(&cgroup_file_kn_lock);
}
@@ -4686,20 +4686,27 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
*/
void cgroup_file_notify(struct cgroup_file *cfile)
{
- unsigned long flags;
+ unsigned long flags, last, next;
struct kernfs_node *kn = NULL;
+ if (!READ_ONCE(cfile->kn))
+ return;
+
+ last = READ_ONCE(cfile->notified_at);
+ if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
+ return;
+
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
if (cfile->kn) {
- unsigned long last = cfile->notified_at;
- unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+ last = cfile->notified_at;
+ next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
- if (time_in_range(jiffies, last, next)) {
+ if (time_before_eq(jiffies, next)) {
timer_reduce(&cfile->notify_timer, next);
} else {
kn = cfile->kn;
kernfs_get(kn);
- cfile->notified_at = jiffies;
+ WRITE_ONCE(cfile->notified_at, jiffies);
}
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
--
2.47.3
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-02-28 14:20 ` [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify() Shakeel Butt
@ 2026-03-02 1:50 ` Chen Ridong
2026-03-02 16:14 ` Shakeel Butt
0 siblings, 1 reply; 10+ messages in thread
From: Chen Ridong @ 2026-03-02 1:50 UTC (permalink / raw)
To: Shakeel Butt, Tejun Heo
Cc: Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
Hi Shakeel,
Good series to move away from the global lock.
On 2026/2/28 22:20, Shakeel Butt wrote:
> Add two lockless checks before acquiring the lock:
>
> 1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
> 2. READ_ONCE(cfile->notified_at) check to skip when within the
> rate-limit window (~10ms).
>
> Both checks have safe error directions -- a stale read can only cause
> unnecessary lock acquisition, never a missed notification. Annotate
> all write sites with WRITE_ONCE() to pair with the lockless readers.
>
> The trade-off is that trailing timer_reduce() calls during bursts are
> skipped, so the deferred notification that delivers the final state
> may be lost. This is acceptable for the primary callers like
> __memcg_memory_event() where events keep arriving.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> ---
> kernel/cgroup/cgroup.c | 21 ++++++++++++++-------
> 1 file changed, 14 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 33282c7d71e4..5473ebd0f6c1 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
> struct cgroup_file *cfile = (void *)css + cft->file_offset;
>
> spin_lock_irq(&cgroup_file_kn_lock);
> - cfile->kn = NULL;
> + WRITE_ONCE(cfile->kn, NULL);
> spin_unlock_irq(&cgroup_file_kn_lock);
>
> timer_delete_sync(&cfile->notify_timer);
> @@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
>
> spin_lock_irq(&cgroup_file_kn_lock);
> - cfile->kn = kn;
> + WRITE_ONCE(cfile->kn, kn);
> spin_unlock_irq(&cgroup_file_kn_lock);
> }
>
> @@ -4686,20 +4686,27 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
> */
> void cgroup_file_notify(struct cgroup_file *cfile)
> {
> - unsigned long flags;
> + unsigned long flags, last, next;
> struct kernfs_node *kn = NULL;
>
> + if (!READ_ONCE(cfile->kn))
> + return;
> +
> + last = READ_ONCE(cfile->notified_at);
> + if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
> + return;
> +
Previously, if a notification arrived within the rate-limit window, we would
still call timer_reduce(&cfile->notify_timer, next) to schedule a deferred
notification.
With this change, returning early here bypasses that timer scheduling entirely.
Does this risk missing notifications that would have been delivered by the timer?
> spin_lock_irqsave(&cgroup_file_kn_lock, flags);
> if (cfile->kn) {
> - unsigned long last = cfile->notified_at;
> - unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
> + last = cfile->notified_at;
> + next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
>
> - if (time_in_range(jiffies, last, next)) {
> + if (time_before_eq(jiffies, next)) {
> timer_reduce(&cfile->notify_timer, next);
> } else {
> kn = cfile->kn;
> kernfs_get(kn);
> - cfile->notified_at = jiffies;
> + WRITE_ONCE(cfile->notified_at, jiffies);
> }
> }
> spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-03-02 1:50 ` Chen Ridong
@ 2026-03-02 16:14 ` Shakeel Butt
2026-03-02 17:00 ` Shakeel Butt
2026-03-03 3:08 ` Chen Ridong
0 siblings, 2 replies; 10+ messages in thread
From: Shakeel Butt @ 2026-03-02 16:14 UTC (permalink / raw)
To: Chen Ridong
Cc: Tejun Heo, Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
Hi Chen, thanks for taking a look.
On Mon, Mar 02, 2026 at 09:50:53AM +0800, Chen Ridong wrote:
>
> Hi Shakeel,
>
> Good series to move away from the global lock.
>
> On 2026/2/28 22:20, Shakeel Butt wrote:
> > Add two lockless checks before acquiring the lock:
> >
> > 1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
> > 2. READ_ONCE(cfile->notified_at) check to skip when within the
> > rate-limit window (~10ms).
> >
> > Both checks have safe error directions -- a stale read can only cause
> > unnecessary lock acquisition, never a missed notification. Annotate
> > all write sites with WRITE_ONCE() to pair with the lockless readers.
> >
> > The trade-off is that trailing timer_reduce() calls during bursts are
> > skipped, so the deferred notification that delivers the final state
> > may be lost. This is acceptable for the primary callers like
> > __memcg_memory_event() where events keep arriving.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > Reported-by: Jakub Kicinski <kuba@kernel.org>
> > ---
> > kernel/cgroup/cgroup.c | 21 ++++++++++++++-------
> > 1 file changed, 14 insertions(+), 7 deletions(-)
> >
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index 33282c7d71e4..5473ebd0f6c1 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
> > struct cgroup_file *cfile = (void *)css + cft->file_offset;
> >
> > spin_lock_irq(&cgroup_file_kn_lock);
> > - cfile->kn = NULL;
> > + WRITE_ONCE(cfile->kn, NULL);
> > spin_unlock_irq(&cgroup_file_kn_lock);
> >
> > timer_delete_sync(&cfile->notify_timer);
> > @@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> > timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
> >
> > spin_lock_irq(&cgroup_file_kn_lock);
> > - cfile->kn = kn;
> > + WRITE_ONCE(cfile->kn, kn);
> > spin_unlock_irq(&cgroup_file_kn_lock);
> > }
> >
> > @@ -4686,20 +4686,27 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
> > */
> > void cgroup_file_notify(struct cgroup_file *cfile)
> > {
> > - unsigned long flags;
> > + unsigned long flags, last, next;
> > struct kernfs_node *kn = NULL;
> >
> > + if (!READ_ONCE(cfile->kn))
> > + return;
> > +
> > + last = READ_ONCE(cfile->notified_at);
> > + if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
> > + return;
> > +
>
> Previously, if a notification arrived within the rate-limit window, we would
> still call timer_reduce(&cfile->notify_timer, next) to schedule a deferred
> notification.
>
> With this change, returning early here bypasses that timer scheduling entirely.
> Does this risk missing notifications that would have been delivered by the timer?
>
You are indeed right that this can cause missed notifications. After giving some
thought I think the lockless check-and-return can be pretty much simplified to
timer_pending() check. If timer is active, just do nothing and the notification
will be delivered eventually.
I will send the updated version soon. Any comments on the other two patches?
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-03-02 16:14 ` Shakeel Butt
@ 2026-03-02 17:00 ` Shakeel Butt
2026-03-03 3:18 ` Chen Ridong
2026-03-03 3:08 ` Chen Ridong
1 sibling, 1 reply; 10+ messages in thread
From: Shakeel Butt @ 2026-03-02 17:00 UTC (permalink / raw)
To: Chen Ridong
Cc: Tejun Heo, Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
On Mon, Mar 02, 2026 at 08:14:05AM -0800, Shakeel Butt wrote:
> Hi Chen, thanks for taking a look.
>
> On Mon, Mar 02, 2026 at 09:50:53AM +0800, Chen Ridong wrote:
> >
[...]
> > > + last = READ_ONCE(cfile->notified_at);
> > > + if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
> > > + return;
> > > +
> >
> > Previously, if a notification arrived within the rate-limit window, we would
> > still call timer_reduce(&cfile->notify_timer, next) to schedule a deferred
> > notification.
> >
> > With this change, returning early here bypasses that timer scheduling entirely.
> > Does this risk missing notifications that would have been delivered by the timer?
> >
>
> You are indeed right that this can cause missed notifications. After giving some
> thought I think the lockless check-and-return can be pretty much simplified to
> timer_pending() check. If timer is active, just do nothing and the notification
> will be delivered eventually.
>
> I will send the updated version soon. Any comments on the other two patches?
>
Something like the following:
From 598199723b50813b015393122796f6775eee02d7 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Sat, 28 Feb 2026 04:01:28 -0800
Subject: [PATCH] cgroup: add lockless fast-path checks to cgroup_file_notify()
Add two lockless checks before acquiring the lock:
1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
2. timer_pending() check to skip when a deferred notification
timer is already armed.
Both checks have safe error directions -- a stale read can only
cause unnecessary lock acquisition, never a missed notification.
Annotate cfile->kn write sites with WRITE_ONCE() to pair with the
lockless reader.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reported-by: Jakub Kicinski <kuba@kernel.org>
---
kernel/cgroup/cgroup.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2b298a2cf206..6e816d27ee25 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cgroup_file *cfile = (void *)css + cft->file_offset;
spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = NULL;
+ WRITE_ONCE(cfile->kn, NULL);
spin_unlock_irq(&cgroup_file_kn_lock);
timer_delete_sync(&cfile->notify_timer);
@@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
spin_lock_irq(&cgroup_file_kn_lock);
- cfile->kn = kn;
+ WRITE_ONCE(cfile->kn, kn);
spin_unlock_irq(&cgroup_file_kn_lock);
}
@@ -4689,6 +4689,12 @@ void cgroup_file_notify(struct cgroup_file *cfile)
unsigned long flags;
struct kernfs_node *kn = NULL;
+ if (!READ_ONCE(cfile->kn))
+ return;
+
+ if (timer_pending(&cfile->notify_timer))
+ return;
+
spin_lock_irqsave(&cgroup_file_kn_lock, flags);
if (cfile->kn) {
unsigned long last = cfile->notified_at;
--
2.47.3
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-03-02 17:00 ` Shakeel Butt
@ 2026-03-03 3:18 ` Chen Ridong
2026-03-03 4:01 ` Shakeel Butt
0 siblings, 1 reply; 10+ messages in thread
From: Chen Ridong @ 2026-03-03 3:18 UTC (permalink / raw)
To: Shakeel Butt
Cc: Tejun Heo, Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
On 2026/3/3 1:00, Shakeel Butt wrote:
> On Mon, Mar 02, 2026 at 08:14:05AM -0800, Shakeel Butt wrote:
>> Hi Chen, thanks for taking a look.
>>
>> On Mon, Mar 02, 2026 at 09:50:53AM +0800, Chen Ridong wrote:
>>>
> [...]
>>>> + last = READ_ONCE(cfile->notified_at);
>>>> + if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
>>>> + return;
>>>> +
>>>
>>> Previously, if a notification arrived within the rate-limit window, we would
>>> still call timer_reduce(&cfile->notify_timer, next) to schedule a deferred
>>> notification.
>>>
>>> With this change, returning early here bypasses that timer scheduling entirely.
>>> Does this risk missing notifications that would have been delivered by the timer?
>>>
>>
>> You are indeed right that this can cause missed notifications. After giving some
>> thought I think the lockless check-and-return can be pretty much simplified to
>> timer_pending() check. If timer is active, just do nothing and the notification
>> will be delivered eventually.
>>
>> I will send the updated version soon. Any comments on the other two patches?
>>
>
> Something like the following:
>
>>From 598199723b50813b015393122796f6775eee02d7 Mon Sep 17 00:00:00 2001
> From: Shakeel Butt <shakeel.butt@linux.dev>
> Date: Sat, 28 Feb 2026 04:01:28 -0800
> Subject: [PATCH] cgroup: add lockless fast-path checks to cgroup_file_notify()
>
> Add two lockless checks before acquiring the lock:
>
> 1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
> 2. timer_pending() check to skip when a deferred notification
> timer is already armed.
>
> Both checks have safe error directions -- a stale read can only
> cause unnecessary lock acquisition, never a missed notification.
>
> Annotate cfile->kn write sites with WRITE_ONCE() to pair with the
> lockless reader.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> ---
> kernel/cgroup/cgroup.c | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 2b298a2cf206..6e816d27ee25 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
> struct cgroup_file *cfile = (void *)css + cft->file_offset;
>
> spin_lock_irq(&cgroup_file_kn_lock);
> - cfile->kn = NULL;
> + WRITE_ONCE(cfile->kn, NULL);
> spin_unlock_irq(&cgroup_file_kn_lock);
>
> timer_delete_sync(&cfile->notify_timer);
> @@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
> timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
>
> spin_lock_irq(&cgroup_file_kn_lock);
> - cfile->kn = kn;
> + WRITE_ONCE(cfile->kn, kn);
> spin_unlock_irq(&cgroup_file_kn_lock);
> }
>
> @@ -4689,6 +4689,12 @@ void cgroup_file_notify(struct cgroup_file *cfile)
> unsigned long flags;
> struct kernfs_node *kn = NULL;
>
> + if (!READ_ONCE(cfile->kn))
> + return;
> +
> + if (timer_pending(&cfile->notify_timer))
> + return;
> +
The added timer_pending() check seems problematic. According to the function's
comment, callers must ensure serialization with other timer operations. Here
we're checking timer_pending() locklessly, which means we might:
1. See an inconsistent state if another CPU is concurrently modifying the timer
2. Race with del_timer() or mod_timer() from other contexts
> spin_lock_irqsave(&cgroup_file_kn_lock, flags);
> if (cfile->kn) {
> unsigned long last = cfile->notified_at;
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-03-03 3:18 ` Chen Ridong
@ 2026-03-03 4:01 ` Shakeel Butt
0 siblings, 0 replies; 10+ messages in thread
From: Shakeel Butt @ 2026-03-03 4:01 UTC (permalink / raw)
To: Chen Ridong
Cc: Tejun Heo, Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
On Tue, Mar 03, 2026 at 11:18:17AM +0800, Chen Ridong wrote:
>
>
[...]
> > @@ -4689,6 +4689,12 @@ void cgroup_file_notify(struct cgroup_file *cfile)
> > unsigned long flags;
> > struct kernfs_node *kn = NULL;
> >
> > + if (!READ_ONCE(cfile->kn))
> > + return;
> > +
> > + if (timer_pending(&cfile->notify_timer))
> > + return;
> > +
>
> The added timer_pending() check seems problematic. According to the function's
> comment, callers must ensure serialization with other timer operations. Here
> we're checking timer_pending() locklessly, which means we might:
>
That comment seems outdated. Check the commit 90c018942c2ba ("timer: Use
hlist_unhashed_lockless() in timer_pending()").
> 1. See an inconsistent state if another CPU is concurrently modifying the timer
>
It will not see inconsistent state but it can see stale state which is totally
fine. At worst we will take the lock and recheck but we will never miss the
notifications.
> 2. Race with del_timer() or mod_timer() from other contexts
>
> > spin_lock_irqsave(&cgroup_file_kn_lock, flags);
> > if (cfile->kn) {
> > unsigned long last = cfile->notified_at;
>
> --
> Best regards,
> Ridong
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify()
2026-03-02 16:14 ` Shakeel Butt
2026-03-02 17:00 ` Shakeel Butt
@ 2026-03-03 3:08 ` Chen Ridong
1 sibling, 0 replies; 10+ messages in thread
From: Chen Ridong @ 2026-03-03 3:08 UTC (permalink / raw)
To: Shakeel Butt
Cc: Tejun Heo, Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
On 2026/3/3 0:14, Shakeel Butt wrote:
> Hi Chen, thanks for taking a look.
>
> On Mon, Mar 02, 2026 at 09:50:53AM +0800, Chen Ridong wrote:
>>
>> Hi Shakeel,
>>
>> Good series to move away from the global lock.
>>
>> On 2026/2/28 22:20, Shakeel Butt wrote:
>>> Add two lockless checks before acquiring the lock:
>>>
>>> 1. READ_ONCE(cfile->kn) NULL check to skip torn-down files.
>>> 2. READ_ONCE(cfile->notified_at) check to skip when within the
>>> rate-limit window (~10ms).
>>>
>>> Both checks have safe error directions -- a stale read can only cause
>>> unnecessary lock acquisition, never a missed notification. Annotate
>>> all write sites with WRITE_ONCE() to pair with the lockless readers.
>>>
>>> The trade-off is that trailing timer_reduce() calls during bursts are
>>> skipped, so the deferred notification that delivers the final state
>>> may be lost. This is acceptable for the primary callers like
>>> __memcg_memory_event() where events keep arriving.
>>>
>>> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>>> Reported-by: Jakub Kicinski <kuba@kernel.org>
>>> ---
>>> kernel/cgroup/cgroup.c | 21 ++++++++++++++-------
>>> 1 file changed, 14 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
>>> index 33282c7d71e4..5473ebd0f6c1 100644
>>> --- a/kernel/cgroup/cgroup.c
>>> +++ b/kernel/cgroup/cgroup.c
>>> @@ -1749,7 +1749,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
>>> struct cgroup_file *cfile = (void *)css + cft->file_offset;
>>>
>>> spin_lock_irq(&cgroup_file_kn_lock);
>>> - cfile->kn = NULL;
>>> + WRITE_ONCE(cfile->kn, NULL);
>>> spin_unlock_irq(&cgroup_file_kn_lock);
>>>
>>> timer_delete_sync(&cfile->notify_timer);
>>> @@ -4430,7 +4430,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
>>> timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
>>>
>>> spin_lock_irq(&cgroup_file_kn_lock);
>>> - cfile->kn = kn;
>>> + WRITE_ONCE(cfile->kn, kn);
>>> spin_unlock_irq(&cgroup_file_kn_lock);
>>> }
>>>
>>> @@ -4686,20 +4686,27 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
>>> */
>>> void cgroup_file_notify(struct cgroup_file *cfile)
>>> {
>>> - unsigned long flags;
>>> + unsigned long flags, last, next;
>>> struct kernfs_node *kn = NULL;
>>>
>>> + if (!READ_ONCE(cfile->kn))
>>> + return;
>>> +
>>> + last = READ_ONCE(cfile->notified_at);
>>> + if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
>>> + return;
>>> +
>>
>> Previously, if a notification arrived within the rate-limit window, we would
>> still call timer_reduce(&cfile->notify_timer, next) to schedule a deferred
>> notification.
>>
>> With this change, returning early here bypasses that timer scheduling entirely.
>> Does this risk missing notifications that would have been delivered by the timer?
>>
>
> You are indeed right that this can cause missed notifications. After giving some
> thought I think the lockless check-and-return can be pretty much simplified to
> timer_pending() check. If timer is active, just do nothing and the notification
> will be delivered eventually.
>
> I will send the updated version soon. Any comments on the other two patches?
>
The other two patches are fine to me.
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 3/3] cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock
2026-02-28 14:20 [PATCH 0/3] cgroup: improve cgroup_file_notify() scalability Shakeel Butt
2026-02-28 14:20 ` [PATCH 1/3] cgroup: reduce cgroup_file_kn_lock hold time in cgroup_file_notify() Shakeel Butt
2026-02-28 14:20 ` [PATCH 2/3] cgroup: add lockless fast-path checks to cgroup_file_notify() Shakeel Butt
@ 2026-02-28 14:20 ` Shakeel Butt
2 siblings, 0 replies; 10+ messages in thread
From: Shakeel Butt @ 2026-02-28 14:20 UTC (permalink / raw)
To: Tejun Heo
Cc: Johannes Weiner, Michal Koutný,
Roman Gushchin, Kuniyuki Iwashima, Daniel Sedlak,
Meta kernel team, linux-mm, netdev, cgroups, linux-kernel,
Jakub Kicinski
Replace the global cgroup_file_kn_lock with a per-cgroup_file spinlock
to eliminate cross-cgroup contention as it is not really protecting
data shared between different cgroups.
The lock is initialized in cgroup_add_file() alongside timer_setup().
No lock acquisition is needed during initialization since the cgroup
directory is being populated under cgroup_mutex and no concurrent
accessors exist at that point.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reported-by: Jakub Kicinski <kuba@kernel.org>
---
include/linux/cgroup-defs.h | 1 +
kernel/cgroup/cgroup.c | 24 ++++++++----------------
2 files changed, 9 insertions(+), 16 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index bb92f5c169ca..ba26b5d05ce3 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -167,6 +167,7 @@ struct cgroup_file {
struct kernfs_node *kn;
unsigned long notified_at;
struct timer_list notify_timer;
+ spinlock_t lock;
};
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5473ebd0f6c1..b502acad3c5c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -107,12 +107,6 @@ static bool cgroup_debug __read_mostly;
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
-/*
- * Protects cgroup_file->kn for !self csses. It synchronizes notifications
- * against file removal/re-creation across css hiding.
- */
-static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-
DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -1748,9 +1742,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
struct cgroup_file *cfile = (void *)css + cft->file_offset;
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
WRITE_ONCE(cfile->kn, NULL);
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_unlock_irq(&cfile->lock);
timer_delete_sync(&cfile->notify_timer);
}
@@ -4428,10 +4422,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
struct cgroup_file *cfile = (void *)css + cft->file_offset;
timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
-
- spin_lock_irq(&cgroup_file_kn_lock);
- WRITE_ONCE(cfile->kn, kn);
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_lock_init(&cfile->lock);
+ cfile->kn = kn;
}
return 0;
@@ -4696,7 +4688,7 @@ void cgroup_file_notify(struct cgroup_file *cfile)
if (time_before_eq(jiffies, last + CGROUP_FILE_NOTIFY_MIN_INTV))
return;
- spin_lock_irqsave(&cgroup_file_kn_lock, flags);
+ spin_lock_irqsave(&cfile->lock, flags);
if (cfile->kn) {
last = cfile->notified_at;
next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
@@ -4709,7 +4701,7 @@ void cgroup_file_notify(struct cgroup_file *cfile)
WRITE_ONCE(cfile->notified_at, jiffies);
}
}
- spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
+ spin_unlock_irqrestore(&cfile->lock, flags);
if (kn) {
kernfs_notify(kn);
@@ -4727,10 +4719,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
struct kernfs_node *kn;
- spin_lock_irq(&cgroup_file_kn_lock);
+ spin_lock_irq(&cfile->lock);
kn = cfile->kn;
kernfs_get(kn);
- spin_unlock_irq(&cgroup_file_kn_lock);
+ spin_unlock_irq(&cfile->lock);
if (kn)
kernfs_show(kn, show);
--
2.47.3
^ permalink raw reply [flat|nested] 10+ messages in thread