From: Shakeel Butt <shakeel.butt@linux.dev>
To: Tejun Heo <tj@kernel.org>,
Andrew Morton <akpm@linux-foundation.org>,
Alexei Starovoitov <ast@kernel.org>
Cc: "Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Hocko" <mhocko@kernel.org>,
"Roman Gushchin" <roman.gushchin@linux.dev>,
"Muchun Song" <muchun.song@linux.dev>,
"Yosry Ahmed" <yosry.ahmed@linux.dev>,
"Michal Koutný" <mkoutny@suse.com>,
"Vlastimil Babka" <vbabka@suse.cz>,
"Sebastian Andrzej Siewior" <bigeasy@linutronix.de>,
"JP Kobryn" <inwardvessel@gmail.com>,
bpf@vger.kernel.org, linux-mm@kvack.org, cgroups@vger.kernel.org,
linux-kernel@vger.kernel.org,
"Meta kernel team" <kernel-team@meta.com>
Subject: [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe
Date: Mon, 28 Apr 2025 23:12:09 -0700 [thread overview]
Message-ID: <20250429061211.1295443-4-shakeel.butt@linux.dev> (raw)
In-Reply-To: <20250429061211.1295443-1-shakeel.butt@linux.dev>
To make css_rstat_updated() able to safely run in nmi context, it can
not spin on locks and rather has to do trylock on the per-cpu per-ss raw
spinlock. This patch implements the backlog mechanism to handle the
failure in acquiring the per-cpu per-ss raw spinlock.
Each subsystem provides a per-cpu lockless list on which the kernel
stores the css given to css_rstat_updated() on trylock failure. These
lockless lists serve as backlog. On cgroup stats flushing code path, the
kernel first processes all the per-cpu lockless backlog lists of the
given ss and then proceeds to flush the update stat trees.
With css_rstat_updated() being nmi safe, the memch stats can and will be
converted to be nmi safe to enable nmi safe mem charging.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
kernel/cgroup/rstat.c | 99 +++++++++++++++++++++++++++++++++----------
1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d3092b4c85d7..ac533e46afa9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
static DEFINE_SPINLOCK(rstat_base_lock);
static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -42,6 +43,13 @@ static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
}
+static struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
/*
* Helper functions for rstat per CPU locks.
*
@@ -86,6 +94,21 @@ unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
return flags;
}
+static __always_inline
+bool _css_rstat_cpu_trylock(struct cgroup_subsys_state *css, int cpu,
+ unsigned long *flags)
+{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
+ bool contended;
+
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
+ contended = !raw_spin_trylock_irqsave(cpu_lock, *flags);
+ if (contended)
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+ return !contended;
+}
+
static __always_inline
void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
unsigned long flags, const bool fast_path)
@@ -102,32 +125,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-/**
- * css_rstat_updated - keep track of updated rstat_cpu
- * @css: target cgroup subsystem state
- * @cpu: cpu on which rstat_cpu was updated
- *
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
- */
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+static void css_add_to_backlog(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
-
- /*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
- *
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
- */
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
- return;
+ struct llist_head *lhead = ss_lhead_cpu(css->ss, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ llist_add_iff_not_on_list(&rstatc->lnode, lhead);
+}
+static void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -153,6 +160,51 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_backlog(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_rstat_updated(rstatc->owner, cpu);
+ }
+}
+
+/**
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @css is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ return;
+
+ if (!_css_rstat_cpu_trylock(css, cpu, &flags)) {
+ css_add_to_backlog(css, cpu);
+ return;
+ }
+
+ __css_rstat_updated(css, cpu);
_css_rstat_cpu_unlock(css, cpu, flags, true);
}
@@ -255,6 +307,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_backlog(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
goto unlock_ret;
--
2.47.1
next prev parent reply other threads:[~2025-04-29 6:13 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-29 6:12 [RFC PATCH 0/3] cgroup: nmi safe css_rstat_updated Shakeel Butt
2025-04-29 6:12 ` [RFC PATCH 1/3] llist: add list_add_iff_not_on_list() Shakeel Butt
2025-04-30 12:44 ` [RFC PATCH 1/3] llist: add list_add_iff_not_on_list()g Yosry Ahmed
2025-04-29 6:12 ` [RFC PATCH 2/3] cgroup: support to enable nmi-safe css_rstat_updated Shakeel Butt
2025-04-29 6:12 ` Shakeel Butt [this message]
2025-04-30 13:14 ` [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe Yosry Ahmed
2025-05-01 22:10 ` Shakeel Butt
2025-05-06 9:41 ` Yosry Ahmed
2025-05-06 19:30 ` Shakeel Butt
2025-05-07 6:52 ` Yosry Ahmed
2025-04-29 6:12 ` [OFFLIST PATCH 1/2] cgroup: use separate rstat trees for each subsystem Shakeel Butt
2025-04-29 6:12 ` [OFFLIST PATCH 2/2] cgroup: use subsystem-specific rstat locks to avoid contention Shakeel Butt
2025-04-29 6:15 ` Shakeel Butt
2025-05-21 22:23 ` Klara Modin
2025-05-21 22:29 ` Tejun Heo
2025-05-21 23:23 ` Shakeel Butt
2025-05-21 23:33 ` Shakeel Butt
2025-05-21 23:47 ` JP Kobryn
2025-05-21 23:50 ` Shakeel Butt
2025-05-21 23:52 ` JP Kobryn
2025-05-21 23:47 ` Shakeel Butt
2025-04-29 6:15 ` [OFFLIST PATCH 1/2] cgroup: use separate rstat trees for each subsystem Shakeel Butt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250429061211.1295443-4-shakeel.butt@linux.dev \
--to=shakeel.butt@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=ast@kernel.org \
--cc=bigeasy@linutronix.de \
--cc=bpf@vger.kernel.org \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=inwardvessel@gmail.com \
--cc=kernel-team@meta.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=roman.gushchin@linux.dev \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=yosry.ahmed@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox