From: JP Kobryn <inwardvessel@gmail.com>
To: tj@kernel.org, shakeel.butt@linux.dev, yosryahmed@google.com,
mhocko@kernel.org, hannes@cmpxchg.org, akpm@linux-foundation.org
Cc: linux-mm@kvack.org, cgroups@vger.kernel.org, kernel-team@meta.com
Subject: Re: [PATCH 3/4 v2] cgroup: separate rstat locks for subsystems
Date: Fri, 28 Feb 2025 09:37:09 -0800 [thread overview]
Message-ID: <084e5bc1-d2cd-4b3d-82ee-7cd83d2462e0@gmail.com> (raw)
In-Reply-To: <20250227215543.49928-4-inwardvessel@gmail.com>
On 2/27/25 1:55 PM, inwardvessel wrote:
> From: JP Kobryn <inwardvessel@gmail.com>
>
> Let the existing locks be dedicated to the base stats and rename them as
> such. Also add new rstat locks for each enabled subsystem. When handling
> cgroup subsystem states, distinguish between formal subsystems (memory,
> io, etc) and the base stats subsystem state (represented by
> cgroup::self) to decide on which locks to take. This change is made to
> prevent contention between subsystems when updating/flushing stats.
>
> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> ---
> kernel/cgroup/rstat.c | 93 +++++++++++++++++++++++++++++++++----------
> 1 file changed, 72 insertions(+), 21 deletions(-)
>
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index 88908ef9212d..b3eaefc1fd07 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -9,8 +9,12 @@
>
> #include <trace/events/cgroup.h>
>
> -static DEFINE_SPINLOCK(cgroup_rstat_lock);
> -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
> +static DEFINE_SPINLOCK(cgroup_rstat_base_lock);
> +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock);
> +
> +static spinlock_t cgroup_rstat_subsys_lock[CGROUP_SUBSYS_COUNT];
> +static DEFINE_PER_CPU(raw_spinlock_t,
> + cgroup_rstat_subsys_cpu_lock[CGROUP_SUBSYS_COUNT]);
>
> static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
>
> @@ -20,8 +24,13 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(
> return per_cpu_ptr(css->rstat_cpu, cpu);
> }
>
> +static inline bool is_base_css(struct cgroup_subsys_state *css)
> +{
> + return css->ss == NULL;
> +}
> +
> /*
> - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
> + * Helper functions for rstat per CPU locks.
> *
> * This makes it easier to diagnose locking issues and contention in
> * production environments. The parameter @fast_path determine the
> @@ -36,12 +45,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
> bool contended;
>
> /*
> - * The _irqsave() is needed because cgroup_rstat_lock is
> - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
> - * this lock with the _irq() suffix only disables interrupts on
> - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
> - * interrupts on both configurations. The _irqsave() ensures
> - * that interrupts are always disabled and later restored.
> + * The _irqsave() is needed because the locks used for flushing are
> + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
> + * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
> + * kernel. The raw_spinlock_t below disables interrupts on both
> + * configurations. The _irqsave() ensures that interrupts are always
> + * disabled and later restored.
> */
> contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
> if (contended) {
> @@ -87,7 +96,7 @@ __bpf_kfunc void cgroup_rstat_updated(
> struct cgroup_subsys_state *css, int cpu)
> {
> struct cgroup *cgrp = css->cgroup;
> - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> + raw_spinlock_t *cpu_lock;
> unsigned long flags;
>
> /*
> @@ -101,6 +110,12 @@ __bpf_kfunc void cgroup_rstat_updated(
> if (data_race(cgroup_rstat_cpu(css, cpu)->updated_next))
> return;
>
> + if (is_base_css(css))
> + cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> + else
> + cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> + css->ss->id;
> +
> flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
>
> /* put @css and all ancestors on the corresponding updated lists */
> @@ -208,11 +223,17 @@ static struct cgroup_subsys_state *cgroup_rstat_updated_list(
> struct cgroup_subsys_state *root, int cpu)
> {
> struct cgroup *cgrp = root->cgroup;
> - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
> struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
> struct cgroup_subsys_state *head = NULL, *parent, *child;
> + raw_spinlock_t *cpu_lock;
> unsigned long flags;
>
> + if (is_base_css(root))
> + cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu);
> + else
> + cpu_lock = per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) +
> + root->ss->id;
> +
> flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, false);
>
> /* Return NULL if this subtree is not on-list */
> @@ -315,7 +336,7 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
> struct cgroup *cgrp = css->cgroup;
> int cpu;
>
> - lockdep_assert_held(&cgroup_rstat_lock);
> + lockdep_assert_held(&lock);
I need to remove the ampersand since the variable is already a pointer.
>
> for_each_possible_cpu(cpu) {
> struct cgroup_subsys_state *pos;
> @@ -356,12 +377,18 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css,
> __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
> {
> struct cgroup *cgrp = css->cgroup;
> + spinlock_t *lock;
> +
> + if (is_base_css(css))
> + lock = &cgroup_rstat_base_lock;
> + else
> + lock = &cgroup_rstat_subsys_lock[css->ss->id];
>
> might_sleep();
>
> - __cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> - cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> - __cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> + __cgroup_rstat_lock(lock, cgrp, -1);
> + cgroup_rstat_flush_locked(css, lock);
> + __cgroup_rstat_unlock(lock, cgrp, -1);
> }
>
> /**
> @@ -376,10 +403,16 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css)
> void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
> {
> struct cgroup *cgrp = css->cgroup;
> + spinlock_t *lock;
> +
> + if (is_base_css(css))
> + lock = &cgroup_rstat_base_lock;
> + else
> + lock = &cgroup_rstat_subsys_lock[css->ss->id];
>
> might_sleep();
> - __cgroup_rstat_lock(&cgroup_rstat_lock, cgrp, -1);
> - cgroup_rstat_flush_locked(css, &cgroup_rstat_lock);
> + __cgroup_rstat_lock(lock, cgrp, -1);
> + cgroup_rstat_flush_locked(css, lock);
> }
>
> /**
> @@ -389,7 +422,14 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css)
> void cgroup_rstat_flush_release(struct cgroup_subsys_state *css)
> {
> struct cgroup *cgrp = css->cgroup;
> - __cgroup_rstat_unlock(&cgroup_rstat_lock, cgrp, -1);
> + spinlock_t *lock;
> +
> + if (is_base_css(css))
> + lock = &cgroup_rstat_base_lock;
> + else
> + lock = &cgroup_rstat_subsys_lock[css->ss->id];
> +
> + __cgroup_rstat_unlock(lock, cgrp, -1);
> }
>
> int cgroup_rstat_init(struct cgroup_subsys_state *css)
> @@ -435,10 +475,21 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css)
>
> void __init cgroup_rstat_boot(void)
> {
> - int cpu;
> + struct cgroup_subsys *ss;
> + int cpu, ssid;
>
> - for_each_possible_cpu(cpu)
> - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
> + for_each_subsys(ss, ssid) {
> + spin_lock_init(&cgroup_rstat_subsys_lock[ssid]);
> + }
> +
> + for_each_possible_cpu(cpu) {
> + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu));
> +
> + for_each_subsys(ss, ssid) {
> + raw_spin_lock_init(
> + per_cpu_ptr(cgroup_rstat_subsys_cpu_lock, cpu) + ssid);
> + }
> + }
> }
>
> /*
next prev parent reply other threads:[~2025-02-28 17:37 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-27 21:55 [PATCH 0/4 v2] cgroup: separate rstat trees inwardvessel
2025-02-27 21:55 ` [PATCH 1/4 v2] cgroup: move cgroup_rstat from cgroup to cgroup_subsys_state inwardvessel
2025-02-27 22:43 ` Shakeel Butt
2025-02-28 19:04 ` Yosry Ahmed
2025-03-01 1:06 ` JP Kobryn
2025-03-01 1:25 ` Yosry Ahmed
2025-03-01 1:30 ` JP Kobryn
2025-03-03 18:18 ` Shakeel Butt
2025-03-03 18:21 ` Yosry Ahmed
2025-03-03 15:20 ` Michal Koutný
2025-03-03 19:31 ` JP Kobryn
2025-02-27 21:55 ` [PATCH 2/4 v2] cgroup: rstat lock indirection inwardvessel
2025-03-03 15:21 ` Michal Koutný
2025-02-27 21:55 ` [PATCH 3/4 v2] cgroup: separate rstat locks for subsystems inwardvessel
2025-02-27 22:52 ` Shakeel Butt
2025-02-28 16:07 ` JP Kobryn
2025-02-28 17:37 ` JP Kobryn [this message]
2025-02-28 19:20 ` Yosry Ahmed
2025-03-06 21:47 ` JP Kobryn
2025-03-01 23:00 ` kernel test robot
2025-03-03 15:22 ` Michal Koutný
2025-03-03 18:29 ` Yosry Ahmed
2025-03-03 18:40 ` Shakeel Butt
2025-03-03 19:23 ` JP Kobryn
2025-03-03 19:39 ` Shakeel Butt
2025-03-03 19:50 ` Yosry Ahmed
2025-03-03 20:09 ` Shakeel Butt
2025-03-03 18:49 ` Michal Koutný
2025-03-10 17:59 ` JP Kobryn
2025-03-11 13:49 ` Michal Koutný
2025-03-06 21:36 ` JP Kobryn
2025-03-03 23:49 ` kernel test robot
2025-02-27 21:55 ` [PATCH 4/4 v2] cgroup: separate rstat list pointers from base stats inwardvessel
2025-02-27 23:01 ` Shakeel Butt
2025-02-28 20:33 ` Yosry Ahmed
2025-02-28 18:22 ` [PATCH 0/4 v2] cgroup: separate rstat trees Yosry Ahmed
2025-03-03 15:19 ` Michal Koutný
2025-03-06 1:07 ` JP Kobryn
2025-03-11 13:49 ` Michal Koutný
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=084e5bc1-d2cd-4b3d-82ee-7cd83d2462e0@gmail.com \
--to=inwardvessel@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=cgroups@vger.kernel.org \
--cc=hannes@cmpxchg.org \
--cc=kernel-team@meta.com \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=shakeel.butt@linux.dev \
--cc=tj@kernel.org \
--cc=yosryahmed@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox