From: Qi Zheng <qi.zheng@linux.dev>
To: hannes@cmpxchg.org, hughd@google.com, mhocko@suse.com,
roman.gushchin@linux.dev, shakeel.butt@linux.dev,
muchun.song@linux.dev, david@kernel.org,
lorenzo.stoakes@oracle.com, ziy@nvidia.com, harry.yoo@oracle.com,
yosry.ahmed@linux.dev, imran.f.khan@oracle.com,
kamalesh.babulal@oracle.com, axelrasmussen@google.com,
yuanchu@google.com, weixugc@google.com,
chenridong@huaweicloud.com, mkoutny@suse.com,
akpm@linux-foundation.org, hamzamahfooz@linux.microsoft.com,
apais@linux.microsoft.com, lance.yang@linux.dev, bhe@redhat.com,
usamaarif642@gmail.com
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
cgroups@vger.kernel.org, Qi Zheng <zhengqi.arch@bytedance.com>
Subject: [PATCH v6 31/33] mm: memcontrol: convert objcg to be per-memcg per-node type
Date: Thu, 5 Mar 2026 19:52:49 +0800 [thread overview]
Message-ID: <56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com> (raw)
In-Reply-To: <cover.1772711148.git.zhengqi.arch@bytedance.com>
From: Qi Zheng <zhengqi.arch@bytedance.com>
Convert objcg to be per-memcg per-node type, so that when reparent LRU
folios later, we can hold the lru lock at the node level, thus avoiding
holding too many lru locks at once.
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
---
include/linux/memcontrol.h | 23 +++++------
include/linux/sched.h | 2 +-
mm/memcontrol.c | 79 +++++++++++++++++++++++---------------
3 files changed, 62 insertions(+), 42 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d2748e672fd88..57d86decf2830 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -116,6 +116,16 @@ struct mem_cgroup_per_node {
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter;
+ /*
+ * objcg is wiped out as a part of the objcg repaprenting process.
+ * orig_objcg preserves a pointer (and a reference) to the original
+ * objcg until the end of live of memcg.
+ */
+ struct obj_cgroup __rcu *objcg;
+ struct obj_cgroup *orig_objcg;
+ /* list of inherited objcgs, protected by objcg_lock */
+ struct list_head objcg_list;
+
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
/* slab stats for nmi context */
atomic_t slab_reclaimable;
@@ -180,6 +190,7 @@ struct obj_cgroup {
struct list_head list; /* protected by objcg_lock */
struct rcu_head rcu;
};
+ bool is_root;
};
/*
@@ -258,15 +269,6 @@ struct mem_cgroup {
seqlock_t socket_pressure_seqlock;
#endif
int kmemcg_id;
- /*
- * memcg->objcg is wiped out as a part of the objcg repaprenting
- * process. memcg->orig_objcg preserves a pointer (and a reference)
- * to the original objcg until the end of live of memcg.
- */
- struct obj_cgroup __rcu *objcg;
- struct obj_cgroup *orig_objcg;
- /* list of inherited objcgs, protected by objcg_lock */
- struct list_head objcg_list;
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
@@ -333,7 +335,6 @@ struct mem_cgroup {
#define MEMCG_CHARGE_BATCH 64U
extern struct mem_cgroup *root_mem_cgroup;
-extern struct obj_cgroup *root_obj_cgroup;
enum page_memcg_data_flags {
/* page->memcg_data is a pointer to an slabobj_ext vector */
@@ -552,7 +553,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
{
- return objcg == root_obj_cgroup;
+ return objcg->is_root;
}
static inline bool mem_cgroup_disabled(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2f0..7b63b7b74f414 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1533,7 +1533,7 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
- /* Cache for current->cgroups->memcg->objcg lookups: */
+ /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
struct obj_cgroup *objcg;
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b0519a16f5684..e31c58bc89188 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
EXPORT_SYMBOL(root_mem_cgroup);
-struct obj_cgroup *root_obj_cgroup __read_mostly;
-
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
@@ -210,18 +208,21 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
}
static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+ struct mem_cgroup *parent,
+ int nid)
{
struct obj_cgroup *objcg, *iter;
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid];
- objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+ objcg = rcu_replace_pointer(pn->objcg, NULL, true);
/* 1) Ready to reparent active objcg. */
- list_add(&objcg->list, &memcg->objcg_list);
+ list_add(&objcg->list, &pn->objcg_list);
/* 2) Reparent active objcg and already reparented objcgs to parent. */
- list_for_each_entry(iter, &memcg->objcg_list, list)
+ list_for_each_entry(iter, &pn->objcg_list, list)
WRITE_ONCE(iter->memcg, parent);
/* 3) Move already reparented objcgs to the parent's list */
- list_splice(&memcg->objcg_list, &parent->objcg_list);
+ list_splice(&pn->objcg_list, &parent_pn->objcg_list);
return objcg;
}
@@ -268,14 +269,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ int nid;
- reparent_locks(memcg, parent);
+ for_each_node(nid) {
+ reparent_locks(memcg, parent);
- objcg = __memcg_reparent_objcgs(memcg, parent);
+ objcg = __memcg_reparent_objcgs(memcg, parent, nid);
- reparent_unlocks(memcg, parent);
+ reparent_unlocks(memcg, parent);
- percpu_ref_kill(&objcg->refcnt);
+ percpu_ref_kill(&objcg->refcnt);
+ }
}
/*
@@ -2877,8 +2881,10 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
+ int nid = numa_node_id();
+
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
+ struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
if (likely(objcg && obj_cgroup_tryget(objcg)))
return objcg;
@@ -2942,6 +2948,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
{
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
+ int nid = numa_node_id();
if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
return NULL;
@@ -2958,14 +2965,14 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
* Objcg reference is kept by the task, so it's safe
* to use the objcg by the current task.
*/
- return objcg ? : root_obj_cgroup;
+ return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
memcg = this_cpu_read(int_active_memcg);
if (unlikely(memcg))
goto from_memcg;
- return root_obj_cgroup;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
from_memcg:
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
@@ -2975,12 +2982,12 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
* away and can be used within the scope without any additional
* protection.
*/
- objcg = rcu_dereference_check(memcg->objcg, 1);
+ objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1);
if (likely(objcg))
return objcg;
}
- return root_obj_cgroup;
+ return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
}
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
@@ -3877,6 +3884,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn->lruvec_stats_percpu)
goto fail;
+ INIT_LIST_HEAD(&pn->objcg_list);
+
lruvec_init(&pn->lruvec);
pn->memcg = memcg;
@@ -3891,10 +3900,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- obj_cgroup_put(memcg->orig_objcg);
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
- for_each_node(node)
- free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
+ obj_cgroup_put(pn->orig_objcg);
+ free_mem_cgroup_per_node_info(pn);
+ }
memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
@@ -3965,7 +3976,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
#endif
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -4042,6 +4052,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct obj_cgroup *objcg;
+ int nid;
memcg_online_kmem(memcg);
@@ -4053,17 +4064,19 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (alloc_shrinker_info(memcg))
goto offline_kmem;
- objcg = obj_cgroup_alloc();
- if (!objcg)
- goto free_shrinker;
+ for_each_node(nid) {
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ goto free_objcg;
- if (unlikely(mem_cgroup_is_root(memcg)))
- root_obj_cgroup = objcg;
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ objcg->is_root = true;
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
- obj_cgroup_get(objcg);
- memcg->orig_objcg = objcg;
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
+ obj_cgroup_get(objcg);
+ memcg->nodeinfo[nid]->orig_objcg = objcg;
+ }
if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
@@ -4087,7 +4100,13 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
-free_shrinker:
+free_objcg:
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+
+ if (pn && pn->orig_objcg)
+ obj_cgroup_put(pn->orig_objcg);
+ }
free_shrinker_info(memcg);
offline_kmem:
memcg_offline_kmem(memcg);
--
2.20.1
next prev parent reply other threads:[~2026-03-05 11:59 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-05 11:52 [PATCH v6 00/33] Eliminate Dying Memory Cgroup Qi Zheng
2026-03-05 11:52 ` [PATCH v6 01/33] mm: memcontrol: remove dead code of checking parent memory cgroup Qi Zheng
2026-03-05 11:52 ` [PATCH v6 02/33] mm: workingset: use folio_lruvec() in workingset_refault() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 03/33] mm: rename unlock_page_lruvec_irq and its variants Qi Zheng
2026-03-05 11:52 ` [PATCH v6 04/33] mm: vmscan: prepare for the refactoring the move_folios_to_lru() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 05/33] mm: vmscan: refactor move_folios_to_lru() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 06/33] mm: memcontrol: allocate object cgroup for non-kmem case Qi Zheng
2026-03-05 11:52 ` [PATCH v6 07/33] mm: memcontrol: return root object cgroup for root memory cgroup Qi Zheng
2026-03-05 11:52 ` [PATCH v6 08/33] mm: memcontrol: prevent memory cgroup release in get_mem_cgroup_from_folio() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 09/33] buffer: prevent memory cgroup release in folio_alloc_buffers() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 10/33] writeback: prevent memory cgroup release in writeback module Qi Zheng
2026-03-05 11:52 ` [PATCH v6 11/33] mm: memcontrol: prevent memory cgroup release in count_memcg_folio_events() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 12/33] mm: page_io: prevent memory cgroup release in page_io module Qi Zheng
2026-03-05 11:52 ` [PATCH v6 13/33] mm: migrate: prevent memory cgroup release in folio_migrate_mapping() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 14/33] mm: mglru: prevent memory cgroup release in mglru Qi Zheng
2026-03-05 11:52 ` [PATCH v6 15/33] mm: memcontrol: prevent memory cgroup release in mem_cgroup_swap_full() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 16/33] mm: workingset: prevent memory cgroup release in lru_gen_eviction() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 17/33] mm: thp: prevent memory cgroup release in folio_split_queue_lock{_irqsave}() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 18/33] mm: zswap: prevent memory cgroup release in zswap_compress() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 19/33] mm: workingset: prevent lruvec release in workingset_refault() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 20/33] mm: zswap: prevent lruvec release in zswap_folio_swapin() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 21/33] mm: swap: prevent lruvec release in lru_gen_clear_refs() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 22/33] mm: workingset: prevent lruvec release in workingset_activation() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 23/33] mm: do not open-code lruvec lock Qi Zheng
2026-03-05 11:52 ` [PATCH v6 24/33] mm: memcontrol: prepare for reparenting LRU pages for " Qi Zheng
2026-03-05 11:52 ` [PATCH v6 25/33] mm: vmscan: prepare for reparenting traditional LRU folios Qi Zheng
2026-03-05 11:52 ` [PATCH v6 26/33] mm: vmscan: prepare for reparenting MGLRU folios Qi Zheng
2026-03-05 11:52 ` [PATCH v6 27/33] mm: memcontrol: refactor memcg_reparent_objcgs() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 28/33] mm: workingset: use lruvec_lru_size() to get the number of lru pages Qi Zheng
2026-03-05 11:52 ` [PATCH v6 29/33] mm: memcontrol: refactor mod_memcg_state() and mod_memcg_lruvec_state() Qi Zheng
2026-03-05 11:52 ` [PATCH v6 30/33] mm: memcontrol: prepare for reparenting non-hierarchical stats Qi Zheng
2026-03-05 11:52 ` Qi Zheng [this message]
2026-03-05 11:52 ` [PATCH v6 32/33] mm: memcontrol: eliminate the problem of dying memory cgroup for LRU folios Qi Zheng
2026-03-05 11:52 ` [PATCH v6 33/33] mm: lru: add VM_WARN_ON_ONCE_FOLIO to lru maintenance helpers Qi Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com \
--to=qi.zheng@linux.dev \
--cc=akpm@linux-foundation.org \
--cc=apais@linux.microsoft.com \
--cc=axelrasmussen@google.com \
--cc=bhe@redhat.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=david@kernel.org \
--cc=hamzamahfooz@linux.microsoft.com \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=hughd@google.com \
--cc=imran.f.khan@oracle.com \
--cc=kamalesh.babulal@oracle.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mhocko@suse.com \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=usamaarif642@gmail.com \
--cc=weixugc@google.com \
--cc=yosry.ahmed@linux.dev \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox