[RFC -next] memcg: Optimize creation performance when LRU

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
@ 2025-11-19  8:37 Chen Ridong
  2025-11-24  3:52 ` Chen Ridong
  2025-11-26 17:15 ` Johannes Weiner
  0 siblings, 2 replies; 7+ messages in thread
From: Chen Ridong @ 2025-11-19  8:37 UTC (permalink / raw)
  To: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, hannes, zhengqi.arch,
	shakeel.butt
  Cc: linux-mm, linux-kernel, lujialin4, chenridong

From: Chen Ridong <chenridong@huawei.com>

With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
when creating a large number of memory cgroups (memcgs):

	# time mkdir testcg_{1..10000}

	real	0m7.167s
	user	0m0.037s
	sys	0m6.773s

	# time mkdir testcg_{1..20000}

	real	0m27.158s
	user	0m0.079s
	sys	0m26.270s

In contrast, with LRU_GEN=n, creation of the same number of memcgs
performs better:

	# time mkdir testcg_{1..10000}

	real	0m3.386s
	user	0m0.044s
	sys	0m3.009s

	# time mkdir testcg_{1..20000}

	real	0m6.876s
	user	0m0.075s
	sys	0m6.121s

The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
which traverses the entire list to find the tail. This traversal scales
with the number of memcgs, even when LRU_GEN is runtime-disabled.

Fix this by adding a per-lru_gen tail pointer to track the list's tail.
Appending new nodes now uses the tail pointer directly, eliminating full
list traversal.

After applying this patch, memcg creation performance with LRU_GEN=y
matches the fully disabled baseline:

	#time mkdir testcg_{1..10000}

	real	0m3.368s
	user	0m0.025s
	sys	0m3.012s

	# time mkdir testcg_{1..20000}
	real	0m6.742s
	user	0m0.085s
	sys	0m5.995s

Signed-off-by: Chen Ridong <chenridong@huawei.com>
---
 include/linux/mmzone.h |  4 +++
 mm/vmscan.c            | 78 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4398e027f450..bdee57b35126 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -513,6 +513,8 @@ struct lru_gen_folio {
 	u8 gen;
 	/* the list segment this lru_gen_folio belongs to */
 	u8 seg;
+	/* the bin index this lru_gen_folio is queued on */
+	u8 bin;
 	/* per-node lru_gen_folio list for global reclaim */
 	struct hlist_nulls_node list;
 };
@@ -610,6 +612,8 @@ struct lru_gen_memcg {
 	unsigned long nr_memcgs[MEMCG_NR_GENS];
 	/* per-node lru_gen_folio list for global reclaim */
 	struct hlist_nulls_head	fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
+	/* cached tails to speed up enqueueing */
+	struct hlist_nulls_node *tails[MEMCG_NR_GENS][MEMCG_NR_BINS];
 	/* protects the above */
 	spinlock_t lock;
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8890f4b58673..6c2665e48f19 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4299,6 +4299,66 @@ enum {
 	MEMCG_LRU_YOUNG,
 };
 
+static void memcg_lru_add_head_locked(struct pglist_data *pgdat,
+				      struct lruvec *lruvec, int gen, int bin)
+{
+	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+	struct hlist_nulls_node *node = &lruvec->lrugen.list;
+	bool empty = !memcg_lru->tails[gen][bin];
+
+	hlist_nulls_add_head_rcu(node, head);
+	lruvec->lrugen.bin = bin;
+
+	if (empty)
+		memcg_lru->tails[gen][bin] = node;
+}
+
+static void memcg_lru_add_tail_locked(struct pglist_data *pgdat,
+				      struct lruvec *lruvec, int gen, int bin)
+{
+	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+	struct hlist_nulls_node *node = &lruvec->lrugen.list;
+	struct hlist_nulls_node *tail = memcg_lru->tails[gen][bin];
+
+	if (tail) {
+		WRITE_ONCE(node->next, tail->next);
+		WRITE_ONCE(node->pprev, &tail->next);
+		rcu_assign_pointer(hlist_nulls_next_rcu(tail), node);
+	} else {
+		hlist_nulls_add_head_rcu(node, head);
+	}
+
+	memcg_lru->tails[gen][bin] = node;
+	lruvec->lrugen.bin = bin;
+}
+
+static void memcg_lru_del_locked(struct pglist_data *pgdat, struct lruvec *lruvec,
+				 bool reinit)
+{
+	int gen = lruvec->lrugen.gen;
+	int bin = lruvec->lrugen.bin;
+	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
+	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
+	struct hlist_nulls_node *node = &lruvec->lrugen.list;
+	struct hlist_nulls_node *prev = NULL;
+
+	if (hlist_nulls_unhashed(node))
+		return;
+
+	if (memcg_lru->tails[gen][bin] == node) {
+		if (node->pprev != &head->first)
+			prev = container_of(node->pprev, struct hlist_nulls_node, next);
+		memcg_lru->tails[gen][bin] = prev;
+	}
+
+	if (reinit)
+		hlist_nulls_del_init_rcu(node);
+	else
+		hlist_nulls_del_rcu(node);
+}
+
 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 {
 	int seg;
@@ -4326,15 +4386,15 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
 	else
 		VM_WARN_ON_ONCE(true);
 
+	memcg_lru_del_locked(pgdat, lruvec, false);
+
 	WRITE_ONCE(lruvec->lrugen.seg, seg);
 	WRITE_ONCE(lruvec->lrugen.gen, new);
 
-	hlist_nulls_del_rcu(&lruvec->lrugen.list);
-
 	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
-		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+		memcg_lru_add_head_locked(pgdat, lruvec, new, bin);
 	else
-		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+		memcg_lru_add_tail_locked(pgdat, lruvec, new, bin);
 
 	pgdat->memcg_lru.nr_memcgs[old]--;
 	pgdat->memcg_lru.nr_memcgs[new]++;
@@ -4365,7 +4425,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
 
 		lruvec->lrugen.gen = gen;
 
-		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+		memcg_lru_add_tail_locked(pgdat, lruvec, gen, bin);
 		pgdat->memcg_lru.nr_memcgs[gen]++;
 
 		spin_unlock_irq(&pgdat->memcg_lru.lock);
@@ -4399,7 +4459,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
 
 		gen = lruvec->lrugen.gen;
 
-		hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
+		memcg_lru_del_locked(pgdat, lruvec, true);
 		pgdat->memcg_lru.nr_memcgs[gen]--;
 
 		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
@@ -5664,8 +5724,10 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
 	spin_lock_init(&pgdat->memcg_lru.lock);
 
 	for (i = 0; i < MEMCG_NR_GENS; i++) {
-		for (j = 0; j < MEMCG_NR_BINS; j++)
+		for (j = 0; j < MEMCG_NR_BINS; j++) {
 			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+			pgdat->memcg_lru.tails[i][j] = NULL;
+		}
 	}
 }
 
@@ -5687,6 +5749,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 
 	if (mm_state)
 		mm_state->seq = MIN_NR_GENS;
+
+	lrugen->bin = 0;
 }
 
 #ifdef CONFIG_MEMCG
-- 
2.34.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-11-19  8:37 [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled Chen Ridong
@ 2025-11-24  3:52 ` Chen Ridong
  2025-11-26  2:29   ` Chen Ridong
  2025-11-26 17:15 ` Johannes Weiner
  1 sibling, 1 reply; 7+ messages in thread
From: Chen Ridong @ 2025-11-24  3:52 UTC (permalink / raw)
  To: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, hannes, zhengqi.arch,
	shakeel.butt
  Cc: linux-mm, linux-kernel, lujialin4, chenridong



On 2025/11/19 16:37, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
> 
> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
> when creating a large number of memory cgroups (memcgs):
> 
> 	# time mkdir testcg_{1..10000}
> 
> 	real	0m7.167s
> 	user	0m0.037s
> 	sys	0m6.773s
> 
> 	# time mkdir testcg_{1..20000}
> 
> 	real	0m27.158s
> 	user	0m0.079s
> 	sys	0m26.270s
> 
> In contrast, with LRU_GEN=n, creation of the same number of memcgs
> performs better:
> 
> 	# time mkdir testcg_{1..10000}
> 
> 	real	0m3.386s
> 	user	0m0.044s
> 	sys	0m3.009s
> 
> 	# time mkdir testcg_{1..20000}
> 
> 	real	0m6.876s
> 	user	0m0.075s
> 	sys	0m6.121s
> 
> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
> which traverses the entire list to find the tail. This traversal scales
> with the number of memcgs, even when LRU_GEN is runtime-disabled.
> 
> Fix this by adding a per-lru_gen tail pointer to track the list's tail.
> Appending new nodes now uses the tail pointer directly, eliminating full
> list traversal.
> 
> After applying this patch, memcg creation performance with LRU_GEN=y
> matches the fully disabled baseline:
> 
> 	#time mkdir testcg_{1..10000}
> 
> 	real	0m3.368s
> 	user	0m0.025s
> 	sys	0m3.012s
> 
> 	# time mkdir testcg_{1..20000}
> 	real	0m6.742s
> 	user	0m0.085s
> 	sys	0m5.995s
> 
> Signed-off-by: Chen Ridong <chenridong@huawei.com>
> ---
>  include/linux/mmzone.h |  4 +++
>  mm/vmscan.c            | 78 ++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 75 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4398e027f450..bdee57b35126 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -513,6 +513,8 @@ struct lru_gen_folio {
>  	u8 gen;
>  	/* the list segment this lru_gen_folio belongs to */
>  	u8 seg;
> +	/* the bin index this lru_gen_folio is queued on */
> +	u8 bin;
>  	/* per-node lru_gen_folio list for global reclaim */
>  	struct hlist_nulls_node list;
>  };
> @@ -610,6 +612,8 @@ struct lru_gen_memcg {
>  	unsigned long nr_memcgs[MEMCG_NR_GENS];
>  	/* per-node lru_gen_folio list for global reclaim */
>  	struct hlist_nulls_head	fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
> +	/* cached tails to speed up enqueueing */
> +	struct hlist_nulls_node *tails[MEMCG_NR_GENS][MEMCG_NR_BINS];
>  	/* protects the above */
>  	spinlock_t lock;
>  };
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8890f4b58673..6c2665e48f19 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4299,6 +4299,66 @@ enum {
>  	MEMCG_LRU_YOUNG,
>  };
>  
> +static void memcg_lru_add_head_locked(struct pglist_data *pgdat,
> +				      struct lruvec *lruvec, int gen, int bin)
> +{
> +	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> +	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> +	struct hlist_nulls_node *node = &lruvec->lrugen.list;
> +	bool empty = !memcg_lru->tails[gen][bin];
> +
> +	hlist_nulls_add_head_rcu(node, head);
> +	lruvec->lrugen.bin = bin;
> +
> +	if (empty)
> +		memcg_lru->tails[gen][bin] = node;
> +}
> +
> +static void memcg_lru_add_tail_locked(struct pglist_data *pgdat,
> +				      struct lruvec *lruvec, int gen, int bin)
> +{
> +	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> +	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> +	struct hlist_nulls_node *node = &lruvec->lrugen.list;
> +	struct hlist_nulls_node *tail = memcg_lru->tails[gen][bin];
> +
> +	if (tail) {
> +		WRITE_ONCE(node->next, tail->next);
> +		WRITE_ONCE(node->pprev, &tail->next);
> +		rcu_assign_pointer(hlist_nulls_next_rcu(tail), node);
> +	} else {
> +		hlist_nulls_add_head_rcu(node, head);
> +	}
> +
> +	memcg_lru->tails[gen][bin] = node;
> +	lruvec->lrugen.bin = bin;
> +}
> +
> +static void memcg_lru_del_locked(struct pglist_data *pgdat, struct lruvec *lruvec,
> +				 bool reinit)
> +{
> +	int gen = lruvec->lrugen.gen;
> +	int bin = lruvec->lrugen.bin;
> +	struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> +	struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> +	struct hlist_nulls_node *node = &lruvec->lrugen.list;
> +	struct hlist_nulls_node *prev = NULL;
> +
> +	if (hlist_nulls_unhashed(node))
> +		return;
> +
> +	if (memcg_lru->tails[gen][bin] == node) {
> +		if (node->pprev != &head->first)
> +			prev = container_of(node->pprev, struct hlist_nulls_node, next);
> +		memcg_lru->tails[gen][bin] = prev;
> +	}
> +
> +	if (reinit)
> +		hlist_nulls_del_init_rcu(node);
> +	else
> +		hlist_nulls_del_rcu(node);
> +}
> +
>  static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
>  {
>  	int seg;
> @@ -4326,15 +4386,15 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
>  	else
>  		VM_WARN_ON_ONCE(true);
>  
> +	memcg_lru_del_locked(pgdat, lruvec, false);
> +
>  	WRITE_ONCE(lruvec->lrugen.seg, seg);
>  	WRITE_ONCE(lruvec->lrugen.gen, new);
>  
> -	hlist_nulls_del_rcu(&lruvec->lrugen.list);
> -
>  	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
> -		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> +		memcg_lru_add_head_locked(pgdat, lruvec, new, bin);
>  	else
> -		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> +		memcg_lru_add_tail_locked(pgdat, lruvec, new, bin);
>  
>  	pgdat->memcg_lru.nr_memcgs[old]--;
>  	pgdat->memcg_lru.nr_memcgs[new]++;
> @@ -4365,7 +4425,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
>  
>  		lruvec->lrugen.gen = gen;
>  
> -		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
> +		memcg_lru_add_tail_locked(pgdat, lruvec, gen, bin);
>  		pgdat->memcg_lru.nr_memcgs[gen]++;
>  
>  		spin_unlock_irq(&pgdat->memcg_lru.lock);
> @@ -4399,7 +4459,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
>  
>  		gen = lruvec->lrugen.gen;
>  
> -		hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
> +		memcg_lru_del_locked(pgdat, lruvec, true);
>  		pgdat->memcg_lru.nr_memcgs[gen]--;
>  
>  		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
> @@ -5664,8 +5724,10 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
>  	spin_lock_init(&pgdat->memcg_lru.lock);
>  
>  	for (i = 0; i < MEMCG_NR_GENS; i++) {
> -		for (j = 0; j < MEMCG_NR_BINS; j++)
> +		for (j = 0; j < MEMCG_NR_BINS; j++) {
>  			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
> +			pgdat->memcg_lru.tails[i][j] = NULL;
> +		}
>  	}
>  }
>  
> @@ -5687,6 +5749,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
>  
>  	if (mm_state)
>  		mm_state->seq = MIN_NR_GENS;
> +
> +	lrugen->bin = 0;
>  }
>  
>  #ifdef CONFIG_MEMCG

Hello all,

Is anyone interested in this issue?

Any better ideas or suggestions are welcome.

-- 
Best regards,
Ridong



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-11-24  3:52 ` Chen Ridong
@ 2025-11-26  2:29   ` Chen Ridong
  0 siblings, 0 replies; 7+ messages in thread
From: Chen Ridong @ 2025-11-26  2:29 UTC (permalink / raw)
  To: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, hannes, zhengqi.arch,
	shakeel.butt
  Cc: linux-mm, linux-kernel, lujialin4, chenridong, Tejun Heo,
	Johannes Weiner, Michal Koutný,
	open list:CONTROL GROUP (CGROUP)

cc cgroup maintainers

-- 
Best regards,
Ridong



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-11-19  8:37 [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled Chen Ridong
  2025-11-24  3:52 ` Chen Ridong
@ 2025-11-26 17:15 ` Johannes Weiner
  2025-11-27  9:04   ` Chen Ridong
  1 sibling, 1 reply; 7+ messages in thread
From: Johannes Weiner @ 2025-11-26 17:15 UTC (permalink / raw)
  To: Chen Ridong
  Cc: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, zhengqi.arch,
	shakeel.butt, linux-mm, linux-kernel, lujialin4, chenridong

On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
> From: Chen Ridong <chenridong@huawei.com>
> 
> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
> when creating a large number of memory cgroups (memcgs):
> 
> 	# time mkdir testcg_{1..10000}
> 
> 	real	0m7.167s
> 	user	0m0.037s
> 	sys	0m6.773s
> 
> 	# time mkdir testcg_{1..20000}
> 
> 	real	0m27.158s
> 	user	0m0.079s
> 	sys	0m26.270s
> 
> In contrast, with LRU_GEN=n, creation of the same number of memcgs
> performs better:
> 
> 	# time mkdir testcg_{1..10000}
> 
> 	real	0m3.386s
> 	user	0m0.044s
> 	sys	0m3.009s
> 
> 	# time mkdir testcg_{1..20000}
> 
> 	real	0m6.876s
> 	user	0m0.075s
> 	sys	0m6.121s
> 
> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
> which traverses the entire list to find the tail. This traversal scales
> with the number of memcgs, even when LRU_GEN is runtime-disabled.

Can you please look into removing the memcg LRU instead?

Use mem_cgroup_iter() with a reclaim cookie in shrink_many(), like we
do in shrink_node_memcgs().

The memcg LRU is complicated, and it only works for global reclaim; if
you have a subtree with a memory.max at the top, it'll go through
shrink_node_memcgs() already anyway.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-11-26 17:15 ` Johannes Weiner
@ 2025-11-27  9:04   ` Chen Ridong
  2025-12-04 12:59     ` Chen Ridong
  0 siblings, 1 reply; 7+ messages in thread
From: Chen Ridong @ 2025-11-27  9:04 UTC (permalink / raw)
  To: Johannes Weiner, Yu Zhao
  Cc: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, zhengqi.arch,
	shakeel.butt, linux-mm, linux-kernel, lujialin4, chenridong



On 2025/11/27 1:15, Johannes Weiner wrote:
> On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
>> From: Chen Ridong <chenridong@huawei.com>
>>
>> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
>> when creating a large number of memory cgroups (memcgs):
>>
>> 	# time mkdir testcg_{1..10000}
>>
>> 	real	0m7.167s
>> 	user	0m0.037s
>> 	sys	0m6.773s
>>
>> 	# time mkdir testcg_{1..20000}
>>
>> 	real	0m27.158s
>> 	user	0m0.079s
>> 	sys	0m26.270s
>>
>> In contrast, with LRU_GEN=n, creation of the same number of memcgs
>> performs better:
>>
>> 	# time mkdir testcg_{1..10000}
>>
>> 	real	0m3.386s
>> 	user	0m0.044s
>> 	sys	0m3.009s
>>
>> 	# time mkdir testcg_{1..20000}
>>
>> 	real	0m6.876s
>> 	user	0m0.075s
>> 	sys	0m6.121s
>>
>> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
>> which traverses the entire list to find the tail. This traversal scales
>> with the number of memcgs, even when LRU_GEN is runtime-disabled.
> 
> Can you please look into removing the memcg LRU instead?
> 

Thanks Johannes, this is indeed a promising approach.

The memcg LRU was originally designed exclusively for global reclaim scenarios. Before we move
forward with its removal, I'd like to hear Yu's thoughts on this.

Hello Yu,

Do you have any opinions on removing the memcg LRU?

> Use mem_cgroup_iter() with a reclaim cookie in shrink_many(), like we
> do in shrink_node_memcgs().
> 
> The memcg LRU is complicated, and it only works for global reclaim; if
> you have a subtree with a memory.max at the top, it'll go through
> shrink_node_memcgs() already anyway.

-- 
Best regards,
Ridong



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-11-27  9:04   ` Chen Ridong
@ 2025-12-04 12:59     ` Chen Ridong
  2025-12-04 13:01       ` Chen Ridong
  0 siblings, 1 reply; 7+ messages in thread
From: Chen Ridong @ 2025-12-04 12:59 UTC (permalink / raw)
  To: Johannes Weiner, Yu Zhao
  Cc: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, zhengqi.arch,
	shakeel.butt, linux-mm, linux-kernel, lujialin4, chenridong



On 2025/11/27 17:04, Chen Ridong wrote:
> 
> 
> On 2025/11/27 1:15, Johannes Weiner wrote:
>> On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
>>> From: Chen Ridong <chenridong@huawei.com>
>>>
>>> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
>>> when creating a large number of memory cgroups (memcgs):
>>>
>>> 	# time mkdir testcg_{1..10000}
>>>
>>> 	real	0m7.167s
>>> 	user	0m0.037s
>>> 	sys	0m6.773s
>>>
>>> 	# time mkdir testcg_{1..20000}
>>>
>>> 	real	0m27.158s
>>> 	user	0m0.079s
>>> 	sys	0m26.270s
>>>
>>> In contrast, with LRU_GEN=n, creation of the same number of memcgs
>>> performs better:
>>>
>>> 	# time mkdir testcg_{1..10000}
>>>
>>> 	real	0m3.386s
>>> 	user	0m0.044s
>>> 	sys	0m3.009s
>>>
>>> 	# time mkdir testcg_{1..20000}
>>>
>>> 	real	0m6.876s
>>> 	user	0m0.075s
>>> 	sys	0m6.121s
>>>
>>> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
>>> which traverses the entire list to find the tail. This traversal scales
>>> with the number of memcgs, even when LRU_GEN is runtime-disabled.
>>
>> Can you please look into removing the memcg LRU instead?
>>
> 
> Thanks Johannes, this is indeed a promising approach.
> 
> The memcg LRU was originally designed exclusively for global reclaim scenarios. Before we move
> forward with its removal, I'd like to hear Yu's thoughts on this.
> 
> Hello Yu,
> 
> Do you have any opinions on removing the memcg LRU?
> 

Hello Johannes and Yu,

I've sent patches to remove the memcg LRU and replace it with mem_cgroup_iter.
I would appreciate it if you could take a look when you have time.

-- 
Best regards,
Ridong



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled
  2025-12-04 12:59     ` Chen Ridong
@ 2025-12-04 13:01       ` Chen Ridong
  0 siblings, 0 replies; 7+ messages in thread
From: Chen Ridong @ 2025-12-04 13:01 UTC (permalink / raw)
  To: Johannes Weiner, Yu Zhao
  Cc: akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
	mhocko, axelrasmussen, yuanchu, weixugc, zhengqi.arch,
	shakeel.butt, linux-mm, linux-kernel, lujialin4, chenridong



On 2025/12/4 20:59, Chen Ridong wrote:
> 
> 
> On 2025/11/27 17:04, Chen Ridong wrote:
>>
>>
>> On 2025/11/27 1:15, Johannes Weiner wrote:
>>> On Wed, Nov 19, 2025 at 08:37:22AM +0000, Chen Ridong wrote:
>>>> From: Chen Ridong <chenridong@huawei.com>
>>>>
>>>> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
>>>> when creating a large number of memory cgroups (memcgs):
>>>>
>>>> 	# time mkdir testcg_{1..10000}
>>>>
>>>> 	real	0m7.167s
>>>> 	user	0m0.037s
>>>> 	sys	0m6.773s
>>>>
>>>> 	# time mkdir testcg_{1..20000}
>>>>
>>>> 	real	0m27.158s
>>>> 	user	0m0.079s
>>>> 	sys	0m26.270s
>>>>
>>>> In contrast, with LRU_GEN=n, creation of the same number of memcgs
>>>> performs better:
>>>>
>>>> 	# time mkdir testcg_{1..10000}
>>>>
>>>> 	real	0m3.386s
>>>> 	user	0m0.044s
>>>> 	sys	0m3.009s
>>>>
>>>> 	# time mkdir testcg_{1..20000}
>>>>
>>>> 	real	0m6.876s
>>>> 	user	0m0.075s
>>>> 	sys	0m6.121s
>>>>
>>>> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
>>>> which traverses the entire list to find the tail. This traversal scales
>>>> with the number of memcgs, even when LRU_GEN is runtime-disabled.
>>>
>>> Can you please look into removing the memcg LRU instead?
>>>
>>
>> Thanks Johannes, this is indeed a promising approach.
>>
>> The memcg LRU was originally designed exclusively for global reclaim scenarios. Before we move
>> forward with its removal, I'd like to hear Yu's thoughts on this.
>>
>> Hello Yu,
>>
>> Do you have any opinions on removing the memcg LRU?
>>
> 
> Hello Johannes and Yu,
> 
> I've sent patches to remove the memcg LRU and replace it with mem_cgroup_iter.
> I would appreciate it if you could take a look when you have time.
> 

Just adding the link to the patch series:

https://lore.kernel.org/cgroups/20251204123124.1822965-1-chenridong@huaweicloud.com/T/#t

-- 
Best regards,
Ridong



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-12-04 13:02 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-19  8:37 [RFC -next] memcg: Optimize creation performance when LRU_GEN is enabled Chen Ridong
2025-11-24  3:52 ` Chen Ridong
2025-11-26  2:29   ` Chen Ridong
2025-11-26 17:15 ` Johannes Weiner
2025-11-27  9:04   ` Chen Ridong
2025-12-04 12:59     ` Chen Ridong
2025-12-04 13:01       ` Chen Ridong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox