[PATCH] memcg: multi-memcg percpu charge cache

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] memcg: multi-memcg percpu charge cache
@ 2025-04-16 18:02 Shakeel Butt
  2025-04-23  1:10 ` Jakub Kicinski
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-16 18:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

Memory cgroup accounting is expensive and to reduce the cost, the kernel
maintains per-cpu charge cache for a single memcg. So, if a charge
request comes for a different memcg, the kernel will flush the old
memcg's charge cache and then charge the newer memcg a fixed amount (64
pages), subtracts the charge request amount and stores the remaining in
the per-cpu charge cache for the newer memcg.

This mechanism is based on the assumption that the kernel, for locality,
keep a process on a CPU for long period of time and most of the charge
requests from that process will be served by that CPU's local charge
cache.

However this assumption breaks down for incoming network traffic in a
multi-tenant machine. We are in the process of running multiple
workloads on a single machine and if such workloads are network heavy,
we are seeing very high network memory accounting cost. We have observed
multiple CPUs spending almost 100% of their time in net_rx_action and
almost all of that time is spent in memcg accounting of the network
traffic.

More precisely, net_rx_action is serving packets from multiple workloads
and is observing/serving mix of packets of these workloads. The memcg
switch of per-cpu cache is very expensive and we are observing a lot of
memcg switches on the machine. Almost all the time is being spent on
charging new memcg and flushing older memcg cache. So, definitely we
need per-cpu cache that support multiple memcgs for this scenario.

This patch implements a simple (and dumb) multiple memcg percpu charge
cache. Actually we started with more sophisticated LRU based approach but
the dumb one was always better than the sophisticated one by 1% to 3%,
so going with the simple approach.

Some of the design choices are:

1. Fit all caches memcgs in a single cacheline.
2. The cache array can be mix of empty slots or memcg charged slots, so
   the kernel has to traverse the full array.
3. The cache drain from the reclaim will drain all cached memcgs to keep
   things simple.

To evaluate the impact of this optimization, on a 72 CPUs machine, we
ran the following workload where each netperf client runs in a different
cgroup. The next-20250415 kernel is used as base.

 $ netserver -6
 $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K

number of clients | Without patch | With patch
  6               | 42584.1 Mbps  | 48603.4 Mbps (14.13% improvement)
  12              | 30617.1 Mbps  | 47919.7 Mbps (56.51% improvement)
  18              | 25305.2 Mbps  | 45497.3 Mbps (79.79% improvement)
  24              | 20104.1 Mbps  | 37907.7 Mbps (88.55% improvement)
  30              | 14702.4 Mbps  | 30746.5 Mbps (109.12% improvement)
  36              | 10801.5 Mbps  | 26476.3 Mbps (145.11% improvement)

The results show drastic improvement for network intensive workloads.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 mm/memcontrol.c | 128 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 37 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1ad326e871c1..0a02ba07561e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1769,10 +1769,11 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 	pr_cont(" are going to be killed due to memory.oom.group set\n");
 }
 
+#define NR_MEMCG_STOCK 7
 struct memcg_stock_pcp {
 	local_trylock_t stock_lock;
-	struct mem_cgroup *cached; /* this never be root cgroup */
-	unsigned int nr_pages;
+	uint8_t nr_pages[NR_MEMCG_STOCK];
+	struct mem_cgroup *cached[NR_MEMCG_STOCK];
 
 	struct obj_cgroup *cached_objcg;
 	struct pglist_data *cached_pgdat;
@@ -1809,9 +1810,10 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
 			  gfp_t gfp_mask)
 {
 	struct memcg_stock_pcp *stock;
-	unsigned int stock_pages;
+	uint8_t stock_pages;
 	unsigned long flags;
 	bool ret = false;
+	int i;
 
 	if (nr_pages > MEMCG_CHARGE_BATCH)
 		return ret;
@@ -1822,10 +1824,17 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
 		return ret;
 
 	stock = this_cpu_ptr(&memcg_stock);
-	stock_pages = READ_ONCE(stock->nr_pages);
-	if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
-		WRITE_ONCE(stock->nr_pages, stock_pages - nr_pages);
-		ret = true;
+
+	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+		if (memcg != READ_ONCE(stock->cached[i]))
+			continue;
+
+		stock_pages = READ_ONCE(stock->nr_pages[i]);
+		if (stock_pages >= nr_pages) {
+			WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
+			ret = true;
+		}
+		break;
 	}
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
@@ -1843,21 +1852,30 @@ static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
 /*
  * Returns stocks cached in percpu and reset cached information.
  */
-static void drain_stock(struct memcg_stock_pcp *stock)
+static void drain_stock(struct memcg_stock_pcp *stock, int i)
 {
-	unsigned int stock_pages = READ_ONCE(stock->nr_pages);
-	struct mem_cgroup *old = READ_ONCE(stock->cached);
+	struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
+	uint8_t stock_pages;
 
 	if (!old)
 		return;
 
+	stock_pages = READ_ONCE(stock->nr_pages[i]);
 	if (stock_pages) {
 		memcg_uncharge(old, stock_pages);
-		WRITE_ONCE(stock->nr_pages, 0);
+		WRITE_ONCE(stock->nr_pages[i], 0);
 	}
 
 	css_put(&old->css);
-	WRITE_ONCE(stock->cached, NULL);
+	WRITE_ONCE(stock->cached[i], NULL);
+}
+
+static void drain_stock_fully(struct memcg_stock_pcp *stock)
+{
+	int i;
+
+	for (i = 0; i < NR_MEMCG_STOCK; ++i)
+		drain_stock(stock, i);
 }
 
 static void drain_local_stock(struct work_struct *dummy)
@@ -1874,7 +1892,7 @@ static void drain_local_stock(struct work_struct *dummy)
 
 	stock = this_cpu_ptr(&memcg_stock);
 	drain_obj_stock(stock);
-	drain_stock(stock);
+	drain_stock_fully(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
@@ -1883,35 +1901,81 @@ static void drain_local_stock(struct work_struct *dummy)
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock;
-	unsigned int stock_pages;
+	struct mem_cgroup *cached;
+	uint8_t stock_pages;
 	unsigned long flags;
+	bool evict = true;
+	int i;
 
 	VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
 
-	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+	if (nr_pages > MEMCG_CHARGE_BATCH ||
+	    !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
 		/*
-		 * In case of unlikely failure to lock percpu stock_lock
-		 * uncharge memcg directly.
+		 * In case of larger than batch refill or unlikely failure to
+		 * lock the percpu stock_lock, uncharge memcg directly.
 		 */
 		memcg_uncharge(memcg, nr_pages);
 		return;
 	}
 
 	stock = this_cpu_ptr(&memcg_stock);
-	if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */
-		drain_stock(stock);
-		css_get(&memcg->css);
-		WRITE_ONCE(stock->cached, memcg);
+	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+again:
+		cached = READ_ONCE(stock->cached[i]);
+		if (!cached) {
+			css_get(&memcg->css);
+			WRITE_ONCE(stock->cached[i], memcg);
+		}
+		if (!cached || memcg == READ_ONCE(stock->cached[i])) {
+			stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
+			WRITE_ONCE(stock->nr_pages[i], stock_pages);
+			if (stock_pages > MEMCG_CHARGE_BATCH)
+				drain_stock(stock, i);
+			evict = false;
+			break;
+		}
 	}
-	stock_pages = READ_ONCE(stock->nr_pages) + nr_pages;
-	WRITE_ONCE(stock->nr_pages, stock_pages);
 
-	if (stock_pages > MEMCG_CHARGE_BATCH)
-		drain_stock(stock);
+	if (evict) {
+		i = get_random_u32_below(NR_MEMCG_STOCK);
+		drain_stock(stock, i);
+		goto again;
+	}
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
 
+static bool is_drain_needed(struct memcg_stock_pcp *stock,
+			    struct mem_cgroup *root_memcg)
+{
+	struct mem_cgroup *memcg;
+	bool flush = false;
+	int i;
+
+	rcu_read_lock();
+
+	if (obj_stock_flush_required(stock, root_memcg)) {
+		flush = true;
+		goto out;
+	}
+
+	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
+		memcg = READ_ONCE(stock->cached[i]);
+		if (!memcg)
+			continue;
+
+		if (READ_ONCE(stock->nr_pages[i]) &&
+		    mem_cgroup_is_descendant(memcg, root_memcg)) {
+			flush = true;
+			break;
+		}
+	}
+out:
+	rcu_read_unlock();
+	return flush;
+}
+
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it.
@@ -1933,17 +1997,7 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
 	curcpu = smp_processor_id();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-		struct mem_cgroup *memcg;
-		bool flush = false;
-
-		rcu_read_lock();
-		memcg = READ_ONCE(stock->cached);
-		if (memcg && READ_ONCE(stock->nr_pages) &&
-		    mem_cgroup_is_descendant(memcg, root_memcg))
-			flush = true;
-		else if (obj_stock_flush_required(stock, root_memcg))
-			flush = true;
-		rcu_read_unlock();
+		bool flush = is_drain_needed(stock, root_memcg);
 
 		if (flush &&
 		    !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
@@ -1969,7 +2023,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 	drain_obj_stock(stock);
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 
-	drain_stock(stock);
+	drain_stock_fully(stock);
 
 	return 0;
 }
-- 
2.47.1



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
@ 2025-04-23  1:10 ` Jakub Kicinski
  2025-04-23 22:16   ` Shakeel Butt
  2025-04-23 23:14 ` Shakeel Butt
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: Jakub Kicinski @ 2025-04-23  1:10 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Wed, 16 Apr 2025 11:02:29 -0700 Shakeel Butt wrote:
>  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  {
>  	struct memcg_stock_pcp *stock;
> -	unsigned int stock_pages;
> +	struct mem_cgroup *cached;
> +	uint8_t stock_pages;

Is it okay to use uintX_t now?

>  	unsigned long flags;
> +	bool evict = true;
> +	int i;
>  
>  	VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
>  
> -	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> +	if (nr_pages > MEMCG_CHARGE_BATCH ||
> +	    !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
>  		/*
> -		 * In case of unlikely failure to lock percpu stock_lock
> -		 * uncharge memcg directly.
> +		 * In case of larger than batch refill or unlikely failure to
> +		 * lock the percpu stock_lock, uncharge memcg directly.
>  		 */

We're bypassing the cache for > CHARGE_BATCH because the u8 math 
may overflow? Could be useful to refocus the comment on the 'why'

>  		memcg_uncharge(memcg, nr_pages);
>  		return;
>  	}

nits notwithstanding:

Acked-by: Jakub Kicinski <kuba@kernel.org>

Thanks!


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-23  1:10 ` Jakub Kicinski
@ 2025-04-23 22:16   ` Shakeel Butt
  2025-04-23 22:30     ` Jakub Kicinski
  0 siblings, 1 reply; 17+ messages in thread
From: Shakeel Butt @ 2025-04-23 22:16 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

Hi Jakub,

On Tue, Apr 22, 2025 at 06:10:22PM -0700, Jakub Kicinski wrote:
> On Wed, 16 Apr 2025 11:02:29 -0700 Shakeel Butt wrote:
> >  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> >  {
> >  	struct memcg_stock_pcp *stock;
> > -	unsigned int stock_pages;
> > +	struct mem_cgroup *cached;
> > +	uint8_t stock_pages;
> 
> Is it okay to use uintX_t now?
> 
> >  	unsigned long flags;
> > +	bool evict = true;
> > +	int i;
> >  
> >  	VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
> >  
> > -	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > +	if (nr_pages > MEMCG_CHARGE_BATCH ||
> > +	    !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> >  		/*
> > -		 * In case of unlikely failure to lock percpu stock_lock
> > -		 * uncharge memcg directly.
> > +		 * In case of larger than batch refill or unlikely failure to
> > +		 * lock the percpu stock_lock, uncharge memcg directly.
> >  		 */
> 
> We're bypassing the cache for > CHARGE_BATCH because the u8 math 
> may overflow? Could be useful to refocus the comment on the 'why'
> 

We actually never put more than MEMCG_CHARGE_BATCH in the cache and thus
we can use u8 as type here. Though we may increase the batch size in
future, so I should put a BUILD_BUG_ON somewhere here.

> >  		memcg_uncharge(memcg, nr_pages);
> >  		return;
> >  	}
> 
> nits notwithstanding:
> 
> Acked-by: Jakub Kicinski <kuba@kernel.org>

Thanks a lot for the review.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-23 22:16   ` Shakeel Butt
@ 2025-04-23 22:30     ` Jakub Kicinski
  2025-04-23 22:59       ` Shakeel Butt
  0 siblings, 1 reply; 17+ messages in thread
From: Jakub Kicinski @ 2025-04-23 22:30 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Wed, 23 Apr 2025 15:16:56 -0700 Shakeel Butt wrote:
> > > -	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > +	if (nr_pages > MEMCG_CHARGE_BATCH ||
> > > +	    !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > >  		/*
> > > -		 * In case of unlikely failure to lock percpu stock_lock
> > > -		 * uncharge memcg directly.
> > > +		 * In case of larger than batch refill or unlikely failure to
> > > +		 * lock the percpu stock_lock, uncharge memcg directly.
> > >  		 */  
> > 
> > We're bypassing the cache for > CHARGE_BATCH because the u8 math 
> > may overflow? Could be useful to refocus the comment on the 'why'
> 
> We actually never put more than MEMCG_CHARGE_BATCH in the cache and thus
> we can use u8 as type here. Though we may increase the batch size in
> future, so I should put a BUILD_BUG_ON somewhere here.

No idea if this matters enough to deserve its own commit but basically
I was wondering if that behavior change is a separate optimization.

Previously we'd check if the cache was for the releasing cgroup and sum
was over BATCH - drain its stock completely. Now we bypass looking at
the cache if nr_pages > BATCH so the cgroup may retain some stock.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-23 22:30     ` Jakub Kicinski
@ 2025-04-23 22:59       ` Shakeel Butt
  0 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-23 22:59 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Wed, Apr 23, 2025 at 03:30:46PM -0700, Jakub Kicinski wrote:
> On Wed, 23 Apr 2025 15:16:56 -0700 Shakeel Butt wrote:
> > > > -	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > > +	if (nr_pages > MEMCG_CHARGE_BATCH ||
> > > > +	    !local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > >  		/*
> > > > -		 * In case of unlikely failure to lock percpu stock_lock
> > > > -		 * uncharge memcg directly.
> > > > +		 * In case of larger than batch refill or unlikely failure to
> > > > +		 * lock the percpu stock_lock, uncharge memcg directly.
> > > >  		 */  
> > > 
> > > We're bypassing the cache for > CHARGE_BATCH because the u8 math 
> > > may overflow? Could be useful to refocus the comment on the 'why'
> > 
> > We actually never put more than MEMCG_CHARGE_BATCH in the cache and thus
> > we can use u8 as type here. Though we may increase the batch size in
> > future, so I should put a BUILD_BUG_ON somewhere here.
> 
> No idea if this matters enough to deserve its own commit but basically
> I was wondering if that behavior change is a separate optimization.
> 
> Previously we'd check if the cache was for the releasing cgroup and sum
> was over BATCH - drain its stock completely. Now we bypass looking at
> the cache if nr_pages > BATCH so the cgroup may retain some stock.

Yes indeed there is a little bit behavior change as you have explained.
The older behavior (fully drain if nr_pages > BATCH) might be due to
single per-cpu memcg cache limitation and in my opinion is problematic
in some scenarios. If you see commit 5387c90490f7 ("mm/memcg: improve
refill_obj_stock() performance"), a very similar behavior for objcg
cache was having a performance impact and was optimized by only allowing
the drain for some code paths. With multi-memcg support, I think we
don't need to worry about it. Multi-objcg per-cpu cache is also on my
TODO list.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
  2025-04-23  1:10 ` Jakub Kicinski
@ 2025-04-23 23:14 ` Shakeel Butt
  2025-04-25 20:18 ` Shakeel Butt
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-23 23:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

Hi Andrew,

Can you please squash the following patch in this one?

From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 23 Apr 2025 15:41:18 -0700
Subject: [PATCH] memcg: multi-memcg percpu charge cache - fix

Add BUILD_BUG_ON() for MEMCG_CHARGE_BATCH

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 mm/memcontrol.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23894e60c3c0..80ff002b0259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1909,6 +1909,13 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	bool evict = true;
 	int i;
 
+	/*
+	 * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
+	 * decide to increase it more than 127 then we will need more careful
+	 * handling of nr_pages[] in struct memcg_stock_pcp.
+	 */
+	BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
+
 	VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
 
 	if (nr_pages > MEMCG_CHARGE_BATCH ||
-- 
2.47.1



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
  2025-04-23  1:10 ` Jakub Kicinski
  2025-04-23 23:14 ` Shakeel Butt
@ 2025-04-25 20:18 ` Shakeel Butt
  2025-04-29  9:40   ` Hugh Dickins
  2025-04-30 10:05   ` Vlastimil Babka
  2025-04-29 12:13 ` Michal Hocko
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-25 20:18 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

Hi Andrew,

Another fix for this patch. Basically simplification of refill_stock and
avoiding multiple cached entries of a memcg.

From 6f6f7736799ad8ca5fee48eca7b7038f6c9bb5b9 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Fri, 25 Apr 2025 13:10:43 -0700
Subject: [PATCH] memcg: multi-memcg percpu charge cache - fix 2

Simplify refill_stock by avoiding goto and doing the operations inline
and make sure the given memcg is not cached multiple times.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 mm/memcontrol.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 997e2da5d2ca..9dfdbb2fcccc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1907,7 +1907,8 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	struct mem_cgroup *cached;
 	uint8_t stock_pages;
 	unsigned long flags;
-	bool evict = true;
+	bool success = false;
+	int empty_slot = -1;
 	int i;
 
 	/*
@@ -1931,26 +1932,28 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 
 	stock = this_cpu_ptr(&memcg_stock);
 	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
-again:
 		cached = READ_ONCE(stock->cached[i]);
-		if (!cached) {
-			css_get(&memcg->css);
-			WRITE_ONCE(stock->cached[i], memcg);
-		}
-		if (!cached || memcg == READ_ONCE(stock->cached[i])) {
+		if (!cached && empty_slot == -1)
+			empty_slot = i;
+		if (memcg == READ_ONCE(stock->cached[i])) {
 			stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
 			WRITE_ONCE(stock->nr_pages[i], stock_pages);
 			if (stock_pages > MEMCG_CHARGE_BATCH)
 				drain_stock(stock, i);
-			evict = false;
+			success = true;
 			break;
 		}
 	}
 
-	if (evict) {
-		i = get_random_u32_below(NR_MEMCG_STOCK);
-		drain_stock(stock, i);
-		goto again;
+	if (!success) {
+		i = empty_slot;
+		if (i == -1) {
+			i = get_random_u32_below(NR_MEMCG_STOCK);
+			drain_stock(stock, i);
+		}
+		css_get(&memcg->css);
+		WRITE_ONCE(stock->cached[i], memcg);
+		WRITE_ONCE(stock->nr_pages[i], stock_pages);
 	}
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-- 
2.47.1



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-25 20:18 ` Shakeel Butt
@ 2025-04-29  9:40   ` Hugh Dickins
  2025-04-29 14:50     ` Shakeel Butt
  2025-04-30 10:05   ` Vlastimil Babka
  1 sibling, 1 reply; 17+ messages in thread
From: Hugh Dickins @ 2025-04-29  9:40 UTC (permalink / raw)
  To: Andrew Morton, Shakeel Butt
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

[PATCH mm-unstable] memcg: multi-memcg percpu charge cache - fix 3

Fix 2 has been giving me lots of memcg OOM kills not seen before:
it's better to stock nr_pages than the uninitialized stock_pages.

Signed-off-by: Hugh Dickins <hughd@google.com>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 178d79e68107..02c6f553dc53 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1952,7 +1952,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 		}
 		css_get(&memcg->css);
 		WRITE_ONCE(stock->cached[i], memcg);
-		WRITE_ONCE(stock->nr_pages[i], stock_pages);
+		WRITE_ONCE(stock->nr_pages[i], nr_pages);
 	}
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
                   ` (2 preceding siblings ...)
  2025-04-25 20:18 ` Shakeel Butt
@ 2025-04-29 12:13 ` Michal Hocko
  2025-04-29 18:43   ` Shakeel Butt
  2025-04-30  9:57 ` Vlastimil Babka
  2025-04-30 15:32 ` Shakeel Butt
  5 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2025-04-29 12:13 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Wed 16-04-25 11:02:29, Shakeel Butt wrote:
> Memory cgroup accounting is expensive and to reduce the cost, the kernel
> maintains per-cpu charge cache for a single memcg. So, if a charge
> request comes for a different memcg, the kernel will flush the old
> memcg's charge cache and then charge the newer memcg a fixed amount (64
> pages), subtracts the charge request amount and stores the remaining in
> the per-cpu charge cache for the newer memcg.
> 
> This mechanism is based on the assumption that the kernel, for locality,
> keep a process on a CPU for long period of time and most of the charge
> requests from that process will be served by that CPU's local charge
> cache.
> 
> However this assumption breaks down for incoming network traffic in a
> multi-tenant machine. We are in the process of running multiple
> workloads on a single machine and if such workloads are network heavy,
> we are seeing very high network memory accounting cost. We have observed
> multiple CPUs spending almost 100% of their time in net_rx_action and
> almost all of that time is spent in memcg accounting of the network
> traffic.
> 
> More precisely, net_rx_action is serving packets from multiple workloads
> and is observing/serving mix of packets of these workloads. The memcg
> switch of per-cpu cache is very expensive and we are observing a lot of
> memcg switches on the machine. Almost all the time is being spent on
> charging new memcg and flushing older memcg cache. So, definitely we
> need per-cpu cache that support multiple memcgs for this scenario.
> 
> This patch implements a simple (and dumb) multiple memcg percpu charge
> cache. Actually we started with more sophisticated LRU based approach but
> the dumb one was always better than the sophisticated one by 1% to 3%,
> so going with the simple approach.

Makes sense to start simple and go for a more sophisticated (has table
appraoch maybe) later when a clear gain could be demonstrated.

> Some of the design choices are:
> 
> 1. Fit all caches memcgs in a single cacheline.

Could you be more specific about the reasoning? I suspect it is for the
network receive path you are mentioning above, right?

> 2. The cache array can be mix of empty slots or memcg charged slots, so
>    the kernel has to traverse the full array.
> 3. The cache drain from the reclaim will drain all cached memcgs to keep
>    things simple.
> 
> To evaluate the impact of this optimization, on a 72 CPUs machine, we
> ran the following workload where each netperf client runs in a different
> cgroup. The next-20250415 kernel is used as base.
> 
>  $ netserver -6
>  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> 
> number of clients | Without patch | With patch
>   6               | 42584.1 Mbps  | 48603.4 Mbps (14.13% improvement)
>   12              | 30617.1 Mbps  | 47919.7 Mbps (56.51% improvement)
>   18              | 25305.2 Mbps  | 45497.3 Mbps (79.79% improvement)
>   24              | 20104.1 Mbps  | 37907.7 Mbps (88.55% improvement)
>   30              | 14702.4 Mbps  | 30746.5 Mbps (109.12% improvement)
>   36              | 10801.5 Mbps  | 26476.3 Mbps (145.11% improvement)
> 
> The results show drastic improvement for network intensive workloads.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Just a minor suggestion below. Other than that looks good to me (with
follow up fixes) in this thread.
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!

> ---
>  mm/memcontrol.c | 128 ++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 91 insertions(+), 37 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1ad326e871c1..0a02ba07561e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1769,10 +1769,11 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
>  	pr_cont(" are going to be killed due to memory.oom.group set\n");
>  }
>  

/* Make sure nr_pages and cached fit into a single cache line */
> +#define NR_MEMCG_STOCK 7
>  struct memcg_stock_pcp {
>  	local_trylock_t stock_lock;
> -	struct mem_cgroup *cached; /* this never be root cgroup */
> -	unsigned int nr_pages;
> +	uint8_t nr_pages[NR_MEMCG_STOCK];
> +	struct mem_cgroup *cached[NR_MEMCG_STOCK];
>  
>  	struct obj_cgroup *cached_objcg;
>  	struct pglist_data *cached_pgdat;
[...]
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-29  9:40   ` Hugh Dickins
@ 2025-04-29 14:50     ` Shakeel Butt
  0 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-29 14:50 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Tue, Apr 29, 2025 at 2:40 AM Hugh Dickins <hughd@google.com> wrote:
>
> [PATCH mm-unstable] memcg: multi-memcg percpu charge cache - fix 3
>
> Fix 2 has been giving me lots of memcg OOM kills not seen before:
> it's better to stock nr_pages than the uninitialized stock_pages.
>
> Signed-off-by: Hugh Dickins <hughd@google.com>

Thanks a lot Hugh for catching this.

> ---
>  mm/memcontrol.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 178d79e68107..02c6f553dc53 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1952,7 +1952,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>                 }
>                 css_get(&memcg->css);
>                 WRITE_ONCE(stock->cached[i], memcg);
> -               WRITE_ONCE(stock->nr_pages[i], stock_pages);
> +               WRITE_ONCE(stock->nr_pages[i], nr_pages);
>         }
>
>         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-29 12:13 ` Michal Hocko
@ 2025-04-29 18:43   ` Shakeel Butt
  2025-04-30  6:48     ` Michal Hocko
  0 siblings, 1 reply; 17+ messages in thread
From: Shakeel Butt @ 2025-04-29 18:43 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Johannes Weiner, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Tue, Apr 29, 2025 at 02:13:16PM +0200, Michal Hocko wrote:
> 
> > Some of the design choices are:
> > 
> > 1. Fit all caches memcgs in a single cacheline.
> 
> Could you be more specific about the reasoning? I suspect it is for the
> network receive path you are mentioning above, right?
> 

Here I meant why I chose NR_MEMCG_STOCK to be 7. Basically the first
cacheline of per-cpu stock has all the cached memcg, so checking if a
given memcg is cached or not should be comparable cheap as single cached
memcg. You suggested comment already mentioned this.

However please note that we may find in future that 2 cachelines worth of
cached memcgs are better for wider audience/workloads but for simplicity
let's start with single cacheline worth of cached memcgs.

[...]
> 
> Just a minor suggestion below. Other than that looks good to me (with
> follow up fixes) in this thread.
> Acked-by: Michal Hocko <mhocko@suse.com>
> Thanks!
> 

Thanks, I will send a diff for Andrew to squash it into original patch.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-29 18:43   ` Shakeel Butt
@ 2025-04-30  6:48     ` Michal Hocko
  0 siblings, 0 replies; 17+ messages in thread
From: Michal Hocko @ 2025-04-30  6:48 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

On Tue 29-04-25 11:43:29, Shakeel Butt wrote:
> On Tue, Apr 29, 2025 at 02:13:16PM +0200, Michal Hocko wrote:
> > 
> > > Some of the design choices are:
> > > 
> > > 1. Fit all caches memcgs in a single cacheline.
> > 
> > Could you be more specific about the reasoning? I suspect it is for the
> > network receive path you are mentioning above, right?
> > 
> 
> Here I meant why I chose NR_MEMCG_STOCK to be 7. Basically the first
> cacheline of per-cpu stock has all the cached memcg, so checking if a
> given memcg is cached or not should be comparable cheap as single cached
> memcg. You suggested comment already mentioned this.
> 
> However please note that we may find in future that 2 cachelines worth of
> cached memcgs are better for wider audience/workloads but for simplicity
> let's start with single cacheline worth of cached memcgs.

Right, and this is exactly why an extended reasoning is really due. We
have introduced magic constants in the past and then we were struggling
to decide whether this might regress something.

I can imagine that we could extend the caching to be adaptive in the
future and scale with the number of cgroups in some way.
> 
> [...]
> > 
> > Just a minor suggestion below. Other than that looks good to me (with
> > follow up fixes) in this thread.
> > Acked-by: Michal Hocko <mhocko@suse.com>
> > Thanks!
> > 
> 
> Thanks, I will send a diff for Andrew to squash it into original patch.

Thanks!
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
                   ` (3 preceding siblings ...)
  2025-04-29 12:13 ` Michal Hocko
@ 2025-04-30  9:57 ` Vlastimil Babka
  2025-04-30 15:05   ` Shakeel Butt
  2025-04-30 15:32 ` Shakeel Butt
  5 siblings, 1 reply; 17+ messages in thread
From: Vlastimil Babka @ 2025-04-30  9:57 UTC (permalink / raw)
  To: Shakeel Butt, Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Jakub Kicinski, Eric Dumazet, Soheil Hassas Yeganeh, linux-mm,
	cgroups, netdev, linux-kernel, Meta kernel team

On 4/16/25 20:02, Shakeel Butt wrote:
> Memory cgroup accounting is expensive and to reduce the cost, the kernel
> maintains per-cpu charge cache for a single memcg. So, if a charge
> request comes for a different memcg, the kernel will flush the old
> memcg's charge cache and then charge the newer memcg a fixed amount (64
> pages), subtracts the charge request amount and stores the remaining in
> the per-cpu charge cache for the newer memcg.
> 
> This mechanism is based on the assumption that the kernel, for locality,
> keep a process on a CPU for long period of time and most of the charge
> requests from that process will be served by that CPU's local charge
> cache.
> 
> However this assumption breaks down for incoming network traffic in a
> multi-tenant machine. We are in the process of running multiple
> workloads on a single machine and if such workloads are network heavy,
> we are seeing very high network memory accounting cost. We have observed
> multiple CPUs spending almost 100% of their time in net_rx_action and
> almost all of that time is spent in memcg accounting of the network
> traffic.
> 
> More precisely, net_rx_action is serving packets from multiple workloads
> and is observing/serving mix of packets of these workloads. The memcg
> switch of per-cpu cache is very expensive and we are observing a lot of
> memcg switches on the machine. Almost all the time is being spent on
> charging new memcg and flushing older memcg cache. So, definitely we
> need per-cpu cache that support multiple memcgs for this scenario.
> 
> This patch implements a simple (and dumb) multiple memcg percpu charge
> cache. Actually we started with more sophisticated LRU based approach but
> the dumb one was always better than the sophisticated one by 1% to 3%,
> so going with the simple approach.
> 
> Some of the design choices are:
> 
> 1. Fit all caches memcgs in a single cacheline.
> 2. The cache array can be mix of empty slots or memcg charged slots, so
>    the kernel has to traverse the full array.
> 3. The cache drain from the reclaim will drain all cached memcgs to keep
>    things simple.
> 
> To evaluate the impact of this optimization, on a 72 CPUs machine, we
> ran the following workload where each netperf client runs in a different
> cgroup. The next-20250415 kernel is used as base.
> 
>  $ netserver -6
>  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> 
> number of clients | Without patch | With patch
>   6               | 42584.1 Mbps  | 48603.4 Mbps (14.13% improvement)
>   12              | 30617.1 Mbps  | 47919.7 Mbps (56.51% improvement)
>   18              | 25305.2 Mbps  | 45497.3 Mbps (79.79% improvement)
>   24              | 20104.1 Mbps  | 37907.7 Mbps (88.55% improvement)
>   30              | 14702.4 Mbps  | 30746.5 Mbps (109.12% improvement)
>   36              | 10801.5 Mbps  | 26476.3 Mbps (145.11% improvement)
> 
> The results show drastic improvement for network intensive workloads.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

See below

> ---
>  mm/memcontrol.c | 128 ++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 91 insertions(+), 37 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1ad326e871c1..0a02ba07561e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1769,10 +1769,11 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
>  	pr_cont(" are going to be killed due to memory.oom.group set\n");
>  }
>  
> +#define NR_MEMCG_STOCK 7
>  struct memcg_stock_pcp {
>  	local_trylock_t stock_lock;
> -	struct mem_cgroup *cached; /* this never be root cgroup */
> -	unsigned int nr_pages;
> +	uint8_t nr_pages[NR_MEMCG_STOCK];
> +	struct mem_cgroup *cached[NR_MEMCG_STOCK];

I have noticed memcg_stock is a DEFINE_PER_CPU and not
DEFINE_PER_CPU_ALIGNED so I think that the intended cacheline usage isn't
guaranteed now.

Actually tried compiling and got in objdump -t vmlinux:

ffffffff83a26e60 l     O .data..percpu  0000000000000088 memcg_stock

AFAICS that's aligned to 32 bytes only (0x60 is 96) bytes, not 64.

changing to _ALIGNED gives me:

ffffffff83a2c5c0 l     O .data..percpu  0000000000000088 memcg_stock

0xc0 is 192 so multiple of 64, so seems to work as intended and indeed
necessary. So you should change it too while adding the comment.

>  	struct obj_cgroup *cached_objcg;
>  	struct pglist_data *cached_pgdat;


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-25 20:18 ` Shakeel Butt
  2025-04-29  9:40   ` Hugh Dickins
@ 2025-04-30 10:05   ` Vlastimil Babka
  2025-04-30 15:16     ` Shakeel Butt
  1 sibling, 1 reply; 17+ messages in thread
From: Vlastimil Babka @ 2025-04-30 10:05 UTC (permalink / raw)
  To: Shakeel Butt, Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Jakub Kicinski, Eric Dumazet, Soheil Hassas Yeganeh, linux-mm,
	cgroups, netdev, linux-kernel, Meta kernel team, Hugh Dickins

On 4/25/25 22:18, Shakeel Butt wrote:
> Hi Andrew,
> 
> Another fix for this patch. Basically simplification of refill_stock and
> avoiding multiple cached entries of a memcg.
> 
> From 6f6f7736799ad8ca5fee48eca7b7038f6c9bb5b9 Mon Sep 17 00:00:00 2001
> From: Shakeel Butt <shakeel.butt@linux.dev>
> Date: Fri, 25 Apr 2025 13:10:43 -0700
> Subject: [PATCH] memcg: multi-memcg percpu charge cache - fix 2
> 
> Simplify refill_stock by avoiding goto and doing the operations inline
> and make sure the given memcg is not cached multiple times.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

It seems to me you could simplify further based on how cached/nr_pages
arrays are filled from 0 to higher index and thus if you see a NULL it means
all higher indices are also NULL. At least I don't think there's ever a
drain_stock() that would "punch a NULL" in the middle? When it's done in
refill_stock() for the random index, it's immediately reused.

Of course if that invariant was made official and relied upon, it would need
to be documented and care taken not to break it.

But then I think:
- refill_stock() could be further simplified
- loops in consume_stop() and is_drain_needed() could stop on first NULL
cached[i] encountered.

WDYT?

> ---
>  mm/memcontrol.c | 27 +++++++++++++++------------
>  1 file changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 997e2da5d2ca..9dfdbb2fcccc 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1907,7 +1907,8 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  	struct mem_cgroup *cached;
>  	uint8_t stock_pages;
>  	unsigned long flags;
> -	bool evict = true;
> +	bool success = false;
> +	int empty_slot = -1;
>  	int i;
>  
>  	/*
> @@ -1931,26 +1932,28 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  
>  	stock = this_cpu_ptr(&memcg_stock);
>  	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
> -again:
>  		cached = READ_ONCE(stock->cached[i]);
> -		if (!cached) {
> -			css_get(&memcg->css);
> -			WRITE_ONCE(stock->cached[i], memcg);
> -		}
> -		if (!cached || memcg == READ_ONCE(stock->cached[i])) {
> +		if (!cached && empty_slot == -1)
> +			empty_slot = i;
> +		if (memcg == READ_ONCE(stock->cached[i])) {
>  			stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
>  			WRITE_ONCE(stock->nr_pages[i], stock_pages);
>  			if (stock_pages > MEMCG_CHARGE_BATCH)
>  				drain_stock(stock, i);
> -			evict = false;
> +			success = true;
>  			break;
>  		}
>  	}
>  
> -	if (evict) {
> -		i = get_random_u32_below(NR_MEMCG_STOCK);
> -		drain_stock(stock, i);
> -		goto again;
> +	if (!success) {
> +		i = empty_slot;
> +		if (i == -1) {
> +			i = get_random_u32_below(NR_MEMCG_STOCK);
> +			drain_stock(stock, i);
> +		}
> +		css_get(&memcg->css);
> +		WRITE_ONCE(stock->cached[i], memcg);
> +		WRITE_ONCE(stock->nr_pages[i], stock_pages);
>  	}
>  
>  	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-30  9:57 ` Vlastimil Babka
@ 2025-04-30 15:05   ` Shakeel Butt
  0 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-30 15:05 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Jakub Kicinski, Eric Dumazet, Soheil Hassas Yeganeh,
	linux-mm, cgroups, netdev, linux-kernel, Meta kernel team

On Wed, Apr 30, 2025 at 11:57:13AM +0200, Vlastimil Babka wrote:
> On 4/16/25 20:02, Shakeel Butt wrote:
> > Memory cgroup accounting is expensive and to reduce the cost, the kernel
> > maintains per-cpu charge cache for a single memcg. So, if a charge
> > request comes for a different memcg, the kernel will flush the old
> > memcg's charge cache and then charge the newer memcg a fixed amount (64
> > pages), subtracts the charge request amount and stores the remaining in
> > the per-cpu charge cache for the newer memcg.
> > 
> > This mechanism is based on the assumption that the kernel, for locality,
> > keep a process on a CPU for long period of time and most of the charge
> > requests from that process will be served by that CPU's local charge
> > cache.
> > 
> > However this assumption breaks down for incoming network traffic in a
> > multi-tenant machine. We are in the process of running multiple
> > workloads on a single machine and if such workloads are network heavy,
> > we are seeing very high network memory accounting cost. We have observed
> > multiple CPUs spending almost 100% of their time in net_rx_action and
> > almost all of that time is spent in memcg accounting of the network
> > traffic.
> > 
> > More precisely, net_rx_action is serving packets from multiple workloads
> > and is observing/serving mix of packets of these workloads. The memcg
> > switch of per-cpu cache is very expensive and we are observing a lot of
> > memcg switches on the machine. Almost all the time is being spent on
> > charging new memcg and flushing older memcg cache. So, definitely we
> > need per-cpu cache that support multiple memcgs for this scenario.
> > 
> > This patch implements a simple (and dumb) multiple memcg percpu charge
> > cache. Actually we started with more sophisticated LRU based approach but
> > the dumb one was always better than the sophisticated one by 1% to 3%,
> > so going with the simple approach.
> > 
> > Some of the design choices are:
> > 
> > 1. Fit all caches memcgs in a single cacheline.
> > 2. The cache array can be mix of empty slots or memcg charged slots, so
> >    the kernel has to traverse the full array.
> > 3. The cache drain from the reclaim will drain all cached memcgs to keep
> >    things simple.
> > 
> > To evaluate the impact of this optimization, on a 72 CPUs machine, we
> > ran the following workload where each netperf client runs in a different
> > cgroup. The next-20250415 kernel is used as base.
> > 
> >  $ netserver -6
> >  $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K
> > 
> > number of clients | Without patch | With patch
> >   6               | 42584.1 Mbps  | 48603.4 Mbps (14.13% improvement)
> >   12              | 30617.1 Mbps  | 47919.7 Mbps (56.51% improvement)
> >   18              | 25305.2 Mbps  | 45497.3 Mbps (79.79% improvement)
> >   24              | 20104.1 Mbps  | 37907.7 Mbps (88.55% improvement)
> >   30              | 14702.4 Mbps  | 30746.5 Mbps (109.12% improvement)
> >   36              | 10801.5 Mbps  | 26476.3 Mbps (145.11% improvement)
> > 
> > The results show drastic improvement for network intensive workloads.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> 
> See below
> 
> > ---
> >  mm/memcontrol.c | 128 ++++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 91 insertions(+), 37 deletions(-)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 1ad326e871c1..0a02ba07561e 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1769,10 +1769,11 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
> >  	pr_cont(" are going to be killed due to memory.oom.group set\n");
> >  }
> >  
> > +#define NR_MEMCG_STOCK 7
> >  struct memcg_stock_pcp {
> >  	local_trylock_t stock_lock;
> > -	struct mem_cgroup *cached; /* this never be root cgroup */
> > -	unsigned int nr_pages;
> > +	uint8_t nr_pages[NR_MEMCG_STOCK];
> > +	struct mem_cgroup *cached[NR_MEMCG_STOCK];
> 
> I have noticed memcg_stock is a DEFINE_PER_CPU and not
> DEFINE_PER_CPU_ALIGNED so I think that the intended cacheline usage isn't
> guaranteed now.
> 
> Actually tried compiling and got in objdump -t vmlinux:
> 
> ffffffff83a26e60 l     O .data..percpu  0000000000000088 memcg_stock
> 
> AFAICS that's aligned to 32 bytes only (0x60 is 96) bytes, not 64.
> 
> changing to _ALIGNED gives me:
> 
> ffffffff83a2c5c0 l     O .data..percpu  0000000000000088 memcg_stock
> 
> 0xc0 is 192 so multiple of 64, so seems to work as intended and indeed
> necessary. So you should change it too while adding the comment.
> 

Wow I didn't notice this at all. Thanks a lot. I will fix this in the
next fix diff.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-30 10:05   ` Vlastimil Babka
@ 2025-04-30 15:16     ` Shakeel Butt
  0 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-30 15:16 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Jakub Kicinski, Eric Dumazet, Soheil Hassas Yeganeh,
	linux-mm, cgroups, netdev, linux-kernel, Meta kernel team,
	Hugh Dickins

On Wed, Apr 30, 2025 at 12:05:48PM +0200, Vlastimil Babka wrote:
> On 4/25/25 22:18, Shakeel Butt wrote:
> > Hi Andrew,
> > 
> > Another fix for this patch. Basically simplification of refill_stock and
> > avoiding multiple cached entries of a memcg.
> > 
> > From 6f6f7736799ad8ca5fee48eca7b7038f6c9bb5b9 Mon Sep 17 00:00:00 2001
> > From: Shakeel Butt <shakeel.butt@linux.dev>
> > Date: Fri, 25 Apr 2025 13:10:43 -0700
> > Subject: [PATCH] memcg: multi-memcg percpu charge cache - fix 2
> > 
> > Simplify refill_stock by avoiding goto and doing the operations inline
> > and make sure the given memcg is not cached multiple times.
> > 
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> It seems to me you could simplify further based on how cached/nr_pages
> arrays are filled from 0 to higher index and thus if you see a NULL it means
> all higher indices are also NULL. At least I don't think there's ever a
> drain_stock() that would "punch a NULL" in the middle? When it's done in
> refill_stock() for the random index, it's immediately reused.
> 
> Of course if that invariant was made official and relied upon, it would need
> to be documented and care taken not to break it.
> 
> But then I think:
> - refill_stock() could be further simplified
> - loops in consume_stop() and is_drain_needed() could stop on first NULL
> cached[i] encountered.
> 
> WDYT?
> 

Please see below.

> > ---
> >  mm/memcontrol.c | 27 +++++++++++++++------------
> >  1 file changed, 15 insertions(+), 12 deletions(-)
> > 
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 997e2da5d2ca..9dfdbb2fcccc 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1907,7 +1907,8 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> >  	struct mem_cgroup *cached;
> >  	uint8_t stock_pages;
> >  	unsigned long flags;
> > -	bool evict = true;
> > +	bool success = false;
> > +	int empty_slot = -1;
> >  	int i;
> >  
> >  	/*
> > @@ -1931,26 +1932,28 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> >  
> >  	stock = this_cpu_ptr(&memcg_stock);
> >  	for (i = 0; i < NR_MEMCG_STOCK; ++i) {
> > -again:
> >  		cached = READ_ONCE(stock->cached[i]);
> > -		if (!cached) {
> > -			css_get(&memcg->css);
> > -			WRITE_ONCE(stock->cached[i], memcg);
> > -		}
> > -		if (!cached || memcg == READ_ONCE(stock->cached[i])) {
> > +		if (!cached && empty_slot == -1)
> > +			empty_slot = i;
> > +		if (memcg == READ_ONCE(stock->cached[i])) {
> >  			stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
> >  			WRITE_ONCE(stock->nr_pages[i], stock_pages);
> >  			if (stock_pages > MEMCG_CHARGE_BATCH)
> >  				drain_stock(stock, i);

So, this drain_stock() above can punch a NULL hole in the array but I
think I do see your point. We can fill this hole by moving the last
non-NULL here. For now I plan to keep it as is as I have some followup
plans to make this specific drain_stock() conditional on the caller
(somewhat similar to commit 5387c90490f7f) and then I will re-check if
we can eliminate this NULL hole.

Thanks a lot for the reviews and suggestions.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] memcg: multi-memcg percpu charge cache
  2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
                   ` (4 preceding siblings ...)
  2025-04-30  9:57 ` Vlastimil Babka
@ 2025-04-30 15:32 ` Shakeel Butt
  5 siblings, 0 replies; 17+ messages in thread
From: Shakeel Butt @ 2025-04-30 15:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Jakub Kicinski, Eric Dumazet,
	Soheil Hassas Yeganeh, linux-mm, cgroups, netdev, linux-kernel,
	Meta kernel team

Andrew, please find another fix/improvements for this patch below.

From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 30 Apr 2025 08:28:23 -0700
Subject: [PATCH] memcg: multi-memcg percpu charge cache - fix 4

Add comment suggested by Michal and use DEFINE_PER_CPU_ALIGNED instead
of DEFINE_PER_CPU suggested by Vlastimil.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
 mm/memcontrol.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5a07e0375254..b877287aeb11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1775,6 +1775,10 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 	pr_cont(" are going to be killed due to memory.oom.group set\n");
 }
 
+/*
+ * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
+ * nr_pages in a single cacheline. This may change in future.
+ */
 #define NR_MEMCG_STOCK 7
 struct memcg_stock_pcp {
 	local_trylock_t stock_lock;
@@ -1791,7 +1795,7 @@ struct memcg_stock_pcp {
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
 };
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
 	.stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
 };
 static DEFINE_MUTEX(percpu_charge_mutex);
-- 
2.47.1



^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2025-04-30 15:32 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-16 18:02 [PATCH] memcg: multi-memcg percpu charge cache Shakeel Butt
2025-04-23  1:10 ` Jakub Kicinski
2025-04-23 22:16   ` Shakeel Butt
2025-04-23 22:30     ` Jakub Kicinski
2025-04-23 22:59       ` Shakeel Butt
2025-04-23 23:14 ` Shakeel Butt
2025-04-25 20:18 ` Shakeel Butt
2025-04-29  9:40   ` Hugh Dickins
2025-04-29 14:50     ` Shakeel Butt
2025-04-30 10:05   ` Vlastimil Babka
2025-04-30 15:16     ` Shakeel Butt
2025-04-29 12:13 ` Michal Hocko
2025-04-29 18:43   ` Shakeel Butt
2025-04-30  6:48     ` Michal Hocko
2025-04-30  9:57 ` Vlastimil Babka
2025-04-30 15:05   ` Shakeel Butt
2025-04-30 15:32 ` Shakeel Butt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox