* [PATCH 1/8 RFC] mm/page_counter: introduce per-page_counter stock
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
@ 2026-04-10 21:06 ` Joshua Hahn
2026-04-10 21:06 ` [PATCH 2/8 RFC] mm/page_counter: use page_counter_stock in page_counter_try_charge Joshua Hahn
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:06 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
In order to avoid expensive hierarchy walks on every memcg charge and
limit check, memcontrol uses per-cpu stocks (memcg_stock_pcp) to cache
pre-charged pages and introduce a fast path to try_charge_memcg.
However, there are a few quirks with the current implementation that
could be improved upon.
First, each memcg_stock_pcp can only cache the charges of 7 memcgs
(defined as NR_MEMCG_STOCK), which means that once a CPU starts handling
the charging of more than 7 memcgs, it randomly selects a victim memcg
to evict and drain from the cpu, which can cause unnecessarily increased
latencies and thrashing as memcgs continually evict each others' stock.
Second, stock is tightly coupled with memcg, which means that all page
counters in a memcg share the same resource. This may simplify some of
the charging logic, but it prevents new page counters from being added
and using a separate stock.
We can address these concerns by pushing the concept of stock down to
the page_counter level, which addresses the random eviction problem by
getting rid of the 7 slot limit, and makes enabling separate stock
caches for other page_counters simpler.
Introduce a generic per-cpu stock directly in struct page_counter.
Stock can optionally be enabled per-page_counter, limiting the overhead
increase for page_counters who do not benefit greatly from caching
charges.
This patch introduces the page_counter_stock struct and its
enable/disable/free functions, but does not use these yet.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/page_counter.h | 13 ++++++++
mm/page_counter.c | 60 ++++++++++++++++++++++++++++++++++++
2 files changed, 73 insertions(+)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc871..c7e3ab3356d20 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -5,8 +5,15 @@
#include <linux/atomic.h>
#include <linux/cache.h>
#include <linux/limits.h>
+#include <linux/local_lock.h>
+#include <linux/percpu.h>
#include <asm/page.h>
+struct page_counter_stock {
+ local_trylock_t lock;
+ unsigned long nr_pages;
+};
+
struct page_counter {
/*
* Make sure 'usage' does not share cacheline with any other field in
@@ -41,6 +48,8 @@ struct page_counter {
unsigned long high;
unsigned long max;
struct page_counter *parent;
+ struct page_counter_stock __percpu *stock;
+ unsigned int batch;
} ____cacheline_internodealigned_in_smp;
#if BITS_PER_LONG == 32
@@ -99,6 +108,10 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
counter->watermark = usage;
}
+int page_counter_enable_stock(struct page_counter *counter, unsigned int batch);
+void page_counter_disable_stock(struct page_counter *counter);
+void page_counter_free_stock(struct page_counter *counter);
+
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
void page_counter_calculate_protection(struct page_counter *root,
struct page_counter *counter,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127a..965021993e161 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -8,6 +8,7 @@
#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
+#include <linux/percpu.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
@@ -289,6 +290,65 @@ int page_counter_memparse(const char *buf, const char *max,
return 0;
}
+int page_counter_enable_stock(struct page_counter *counter, unsigned int batch)
+{
+ struct page_counter_stock __percpu *stock;
+ int cpu;
+
+ stock = alloc_percpu(struct page_counter_stock);
+ if (!stock)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct page_counter_stock *s = per_cpu_ptr(stock, cpu);
+
+ local_trylock_init(&s->lock);
+ }
+ counter->stock = stock;
+ counter->batch = batch;
+
+ return 0;
+}
+
+void page_counter_disable_stock(struct page_counter *counter)
+{
+ unsigned int stock_to_drain = 0;
+ int cpu;
+
+ if (!counter->stock)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct page_counter_stock *stock;
+
+ /*
+ * No need for local lock; this is called during css_offline,
+ * after the cgroup has already been removed.
+ */
+ stock = per_cpu_ptr(counter->stock, cpu);
+ stock_to_drain += stock->nr_pages;
+ }
+
+ if (stock_to_drain) {
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent)
+ page_counter_cancel(c, stock_to_drain);
+ }
+
+ /* This prevents future charges from trying to deposit pages */
+ counter->batch = 0;
+}
+
+void page_counter_free_stock(struct page_counter *counter)
+{
+ if (!counter->stock)
+ return;
+
+ free_percpu(counter->stock);
+ counter->stock = NULL;
+}
+
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 2/8 RFC] mm/page_counter: use page_counter_stock in page_counter_try_charge
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
2026-04-10 21:06 ` [PATCH 1/8 RFC] mm/page_counter: introduce per-page_counter stock Joshua Hahn
@ 2026-04-10 21:06 ` Joshua Hahn
2026-04-10 21:06 ` [PATCH 3/8 RFC] mm/page_counter: use page_counter_stock in page_counter_uncharge Joshua Hahn
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:06 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
Make page_counter_try_charge() stock-aware. We preserve the same
semantics as the existing stock handling logic in try_charge_memcg:
1. Limit-check against the stock. If there is enough, charge to the
stock (non-hierarchical) and return immediately.
2. Greedily attempt to fulfill the charge request and fill the stock up
at the same time via a hierarchical charge.
3. If we fail with this charge, retry again (once) with the exact number
of pages requested.
4. If we succeed with the greedy attempt, then try to add those extra
pages to the stock. If that fails (trylock), then uncharge those
surplus pages hierarchically.
As of this patch, the page_counter_stock is unused, as it has not been
enabled on any memcg yet. No functional changes intended.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/page_counter.c | 41 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 38 insertions(+), 3 deletions(-)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 965021993e161..7a921872079b8 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -121,9 +121,24 @@ bool page_counter_try_charge(struct page_counter *counter,
struct page_counter **fail)
{
struct page_counter *c;
+ unsigned long charge = nr_pages;
bool protection = track_protection(counter);
bool track_failcnt = counter->track_failcnt;
+ if (counter->stock && local_trylock(&counter->stock->lock)) {
+ struct page_counter_stock *stock = this_cpu_ptr(counter->stock);
+
+ if (stock->nr_pages >= charge) {
+ stock->nr_pages -= charge;
+ local_unlock(&counter->stock->lock);
+ return true;
+ }
+ local_unlock(&counter->stock->lock);
+ }
+
+ charge = max_t(unsigned long, counter->batch, nr_pages);
+
+retry:
for (c = counter; c; c = c->parent) {
long new;
/*
@@ -140,9 +155,9 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->usage);
+ new = atomic_long_add_return(charge, &c->usage);
if (new > c->max) {
- atomic_long_sub(nr_pages, &c->usage);
+ atomic_long_sub(charge, &c->usage);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt which is only used
@@ -163,11 +178,31 @@ bool page_counter_try_charge(struct page_counter *counter,
WRITE_ONCE(c->watermark, new);
}
}
+
+ /* charge > nr_pages implies this page_counter has stock enabled */
+ if (charge > nr_pages) {
+ if (local_trylock(&counter->stock->lock)) {
+ struct page_counter_stock *stock;
+
+ stock = this_cpu_ptr(counter->stock);
+ stock->nr_pages += charge - nr_pages;
+ local_unlock(&counter->stock->lock);
+ } else {
+ page_counter_uncharge(counter, charge - nr_pages);
+ }
+ }
+
return true;
failed:
for (c = counter; c != *fail; c = c->parent)
- page_counter_cancel(c, nr_pages);
+ page_counter_cancel(c, charge);
+
+ if (charge > nr_pages) {
+ /* Retry without trying to grab extra pages to refill stock */
+ charge = nr_pages;
+ goto retry;
+ }
return false;
}
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 3/8 RFC] mm/page_counter: use page_counter_stock in page_counter_uncharge
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
2026-04-10 21:06 ` [PATCH 1/8 RFC] mm/page_counter: introduce per-page_counter stock Joshua Hahn
2026-04-10 21:06 ` [PATCH 2/8 RFC] mm/page_counter: use page_counter_stock in page_counter_try_charge Joshua Hahn
@ 2026-04-10 21:06 ` Joshua Hahn
2026-04-10 21:06 ` [PATCH 4/8 RFC] mm/page_counter: introduce stock drain APIs Joshua Hahn
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:06 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
Make page_counter_uncharge() stock-aware. We preserve the same semantics
as the existing stock handling logic in try_charge_memcg:
1. Instead of immediately walking the page_counter hierarchy, see if
depositing the charge to the stock puts it over the batch limit.
If not, deposit the charge and return immediately.
2. If we put the stock over the batch limit, walk up the page_counter
hierarchy and uncharge the excess.
Extract the repeated work of hierarchically cancelling page_counter
charges into a helper function as well.
As of this patch, the page_counter_stock is unused, as it has not been
enabled on any memcg yet. No functional changes intended.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/page_counter.c | 36 +++++++++++++++++++++++++++---------
1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 7a921872079b8..7be214034bfad 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -207,6 +207,15 @@ bool page_counter_try_charge(struct page_counter *counter,
return false;
}
+static void page_counter_cancel_hierarchy(struct page_counter *counter,
+ unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ for (c = counter; c; c = c->parent)
+ page_counter_cancel(c, nr_pages);
+}
+
/**
* page_counter_uncharge - hierarchically uncharge pages
* @counter: counter
@@ -214,10 +223,23 @@ bool page_counter_try_charge(struct page_counter *counter,
*/
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
- struct page_counter *c;
+ unsigned long charge = nr_pages;
- for (c = counter; c; c = c->parent)
- page_counter_cancel(c, nr_pages);
+ if (counter->stock && local_trylock(&counter->stock->lock)) {
+ struct page_counter_stock *stock = this_cpu_ptr(counter->stock);
+
+ stock->nr_pages += nr_pages;
+ if (stock->nr_pages > counter->batch) {
+ charge = stock->nr_pages - counter->batch;
+ stock->nr_pages = counter->batch;
+ local_unlock(&counter->stock->lock);
+ } else {
+ local_unlock(&counter->stock->lock);
+ return;
+ }
+ }
+
+ page_counter_cancel_hierarchy(counter, charge);
}
/**
@@ -364,12 +386,8 @@ void page_counter_disable_stock(struct page_counter *counter)
stock_to_drain += stock->nr_pages;
}
- if (stock_to_drain) {
- struct page_counter *c;
-
- for (c = counter; c; c = c->parent)
- page_counter_cancel(c, stock_to_drain);
- }
+ if (stock_to_drain)
+ page_counter_cancel_hierarchy(counter, stock_to_drain);
/* This prevents future charges from trying to deposit pages */
counter->batch = 0;
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 4/8 RFC] mm/page_counter: introduce stock drain APIs
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (2 preceding siblings ...)
2026-04-10 21:06 ` [PATCH 3/8 RFC] mm/page_counter: use page_counter_stock in page_counter_uncharge Joshua Hahn
@ 2026-04-10 21:06 ` Joshua Hahn
2026-04-10 21:06 ` [PATCH 5/8 RFC] mm/memcontrol: convert memcg to use page_counter_stock Joshua Hahn
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:06 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, cgroups, linux-mm, linux-kernel, kernel-team
Introduce page_counter_drain_stock() and page_counter_drain_cpu()
to replace memcg stock draining functions.
page_counter_drain_stock() runs from drain_all_stock, which is called
when the system is under memory pressure or a cgroup is dying. Because
it is a rare operation, it uses work_on_cpu() to synchronously drain
each online CPU's stock and synchronizes with concurrent charge/uncharge
via local_lock.
page_counter_drain_cpu() handles the CPU hotplug dead path, where the
stock can be accessed directly without locking since the CPU is dead.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
include/linux/page_counter.h | 2 ++
mm/page_counter.c | 51 ++++++++++++++++++++++++++++++++++++
2 files changed, 53 insertions(+)
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c7e3ab3356d20..c6772531074b5 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -111,6 +111,8 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
int page_counter_enable_stock(struct page_counter *counter, unsigned int batch);
void page_counter_disable_stock(struct page_counter *counter);
void page_counter_free_stock(struct page_counter *counter);
+void page_counter_drain_stock(struct page_counter *counter);
+void page_counter_drain_cpu(struct page_counter *counter, unsigned int cpu);
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
void page_counter_calculate_protection(struct page_counter *root,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 7be214034bfad..28c2e6442f7d3 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -12,6 +12,8 @@
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/workqueue.h>
#include <asm/page.h>
static bool track_protection(struct page_counter *c)
@@ -402,6 +404,55 @@ void page_counter_free_stock(struct page_counter *counter)
counter->stock = NULL;
}
+static long page_counter_drain_stock_cpu(void *arg)
+{
+ struct page_counter *counter = arg;
+ struct page_counter_stock *stock;
+ unsigned long nr_pages;
+
+ local_lock(&counter->stock->lock);
+ stock = this_cpu_ptr(counter->stock);
+ nr_pages = stock->nr_pages;
+ stock->nr_pages = 0;
+ local_unlock(&counter->stock->lock);
+
+ if (nr_pages)
+ page_counter_cancel_hierarchy(counter, nr_pages);
+
+ return 0;
+}
+/*
+ * Drain per-cpu stock across all online CPUs. Caller (drain_all_stock) is
+ * already protected by a mutex, all future callers must serialize as well.
+ */
+void page_counter_drain_stock(struct page_counter *counter)
+{
+ int cpu;
+
+ if (!counter->stock)
+ return;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu)
+ work_on_cpu(cpu, page_counter_drain_stock_cpu, counter);
+ cpus_read_unlock();
+}
+
+void page_counter_drain_cpu(struct page_counter *counter, unsigned int cpu)
+{
+ struct page_counter_stock *stock;
+ unsigned long nr_pages;
+
+ if (!counter->stock)
+ return;
+
+ stock = per_cpu_ptr(counter->stock, cpu);
+ nr_pages = stock->nr_pages;
+ if (nr_pages) {
+ stock->nr_pages = 0;
+ page_counter_cancel_hierarchy(counter, nr_pages);
+ }
+}
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 5/8 RFC] mm/memcontrol: convert memcg to use page_counter_stock
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (3 preceding siblings ...)
2026-04-10 21:06 ` [PATCH 4/8 RFC] mm/page_counter: introduce stock drain APIs Joshua Hahn
@ 2026-04-10 21:06 ` Joshua Hahn
2026-04-10 21:07 ` [PATCH 6/8 RFC] mm/memcontrol: optimize memsw stock for cgroup v1 Joshua Hahn
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:06 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
Now with all of the memcg_stock handling logic replicated in
page_counter_stock, switch memcg to use the page_counter_stock.
There are a few details that have changed:
First, the old special-casing for the !allow_spinning check to avoid
refilling and flushing of the old stock is removed. This special casing
was important previously, because refilling the stock could do a lot of
extra work by evicting one of 7 random victim memcgs in the percpu
memcg_stock slots.
Now that we no longer randomly evict other memcg stocks, refilling just
adds extra pages to the local cache. While there may be extra work
attempted when trying to refill (rather than just servicing the exact
number of pages requested), this is much less work than the flushing of
other memcgs' stock.
Secondly, stock checking is folded into the memory page_counter. This
means that for cgroupv1 users who use the memsw page_counter, they will
always incur the cost of hierarchically charging for memsw. One possible
workaround for this is to introduce a separate stock for memsw, which
would allow for separate stock checks for both memsw and memory,
restoring the fastpath behavior.
Finally, we can now fail during page_counter_enable_stock(), if there is
not enough memory to allocate a percpu page_counter_stock. This failure
is rare and nonfatal; the system can continue to operate, with the page
counter working without stock and falling back to walking the hierarchy.
Note that obj_stock remains untouched by these changes.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 68 +++++++++++++++++------------------------------
mm/page_counter.c | 5 +---
2 files changed, 25 insertions(+), 48 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c3d98ab41f1f1..27d2edd5a7832 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2238,33 +2238,22 @@ static void schedule_drain_work(int cpu, struct work_struct *work)
*/
void drain_all_stock(struct mem_cgroup *root_memcg)
{
+ struct mem_cgroup *memcg;
int cpu, curcpu;
/* If someone's already draining, avoid adding running more workers. */
if (!mutex_trylock(&percpu_charge_mutex))
return;
- /*
- * Notify other cpus that system-wide "drain" is running
- * We do not care about races with the cpu hotplug because cpu down
- * as well as workers from this path always operate on the local
- * per-cpu data. CPU up doesn't touch memcg_stock at all.
- */
+
+ for_each_mem_cgroup_tree(memcg, root_memcg)
+ page_counter_drain_stock(&memcg->memory);
+
+ /* Drain obj_stock on all online CPUs */
migrate_disable();
curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
- struct memcg_stock_pcp *memcg_st = &per_cpu(memcg_stock, cpu);
struct obj_stock_pcp *obj_st = &per_cpu(obj_stock, cpu);
- if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) &&
- is_memcg_drain_needed(memcg_st, root_memcg) &&
- !test_and_set_bit(FLUSHING_CACHED_CHARGE,
- &memcg_st->flags)) {
- if (cpu == curcpu)
- drain_local_memcg_stock(&memcg_st->work);
- else
- schedule_drain_work(cpu, &memcg_st->work);
- }
-
if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
obj_stock_flush_required(obj_st, root_memcg) &&
!test_and_set_bit(FLUSHING_CACHED_CHARGE,
@@ -2281,9 +2270,13 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
+ struct mem_cgroup *memcg;
+
/* no need for the local lock */
drain_obj_stock(&per_cpu(obj_stock, cpu));
- drain_stock_fully(&per_cpu(memcg_stock, cpu));
+
+ for_each_mem_cgroup(memcg)
+ page_counter_drain_cpu(&memcg->memory, cpu);
return 0;
}
@@ -2558,7 +2551,6 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
- unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
@@ -2571,31 +2563,19 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
bool allow_spinning = gfpflags_allow_spinning(gfp_mask);
retry:
- if (consume_stock(memcg, nr_pages))
- return 0;
-
- if (!allow_spinning)
- /* Avoid the refill and flush of the older stock */
- batch = nr_pages;
-
reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (!do_memsw_account() ||
- page_counter_try_charge(&memcg->memsw, batch, &counter)) {
- if (page_counter_try_charge(&memcg->memory, batch, &counter))
+ page_counter_try_charge(&memcg->memsw, nr_pages, &counter)) {
+ if (page_counter_try_charge(&memcg->memory, nr_pages, &counter))
goto done_restock;
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, batch);
+ page_counter_uncharge(&memcg->memsw, nr_pages);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}
- if (batch > nr_pages) {
- batch = nr_pages;
- goto retry;
- }
-
/*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
@@ -2692,9 +2672,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
return 0;
done_restock:
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
-
/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
@@ -2731,7 +2708,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
* and distribute reclaim work and delay penalties
* based on how much each task is actually allocating.
*/
- current->memcg_nr_pages_over_high += batch;
+ current->memcg_nr_pages_over_high += nr_pages;
set_notify_resume(current);
break;
}
@@ -3036,7 +3013,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
- refill_stock(memcg, nr_pages);
+ memcg_uncharge(memcg, nr_pages);
css_put(&memcg->css);
}
@@ -3957,6 +3934,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ page_counter_free_stock(&memcg->memory);
+ page_counter_free_stock(&memcg->memsw);
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
@@ -4130,6 +4109,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
refcount_set(&memcg->id.ref, 1);
css_get(css);
+ /* failure is nonfatal, charges fall back to direct hierarchy */
+ page_counter_enable_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+
/*
* Ensure mem_cgroup_from_private_id() works once we're fully online.
*
@@ -4192,6 +4174,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
lru_gen_offline_memcg(memcg);
drain_all_stock(memcg);
+ page_counter_disable_stock(&memcg->memory);
mem_cgroup_private_id_put(memcg, 1);
}
@@ -5382,7 +5365,7 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
- refill_stock(memcg, nr_pages);
+ page_counter_uncharge(&memcg->memory, nr_pages);
}
void mem_cgroup_flush_workqueue(void)
@@ -5435,12 +5418,9 @@ int __init mem_cgroup_init(void)
memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
WARN_ON(!memcg_wq);
- for_each_possible_cpu(cpu) {
- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
- drain_local_memcg_stock);
+ for_each_possible_cpu(cpu)
INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work,
drain_local_obj_stock);
- }
memcg_size = struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids);
memcg_cachep = kmem_cache_create("mem_cgroup", memcg_size, 0,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 28c2e6442f7d3..51148ca3a5b63 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -421,10 +421,7 @@ static long page_counter_drain_stock_cpu(void *arg)
return 0;
}
-/*
- * Drain per-cpu stock across all online CPUs. Caller (drain_all_stock) is
- * already protected by a mutex, all future callers must serialize as well.
- */
+
void page_counter_drain_stock(struct page_counter *counter)
{
int cpu;
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 6/8 RFC] mm/memcontrol: optimize memsw stock for cgroup v1
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (4 preceding siblings ...)
2026-04-10 21:06 ` [PATCH 5/8 RFC] mm/memcontrol: convert memcg to use page_counter_stock Joshua Hahn
@ 2026-04-10 21:07 ` Joshua Hahn
2026-04-10 21:07 ` [PATCH 7/8 RFC] mm/memcontrol: optimize stock usage for cgroup v2 Joshua Hahn
2026-04-10 21:07 ` [PATCH 8/8 RFC] mm/memcontrol: remove unused memcg_stock code Joshua Hahn
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:07 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
Previously, each memcg had its own stock, which was shared by all page
counters within it. Specifically in try_charge_memcg, the stock limit
check would occur before the memsw and memory page_counters were
charged hierarchically.
Now that the memcg stock was folded into the page_counter level, and we
have replaced try_charge_memcg's stock check against the memory
page_counter's stock, this leaves no fast path available for cgroup v1's
memsw check.
Introduce a new stock for the memsw page_counter, charged and uncharged
independently from the memory page_counter. This provides better caching
on cgroup v1:
The best case scenario is when both the memsw and memory page_counters
can use their cached stock charge; this is the old behavior.
The halfway scenario is when either the memsw or memory page_counter
is within the stock size, but the other isn't. This requires one
hierarchical charge.
The worst case scenario is when both memsw and memory page_counters
are over their limit, and must walk two page_counter hierarchies. This
is the same as the old behavior.
By introducing an indepednent stock for memsw, we can avoid the worst
case scenario more often and can fail or succeed separately from the
memory page counter.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 27d2edd5a7832..6d50f5d667434 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2245,8 +2245,10 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
if (!mutex_trylock(&percpu_charge_mutex))
return;
- for_each_mem_cgroup_tree(memcg, root_memcg)
+ for_each_mem_cgroup_tree(memcg, root_memcg) {
page_counter_drain_stock(&memcg->memory);
+ page_counter_drain_stock(&memcg->memsw);
+ }
/* Drain obj_stock on all online CPUs */
migrate_disable();
@@ -2275,8 +2277,10 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
/* no need for the local lock */
drain_obj_stock(&per_cpu(obj_stock, cpu));
- for_each_mem_cgroup(memcg)
+ for_each_mem_cgroup(memcg) {
page_counter_drain_cpu(&memcg->memory, cpu);
+ page_counter_drain_cpu(&memcg->memsw, cpu);
+ }
return 0;
}
@@ -4111,6 +4115,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
/* failure is nonfatal, charges fall back to direct hierarchy */
page_counter_enable_stock(&memcg->memory, MEMCG_CHARGE_BATCH);
+ if (do_memsw_account())
+ page_counter_enable_stock(&memcg->memsw, MEMCG_CHARGE_BATCH);
/*
* Ensure mem_cgroup_from_private_id() works once we're fully online.
@@ -4175,6 +4181,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
drain_all_stock(memcg);
page_counter_disable_stock(&memcg->memory);
+ page_counter_disable_stock(&memcg->memsw);
mem_cgroup_private_id_put(memcg, 1);
}
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 7/8 RFC] mm/memcontrol: optimize stock usage for cgroup v2
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (5 preceding siblings ...)
2026-04-10 21:07 ` [PATCH 6/8 RFC] mm/memcontrol: optimize memsw stock for cgroup v1 Joshua Hahn
@ 2026-04-10 21:07 ` Joshua Hahn
2026-04-10 21:07 ` [PATCH 8/8 RFC] mm/memcontrol: remove unused memcg_stock code Joshua Hahn
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:07 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
In cgroup v2, tasks can only belong to leaf cgroups, meaning non-leaf
cgroups never receive direct charges. Having stock remain in these
cgroups therefore, is wasted percpu memory that will never be consumed
unless all of its children are removed.
To avoid leaving unused but accounted charges from remaining in non-leaf
cgroups, drain the stock when leaf cgroups become parents.
There is one caveat, which is concurrent charging and child creation.
When a leaf cgroup becomes a parent at the same time it is still
charging a task, there can be a race condition where the parent's
stock is drained, then refilled by the charge.
Instead of adding expensive synchronization mechanisms, accept the
pages kept captive by parent page_counters which will not be able to use
the stock until all its children are offlined first. It is a rare
race condition, and is also bounded by MEMCG_CHARGE_BATCH = 64 pages.
This optimization is not for cgroup v1, where tasks can be attached to
any cgroup in the hierarchy, meaning stock can be consumed & refilled
for non-leaf cgroups as well.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6d50f5d667434..4be1638dde180 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4130,6 +4130,17 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
+ /*
+ * On v2, non-leaf memcgs cannot directly be charged. This child's
+ * parent is no longer a leaf, so drain the parent's stock.
+ */
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+ if (parent)
+ page_counter_drain_stock(&parent->memory);
+ }
+
return 0;
free_objcg:
for_each_node(nid) {
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 8/8 RFC] mm/memcontrol: remove unused memcg_stock code
2026-04-10 21:06 [PATCH 0/8 RFC] mm/memcontrol, page_counter: move stock from mem_cgroup to page_counter Joshua Hahn
` (6 preceding siblings ...)
2026-04-10 21:07 ` [PATCH 7/8 RFC] mm/memcontrol: optimize stock usage for cgroup v2 Joshua Hahn
@ 2026-04-10 21:07 ` Joshua Hahn
7 siblings, 0 replies; 9+ messages in thread
From: Joshua Hahn @ 2026-04-10 21:07 UTC (permalink / raw)
To: Johannes Weiner
Cc: Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, cgroups, linux-mm, linux-kernel, kernel-team
Now that all memcg_stock logic has been moved to page_counter_stock, we
can remove all code related to handling memcg_stock. Note that obj_stock
is untouched and is still needed. FLUSHING_CACHED_CHARGE is preserved
so that it can be used by obj_stock as well.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
mm/memcontrol.c | 183 ------------------------------------------------
1 file changed, 183 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4be1638dde180..7de23ecd7cef6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1989,24 +1989,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
pr_cont(" are going to be killed due to memory.oom.group set\n");
}
-/*
- * The value of NR_MEMCG_STOCK is selected to keep the cached memcgs and their
- * nr_pages in a single cacheline. This may change in future.
- */
-#define NR_MEMCG_STOCK 7
#define FLUSHING_CACHED_CHARGE 0
-struct memcg_stock_pcp {
- local_trylock_t lock;
- uint8_t nr_pages[NR_MEMCG_STOCK];
- struct mem_cgroup *cached[NR_MEMCG_STOCK];
-
- struct work_struct work;
- unsigned long flags;
-};
-
-static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = {
- .lock = INIT_LOCAL_TRYLOCK(lock),
-};
struct obj_stock_pcp {
local_trylock_t lock;
@@ -2030,47 +2013,6 @@ static void drain_obj_stock(struct obj_stock_pcp *stock);
static bool obj_stock_flush_required(struct obj_stock_pcp *stock,
struct mem_cgroup *root_memcg);
-/**
- * consume_stock: Try to consume stocked charge on this cpu.
- * @memcg: memcg to consume from.
- * @nr_pages: how many pages to charge.
- *
- * Consume the cached charge if enough nr_pages are present otherwise return
- * failure. Also return failure for charge request larger than
- * MEMCG_CHARGE_BATCH or if the local lock is already taken.
- *
- * returns true if successful, false otherwise.
- */
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- struct memcg_stock_pcp *stock;
- uint8_t stock_pages;
- bool ret = false;
- int i;
-
- if (nr_pages > MEMCG_CHARGE_BATCH ||
- !local_trylock(&memcg_stock.lock))
- return ret;
-
- stock = this_cpu_ptr(&memcg_stock);
-
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- if (memcg != READ_ONCE(stock->cached[i]))
- continue;
-
- stock_pages = READ_ONCE(stock->nr_pages[i]);
- if (stock_pages >= nr_pages) {
- WRITE_ONCE(stock->nr_pages[i], stock_pages - nr_pages);
- ret = true;
- }
- break;
- }
-
- local_unlock(&memcg_stock.lock);
-
- return ret;
-}
-
static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
@@ -2078,51 +2020,6 @@ static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-/*
- * Returns stocks cached in percpu and reset cached information.
- */
-static void drain_stock(struct memcg_stock_pcp *stock, int i)
-{
- struct mem_cgroup *old = READ_ONCE(stock->cached[i]);
- uint8_t stock_pages;
-
- if (!old)
- return;
-
- stock_pages = READ_ONCE(stock->nr_pages[i]);
- if (stock_pages) {
- memcg_uncharge(old, stock_pages);
- WRITE_ONCE(stock->nr_pages[i], 0);
- }
-
- css_put(&old->css);
- WRITE_ONCE(stock->cached[i], NULL);
-}
-
-static void drain_stock_fully(struct memcg_stock_pcp *stock)
-{
- int i;
-
- for (i = 0; i < NR_MEMCG_STOCK; ++i)
- drain_stock(stock, i);
-}
-
-static void drain_local_memcg_stock(struct work_struct *dummy)
-{
- struct memcg_stock_pcp *stock;
-
- if (WARN_ONCE(!in_task(), "drain in non-task context"))
- return;
-
- local_lock(&memcg_stock.lock);
-
- stock = this_cpu_ptr(&memcg_stock);
- drain_stock_fully(stock);
- clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
-
- local_unlock(&memcg_stock.lock);
-}
-
static void drain_local_obj_stock(struct work_struct *dummy)
{
struct obj_stock_pcp *stock;
@@ -2139,86 +2036,6 @@ static void drain_local_obj_stock(struct work_struct *dummy)
local_unlock(&obj_stock.lock);
}
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
- struct memcg_stock_pcp *stock;
- struct mem_cgroup *cached;
- uint8_t stock_pages;
- bool success = false;
- int empty_slot = -1;
- int i;
-
- /*
- * For now limit MEMCG_CHARGE_BATCH to 127 and less. In future if we
- * decide to increase it more than 127 then we will need more careful
- * handling of nr_pages[] in struct memcg_stock_pcp.
- */
- BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S8_MAX);
-
- VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg));
-
- if (nr_pages > MEMCG_CHARGE_BATCH ||
- !local_trylock(&memcg_stock.lock)) {
- /*
- * In case of larger than batch refill or unlikely failure to
- * lock the percpu memcg_stock.lock, uncharge memcg directly.
- */
- memcg_uncharge(memcg, nr_pages);
- return;
- }
-
- stock = this_cpu_ptr(&memcg_stock);
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- cached = READ_ONCE(stock->cached[i]);
- if (!cached && empty_slot == -1)
- empty_slot = i;
- if (memcg == READ_ONCE(stock->cached[i])) {
- stock_pages = READ_ONCE(stock->nr_pages[i]) + nr_pages;
- WRITE_ONCE(stock->nr_pages[i], stock_pages);
- if (stock_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock, i);
- success = true;
- break;
- }
- }
-
- if (!success) {
- i = empty_slot;
- if (i == -1) {
- i = get_random_u32_below(NR_MEMCG_STOCK);
- drain_stock(stock, i);
- }
- css_get(&memcg->css);
- WRITE_ONCE(stock->cached[i], memcg);
- WRITE_ONCE(stock->nr_pages[i], nr_pages);
- }
-
- local_unlock(&memcg_stock.lock);
-}
-
-static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
- struct mem_cgroup *root_memcg)
-{
- struct mem_cgroup *memcg;
- bool flush = false;
- int i;
-
- rcu_read_lock();
- for (i = 0; i < NR_MEMCG_STOCK; ++i) {
- memcg = READ_ONCE(stock->cached[i]);
- if (!memcg)
- continue;
-
- if (READ_ONCE(stock->nr_pages[i]) &&
- mem_cgroup_is_descendant(memcg, root_memcg)) {
- flush = true;
- break;
- }
- }
- rcu_read_unlock();
- return flush;
-}
-
static void schedule_drain_work(int cpu, struct work_struct *work)
{
/*
--
2.52.0
^ permalink raw reply [flat|nested] 9+ messages in thread