linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	"Liam R . Howlett" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@suse.com>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>, Tejun Heo <tj@kernel.org>,
	Michal Koutny <mkoutny@suse.com>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Yuanchu Xie <yuanchu@google.com>, Wei Xu <weixugc@google.com>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	linux-mm@kvack.org, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@meta.com
Subject: [RFC PATCH 5/6] mm/memcontrol, page_counter: Make memory.low tier-aware
Date: Mon, 23 Feb 2026 14:38:28 -0800	[thread overview]
Message-ID: <20260223223830.586018-6-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20260223223830.586018-1-joshua.hahnjy@gmail.com>

On machines serving multiple workloads whose memory is isolated via
the memory cgroup controller, it is currently impossible to enforce a
fair distribution of toptier memory among the worloads, as the only
enforcable limits have to do with total memory footprint, but not where
that memory resides.

This makes ensuring a consistent and baseline performance difficult, as
each workload's performance is heavily impacted by workload-external
factors such as which other workloads are co-located in the same host,
and the order at which different workloads are started.

Extend the existing memory.low protection to be tier-aware in the
charging, enforcement, and protection calculation to provide
best-effort attempts at protecting a fair proportion of toptier memory.

Updates to protection and charging are performed in the same path as
the standard memcontrol equivalents. Enforcing tier-aware memcg limits
however, are gated behind the sysctl tier_aware_memcg. This is so that
runtime-enabling of tier aware limits can account for memory already
present in the system.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 include/linux/memcontrol.h   | 15 +++++++++++----
 include/linux/page_counter.h |  7 ++++---
 kernel/cgroup/dmem.c         |  2 +-
 mm/memcontrol.c              | 14 ++++++++++++--
 mm/page_counter.c            | 35 ++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                  | 13 +++++++++----
 6 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 900a36112b62..a998a1e3b8b0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -606,7 +606,9 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root,
 }
 
 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
-				     struct mem_cgroup *memcg);
+				     struct mem_cgroup *memcg, bool toptier);
+
+unsigned long mem_cgroup_toptier_usage(struct mem_cgroup *memcg);
 
 void update_memcg_toptier_capacity(void);
 
@@ -623,11 +625,15 @@ static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
 }
 
 static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
-					struct mem_cgroup *memcg)
+					struct mem_cgroup *memcg, bool toptier)
 {
 	if (mem_cgroup_unprotected(target, memcg))
 		return false;
 
+	if (toptier)
+		return READ_ONCE(memcg->memory.etoptier_low) >=
+				 mem_cgroup_toptier_usage(memcg);
+
 	return READ_ONCE(memcg->memory.elow) >=
 		page_counter_read(&memcg->memory);
 }
@@ -1114,7 +1120,8 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root,
 }
 
 static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
-						   struct mem_cgroup *memcg)
+						   struct mem_cgroup *memcg,
+						   bool toptier)
 {
 }
 
@@ -1128,7 +1135,7 @@ static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
 	return true;
 }
 static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
-					struct mem_cgroup *memcg)
+					struct mem_cgroup *memcg, bool toptier)
 {
 	return false;
 }
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index ada5f1dd75d4..6635ee7b9575 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -120,15 +120,16 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
 #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
 void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
-				       bool recursive_protection);
+				       bool recursive_protection, bool toptier);
 void page_counter_update_toptier_capacity(struct page_counter *counter,
 					  const nodemask_t *allowed);
 unsigned long page_counter_toptier_high(struct page_counter *counter);
 unsigned long page_counter_toptier_low(struct page_counter *counter);
 #else
 static inline void page_counter_calculate_protection(struct page_counter *root,
-						     struct page_counter *counter,
-						     bool recursive_protection) {}
+						struct page_counter *counter,
+						bool recursive_protection,
+						bool toptier) {}
 #endif
 
 #endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 1ea6afffa985..536d43c42de8 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -277,7 +277,7 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
 			continue;
 
 		page_counter_calculate_protection(
-			climit, &found_pool->cnt, true);
+			climit, &found_pool->cnt, true, false);
 
 		if (found_pool == test_pool)
 			break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 07464f02c529..8aa7ae361a73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4806,12 +4806,13 @@ struct cgroup_subsys memory_cgrp_subsys = {
  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
  * @root: the top ancestor of the sub-tree being checked
  * @memcg: the memory cgroup to check
+ * @toptier: whether the caller is in a toptier node
  *
  * WARNING: This function is not stateless! It can only be used as part
  *          of a top-down tree iteration, not for isolated queries.
  */
 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
-				     struct mem_cgroup *memcg)
+				     struct mem_cgroup *memcg, bool toptier)
 {
 	bool recursive_protection =
 		cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT;
@@ -4822,7 +4823,16 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 	if (!root)
 		root = root_mem_cgroup;
 
-	page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
+	page_counter_calculate_protection(&root->memory, &memcg->memory,
+					  recursive_protection, toptier);
+}
+
+unsigned long mem_cgroup_toptier_usage(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled() || !memcg)
+		return 0;
+
+	return atomic_long_read(&memcg->memory.toptier_usage);
 }
 
 void update_memcg_toptier_capacity(void)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index cf21c72bfd4e..79d46a1c4c0c 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -410,12 +410,39 @@ static unsigned long effective_protection(unsigned long usage,
 	return ep;
 }
 
+static void calculate_protection_toptier(struct page_counter *counter,
+					 bool recursive_protection)
+{
+	struct page_counter *parent = counter->parent;
+	unsigned long toptier_low;
+	unsigned long toptier_usage, parent_toptier_usage;
+	unsigned long toptier_protected, old_toptier_protected;
+	long delta;
+
+	toptier_low = page_counter_toptier_low(counter);
+	toptier_usage = atomic_long_read(&counter->toptier_usage);
+	parent_toptier_usage = atomic_long_read(&parent->toptier_usage);
+
+	/* Propagate toptier low usage to parent for sibling distribution */
+	toptier_protected = min(toptier_usage, toptier_low);
+	old_toptier_protected = atomic_long_xchg(&counter->toptier_low_usage,
+						 toptier_protected);
+	delta = toptier_protected - old_toptier_protected;
+	atomic_long_add(delta, &parent->children_toptier_low_usage);
+
+	WRITE_ONCE(counter->etoptier_low,
+		   effective_protection(toptier_usage, parent_toptier_usage,
+		   toptier_low, READ_ONCE(parent->etoptier_low),
+		   atomic_long_read(&parent->children_toptier_low_usage),
+		   recursive_protection));
+}
 
 /**
  * page_counter_calculate_protection - check if memory consumption is in the normal range
  * @root: the top ancestor of the sub-tree being checked
  * @counter: the page_counter the counter to update
  * @recursive_protection: Whether to use memory_recursiveprot behavior.
+ * @toptier: Whether to calculate toptier-proportional protection
  *
  * Calculates elow/emin thresholds for given page_counter.
  *
@@ -424,7 +451,7 @@ static unsigned long effective_protection(unsigned long usage,
  */
 void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
-				       bool recursive_protection)
+				       bool recursive_protection, bool toptier)
 {
 	unsigned long usage, parent_usage;
 	struct page_counter *parent = counter->parent;
@@ -446,6 +473,9 @@ void page_counter_calculate_protection(struct page_counter *root,
 	if (parent == root) {
 		counter->emin = READ_ONCE(counter->min);
 		counter->elow = READ_ONCE(counter->low);
+		if (toptier)
+			WRITE_ONCE(counter->etoptier_low,
+				   page_counter_toptier_low(counter));
 		return;
 	}
 
@@ -462,6 +492,9 @@ void page_counter_calculate_protection(struct page_counter *root,
 			READ_ONCE(parent->elow),
 			atomic_long_read(&parent->children_low_usage),
 			recursive_protection));
+
+	if (toptier)
+		calculate_protection_toptier(counter, recursive_protection);
 }
 
 void page_counter_update_toptier_capacity(struct page_counter *counter,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6a87ac7be43c..5b4cb030a477 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4144,6 +4144,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	struct mem_cgroup *memcg;
 	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
 	bool reclaimable = !min_ttl;
+	bool toptier = node_is_toptier(pgdat->node_id);
 
 	VM_WARN_ON_ONCE(!current_is_kswapd());
 
@@ -4153,7 +4154,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
-		mem_cgroup_calculate_protection(NULL, memcg);
+		mem_cgroup_calculate_protection(NULL, memcg, toptier);
 
 		if (!reclaimable)
 			reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
@@ -4905,12 +4906,14 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 	unsigned long reclaimed = sc->nr_reclaimed;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	bool toptier = tier_aware_memcg_limits &&
+		       node_is_toptier(pgdat->node_id);
 
 	/* lru_gen_age_node() called mem_cgroup_calculate_protection() */
 	if (mem_cgroup_below_min(NULL, memcg))
 		return MEMCG_LRU_YOUNG;
 
-	if (mem_cgroup_below_low(NULL, memcg)) {
+	if (mem_cgroup_below_low(NULL, memcg, toptier)) {
 		/* see the comment on MEMCG_NR_GENS */
 		if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
 			return MEMCG_LRU_TAIL;
@@ -5960,6 +5963,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 	};
 	struct mem_cgroup_reclaim_cookie *partial = &reclaim;
 	struct mem_cgroup *memcg;
+	bool toptier = node_is_toptier(pgdat->node_id);
 
 	/*
 	 * In most cases, direct reclaimers can do partial walks
@@ -5987,7 +5991,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 		 */
 		cond_resched();
 
-		mem_cgroup_calculate_protection(target_memcg, memcg);
+		mem_cgroup_calculate_protection(target_memcg, memcg, toptier);
 
 		if (mem_cgroup_below_min(target_memcg, memcg)) {
 			/*
@@ -5995,7 +5999,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 			 * If there is no reclaimable memory, OOM.
 			 */
 			continue;
-		} else if (mem_cgroup_below_low(target_memcg, memcg)) {
+		} else if (mem_cgroup_below_low(target_memcg, memcg,
+					tier_aware_memcg_limits && toptier)) {
 			/*
 			 * Soft protection.
 			 * Respect the protection only as long as
-- 
2.47.3



  parent reply	other threads:[~2026-02-23 22:38 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-23 22:38 [RFC PATCH 0/6] mm/memcontrol: Make memcg limits tier-aware Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 1/6] mm/memory-tiers: Introduce tier-aware memcg limit sysfs Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 2/6] mm/page_counter: Introduce tiered memory awareness to page_counter Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 3/6] mm/memory-tiers, memcontrol: Introduce toptier capacity updates Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 4/6] mm/memcontrol: Charge and uncharge from toptier Joshua Hahn
2026-02-23 22:38 ` Joshua Hahn [this message]
2026-02-23 22:38 ` [RFC PATCH 6/6] mm/memcontrol: Make memory.high tier-aware Joshua Hahn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260223223830.586018-6-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=rppt@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=surenb@google.com \
    --cc=tj@kernel.org \
    --cc=vbabka@kernel.org \
    --cc=weixugc@google.com \
    --cc=yuanchu@google.com \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox