[RFC PATCH 6/6] mm/memcontrol: Make memory.high tier-aware

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@suse.com>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Yuanchu Xie <yuanchu@google.com>, Wei Xu <weixugc@google.com>,
	linux-mm@kvack.org, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@meta.com
Subject: [RFC PATCH 6/6] mm/memcontrol: Make memory.high tier-aware
Date: Mon, 23 Feb 2026 14:38:29 -0800	[thread overview]
Message-ID: <20260223223830.586018-7-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20260223223830.586018-1-joshua.hahnjy@gmail.com>

On machines serving multiple workloads whose memory is isolated via the
memory cgroup controller, it is currently impossible to enforce a fair
distribution of toptier memory among the workloads, as the only
enforcable limits have to do with total memory footprint, but not where
that memory resides.

This makes ensuring a consistent and baseline performance difficult, as
each workload's performance is heavily impacted by workload-external
factors wuch as which other workloads are co-located in the same host,
and the order at which different workloads are started.

Extend the existing memory.high protection to be tier-aware in the
charging and enforcement to limit toptier-hogging for workloads.

Also, add a new nodemask parameter to try_to_free_mem_cgroup_pages,
which can be used to selectively reclaim from memory at the
memcg-tier interection of a cgroup.

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 include/linux/swap.h |  3 +-
 mm/memcontrol-v1.c   |  6 ++--
 mm/memcontrol.c      | 85 +++++++++++++++++++++++++++++++++++++-------
 mm/vmscan.c          | 11 +++---
 4 files changed, 84 insertions(+), 21 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0effe3cc50f5..c6037ac7bf6e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,7 +368,8 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 						  unsigned long nr_pages,
 						  gfp_t gfp_mask,
 						  unsigned int reclaim_options,
-						  int *swappiness);
+						  int *swappiness,
+						  nodemask_t *allowed);
 extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
 						pg_data_t *pgdat,
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 0b39ba608109..29630c7f3567 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1497,7 +1497,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
+				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
+				NULL, NULL)) {
 			ret = -EBUSY;
 			break;
 		}
@@ -1529,7 +1530,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 			return -EINTR;
 
 		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
-						  MEMCG_RECLAIM_MAY_SWAP, NULL))
+						  MEMCG_RECLAIM_MAY_SWAP,
+						  NULL, NULL))
 			nr_retries--;
 	}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8aa7ae361a73..ebd4a1b73c51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2184,18 +2184,30 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
 
 	do {
 		unsigned long pflags;
-
-		if (page_counter_read(&memcg->memory) <=
-		    READ_ONCE(memcg->memory.high))
+		nodemask_t toptier_nodes, *reclaim_nodes;
+		bool mem_high_ok, toptier_high_ok;
+
+		mt_get_toptier_nodemask(&toptier_nodes, NULL);
+		mem_high_ok = page_counter_read(&memcg->memory) <=
+			      READ_ONCE(memcg->memory.high);
+		toptier_high_ok = !(tier_aware_memcg_limits &&
+				    mem_cgroup_toptier_usage(memcg) >
+				    page_counter_toptier_high(&memcg->memory));
+		if (mem_high_ok && toptier_high_ok)
 			continue;
 
+		if (mem_high_ok && !toptier_high_ok)
+			reclaim_nodes = &toptier_nodes;
+		else
+			reclaim_nodes = NULL;
+
 		memcg_memory_event(memcg, MEMCG_HIGH);
 
 		psi_memstall_enter(&pflags);
 		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
 							gfp_mask,
 							MEMCG_RECLAIM_MAY_SWAP,
-							NULL);
+							NULL, reclaim_nodes);
 		psi_memstall_leave(&pflags);
 	} while ((memcg = parent_mem_cgroup(memcg)) &&
 		 !mem_cgroup_is_root(memcg));
@@ -2296,6 +2308,24 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg)
 	return max_overage;
 }
 
+static u64 toptier_find_max_overage(struct mem_cgroup *memcg)
+{
+	u64 overage, max_overage = 0;
+
+	if (!tier_aware_memcg_limits)
+		return 0;
+
+	do {
+		unsigned long usage = mem_cgroup_toptier_usage(memcg);
+		unsigned long high = page_counter_toptier_high(&memcg->memory);
+
+		overage = calculate_overage(usage, high);
+		max_overage = max(overage, max_overage);
+	} while ((memcg = parent_mem_cgroup(memcg)) &&
+		  !mem_cgroup_is_root(memcg));
+
+	return max_overage;
+}
 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
 {
 	u64 overage, max_overage = 0;
@@ -2401,6 +2431,14 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
 	penalty_jiffies += calculate_high_delay(memcg, nr_pages,
 						swap_find_max_overage(memcg));
 
+	/*
+	 * Don't double-penalize for toptier high overage if system-wide
+	 * memory.high has already been breached.
+	 */
+	if (!penalty_jiffies)
+		penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+					toptier_find_max_overage(memcg));
+
 	/*
 	 * Clamp the max delay per usermode return so as to still keep the
 	 * application moving forwards and also permit diagnostics, albeit
@@ -2503,7 +2541,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	psi_memstall_enter(&pflags);
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-						    gfp_mask, reclaim_options, NULL);
+						    gfp_mask, reclaim_options,
+						    NULL, NULL);
 	psi_memstall_leave(&pflags);
 
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2592,23 +2631,26 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * reclaim, the cost of mismatch is negligible.
 	 */
 	do {
-		bool mem_high, swap_high;
+		bool mem_high, swap_high, toptier_high = false;
 
 		mem_high = page_counter_read(&memcg->memory) >
 			READ_ONCE(memcg->memory.high);
 		swap_high = page_counter_read(&memcg->swap) >
 			READ_ONCE(memcg->swap.high);
+		toptier_high = tier_aware_memcg_limits &&
+			       (mem_cgroup_toptier_usage(memcg) >
+				page_counter_toptier_high(&memcg->memory));
 
 		/* Don't bother a random interrupted task */
 		if (!in_task()) {
-			if (mem_high) {
+			if (mem_high || toptier_high) {
 				schedule_work(&memcg->high_work);
 				break;
 			}
 			continue;
 		}
 
-		if (mem_high || swap_high) {
+		if (mem_high || swap_high || toptier_high) {
 			/*
 			 * The allocating tasks in this cgroup will need to do
 			 * reclaim or be throttled to prevent further growth
@@ -4476,7 +4518,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	bool drained = false;
-	unsigned long high;
+	unsigned long high, toptier_high;
 	int err;
 
 	buf = strstrip(buf);
@@ -4485,15 +4527,22 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 		return err;
 
 	page_counter_set_high(&memcg->memory, high);
+	toptier_high = page_counter_toptier_high(&memcg->memory);
 
 	if (of->file->f_flags & O_NONBLOCK)
 		goto out;
 
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
+		unsigned long toptier_pages = mem_cgroup_toptier_usage(memcg);
 		unsigned long reclaimed;
+		unsigned long to_free;
+		nodemask_t toptier_nodes, *reclaim_nodes;
+		bool mem_high_ok = nr_pages <= high;
+		bool toptier_high_ok = !(tier_aware_memcg_limits &&
+					 toptier_pages > toptier_high);
 
-		if (nr_pages <= high)
+		if (mem_high_ok && toptier_high_ok)
 			break;
 
 		if (signal_pending(current))
@@ -4505,8 +4554,17 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 			continue;
 		}
 
-		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
+		mt_get_toptier_nodemask(&toptier_nodes, NULL);
+		if (mem_high_ok && !toptier_high_ok) {
+			reclaim_nodes = &toptier_nodes;
+			to_free = toptier_pages - toptier_high;
+		} else {
+			reclaim_nodes = NULL;
+			to_free = nr_pages - high;
+		}
+		reclaimed = try_to_free_mem_cgroup_pages(memcg, to_free,
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+					NULL, reclaim_nodes);
 
 		if (!reclaimed && !nr_retries--)
 			break;
@@ -4558,7 +4616,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 
 		if (nr_reclaims) {
 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
+					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+					NULL, NULL))
 				nr_reclaims--;
 			continue;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5b4cb030a477..94498734b4f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6652,7 +6652,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
 					   unsigned int reclaim_options,
-					   int *swappiness)
+					   int *swappiness, nodemask_t *allowed)
 {
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
@@ -6668,6 +6668,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_unmap = 1,
 		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
 		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+		.nodemask = allowed,
 	};
 	/*
 	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6693,7 +6694,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
 					   unsigned int reclaim_options,
-					   int *swappiness)
+					   int *swappiness, nodemask_t *allowed)
 {
 	return 0;
 }
@@ -7806,9 +7807,9 @@ int user_proactive_reclaim(char *buf,
 			reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
 					  MEMCG_RECLAIM_PROACTIVE;
 			reclaimed = try_to_free_mem_cgroup_pages(memcg,
-						 batch_size, gfp_mask,
-						 reclaim_options,
-						 swappiness == -1 ? NULL : &swappiness);
+					batch_size, gfp_mask, reclaim_options,
+					swappiness == -1 ? NULL : &swappiness,
+					NULL);
 		} else {
 			struct scan_control sc = {
 				.gfp_mask = current_gfp_context(gfp_mask),
-- 
2.47.3

     prev parent reply	other threads:[~2026-02-23 22:38 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-23 22:38 [RFC PATCH 0/6] mm/memcontrol: Make memcg limits tier-aware Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 1/6] mm/memory-tiers: Introduce tier-aware memcg limit sysfs Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 2/6] mm/page_counter: Introduce tiered memory awareness to page_counter Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 3/6] mm/memory-tiers, memcontrol: Introduce toptier capacity updates Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 4/6] mm/memcontrol: Charge and uncharge from toptier Joshua Hahn
2026-02-23 22:38 ` [RFC PATCH 5/6] mm/memcontrol, page_counter: Make memory.low tier-aware Joshua Hahn
2026-02-23 22:38 ` Joshua Hahn [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260223223830.586018-7-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=weixugc@google.com \
    --cc=yuanchu@google.com \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox