linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm/vmscan: respect mems_effective in demote_folio_list()
@ 2025-12-20  6:10 Bing Jiao
  2025-12-20 19:20 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Bing Jiao @ 2025-12-20  6:10 UTC (permalink / raw)
  To: linux-mm
  Cc: gourry, Bing Jiao, Waiman Long, Johannes Weiner, Michal Hocko,
	Roman Gushchin, Shakeel Butt, Muchun Song, Andrew Morton,
	David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Tejun Heo,
	Michal Koutný,
	Qi Zheng, Axel Rasmussen, Yuanchu Xie, Wei Xu, cgroups,
	linux-kernel

Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
introduces the cpuset.mems_effective check and applies it to
can_demote(). However, it does not apply this check in
demote_folio_list(), which leads to situations where pages are demoted
to nodes that are explicitly excluded from the task's cpuset.mems.

To address the issue that demotion targets do not respect
cpuset.mem_effective in demote_folio_list(), implement a new function
get_demotion_targets(), which returns a preferred demotion target
and all allowed (fallback) nodes against mems_effective,
and update demote_folio_list() and can_demote() accordingly to
use get_demotion_targets().

Furthermore, update some supporting functions:
  - Add a parameter for next_demotion_node() to return a copy of
    node_demotion[]->preferred, allowing get_demotion_targets()
    to select the next-best node for demotion.
  - Change the parameters for cpuset_node_allowed() and
    mem_cgroup_node_allowed() from nid to nodemask * to allow
    for direct logic-and operations with mems_effective.

Signed-off-by: Bing Jiao <bingjiao@google.com>
---
 include/linux/cpuset.h       |  5 +--
 include/linux/memcontrol.h   |  6 +--
 include/linux/memory-tiers.h |  6 +--
 kernel/cgroup/cpuset.c       | 14 +++----
 mm/memcontrol.c              |  5 ++-
 mm/memory-tiers.c            |  8 +++-
 mm/vmscan.c                  | 77 +++++++++++++++++++++++++++++-------
 7 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..27a0b6e9fb9d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -174,7 +174,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }

-extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes);
 #else /* !CONFIG_CPUSETS */

 static inline bool cpusets_enabled(void) { return false; }
@@ -301,9 +301,8 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }

-static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+static inline void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes)
 {
-	return true;
 }
 #endif /* !CONFIG_CPUSETS */

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fd400082313a..a87f008b6600 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1740,7 +1740,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }

-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+void mem_cgroup_node_allowed(struct mem_cgroup *memcg, nodemask_t *nodes);

 void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);

@@ -1811,9 +1811,9 @@ static inline ino_t page_cgroup_ino(struct page *page)
 	return 0;
 }

-static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+static inline void mem_cgroup_node_allowed(struct mem_cgroup *memcg,
+					   nodemask_t *nodes)
 {
-	return true;
 }

 static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 7a805796fcfd..2706ebfa94b5 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -53,11 +53,11 @@ struct memory_dev_type *mt_find_alloc_memory_type(int adist,
 						  struct list_head *memory_types);
 void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
-int next_demotion_node(int node);
+int next_demotion_node(int node, nodemask_t *mask);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
 bool node_is_toptier(int node);
 #else
-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, nodemask_t *mask)
 {
 	return NUMA_NO_NODE;
 }
@@ -101,7 +101,7 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt

 }

-static inline int next_demotion_node(int node)
+static inline int next_demotion_node(int node, nodemask_t *mask)
 {
 	return NUMA_NO_NODE;
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6e6eb09b8db6..2d78cfde5911 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4416,11 +4416,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 	return allowed;
 }

-bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+void cpuset_node_allowed(struct cgroup *cgroup, nodemask_t *nodes)
 {
 	struct cgroup_subsys_state *css;
 	struct cpuset *cs;
-	bool allowed;

 	/*
 	 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
@@ -4428,16 +4427,16 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
 	 * so return true to avoid taking a global lock on the empty check.
 	 */
 	if (!cpuset_v2())
-		return true;
+		return;

 	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
 	if (!css)
-		return true;
+		return;

 	/*
 	 * Normally, accessing effective_mems would require the cpuset_mutex
-	 * or callback_lock - but node_isset is atomic and the reference
-	 * taken via cgroup_get_e_css is sufficient to protect css.
+	 * or callback_lock - but the reference taken via cgroup_get_e_css
+	 * is sufficient to protect css.
 	 *
 	 * Since this interface is intended for use by migration paths, we
 	 * relax locking here to avoid taking global locks - while accepting
@@ -4447,9 +4446,8 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
 	 * cannot make strong isolation guarantees, so this is acceptable.
 	 */
 	cs = container_of(css, struct cpuset, css);
-	allowed = node_isset(nid, cs->effective_mems);
+	nodes_and(*nodes, *nodes, cs->effective_mems);
 	css_put(css);
-	return allowed;
 }

 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75fc22a33b28..a62c75b136ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5597,9 +5597,10 @@ subsys_initcall(mem_cgroup_swap_init);

 #endif /* CONFIG_SWAP */

-bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+void mem_cgroup_node_allowed(struct mem_cgroup *memcg, nodemask_t *nodes)
 {
-	return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+	if (memcg)
+		cpuset_node_allowed(memcg->css.cgroup, nodes);
 }

 void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 20aab9c19c5e..ed0ee9c3ae70 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -320,13 +320,14 @@ void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
 /**
  * next_demotion_node() - Get the next node in the demotion path
  * @node: The starting node to lookup the next node
+ * @mask: The preferred nodemask copy to be returned
  *
  * Return: node id for next memory node in the demotion path hierarchy
  * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
  * @node online or guarantee that it *continues* to be the next demotion
  * target.
  */
-int next_demotion_node(int node)
+int next_demotion_node(int node, nodemask_t *mask)
 {
 	struct demotion_nodes *nd;
 	int target;
@@ -355,7 +356,12 @@ int next_demotion_node(int node)
 	 * last target node. Or introducing per-cpu data to avoid
 	 * caching issue, which seems more complicated. So selecting
 	 * target node randomly seems better until now.
+	 *
+	 * Copy preferred nodes as the fallback if the returned one
+	 * does not satisify some constraints like cpuset.
 	 */
+	if (mask)
+		nodes_copy(*mask, nd->preferred);
 	target = node_random(&nd->preferred);
 	rcu_read_unlock();

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8bdb1629b6eb..2ddbf5584af8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -341,22 +341,71 @@ static void flush_reclaim_state(struct scan_control *sc)
 	}
 }

+/*
+ * Returns a preferred demotion node and all allowed demotion @targets.
+ * Returns NUMA_NO_NODE and @targets is meaningless if no allowed nodes.
+ */
+static int get_demotion_targets(nodemask_t *targets, struct pglist_data *pgdat,
+				struct mem_cgroup *memcg)
+{
+	nodemask_t allowed_mask;
+	nodemask_t preferred_mask;
+	int preferred_node;
+
+	if (!pgdat)
+		return NUMA_NO_NODE;
+
+	preferred_node = next_demotion_node(pgdat->node_id, &preferred_mask);
+	if (preferred_node == NUMA_NO_NODE)
+		return NUMA_NO_NODE;
+
+	node_get_allowed_targets(pgdat, &allowed_mask);
+	mem_cgroup_node_allowed(memcg, &allowed_mask);
+	if (nodes_empty(allowed_mask))
+		return NUMA_NO_NODE;
+
+	if (targets)
+		nodes_copy(*targets, allowed_mask);
+
+	do {
+		if (node_isset(preferred_node, allowed_mask))
+			return preferred_node;
+
+		nodes_and(preferred_mask, preferred_mask, allowed_mask);
+		if (!nodes_empty(preferred_mask))
+			return node_random(&preferred_mask);
+
+		/*
+		 * Hop to the next tier of preferred nodes. Even if
+		 * preferred_node is not set in allowed_mask, still can use it
+		 * to query the nest-best demotion nodes.
+		 */
+		preferred_node = next_demotion_node(preferred_node,
+						    &preferred_mask);
+	} while (preferred_node != NUMA_NO_NODE);
+
+	/*
+	 * Should not reach here, as a non-empty allowed_mask ensures
+	 * there must have a target node for demotion.
+	 * Otherwise, it suggests something wrong in node_demotion[]->preferred,
+	 * where the same-tier nodes have different preferred targets.
+	 * E.g., if node 0 identifies both nodes 2 and 3 as preferred targets,
+	 * but nodes 2 and 3 themselves have different preferred nodes.
+	 */
+	WARN_ON_ONCE(1);
+	return node_random(&allowed_mask);
+}
+
 static bool can_demote(int nid, struct scan_control *sc,
 		       struct mem_cgroup *memcg)
 {
-	int demotion_nid;
-
 	if (!numa_demotion_enabled)
 		return false;
 	if (sc && sc->no_demotion)
 		return false;

-	demotion_nid = next_demotion_node(nid);
-	if (demotion_nid == NUMA_NO_NODE)
-		return false;
-
-	/* If demotion node isn't in the cgroup's mems_allowed, fall back */
-	return mem_cgroup_node_allowed(memcg, demotion_nid);
+	return get_demotion_targets(NULL, NODE_DATA(nid), memcg) !=
+	       NUMA_NO_NODE;
 }

 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -1019,9 +1068,10 @@ static struct folio *alloc_demote_folio(struct folio *src,
  * Folios which are not demoted are left on @demote_folios.
  */
 static unsigned int demote_folio_list(struct list_head *demote_folios,
-				     struct pglist_data *pgdat)
+				      struct pglist_data *pgdat,
+				      struct mem_cgroup *memcg)
 {
-	int target_nid = next_demotion_node(pgdat->node_id);
+	int target_nid;
 	unsigned int nr_succeeded;
 	nodemask_t allowed_mask;

@@ -1033,7 +1083,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 		 */
 		.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
 			__GFP_NOMEMALLOC | GFP_NOWAIT,
-		.nid = target_nid,
 		.nmask = &allowed_mask,
 		.reason = MR_DEMOTION,
 	};
@@ -1041,10 +1090,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
 	if (list_empty(demote_folios))
 		return 0;

+	target_nid = get_demotion_targets(&allowed_mask, pgdat, memcg);
 	if (target_nid == NUMA_NO_NODE)
 		return 0;
-
-	node_get_allowed_targets(pgdat, &allowed_mask);
+	mtc.nid = target_nid;

 	/* Demotion ignores all cpuset and mempolicy settings */
 	migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1566,7 +1615,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	/* 'folio_list' is always empty here */

 	/* Migrate folios selected for demotion */
-	nr_demoted = demote_folio_list(&demote_folios, pgdat);
+	nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
 	nr_reclaimed += nr_demoted;
 	stat->nr_demoted += nr_demoted;
 	/* Folios that could not be demoted are still in @demote_folios */
--
2.52.0.351.gbe84eed79e-goog



^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-12-26 20:24 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-20  6:10 [PATCH] mm/vmscan: respect mems_effective in demote_folio_list() Bing Jiao
2025-12-20 19:20 ` Andrew Morton
2025-12-22  6:16   ` Bing Jiao
2025-12-21 12:07 ` Gregory Price
2025-12-22  6:28   ` Bing Jiao
2025-12-21 23:36 ` [PATCH v2 0/2] fix demotion targets checks in reclaim/demotion Bing Jiao
2025-12-21 23:36   ` [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list() Bing Jiao
2025-12-22  2:38     ` Chen Ridong
2025-12-22 21:56     ` kernel test robot
2025-12-22 22:18     ` kernel test robot
2025-12-21 23:36   ` [PATCH v2 2/2] mm/vmscan: check all allowed targets in can_demote() Bing Jiao
2025-12-22  2:51     ` Chen Ridong
2025-12-22  6:09       ` Bing Jiao
2025-12-22  8:28         ` Chen Ridong
2025-12-23 21:19   ` [PATCH v3] mm/vmscan: fix demotion targets checks in reclaim/demotion Bing Jiao
2025-12-23 21:38     ` Bing Jiao
2025-12-24  1:19     ` Gregory Price
2025-12-26 18:48       ` Bing Jiao
2025-12-24  1:49     ` Chen Ridong
2025-12-26 18:58       ` Bing Jiao
2025-12-26 19:32     ` Waiman Long
2025-12-26 20:24     ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox