From: Bing Jiao <bingjiao@google.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org, stable@vger.kernel.org,
akpm@linux-foundation.org, gourry@gourry.net,
longman@redhat.com, hannes@cmpxchg.org, mhocko@kernel.org,
roman.gushchin@linux.dev, shakeel.butt@linux.dev,
muchun.song@linux.dev, tj@kernel.org, mkoutny@suse.com,
david@kernel.org, zhengqi.arch@bytedance.com,
lorenzo.stoakes@oracle.com, axelrasmussen@google.com,
yuanchu@google.com, weixugc@google.com, cgroups@vger.kernel.org,
Bing Jiao <bingjiao@google.com>
Subject: [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list()
Date: Sun, 21 Dec 2025 23:36:34 +0000 [thread overview]
Message-ID: <20251221233635.3761887-2-bingjiao@google.com> (raw)
In-Reply-To: <20251221233635.3761887-1-bingjiao@google.com>
Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
introduces the cpuset.mems_effective check and applies it to
can_demote(). However, it does not apply this check in
demote_folio_list().
This omission leads to situations where pages are demoted to nodes
that are explicitly excluded from the task's cpuset.mems.
The impact is two-fold:
1. Resource Isolation: This bug breaks resource isolation provided
by cpuset.mems. It allows pages to be demoted to nodes that are
dedicated to other tasks or are intended for hot-unplugging.
2. Performance Issue: In multi-tier systems, users use cpuset.mems
to bind tasks to different performed-far tiers (e.g., avoiding
the slowest tiers for latency-sensitive data). This bug can
cause unexpected latency spikes if pages are demoted to the
farthest nodes.
To address the bug, implement a new function
mem_cgroup_filter_mems_allowed() to filter out nodes that are not
set in mems_effective, and update demote_folio_list() to utilize
this filtering logic. This ensures that demotions target respect
task's memory placement constraints.
Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
Signed-off-by: Bing Jiao <bingjiao@google.com>
---
include/linux/cpuset.h | 6 ++++++
include/linux/memcontrol.h | 7 +++++++
kernel/cgroup/cpuset.c | 18 ++++++++++++++++++
mm/memcontrol.c | 6 ++++++
mm/vmscan.c | 13 ++++++++++---
5 files changed, 47 insertions(+), 3 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..0e94548e2d24 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -175,6 +175,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
}
extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -305,6 +306,11 @@ static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
{
return true;
}
+
+static inline void cpuset_node_filter_allowed(struct cgroup *cgroup,
+ nodemask_t *mask)
+{
+}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fd400082313a..7cfd71c57caa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1742,6 +1742,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
+
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
static inline bool memcg_is_dying(struct mem_cgroup *memcg)
@@ -1816,6 +1818,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
return true;
}
+static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
+ nodemask_t *mask)
+{
+}
+
static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
{
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6e6eb09b8db6..2925bd6bca91 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4452,6 +4452,24 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
return allowed;
}
+void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+
+ if (!cpuset_v2())
+ return;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return;
+
+ /* Follows the same assumption in cpuset_node_allowed() */
+ cs = container_of(css, struct cpuset, css);
+ nodes_and(*mask, *mask, cs->effective_mems);
+ css_put(css);
+}
+
/**
* cpuset_spread_node() - On which node to begin search for a page
* @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75fc22a33b28..f414653867de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5602,6 +5602,12 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
}
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
+{
+ if (memcg)
+ cpuset_node_filter_allowed(memcg->css.cgroup, mask);
+}
+
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 453d654727c1..4d23c491e914 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1018,7 +1018,8 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
@@ -1032,7 +1033,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1044,6 +1044,13 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
+ /* Filter the given nmask based on cpuset.mems.allowed */
+ mem_cgroup_filter_mems_allowed(memcg, &allowed_mask);
+ if (nodes_empty(allowed_mask))
+ return 0;
+ if (!node_isset(target_nid, allowed_mask))
+ target_nid = node_random(&allowed_mask);
+ mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1565,7 +1572,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
--
2.52.0.351.gbe84eed79e-goog
next prev parent reply other threads:[~2025-12-21 23:37 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-12-20 6:10 [PATCH] " Bing Jiao
2025-12-20 19:20 ` Andrew Morton
2025-12-22 6:16 ` Bing Jiao
2025-12-21 12:07 ` Gregory Price
2025-12-22 6:28 ` Bing Jiao
2025-12-21 23:36 ` [PATCH v2 0/2] fix demotion targets checks in reclaim/demotion Bing Jiao
2025-12-21 23:36 ` Bing Jiao [this message]
2025-12-22 2:38 ` [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list() Chen Ridong
2025-12-22 21:56 ` kernel test robot
2025-12-22 22:18 ` kernel test robot
2025-12-21 23:36 ` [PATCH v2 2/2] mm/vmscan: check all allowed targets in can_demote() Bing Jiao
2025-12-22 2:51 ` Chen Ridong
2025-12-22 6:09 ` Bing Jiao
2025-12-22 8:28 ` Chen Ridong
2025-12-23 21:19 ` [PATCH v3] mm/vmscan: fix demotion targets checks in reclaim/demotion Bing Jiao
2025-12-23 21:38 ` Bing Jiao
2025-12-24 1:19 ` Gregory Price
2025-12-26 18:48 ` Bing Jiao
2025-12-24 1:49 ` Chen Ridong
2025-12-26 18:58 ` Bing Jiao
2025-12-26 19:32 ` Waiman Long
2025-12-26 20:24 ` Waiman Long
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251221233635.3761887-2-bingjiao@google.com \
--to=bingjiao@google.com \
--cc=akpm@linux-foundation.org \
--cc=axelrasmussen@google.com \
--cc=cgroups@vger.kernel.org \
--cc=david@kernel.org \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=longman@redhat.com \
--cc=lorenzo.stoakes@oracle.com \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=stable@vger.kernel.org \
--cc=tj@kernel.org \
--cc=weixugc@google.com \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox