[PATCH v4 0/2] vmscan: enforce mems_effective during demotion

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 0/2] vmscan: enforce mems_effective during demotion
@ 2025-04-22  1:26 Gregory Price
  2025-04-22  1:26 ` [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  0 siblings, 2 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-22  1:26 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

Change reclaim to respect cpuset.mems_effective during demotion when
possible. Presently, reclaim explicitly ignores cpuset.mems_effective
when demoting, which may cause the cpuset settings to violated.

Implement cpuset_node_allowed() to check the cpuset.mems_effective
associated wih the mem_cgroup of the lruvec being scanned. This only
applies to cgroup/cpuset v2, as cpuset exists in a different hierarchy
than mem_cgroup in v1.

This requires renaming the existing cpuset_node_allowed() to be
cpuset_current_now_allowed() - which is more descriptive anyway - to
implement the new cpuset_node_allowed() which takes a target cgroup.

v4:
- explicitly expect v1 instead of checking for empty effective_mems.
  this was the case anyway since cpuset and mem_cgroup are in different
  heirarchy in v1.
- update docs to reflect this
- rcu_read_lock instead of taking a global lock.
- cpuset header fixup when compiled out.

Gregory Price (2):
  cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed
  vmscan,cgroup: apply mems_effective to reclaim

 .../ABI/testing/sysfs-kernel-mm-numa          | 16 +++++---
 include/linux/cpuset.h                        |  9 +++-
 include/linux/memcontrol.h                    |  6 +++
 kernel/cgroup/cpuset.c                        | 30 +++++++++++++-
 mm/memcontrol.c                               |  6 +++
 mm/page_alloc.c                               |  4 +-
 mm/vmscan.c                                   | 41 +++++++++++--------
 7 files changed, 84 insertions(+), 28 deletions(-)

-- 
2.49.0


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed
  2025-04-22  1:26 [PATCH v4 0/2] vmscan: enforce mems_effective during demotion Gregory Price
@ 2025-04-22  1:26 ` Gregory Price
  2025-04-22 17:37   ` Johannes Weiner
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  1 sibling, 1 reply; 15+ messages in thread
From: Gregory Price @ 2025-04-22  1:26 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

Rename cpuset_node_allowed to reflect that the function checks the
current task's cpuset.mems.  This allows us to make a new
cpuset_node_allowed function that checks a target cgroup's cpuset.mems.

Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 4 ++--
 mm/page_alloc.c        | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 835e7b793f6a..893a4c340d48 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -82,11 +82,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask);
 
 static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask);
 }
 
 static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..f8e6a9b642cb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4090,7 +4090,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 }
 
 /*
- * cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -4129,7 +4129,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-bool cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	bool allowed;			/* is allocation in zone z allowed? */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5079b1b04d49..233ce25f8f3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3461,7 +3461,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 retry:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-	 * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+	 * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
 	 */
 	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
 	z = ac->preferred_zoneref;
@@ -4148,7 +4148,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 		/*
 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
 		 * GFP_ATOMIC) rather than fail, see the comment for
-		 * cpuset_node_allowed().
+		 * cpuset_current_node_allowed().
 		 */
 		if (alloc_flags & ALLOC_MIN_RESERVE)
 			alloc_flags &= ~ALLOC_CPUSET;
-- 
2.49.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-22  1:26 [PATCH v4 0/2] vmscan: enforce mems_effective during demotion Gregory Price
  2025-04-22  1:26 ` [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
@ 2025-04-22  1:26 ` Gregory Price
  2025-04-22  2:02   ` Waiman Long
                     ` (4 more replies)
  1 sibling, 5 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-22  1:26 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

It is possible for a reclaimer to cause demotions of an lruvec belonging
to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
this limitation based on the lruvec's memcg and prevent demotion.

Notably, this may still allow demotion of shared libraries or any memory
first instantiated in another cgroup. This means cpusets still cannot
cannot guarantee complete isolation when demotion is enabled, and the
docs have been updated to reflect this.

This is useful for isolating workloads on a multi-tenant system from
certain classes of memory more consistently - with the noted exceptions.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
 .../ABI/testing/sysfs-kernel-mm-numa          | 16 +++++---
 include/linux/cpuset.h                        |  5 +++
 include/linux/memcontrol.h                    |  6 +++
 kernel/cgroup/cpuset.c                        | 26 ++++++++++++
 mm/memcontrol.c                               |  6 +++
 mm/vmscan.c                                   | 41 +++++++++++--------
 6 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
index 77e559d4ed80..90e375ff54cb 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
 		Allowing page migration during reclaim enables these
 		systems to migrate pages from fast tiers to slow tiers
 		when the fast tier is under pressure.  This migration
-		is performed before swap.  It may move data to a NUMA
-		node that does not fall into the cpuset of the
-		allocating process which might be construed to violate
-		the guarantees of cpusets.  This should not be enabled
-		on systems which need strict cpuset location
-		guarantees.
+		is performed before swap if an eligible numa node is
+		present in cpuset.mems for the cgroup (or if cpuset v1
+		is being used). If cpusets.mems changes at runtime, it
+		may move data to a NUMA node that does not fall into the
+		cpuset of the new cpusets.mems, which might be construed
+		to violate the guarantees of cpusets.  Shared memory,
+		such as libraries, owned by another cgroup may still be
+		demoted and result in memory use on a node not present
+		in cpusets.mem. This should not be enabled on systems
+		which need strict cpuset location guarantees.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 893a4c340d48..5255e3fdbf62 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -171,6 +171,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
+extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -282,6 +283,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }
 
+static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	return true;
+}
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 53364526d877..a6c4e3faf721 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1736,6 +1736,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1793,6 +1795,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 {
 }
 
+static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return true;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f8e6a9b642cb..c52348bfd5db 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4163,6 +4163,32 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 	return allowed;
 }
 
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+	bool allowed;
+
+	/*
+	 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+	 * and mems_allowed is likely to be empty even if we could get to it,
+	 * so return true to avoid taking a global lock on the empty check.
+	 */
+	if (!cpuset_v2())
+		return true;
+
+	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+	if (!css)
+		return true;
+
+	cs = container_of(css, struct cpuset, css);
+	rcu_read_lock();
+	allowed = node_isset(nid, cs->effective_mems);
+	rcu_read_unlock();
+	css_put(css);
+	return allowed;
+}
+
 /**
  * cpuset_spread_node() - On which node to begin search for a page
  * @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40c07b8699ae..2f61d0060fd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
@@ -5437,3 +5438,8 @@ static int __init mem_cgroup_swap_init(void)
 subsys_initcall(mem_cgroup_swap_init);
 
 #endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2b2ab386cab5..32a7ce421e42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
 	}
 }
 
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+		       struct mem_cgroup *memcg)
 {
+	int demotion_nid;
+
 	if (!numa_demotion_enabled)
 		return false;
 	if (sc && sc->no_demotion)
 		return false;
-	if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+	demotion_nid = next_demotion_node(nid);
+	if (demotion_nid == NUMA_NO_NODE)
 		return false;
 
-	return true;
+	/* If demotion node isn't in the cgroup's mems_allowed, fall back */
+	return mem_cgroup_node_allowed(memcg, demotion_nid);
 }
 
 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	 *
 	 * Can it be reclaimed from this node via demotion?
 	 */
-	return can_demote(nid, sc);
+	return can_demote(nid, sc, memcg);
 }
 
 /*
@@ -1096,7 +1102,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
  */
 static unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
-		struct reclaim_stat *stat, bool ignore_references)
+		struct reclaim_stat *stat, bool ignore_references,
+		struct mem_cgroup *memcg)
 {
 	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
@@ -1109,7 +1116,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
-	do_demote_pass = can_demote(pgdat->node_id, sc);
+	do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
 
 retry:
 	while (!list_empty(folio_list)) {
@@ -1658,7 +1665,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
-					&stat, true);
+					&stat, true, NULL);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	list_splice(&clean_folios, folio_list);
@@ -2031,7 +2038,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+					 lruvec_memcg(lruvec));
 
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
@@ -2214,7 +2222,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2646,7 +2654,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
  * Anonymous LRU management is a waste if there is
  * ultimately no way to reclaim the memory.
  */
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
 			       struct scan_control *sc)
 {
 	/* Aging the anon LRU is valuable if swap is present: */
@@ -2654,7 +2662,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 		return true;
 
 	/* Also valuable if anon pages can be demoted: */
-	return can_demote(pgdat->node_id, sc);
+	return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+			  lruvec_memcg(lruvec));
 }
 
 #ifdef CONFIG_LRU_GEN
@@ -2732,7 +2741,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	if (!sc->may_swap)
 		return 0;
 
-	if (!can_demote(pgdat->node_id, sc) &&
+	if (!can_demote(pgdat->node_id, sc, memcg) &&
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
@@ -4695,7 +4704,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	if (list_empty(&list))
 		return scanned;
 retry:
-	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
 	sc->nr_reclaimed += reclaimed;
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -5850,7 +5859,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+	if (can_age_anon_pages(lruvec, sc) &&
 	    inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
@@ -6681,10 +6690,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 		return;
 	}
 
-	if (!can_age_anon_pages(pgdat, sc))
+	lruvec = mem_cgroup_lruvec(NULL, pgdat);
+	if (!can_age_anon_pages(lruvec, sc))
 		return;
 
-	lruvec = mem_cgroup_lruvec(NULL, pgdat);
 	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		return;
 
-- 
2.49.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
@ 2025-04-22  2:02   ` Waiman Long
  2025-04-22  4:07     ` Gregory Price
  2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Waiman Long @ 2025-04-22  2:02 UTC (permalink / raw)
  To: Gregory Price, linux-mm
  Cc: cgroups, linux-kernel, kernel-team, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On 4/21/25 9:26 PM, Gregory Price wrote:
> It is possible for a reclaimer to cause demotions of an lruvec belonging
> to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
> this limitation based on the lruvec's memcg and prevent demotion.
>
> Notably, this may still allow demotion of shared libraries or any memory
> first instantiated in another cgroup. This means cpusets still cannot
> cannot guarantee complete isolation when demotion is enabled, and the
> docs have been updated to reflect this.
>
> This is useful for isolating workloads on a multi-tenant system from
> certain classes of memory more consistently - with the noted exceptions.
>
> Acked-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>   .../ABI/testing/sysfs-kernel-mm-numa          | 16 +++++---
>   include/linux/cpuset.h                        |  5 +++
>   include/linux/memcontrol.h                    |  6 +++
>   kernel/cgroup/cpuset.c                        | 26 ++++++++++++
>   mm/memcontrol.c                               |  6 +++
>   mm/vmscan.c                                   | 41 +++++++++++--------
>   6 files changed, 78 insertions(+), 22 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> index 77e559d4ed80..90e375ff54cb 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> @@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
>   		Allowing page migration during reclaim enables these
>   		systems to migrate pages from fast tiers to slow tiers
>   		when the fast tier is under pressure.  This migration
> -		is performed before swap.  It may move data to a NUMA
> -		node that does not fall into the cpuset of the
> -		allocating process which might be construed to violate
> -		the guarantees of cpusets.  This should not be enabled
> -		on systems which need strict cpuset location
> -		guarantees.
> +		is performed before swap if an eligible numa node is
> +		present in cpuset.mems for the cgroup (or if cpuset v1
> +		is being used). If cpusets.mems changes at runtime, it
> +		may move data to a NUMA node that does not fall into the
> +		cpuset of the new cpusets.mems, which might be construed
> +		to violate the guarantees of cpusets.  Shared memory,
> +		such as libraries, owned by another cgroup may still be
> +		demoted and result in memory use on a node not present
> +		in cpusets.mem. This should not be enabled on systems
> +		which need strict cpuset location guarantees.
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 893a4c340d48..5255e3fdbf62 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -171,6 +171,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>   	task_unlock(current);
>   }
>   
> +extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
>   #else /* !CONFIG_CPUSETS */
>   
>   static inline bool cpusets_enabled(void) { return false; }
> @@ -282,6 +283,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
>   	return false;
>   }
>   
> +static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	return true;
> +}
>   #endif /* !CONFIG_CPUSETS */
>   
>   #endif /* _LINUX_CPUSET_H */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 53364526d877..a6c4e3faf721 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1736,6 +1736,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
>   	rcu_read_unlock();
>   }
>   
> +bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
> +
>   #else
>   static inline bool mem_cgroup_kmem_disabled(void)
>   {
> @@ -1793,6 +1795,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
>   {
>   }
>   
> +static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> +{
> +	return true;
> +}
>   #endif /* CONFIG_MEMCG */
>   
>   #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index f8e6a9b642cb..c52348bfd5db 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4163,6 +4163,32 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
>   	return allowed;
>   }
>   
> +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct cpuset *cs;
> +	bool allowed;
> +
> +	/*
> +	 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
> +	 * and mems_allowed is likely to be empty even if we could get to it,
> +	 * so return true to avoid taking a global lock on the empty check.
> +	 */
> +	if (!cpuset_v2())
> +		return true;
> +
> +	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
> +	if (!css)
> +		return true;
> +
> +	cs = container_of(css, struct cpuset, css);
> +	rcu_read_lock();

Sorry, I missed the fact that cgroup_get_e_css() will take a reference 
to the css and so it won't go away. In that case, rcu_read_lock() isn't 
really needed. However, I do want a comment to say that accessing 
effective_mems should normally requrie taking either a cpuset_mutex or 
callback_lock, but is skipped in this case to avoid taking a global lock 
in the reclaim path at the expense that the result may be inaccurate in 
some rare cases.

Cheers,
Longman

Cheers,
Longman



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-22  2:02   ` Waiman Long
@ 2025-04-22  4:07     ` Gregory Price
  0 siblings, 0 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-22  4:07 UTC (permalink / raw)
  To: Waiman Long
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On Mon, Apr 21, 2025 at 10:02:22PM -0400, Waiman Long wrote:
> > +bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> > +{
> > +	struct cgroup_subsys_state *css;
> > +	struct cpuset *cs;
> > +	bool allowed;
> > +
> > +	/*
> > +	 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
> > +	 * and mems_allowed is likely to be empty even if we could get to it,
> > +	 * so return true to avoid taking a global lock on the empty check.
> > +	 */
> > +	if (!cpuset_v2())
> > +		return true;
> > +
> > +	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
> > +	if (!css)
> > +		return true;
> > +
> > +	cs = container_of(css, struct cpuset, css);
> > +	rcu_read_lock();
> 
> Sorry, I missed the fact that cgroup_get_e_css() will take a reference to
> the css and so it won't go away. In that case, rcu_read_lock() isn't really
> needed. However, I do want a comment to say that accessing effective_mems
> should normally requrie taking either a cpuset_mutex or callback_lock, but
> is skipped in this case to avoid taking a global lock in the reclaim path at
> the expense that the result may be inaccurate in some rare cases.
> 

I'll add a differential patch here.

~Gregory


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] cpuset: relax locking on cpuset_node_allowed
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  2025-04-22  2:02   ` Waiman Long
@ 2025-04-22  4:30   ` Gregory Price
  2025-04-22  4:41     ` Shakeel Butt
                       ` (2 more replies)
  2025-04-22  4:41   ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Shakeel Butt
                     ` (2 subsequent siblings)
  4 siblings, 3 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-22  4:30 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

The cgroup_get_e_css reference protects the css->effective_mems, and
calls of this interface would be subject to the same race conditions
associated with a non-atomic access to cs->effective_mems.

So while this interface cannot make strong guarantees of correctness,
it can therefore avoid taking a global or rcu_read_lock for performance.

Drop the rcu_read_lock from cpuset_node_allowed.

Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Waiman Long <longman@redhat.com>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
 kernel/cgroup/cpuset.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c52348bfd5db..1dc41758c62c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4181,10 +4181,20 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
 	if (!css)
 		return true;
 
+	/*
+	 * Normally, accessing effective_mems would require the cpuset_mutex
+	 * or RCU read lock - but node_isset is atomic and the reference
+	 * taken via cgroup_get_e_css is sufficient to protect css.
+	 *
+	 * Since this interface is intended for use by migration paths, we
+	 * relax locking here to avoid taking global locks - while accepting
+	 * there may be rare scenarios where the result may be innaccurate.
+	 *
+	 * Reclaim and migration are subject to these same race conditions, and
+	 * cannot make strong isolation guarantees, so this is acceptable.
+	 */
 	cs = container_of(css, struct cpuset, css);
-	rcu_read_lock();
 	allowed = node_isset(nid, cs->effective_mems);
-	rcu_read_unlock();
 	css_put(css);
 	return allowed;
 }
-- 
2.49.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] cpuset: relax locking on cpuset_node_allowed
  2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
@ 2025-04-22  4:41     ` Shakeel Butt
  2025-04-22 17:46     ` Johannes Weiner
  2025-04-22 19:57     ` Waiman Long
  2 siblings, 0 replies; 15+ messages in thread
From: Shakeel Butt @ 2025-04-22  4:41 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, hannes,
	mhocko, roman.gushchin, muchun.song, tj, mkoutny, akpm

On Mon, Apr 21, 2025 at 9:30 PM Gregory Price <gourry@gourry.net> wrote:
>
> The cgroup_get_e_css reference protects the css->effective_mems, and
> calls of this interface would be subject to the same race conditions
> associated with a non-atomic access to cs->effective_mems.
>
> So while this interface cannot make strong guarantees of correctness,
> it can therefore avoid taking a global or rcu_read_lock for performance.
>
> Drop the rcu_read_lock from cpuset_node_allowed.
>
> Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
> Suggested-by: Waiman Long <longman@redhat.com>
> Signed-off-by: Gregory Price <gourry@gourry.net>

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  2025-04-22  2:02   ` Waiman Long
  2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
@ 2025-04-22  4:41   ` Shakeel Butt
  2025-04-22 17:41   ` Johannes Weiner
  2025-04-24 20:22   ` [PATCH v5 0/2] vmscan: enforce mems_effective during demotion Gregory Price
  4 siblings, 0 replies; 15+ messages in thread
From: Shakeel Butt @ 2025-04-22  4:41 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, hannes,
	mhocko, roman.gushchin, muchun.song, tj, mkoutny, akpm

On Mon, Apr 21, 2025 at 6:26 PM Gregory Price <gourry@gourry.net> wrote:
>
> It is possible for a reclaimer to cause demotions of an lruvec belonging
> to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
> this limitation based on the lruvec's memcg and prevent demotion.
>
> Notably, this may still allow demotion of shared libraries or any memory
> first instantiated in another cgroup. This means cpusets still cannot
> cannot guarantee complete isolation when demotion is enabled, and the
> docs have been updated to reflect this.
>
> This is useful for isolating workloads on a multi-tenant system from
> certain classes of memory more consistently - with the noted exceptions.
>
> Acked-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Gregory Price <gourry@gourry.net>

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed
  2025-04-22  1:26 ` [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
@ 2025-04-22 17:37   ` Johannes Weiner
  0 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2025-04-22 17:37 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On Mon, Apr 21, 2025 at 09:26:14PM -0400, Gregory Price wrote:
> Rename cpuset_node_allowed to reflect that the function checks the
> current task's cpuset.mems.  This allows us to make a new
> cpuset_node_allowed function that checks a target cgroup's cpuset.mems.
> 
> Acked-by: Waiman Long <longman@redhat.com>
> Acked-by: Tejun Heo <tj@kernel.org>
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> Signed-off-by: Gregory Price <gourry@gourry.net>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
                     ` (2 preceding siblings ...)
  2025-04-22  4:41   ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Shakeel Butt
@ 2025-04-22 17:41   ` Johannes Weiner
  2025-04-24 20:22   ` [PATCH v5 0/2] vmscan: enforce mems_effective during demotion Gregory Price
  4 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2025-04-22 17:41 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On Mon, Apr 21, 2025 at 09:26:15PM -0400, Gregory Price wrote:
> It is possible for a reclaimer to cause demotions of an lruvec belonging
> to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
> this limitation based on the lruvec's memcg and prevent demotion.
> 
> Notably, this may still allow demotion of shared libraries or any memory
> first instantiated in another cgroup. This means cpusets still cannot
> cannot guarantee complete isolation when demotion is enabled, and the
> docs have been updated to reflect this.
> 
> This is useful for isolating workloads on a multi-tenant system from
> certain classes of memory more consistently - with the noted exceptions.
> 
> Acked-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Gregory Price <gourry@gourry.net>

With the rcu lock removal from the follow-up fixlet,

Acked-by: Johannes Weiner <hannes@cmpxchg.org>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] cpuset: relax locking on cpuset_node_allowed
  2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
  2025-04-22  4:41     ` Shakeel Butt
@ 2025-04-22 17:46     ` Johannes Weiner
  2025-04-22 19:57     ` Waiman Long
  2 siblings, 0 replies; 15+ messages in thread
From: Johannes Weiner @ 2025-04-22 17:46 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On Tue, Apr 22, 2025 at 12:30:55AM -0400, Gregory Price wrote:
> The cgroup_get_e_css reference protects the css->effective_mems, and
> calls of this interface would be subject to the same race conditions
> associated with a non-atomic access to cs->effective_mems.
> 
> So while this interface cannot make strong guarantees of correctness,
> it can therefore avoid taking a global or rcu_read_lock for performance.
> 
> Drop the rcu_read_lock from cpuset_node_allowed.
> 
> Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
> Suggested-by: Waiman Long <longman@redhat.com>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>  kernel/cgroup/cpuset.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c52348bfd5db..1dc41758c62c 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4181,10 +4181,20 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
>  	if (!css)
>  		return true;
>  
> +	/*
> +	 * Normally, accessing effective_mems would require the cpuset_mutex
> +	 * or RCU read lock - but node_isset is atomic and the reference

              ^^^^^^^^^^^^^

This should be callback_lock. rcu_read_lock() was intended for css
lifetime - which is ensured by css_get_e_css() - not a stable mask.

Otherwise looks good, makes sense to lampshade the lockless access.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] cpuset: relax locking on cpuset_node_allowed
  2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
  2025-04-22  4:41     ` Shakeel Butt
  2025-04-22 17:46     ` Johannes Weiner
@ 2025-04-22 19:57     ` Waiman Long
  2 siblings, 0 replies; 15+ messages in thread
From: Waiman Long @ 2025-04-22 19:57 UTC (permalink / raw)
  To: Gregory Price, linux-mm
  Cc: cgroups, linux-kernel, kernel-team, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

On 4/22/25 12:30 AM, Gregory Price wrote:
> The cgroup_get_e_css reference protects the css->effective_mems, and
> calls of this interface would be subject to the same race conditions
> associated with a non-atomic access to cs->effective_mems.
>
> So while this interface cannot make strong guarantees of correctness,
> it can therefore avoid taking a global or rcu_read_lock for performance.
>
> Drop the rcu_read_lock from cpuset_node_allowed.
>
> Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
> Suggested-by: Waiman Long <longman@redhat.com>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>   kernel/cgroup/cpuset.c | 14 ++++++++++++--
>   1 file changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c52348bfd5db..1dc41758c62c 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4181,10 +4181,20 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
>   	if (!css)
>   		return true;
>   
> +	/*
> +	 * Normally, accessing effective_mems would require the cpuset_mutex
> +	 * or RCU read lock - but node_isset is atomic and the reference
> +	 * taken via cgroup_get_e_css is sufficient to protect css.
> +	 *
> +	 * Since this interface is intended for use by migration paths, we
> +	 * relax locking here to avoid taking global locks - while accepting
> +	 * there may be rare scenarios where the result may be innaccurate.
> +	 *
> +	 * Reclaim and migration are subject to these same race conditions, and
> +	 * cannot make strong isolation guarantees, so this is acceptable.
> +	 */
>   	cs = container_of(css, struct cpuset, css);
> -	rcu_read_lock();
>   	allowed = node_isset(nid, cs->effective_mems);
> -	rcu_read_unlock();
>   	css_put(css);
>   	return allowed;
>   }

Except for mislabeling RCU read lock instead of callback_lock as pointed 
out by Johannes, the change looks good to me.

Reviewed-by: Waiman Long <longman@redhat.com>



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v5 0/2] vmscan: enforce mems_effective during demotion
  2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
                     ` (3 preceding siblings ...)
  2025-04-22 17:41   ` Johannes Weiner
@ 2025-04-24 20:22   ` Gregory Price
  2025-04-24 20:22     ` [PATCH v5 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
  2025-04-24 20:22     ` [PATCH v5 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  4 siblings, 2 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-24 20:22 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

Change reclaim to respect cpuset.mems_effective during demotion when
possible. Presently, reclaim explicitly ignores cpuset.mems_effective
when demoting, which may cause the cpuset settings to violated.

Implement cpuset_node_allowed() to check the cpuset.mems_effective
associated wih the mem_cgroup of the lruvec being scanned. This only
applies to cgroup/cpuset v2, as cpuset exists in a different hierarchy
than mem_cgroup in v1.

This requires renaming the existing cpuset_node_allowed() to be
cpuset_current_now_allowed() - which is more descriptive anyway - to
implement the new cpuset_node_allowed() which takes a target cgroup.

v5:
- squash drop rcu_read_lock fixlet into second patch,
- changelog fixups

Gregory Price (2):
  cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed
  vmscan,cgroup: apply mems_effective to reclaim

 .../ABI/testing/sysfs-kernel-mm-numa          | 16 +++++---
 include/linux/cpuset.h                        |  9 +++-
 include/linux/memcontrol.h                    |  6 +++
 kernel/cgroup/cpuset.c                        | 40 +++++++++++++++++-
 mm/memcontrol.c                               |  6 +++
 mm/page_alloc.c                               |  4 +-
 mm/vmscan.c                                   | 41 +++++++++++--------
 7 files changed, 94 insertions(+), 28 deletions(-)

-- 
2.49.0


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v5 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed
  2025-04-24 20:22   ` [PATCH v5 0/2] vmscan: enforce mems_effective during demotion Gregory Price
@ 2025-04-24 20:22     ` Gregory Price
  2025-04-24 20:22     ` [PATCH v5 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  1 sibling, 0 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-24 20:22 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

Rename cpuset_node_allowed to reflect that the function checks the
current task's cpuset.mems.  This allows us to make a new
cpuset_node_allowed function that checks a target cgroup's cpuset.mems.

Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 4 ++--
 mm/page_alloc.c        | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 835e7b793f6a..893a4c340d48 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -82,11 +82,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask);
 
 static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask);
 }
 
 static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..f8e6a9b642cb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4090,7 +4090,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 }
 
 /*
- * cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
  * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
@@ -4129,7 +4129,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-bool cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	bool allowed;			/* is allocation in zone z allowed? */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5079b1b04d49..233ce25f8f3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3461,7 +3461,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 retry:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-	 * See also cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
+	 * See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
 	 */
 	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
 	z = ac->preferred_zoneref;
@@ -4148,7 +4148,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
 		/*
 		 * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
 		 * GFP_ATOMIC) rather than fail, see the comment for
-		 * cpuset_node_allowed().
+		 * cpuset_current_node_allowed().
 		 */
 		if (alloc_flags & ALLOC_MIN_RESERVE)
 			alloc_flags &= ~ALLOC_CPUSET;
-- 
2.49.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH v5 2/2] vmscan,cgroup: apply mems_effective to reclaim
  2025-04-24 20:22   ` [PATCH v5 0/2] vmscan: enforce mems_effective during demotion Gregory Price
  2025-04-24 20:22     ` [PATCH v5 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
@ 2025-04-24 20:22     ` Gregory Price
  1 sibling, 0 replies; 15+ messages in thread
From: Gregory Price @ 2025-04-24 20:22 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, hannes, mhocko,
	roman.gushchin, shakeel.butt, muchun.song, tj, mkoutny, akpm

It is possible for a reclaimer to cause demotions of an lruvec belonging
to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
this limitation based on the lruvec's memcg and prevent demotion.

Notably, this may still allow demotion of shared libraries or any memory
first instantiated in another cgroup. This means cpusets still cannot
cannot guarantee complete isolation when demotion is enabled, and the
docs have been updated to reflect this.

This is useful for isolating workloads on a multi-tenant system from
certain classes of memory more consistently - with the noted exceptions.

Note on locking:

The cgroup_get_e_css reference protects the css->effective_mems, and
calls of this interface would be subject to the same race conditions
associated with a non-atomic access to cs->effective_mems.

So while this interface cannot make strong guarantees of correctness,
it can therefore avoid taking a global or rcu_read_lock for performance.

Suggested-by: Shakeel Butt <shakeel.butt@linux.dev>
Suggested-by: Waiman Long <longman@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
 .../ABI/testing/sysfs-kernel-mm-numa          | 16 +++++---
 include/linux/cpuset.h                        |  5 +++
 include/linux/memcontrol.h                    |  6 +++
 kernel/cgroup/cpuset.c                        | 36 ++++++++++++++++
 mm/memcontrol.c                               |  6 +++
 mm/vmscan.c                                   | 41 +++++++++++--------
 6 files changed, 88 insertions(+), 22 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
index 77e559d4ed80..90e375ff54cb 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
 		Allowing page migration during reclaim enables these
 		systems to migrate pages from fast tiers to slow tiers
 		when the fast tier is under pressure.  This migration
-		is performed before swap.  It may move data to a NUMA
-		node that does not fall into the cpuset of the
-		allocating process which might be construed to violate
-		the guarantees of cpusets.  This should not be enabled
-		on systems which need strict cpuset location
-		guarantees.
+		is performed before swap if an eligible numa node is
+		present in cpuset.mems for the cgroup (or if cpuset v1
+		is being used). If cpusets.mems changes at runtime, it
+		may move data to a NUMA node that does not fall into the
+		cpuset of the new cpusets.mems, which might be construed
+		to violate the guarantees of cpusets.  Shared memory,
+		such as libraries, owned by another cgroup may still be
+		demoted and result in memory use on a node not present
+		in cpusets.mem. This should not be enabled on systems
+		which need strict cpuset location guarantees.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 893a4c340d48..5255e3fdbf62 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -171,6 +171,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
+extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -282,6 +283,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }
 
+static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	return true;
+}
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 53364526d877..a6c4e3faf721 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1736,6 +1736,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+
 #else
 static inline bool mem_cgroup_kmem_disabled(void)
 {
@@ -1793,6 +1795,10 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
 {
 }
 
+static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return true;
+}
 #endif /* CONFIG_MEMCG */
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f8e6a9b642cb..7eb71d411dc7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4163,6 +4163,42 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
 	return allowed;
 }
 
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+	bool allowed;
+
+	/*
+	 * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+	 * and mems_allowed is likely to be empty even if we could get to it,
+	 * so return true to avoid taking a global lock on the empty check.
+	 */
+	if (!cpuset_v2())
+		return true;
+
+	css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+	if (!css)
+		return true;
+
+	/*
+	 * Normally, accessing effective_mems would require the cpuset_mutex
+	 * or callback_lock - but node_isset is atomic and the reference
+	 * taken via cgroup_get_e_css is sufficient to protect css.
+	 *
+	 * Since this interface is intended for use by migration paths, we
+	 * relax locking here to avoid taking global locks - while accepting
+	 * there may be rare scenarios where the result may be innaccurate.
+	 *
+	 * Reclaim and migration are subject to these same race conditions, and
+	 * cannot make strong isolation guarantees, so this is acceptable.
+	 */
+	cs = container_of(css, struct cpuset, css);
+	allowed = node_isset(nid, cs->effective_mems);
+	css_put(css);
+	return allowed;
+}
+
 /**
  * cpuset_spread_node() - On which node to begin search for a page
  * @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40c07b8699ae..2f61d0060fd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
@@ -5437,3 +5438,8 @@ static int __init mem_cgroup_swap_init(void)
 subsys_initcall(mem_cgroup_swap_init);
 
 #endif /* CONFIG_SWAP */
+
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+	return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2b2ab386cab5..32a7ce421e42 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
 	}
 }
 
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+		       struct mem_cgroup *memcg)
 {
+	int demotion_nid;
+
 	if (!numa_demotion_enabled)
 		return false;
 	if (sc && sc->no_demotion)
 		return false;
-	if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+	demotion_nid = next_demotion_node(nid);
+	if (demotion_nid == NUMA_NO_NODE)
 		return false;
 
-	return true;
+	/* If demotion node isn't in the cgroup's mems_allowed, fall back */
+	return mem_cgroup_node_allowed(memcg, demotion_nid);
 }
 
 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	 *
 	 * Can it be reclaimed from this node via demotion?
 	 */
-	return can_demote(nid, sc);
+	return can_demote(nid, sc, memcg);
 }
 
 /*
@@ -1096,7 +1102,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
  */
 static unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
-		struct reclaim_stat *stat, bool ignore_references)
+		struct reclaim_stat *stat, bool ignore_references,
+		struct mem_cgroup *memcg)
 {
 	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
@@ -1109,7 +1116,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
-	do_demote_pass = can_demote(pgdat->node_id, sc);
+	do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
 
 retry:
 	while (!list_empty(folio_list)) {
@@ -1658,7 +1665,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
-					&stat, true);
+					&stat, true, NULL);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	list_splice(&clean_folios, folio_list);
@@ -2031,7 +2038,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+					 lruvec_memcg(lruvec));
 
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
@@ -2214,7 +2222,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2646,7 +2654,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
  * Anonymous LRU management is a waste if there is
  * ultimately no way to reclaim the memory.
  */
-static bool can_age_anon_pages(struct pglist_data *pgdat,
+static bool can_age_anon_pages(struct lruvec *lruvec,
 			       struct scan_control *sc)
 {
 	/* Aging the anon LRU is valuable if swap is present: */
@@ -2654,7 +2662,8 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 		return true;
 
 	/* Also valuable if anon pages can be demoted: */
-	return can_demote(pgdat->node_id, sc);
+	return can_demote(lruvec_pgdat(lruvec)->node_id, sc,
+			  lruvec_memcg(lruvec));
 }
 
 #ifdef CONFIG_LRU_GEN
@@ -2732,7 +2741,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	if (!sc->may_swap)
 		return 0;
 
-	if (!can_demote(pgdat->node_id, sc) &&
+	if (!can_demote(pgdat->node_id, sc, memcg) &&
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
@@ -4695,7 +4704,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	if (list_empty(&list))
 		return scanned;
 retry:
-	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
 	sc->nr_reclaimed += reclaimed;
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
@@ -5850,7 +5859,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+	if (can_age_anon_pages(lruvec, sc) &&
 	    inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
@@ -6681,10 +6690,10 @@ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 		return;
 	}
 
-	if (!can_age_anon_pages(pgdat, sc))
+	lruvec = mem_cgroup_lruvec(NULL, pgdat);
+	if (!can_age_anon_pages(lruvec, sc))
 		return;
 
-	lruvec = mem_cgroup_lruvec(NULL, pgdat);
 	if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
 		return;
 
-- 
2.49.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2025-04-24 20:22 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-22  1:26 [PATCH v4 0/2] vmscan: enforce mems_effective during demotion Gregory Price
2025-04-22  1:26 ` [PATCH v4 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
2025-04-22 17:37   ` Johannes Weiner
2025-04-22  1:26 ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
2025-04-22  2:02   ` Waiman Long
2025-04-22  4:07     ` Gregory Price
2025-04-22  4:30   ` [PATCH] cpuset: relax locking on cpuset_node_allowed Gregory Price
2025-04-22  4:41     ` Shakeel Butt
2025-04-22 17:46     ` Johannes Weiner
2025-04-22 19:57     ` Waiman Long
2025-04-22  4:41   ` [PATCH v4 2/2] vmscan,cgroup: apply mems_effective to reclaim Shakeel Butt
2025-04-22 17:41   ` Johannes Weiner
2025-04-24 20:22   ` [PATCH v5 0/2] vmscan: enforce mems_effective during demotion Gregory Price
2025-04-24 20:22     ` [PATCH v5 1/2] cpuset: rename cpuset_node_allowed to cpuset_current_node_allowed Gregory Price
2025-04-24 20:22     ` [PATCH v5 2/2] vmscan,cgroup: apply mems_effective to reclaim Gregory Price

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox