linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim
@ 2025-03-20 21:09 Gregory Price
  2025-04-08 18:17 ` Johannes Weiner
  2025-04-08 18:36 ` Waiman Long
  0 siblings, 2 replies; 3+ messages in thread
From: Gregory Price @ 2025-03-20 21:09 UTC (permalink / raw)
  To: linux-mm
  Cc: cgroups, linux-kernel, kernel-team, longman, tj, hannes, mkoutny, akpm

It is possible for a reclaimer to cause demotions of an lruvec belonging
to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
this limitation based on the lruvec's memcg and prevent demotion.

Notably, this may still allow demotion of shared libraries or any memory
first instantiated in another cgroup. This means cpusets still cannot
cannot guarantee complete isolation when demotion is enabled, and the
docs have been updated to reflect this.


Note: This is a fairly hacked up method that probably overlooks some
      cgroup/cpuset controls or designs. RFCing now for some discussion
      at LSFMM '25.


Signed-off-by: Gregory Price <gourry@gourry.net>
---
 .../ABI/testing/sysfs-kernel-mm-numa          | 14 +++++---
 include/linux/cpuset.h                        |  2 ++
 kernel/cgroup/cpuset.c                        | 10 ++++++
 mm/vmscan.c                                   | 32 ++++++++++++-------
 4 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
index 77e559d4ed80..27cdcab901f7 100644
--- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
@@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
 		Allowing page migration during reclaim enables these
 		systems to migrate pages from fast tiers to slow tiers
 		when the fast tier is under pressure.  This migration
-		is performed before swap.  It may move data to a NUMA
-		node that does not fall into the cpuset of the
-		allocating process which might be construed to violate
-		the guarantees of cpusets.  This should not be enabled
-		on systems which need strict cpuset location
+		is performed before swap if an eligible numa node is
+		present in cpuset.mems for the cgroup. If cpusets.mems
+		changes at runtime, it may move data to a NUMA node that
+		does not fall into the cpuset of the new cpusets.mems,
+		which might be construed to violate the guarantees of
+		cpusets.  Shared memory, such as libraries, owned by
+		another cgroup may still be demoted and result in memory
+		use on a node not present in cpusets.mem. This should not
+		be enabled on systems which need strict cpuset location
 		guarantees.
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 835e7b793f6a..d4169f1b1719 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -171,6 +171,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
+bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..bb9669cc105d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4296,3 +4296,13 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
 		   nodemask_pr_args(&task->mems_allowed));
 }
+
+bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid)
+{
+	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
+
+	css = cgroup_get_e_css(memcg->css.cgroup, &cpuset_cgrp_subsys);
+	cs = css ? container_of(css, struct cpuset, css) : NULL;
+	return cs ? node_isset(nid, cs->effective_mems) : true;
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2b2ab386cab5..04152ea1c03d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
 	}
 }
 
-static bool can_demote(int nid, struct scan_control *sc)
+static bool can_demote(int nid, struct scan_control *sc,
+		       struct mem_cgroup *memcg)
 {
+	int demotion_nid;
+
 	if (!numa_demotion_enabled)
 		return false;
 	if (sc && sc->no_demotion)
 		return false;
-	if (next_demotion_node(nid) == NUMA_NO_NODE)
+
+	demotion_nid = next_demotion_node(nid);
+	if (demotion_nid == NUMA_NO_NODE)
 		return false;
 
-	return true;
+	/* If demotion node isn't in mems_allowed, fall back */
+	return memcg ? memcg_mems_allowed(memcg, demotion_nid) : true;
 }
 
 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
@@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 	 *
 	 * Can it be reclaimed from this node via demotion?
 	 */
-	return can_demote(nid, sc);
+	return can_demote(nid, sc, NULL);
 }
 
 /*
@@ -1096,7 +1102,8 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
  */
 static unsigned int shrink_folio_list(struct list_head *folio_list,
 		struct pglist_data *pgdat, struct scan_control *sc,
-		struct reclaim_stat *stat, bool ignore_references)
+		struct reclaim_stat *stat, bool ignore_references,
+		struct mem_cgroup *memcg)
 {
 	struct folio_batch free_folios;
 	LIST_HEAD(ret_folios);
@@ -1109,7 +1116,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 	folio_batch_init(&free_folios);
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
-	do_demote_pass = can_demote(pgdat->node_id, sc);
+	do_demote_pass = can_demote(pgdat->node_id, sc, memcg);
 
 retry:
 	while (!list_empty(folio_list)) {
@@ -1658,7 +1665,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
-					&stat, true);
+					&stat, true, NULL);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	list_splice(&clean_folios, folio_list);
@@ -2031,7 +2038,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
+	nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
+					 lruvec_memcg(lruvec));
 
 	spin_lock_irq(&lruvec->lru_lock);
 	move_folios_to_lru(lruvec, &folio_list);
@@ -2214,7 +2222,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
 		.no_demotion = 1,
 	};
 
-	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true);
+	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true, NULL);
 	while (!list_empty(folio_list)) {
 		folio = lru_to_folio(folio_list);
 		list_del(&folio->lru);
@@ -2654,7 +2662,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 		return true;
 
 	/* Also valuable if anon pages can be demoted: */
-	return can_demote(pgdat->node_id, sc);
+	return can_demote(pgdat->node_id, sc, NULL);
 }
 
 #ifdef CONFIG_LRU_GEN
@@ -2732,7 +2740,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
 	if (!sc->may_swap)
 		return 0;
 
-	if (!can_demote(pgdat->node_id, sc) &&
+	if (!can_demote(pgdat->node_id, sc, NULL) &&
 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
 		return 0;
 
@@ -4695,7 +4703,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	if (list_empty(&list))
 		return scanned;
 retry:
-	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, NULL);
 	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
 	sc->nr_reclaimed += reclaimed;
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
-- 
2.48.1



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim
  2025-03-20 21:09 [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
@ 2025-04-08 18:17 ` Johannes Weiner
  2025-04-08 18:36 ` Waiman Long
  1 sibling, 0 replies; 3+ messages in thread
From: Johannes Weiner @ 2025-04-08 18:17 UTC (permalink / raw)
  To: Gregory Price
  Cc: linux-mm, cgroups, linux-kernel, kernel-team, longman, tj, mkoutny, akpm

On Thu, Mar 20, 2025 at 05:09:19PM -0400, Gregory Price wrote:
> @@ -4296,3 +4296,13 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
>  	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
>  		   nodemask_pr_args(&task->mems_allowed));
>  }
> +
> +bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid)

This should probably be

	cgroup_mems_allowed(struct cgroup *, int)

and then have a

	mem_cgroup_mems_allowed(struct mem_cgroup *, int)

that does the e_css translation, with the necessary dummy functions to
work with all CONFIG combinations.

> +{
> +	struct cgroup_subsys_state *css;
> +	struct cpuset *cs;
> +
> +	css = cgroup_get_e_css(memcg->css.cgroup, &cpuset_cgrp_subsys);
> +	cs = css ? container_of(css, struct cpuset, css) : NULL;
> +	return cs ? node_isset(nid, cs->effective_mems) : true;

You need a css_put() to drop the ref from cgroup_get_e_css(), but
otherwise accessing css should be safe this way.

AFAICS you need callback_lock to query cs->effective_mems.

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2b2ab386cab5..04152ea1c03d 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -342,16 +342,22 @@ static void flush_reclaim_state(struct scan_control *sc)
>  	}
>  }
>  
> -static bool can_demote(int nid, struct scan_control *sc)
> +static bool can_demote(int nid, struct scan_control *sc,
> +		       struct mem_cgroup *memcg)
>  {
> +	int demotion_nid;
> +
>  	if (!numa_demotion_enabled)
>  		return false;
>  	if (sc && sc->no_demotion)
>  		return false;
> -	if (next_demotion_node(nid) == NUMA_NO_NODE)
> +
> +	demotion_nid = next_demotion_node(nid);
> +	if (demotion_nid == NUMA_NO_NODE)
>  		return false;
>  
> -	return true;
> +	/* If demotion node isn't in mems_allowed, fall back */
> +	return memcg ? memcg_mems_allowed(memcg, demotion_nid) : true;
>  }
>  
>  static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
> @@ -376,7 +382,7 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
>  	 *
>  	 * Can it be reclaimed from this node via demotion?
>  	 */
> -	return can_demote(nid, sc);
> +	return can_demote(nid, sc, NULL);

This has appropriate memcg context from get_scan_count(), use that.

> @@ -2654,7 +2662,7 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
>  		return true;
>  
>  	/* Also valuable if anon pages can be demoted: */
> -	return can_demote(pgdat->node_id, sc);
> +	return can_demote(pgdat->node_id, sc, NULL);

Make this take an lruvec, then pass lruvec_memcg() to can_demote().

shrink_lruvec() already has the lruvec.

kswapd_age_node() has to do the test from inside the memcg loop, since
demotion and thus aging now very much depends on each cgroup's policy.

>  }
>  
>  #ifdef CONFIG_LRU_GEN
> @@ -2732,7 +2740,7 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
>  	if (!sc->may_swap)
>  		return 0;
>  
> -	if (!can_demote(pgdat->node_id, sc) &&
> +	if (!can_demote(pgdat->node_id, sc, NULL) &&
>  	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
>  		return 0;

MGLRU, so grain of salt, but that memcg looks appropriate for passing.

> @@ -4695,7 +4703,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
>  	if (list_empty(&list))
>  		return scanned;
>  retry:
> -	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
> +	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, NULL);

This also seems to have appropriate lruvec/memcg context.


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim
  2025-03-20 21:09 [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
  2025-04-08 18:17 ` Johannes Weiner
@ 2025-04-08 18:36 ` Waiman Long
  1 sibling, 0 replies; 3+ messages in thread
From: Waiman Long @ 2025-04-08 18:36 UTC (permalink / raw)
  To: Gregory Price, linux-mm
  Cc: cgroups, linux-kernel, kernel-team, tj, hannes, mkoutny, akpm

On 3/20/25 5:09 PM, Gregory Price wrote:
> It is possible for a reclaimer to cause demotions of an lruvec belonging
> to a cgroup with cpuset.mems set to exclude some nodes. Attempt to apply
> this limitation based on the lruvec's memcg and prevent demotion.
>
> Notably, this may still allow demotion of shared libraries or any memory
> first instantiated in another cgroup. This means cpusets still cannot
> cannot guarantee complete isolation when demotion is enabled, and the
> docs have been updated to reflect this.
>
>
> Note: This is a fairly hacked up method that probably overlooks some
>        cgroup/cpuset controls or designs. RFCing now for some discussion
>        at LSFMM '25.
>
>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>   .../ABI/testing/sysfs-kernel-mm-numa          | 14 +++++---
>   include/linux/cpuset.h                        |  2 ++
>   kernel/cgroup/cpuset.c                        | 10 ++++++
>   mm/vmscan.c                                   | 32 ++++++++++++-------
>   4 files changed, 41 insertions(+), 17 deletions(-)
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> index 77e559d4ed80..27cdcab901f7 100644
> --- a/Documentation/ABI/testing/sysfs-kernel-mm-numa
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa
> @@ -16,9 +16,13 @@ Description:	Enable/disable demoting pages during reclaim
>   		Allowing page migration during reclaim enables these
>   		systems to migrate pages from fast tiers to slow tiers
>   		when the fast tier is under pressure.  This migration
> -		is performed before swap.  It may move data to a NUMA
> -		node that does not fall into the cpuset of the
> -		allocating process which might be construed to violate
> -		the guarantees of cpusets.  This should not be enabled
> -		on systems which need strict cpuset location
> +		is performed before swap if an eligible numa node is
> +		present in cpuset.mems for the cgroup. If cpusets.mems
> +		changes at runtime, it may move data to a NUMA node that
> +		does not fall into the cpuset of the new cpusets.mems,
> +		which might be construed to violate the guarantees of
> +		cpusets.  Shared memory, such as libraries, owned by
> +		another cgroup may still be demoted and result in memory
> +		use on a node not present in cpusets.mem. This should not
> +		be enabled on systems which need strict cpuset location
>   		guarantees.
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 835e7b793f6a..d4169f1b1719 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -171,6 +171,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>   	task_unlock(current);
>   }
>   
> +bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid);
> +
>   #else /* !CONFIG_CPUSETS */
>   
You should also define an inline function for the !CONFIG_CPUSETS case.
>   static inline bool cpusets_enabled(void) { return false; }
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 0f910c828973..bb9669cc105d 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4296,3 +4296,13 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
>   	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
>   		   nodemask_pr_args(&task->mems_allowed));
>   }
> +
> +bool memcg_mems_allowed(struct mem_cgroup *memcg, int nid)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct cpuset *cs;
> +
> +	css = cgroup_get_e_css(memcg->css.cgroup, &cpuset_cgrp_subsys);
> +	cs = css ? container_of(css, struct cpuset, css) : NULL;
> +	return cs ? node_isset(nid, cs->effective_mems) : true;

As said by Johannes, you will need to take the callback_lock to ensure 
the stability of effective_mems. I also second his suggestion of 
defining a cgroup_mems_allowed() here and do the the memcg to cgroup 
translation outside of cpuset.c.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-04-08 18:36 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-20 21:09 [RFC PATCH] vmscan,cgroup: apply mems_effective to reclaim Gregory Price
2025-04-08 18:17 ` Johannes Weiner
2025-04-08 18:36 ` Waiman Long

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox