From: Gregory Price <gourry@gourry.net>
To: lsf-pc@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-cxl@vger.kernel.org,
cgroups@vger.kernel.org, linux-mm@kvack.org,
linux-trace-kernel@vger.kernel.org, damon@lists.linux.dev,
kernel-team@meta.com, gregkh@linuxfoundation.org,
rafael@kernel.org, dakr@kernel.org, dave@stgolabs.net,
jonathan.cameron@huawei.com, dave.jiang@intel.com,
alison.schofield@intel.com, vishal.l.verma@intel.com,
ira.weiny@intel.com, dan.j.williams@intel.com,
longman@redhat.com, akpm@linux-foundation.org, david@kernel.org,
lorenzo.stoakes@oracle.com, Liam.Howlett@oracle.com,
vbabka@suse.cz, rppt@kernel.org, surenb@google.com,
mhocko@suse.com, osalvador@suse.de, ziy@nvidia.com,
matthew.brost@intel.com, joshua.hahnjy@gmail.com,
rakie.kim@sk.com, byungchul@sk.com, gourry@gourry.net,
ying.huang@linux.alibaba.com, apopple@nvidia.com,
axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
yury.norov@gmail.com, linux@rasmusvillemoes.dk,
mhiramat@kernel.org, mathieu.desnoyers@efficios.com,
tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
jackmanb@google.com, sj@kernel.org,
baolin.wang@linux.alibaba.com, npache@redhat.com,
ryan.roberts@arm.com, dev.jain@arm.com, baohua@kernel.org,
lance.yang@linux.dev, muchun.song@linux.dev, xu.xin16@zte.com.cn,
chengming.zhou@linux.dev, jannh@google.com, linmiaohe@huawei.com,
nao.horiguchi@gmail.com, pfalcato@suse.de, rientjes@google.com,
shakeel.butt@linux.dev, riel@surriel.com, harry.yoo@oracle.com,
cl@gentwo.org, roman.gushchin@linux.dev, chrisl@kernel.org,
kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
bhe@redhat.com, zhengqi.arch@bytedance.com, terry.bowman@amd.com
Subject: [RFC PATCH v4 14/27] mm/memory-tiers: NP_OPS_DEMOTION - support private node demotion
Date: Sun, 22 Feb 2026 03:48:29 -0500 [thread overview]
Message-ID: <20260222084842.1824063-15-gourry@gourry.net> (raw)
In-Reply-To: <20260222084842.1824063-1-gourry@gourry.net>
The memory-tier subsystem needs to know which private nodes should
appear as demotion targets.
Add NP_OPS_DEMOTION (BIT(2)):
Node can be added as a demotion target by memory-tiers.
Add demotion backpressure support so private nodes can reject
new demotions cleanly, allowing vmscan to fall back to swap.
In the demotion path, try demotion to private nodes invididually,
then clear private nodes from the demotion target mask until a
non-private node is found, then fall back to the remaining mask.
This prevents LRU inversion while still allowing forward progress.
This is the closest match to the current behavior without making
private nodes inaccessible or preventing forward progress. We
should probably completely re-do the demotion logic to allow less
fallback and kick kswapd instead - right now we induce LRU
inversions by simply falling back to any node in the demotion list.
Add memory_tier_refresh_demotion() export for services to trigger
re-evaluation of demotion targets after changing their flags.
Signed-off-by: Gregory Price <gourry@gourry.net>
---
include/linux/memory-tiers.h | 9 +++++++
include/linux/node_private.h | 22 +++++++++++++++++
mm/internal.h | 7 ++++++
mm/memory-tiers.c | 46 ++++++++++++++++++++++++++++++++----
mm/page_alloc.c | 12 +++++++---
mm/vmscan.c | 30 ++++++++++++++++++++++-
6 files changed, 117 insertions(+), 9 deletions(-)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 3e1159f6762c..e1476432e359 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -58,6 +58,7 @@ struct memory_dev_type *mt_get_memory_type(int adist);
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
bool node_is_toptier(int node);
+void memory_tier_refresh_demotion(void);
#else
static inline int next_demotion_node(int node)
{
@@ -73,6 +74,10 @@ static inline bool node_is_toptier(int node)
{
return true;
}
+
+static inline void memory_tier_refresh_demotion(void)
+{
+}
#endif
#else
@@ -106,6 +111,10 @@ static inline bool node_is_toptier(int node)
return true;
}
+static inline void memory_tier_refresh_demotion(void)
+{
+}
+
static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
{
return 0;
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index e9b58afa366b..e254e36056cd 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -88,6 +88,8 @@ struct node_private_ops {
#define NP_OPS_MIGRATION BIT(0)
/* Allow mempolicy-directed allocation and mbind migration to this node */
#define NP_OPS_MEMPOLICY BIT(1)
+/* Node participates as a demotion target in memory-tiers */
+#define NP_OPS_DEMOTION BIT(2)
/**
* struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -101,12 +103,14 @@ struct node_private_ops {
* callbacks that may sleep; 0 = fully released)
* @released: Signaled when refcount drops to 0; unregister waits on this
* @ops: Service callbacks and exclusion flags (NULL until service registers)
+ * @migration_blocked: Service signals migrations should pause
*/
struct node_private {
void *owner;
refcount_t refcount;
struct completion released;
const struct node_private_ops *ops;
+ bool migration_blocked;
};
#ifdef CONFIG_NUMA
@@ -306,6 +310,19 @@ static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
}
return eligible;
}
+
+static inline bool node_private_migration_blocked(int nid)
+{
+ struct node_private *np;
+ bool blocked;
+
+ rcu_read_lock();
+ np = rcu_dereference(NODE_DATA(nid)->node_private);
+ blocked = np && READ_ONCE(np->migration_blocked);
+ rcu_read_unlock();
+
+ return blocked;
+}
#endif /* CONFIG_MEMORY_HOTPLUG */
#else /* !CONFIG_NUMA */
@@ -404,6 +421,11 @@ static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
return false;
}
+static inline bool node_private_migration_blocked(int nid)
+{
+ return false;
+}
+
static inline int node_private_register(int nid, struct node_private *np)
{
return -ENODEV;
diff --git a/mm/internal.h b/mm/internal.h
index 6ab4679fe943..5950e20d4023 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1206,6 +1206,8 @@ extern int node_reclaim_mode;
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
+extern int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+ const nodemask_t *candidates);
extern bool numa_zone_alloc_allowed(int alloc_flags, struct zone *zone,
gfp_t gfp_mask);
#else
@@ -1220,6 +1222,11 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
return NUMA_NO_NODE;
}
+static inline int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+ const nodemask_t *candidates)
+{
+ return NUMA_NO_NODE;
+}
static inline bool numa_zone_alloc_allowed(int alloc_flags, struct zone *zone,
gfp_t gfp_mask)
{
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 9c742e18e48f..434190fdc078 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -3,6 +3,7 @@
#include <linux/lockdep.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
+#include <linux/node_private.h>
#include <linux/memory.h>
#include <linux/memory-tiers.h>
#include <linux/notifier.h>
@@ -380,6 +381,8 @@ static void disable_all_demotion_targets(void)
if (memtier)
memtier->lower_tier_mask = NODE_MASK_NONE;
}
+ for_each_node_state(node, N_MEMORY_PRIVATE)
+ node_demotion[node].preferred = NODE_MASK_NONE;
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
@@ -421,6 +424,7 @@ static void establish_demotion_targets(void)
int target = NUMA_NO_NODE, node;
int distance, best_distance;
nodemask_t tier_nodes, lower_tier;
+ nodemask_t all_memory;
lockdep_assert_held_once(&memory_tier_lock);
@@ -429,6 +433,13 @@ static void establish_demotion_targets(void)
disable_all_demotion_targets();
+ /* Include private nodes that have opted in to demotion. */
+ all_memory = node_states[N_MEMORY];
+ for_each_node_state(node, N_MEMORY_PRIVATE) {
+ if (node_private_has_flag(node, NP_OPS_DEMOTION))
+ node_set(node, all_memory);
+ }
+
for_each_node_state(node, N_MEMORY) {
best_distance = -1;
nd = &node_demotion[node];
@@ -442,12 +453,12 @@ static void establish_demotion_targets(void)
memtier = list_next_entry(memtier, list);
tier_nodes = get_memtier_nodemask(memtier);
/*
- * find_next_best_node, use 'used' nodemask as a skip list.
+ * find_next_best_node_in, use 'used' nodemask as a skip list.
* Add all memory nodes except the selected memory tier
* nodelist to skip list so that we find the best node from the
* memtier nodelist.
*/
- nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
+ nodes_andnot(tier_nodes, all_memory, tier_nodes);
/*
* Find all the nodes in the memory tier node list of same best distance.
@@ -455,7 +466,8 @@ static void establish_demotion_targets(void)
* in the preferred mask when allocating pages during demotion.
*/
do {
- target = find_next_best_node(node, &tier_nodes);
+ target = find_next_best_node_in(node, &tier_nodes,
+ &all_memory);
if (target == NUMA_NO_NODE)
break;
@@ -495,7 +507,7 @@ static void establish_demotion_targets(void)
* allocation to a set of nodes that is closer the above selected
* preferred node.
*/
- lower_tier = node_states[N_MEMORY];
+ lower_tier = all_memory;
list_for_each_entry(memtier, &memory_tiers, list) {
/*
* Keep removing current tier from lower_tier nodes,
@@ -542,7 +554,7 @@ static struct memory_tier *set_node_memory_tier(int node)
lockdep_assert_held_once(&memory_tier_lock);
- if (!node_state(node, N_MEMORY))
+ if (!node_state(node, N_MEMORY) && !node_state(node, N_MEMORY_PRIVATE))
return ERR_PTR(-EINVAL);
mt_calc_adistance(node, &adist);
@@ -865,6 +877,30 @@ int mt_calc_adistance(int node, int *adist)
}
EXPORT_SYMBOL_GPL(mt_calc_adistance);
+/**
+ * memory_tier_refresh_demotion() - Re-establish demotion targets
+ *
+ * Called by services after registering or unregistering ops->migrate_to on
+ * a private node, so that establish_demotion_targets() picks up the change.
+ */
+void memory_tier_refresh_demotion(void)
+{
+ int nid;
+
+ mutex_lock(&memory_tier_lock);
+ /*
+ * Ensure private nodes are registered with a tier, otherwise
+ * they won't show up in any node's demotion targets nodemask.
+ */
+ for_each_node_state(nid, N_MEMORY_PRIVATE) {
+ if (!__node_get_memory_tier(nid))
+ set_node_memory_tier(nid);
+ }
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(memory_tier_refresh_demotion);
+
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ec6c1f8e85d8..e272dfdc6b00 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5589,7 +5589,8 @@ static int node_load[MAX_NUMNODES];
*
* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
-int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node_in(int node, nodemask_t *used_node_mask,
+ const nodemask_t *candidates)
{
int n, val;
int min_val = INT_MAX;
@@ -5599,12 +5600,12 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
* Use the local node if we haven't already, but for memoryless local
* node, we should skip it and fall back to other nodes.
*/
- if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
+ if (!node_isset(node, *used_node_mask) && node_isset(node, *candidates)) {
node_set(node, *used_node_mask);
return node;
}
- for_each_node_state(n, N_MEMORY) {
+ for_each_node_mask(n, *candidates) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -5636,6 +5637,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
return best_node;
}
+int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+ return find_next_best_node_in(node, used_node_mask,
+ &node_states[N_MEMORY]);
+}
/*
* Build zonelists ordered by node and zones within node.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6113be4d3519..0f534428ea88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,6 +58,7 @@
#include <linux/random.h>
#include <linux/mmu_notifier.h>
#include <linux/parser.h>
+#include <linux/node_private.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -355,6 +356,10 @@ static bool can_demote(int nid, struct scan_control *sc,
if (demotion_nid == NUMA_NO_NODE)
return false;
+ /* Don't demote when the target's service signals backpressure */
+ if (node_private_migration_blocked(demotion_nid))
+ return false;
+
/* If demotion node isn't in the cgroup's mems_allowed, fall back */
return mem_cgroup_node_allowed(memcg, demotion_nid);
}
@@ -1022,8 +1027,10 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
{
int target_nid = next_demotion_node(pgdat->node_id);
- unsigned int nr_succeeded;
+ int first_nid = target_nid;
+ unsigned int nr_succeeded = 0;
nodemask_t allowed_mask;
+ int ret;
struct migration_target_control mtc = {
/*
@@ -1046,6 +1053,27 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
node_get_allowed_targets(pgdat, &allowed_mask);
+ /* Try private node targets until we find non-private node */
+ while (node_state(target_nid, N_MEMORY_PRIVATE)) {
+ unsigned int nr = 0;
+
+ ret = node_private_migrate_to(demote_folios, target_nid,
+ MIGRATE_ASYNC, MR_DEMOTION,
+ &nr);
+ nr_succeeded += nr;
+ if (ret == 0 || list_empty(demote_folios))
+ return nr_succeeded;
+
+ target_nid = next_node_in(target_nid, allowed_mask);
+ if (target_nid == first_nid)
+ return nr_succeeded;
+ if (!node_state(target_nid, N_MEMORY_PRIVATE))
+ break;
+ }
+
+ /* target_nid is a non-private node; use standard migration */
+ mtc.nid = target_nid;
+
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
--
2.53.0
next prev parent reply other threads:[~2026-02-22 8:49 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-22 8:48 [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM) Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 01/27] numa: introduce N_MEMORY_PRIVATE node state Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 02/27] mm,cpuset: gate allocations from N_MEMORY_PRIVATE behind __GFP_PRIVATE Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 03/27] mm/page_alloc: add numa_zone_allowed() and wire it up Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 04/27] mm/page_alloc: Add private node handling to build_zonelists Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 05/27] mm: introduce folio_is_private_managed() unified predicate Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 06/27] mm/mlock: skip mlock for managed-memory folios Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 07/27] mm/madvise: skip madvise " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 08/27] mm/ksm: skip KSM " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 09/27] mm/khugepaged: skip private node folios when trying to collapse Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 10/27] mm/swap: add free_folio callback for folio release cleanup Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 11/27] mm/huge_memory.c: add private node folio split notification callback Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 12/27] mm/migrate: NP_OPS_MIGRATION - support private node user migration Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy Gregory Price
2026-02-22 8:48 ` Gregory Price [this message]
2026-02-22 8:48 ` [RFC PATCH v4 15/27] mm/mprotect: NP_OPS_PROTECT_WRITE - gate PTE/PMD write-upgrades Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 16/27] mm: NP_OPS_RECLAIM - private node reclaim participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 17/27] mm/oom: NP_OPS_OOM_ELIGIBLE - private node OOM participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 18/27] mm/memory: NP_OPS_NUMA_BALANCING - private node NUMA balancing Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 19/27] mm/compaction: NP_OPS_COMPACTION - private node compaction support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 20/27] mm/gup: NP_OPS_LONGTERM_PIN - private node longterm pin support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 21/27] mm/memory-failure: add memory_failure callback to node_private_ops Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 22/27] mm/memory_hotplug: add add_private_memory_driver_managed() Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 23/27] mm/cram: add compressed ram memory management subsystem Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 24/27] cxl/core: Add cxl_sysram region type Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 25/27] cxl/core: Add private node support to cxl_sysram Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 26/27] cxl: add cxl_mempolicy sample PCI driver Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 27/27] cxl: add cxl_compression " Gregory Price
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260222084842.1824063-15-gourry@gourry.net \
--to=gourry@gourry.net \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alison.schofield@intel.com \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=cl@gentwo.org \
--cc=dakr@kernel.org \
--cc=damon@lists.linux.dev \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gregkh@linuxfoundation.org \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=ira.weiny@intel.com \
--cc=jackmanb@google.com \
--cc=jannh@google.com \
--cc=jonathan.cameron@huawei.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linmiaohe@huawei.com \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=linux@rasmusvillemoes.dk \
--cc=longman@redhat.com \
--cc=lorenzo.stoakes@oracle.com \
--cc=lsf-pc@lists.linux-foundation.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.brost@intel.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@suse.com \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nao.horiguchi@gmail.com \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=osalvador@suse.de \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=sj@kernel.org \
--cc=surenb@google.com \
--cc=terry.bowman@amd.com \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=vishal.l.verma@intel.com \
--cc=weixugc@google.com \
--cc=xu.xin16@zte.com.cn \
--cc=ying.huang@linux.alibaba.com \
--cc=yuanchu@google.com \
--cc=yury.norov@gmail.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox