From: Gregory Price <gourry@gourry.net>
To: lsf-pc@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-cxl@vger.kernel.org,
cgroups@vger.kernel.org, linux-mm@kvack.org,
linux-trace-kernel@vger.kernel.org, damon@lists.linux.dev,
kernel-team@meta.com, gregkh@linuxfoundation.org,
rafael@kernel.org, dakr@kernel.org, dave@stgolabs.net,
jonathan.cameron@huawei.com, dave.jiang@intel.com,
alison.schofield@intel.com, vishal.l.verma@intel.com,
ira.weiny@intel.com, dan.j.williams@intel.com,
longman@redhat.com, akpm@linux-foundation.org, david@kernel.org,
lorenzo.stoakes@oracle.com, Liam.Howlett@oracle.com,
vbabka@suse.cz, rppt@kernel.org, surenb@google.com,
mhocko@suse.com, osalvador@suse.de, ziy@nvidia.com,
matthew.brost@intel.com, joshua.hahnjy@gmail.com,
rakie.kim@sk.com, byungchul@sk.com, gourry@gourry.net,
ying.huang@linux.alibaba.com, apopple@nvidia.com,
axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
yury.norov@gmail.com, linux@rasmusvillemoes.dk,
mhiramat@kernel.org, mathieu.desnoyers@efficios.com,
tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
jackmanb@google.com, sj@kernel.org,
baolin.wang@linux.alibaba.com, npache@redhat.com,
ryan.roberts@arm.com, dev.jain@arm.com, baohua@kernel.org,
lance.yang@linux.dev, muchun.song@linux.dev, xu.xin16@zte.com.cn,
chengming.zhou@linux.dev, jannh@google.com, linmiaohe@huawei.com,
nao.horiguchi@gmail.com, pfalcato@suse.de, rientjes@google.com,
shakeel.butt@linux.dev, riel@surriel.com, harry.yoo@oracle.com,
cl@gentwo.org, roman.gushchin@linux.dev, chrisl@kernel.org,
kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
bhe@redhat.com, zhengqi.arch@bytedance.com, terry.bowman@amd.com
Subject: [RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy
Date: Sun, 22 Feb 2026 03:48:28 -0500 [thread overview]
Message-ID: <20260222084842.1824063-14-gourry@gourry.net> (raw)
In-Reply-To: <20260222084842.1824063-1-gourry@gourry.net>
Some private nodes want userland to directly allocate from the node
via set_mempolicy() and mbind() - but don't want that node as normal
allocable system memory in the fallback lists.
Add NP_OPS_MEMPOLICY flag requiring NP_OPS_MIGRATION (since mbind can
drive migrations). Only allow private nodes in policy nodemasks if
all private nodes in the mask support NP_OPS_MEMPOLICY. This prevents
__GFP_PRIVATE from unlocking nodes without NP_OPS_MEMPOLICY support.
Add __GFP_PRIVATE to mempolicy migration sites so moves to opted-in
private nodes succeed.
Update the sysfs "has_memory" attribute to include N_MEMORY_PRIVATE
nodes with NP_OPS_MEMPOLICY set, allowing existing numactl userland
tools to work without modification.
Signed-off-by: Gregory Price <gourry@gourry.net>
---
drivers/base/node.c | 22 +++++++++++++-
include/linux/node_private.h | 40 +++++++++++++++++++++++++
include/uapi/linux/mempolicy.h | 1 +
mm/mempolicy.c | 54 ++++++++++++++++++++++++++++++----
mm/page_alloc.c | 5 ++++
5 files changed, 116 insertions(+), 6 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index e587f5781135..c08b5a948779 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -953,6 +953,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops)
(!ops->migrate_to || !ops->folio_migrate))
return -EINVAL;
+ if ((ops->flags & NP_OPS_MEMPOLICY) &&
+ !(ops->flags & NP_OPS_MIGRATION))
+ return -EINVAL;
+
mutex_lock(&node_private_lock);
np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
lockdep_is_held(&node_private_lock));
@@ -1145,6 +1149,21 @@ static ssize_t show_node_state(struct device *dev,
nodemask_pr_args(&node_states[na->state]));
}
+/* has_memory includes N_MEMORY + N_MEMORY_PRIVATE that support mempolicy. */
+static ssize_t show_has_memory(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ nodemask_t mask = node_states[N_MEMORY];
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY_PRIVATE) {
+ if (node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+ node_set(nid, mask);
+ }
+
+ return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&mask));
+}
+
#define _NODE_ATTR(name, state) \
{ __ATTR(name, 0444, show_node_state, NULL), state }
@@ -1155,7 +1174,8 @@ static struct node_attr node_state_attr[] = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
#endif
- [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+ [N_MEMORY] = { __ATTR(has_memory, 0444, show_has_memory, NULL),
+ N_MEMORY },
[N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE),
[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index 0c5be1ee6e60..e9b58afa366b 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -86,6 +86,8 @@ struct node_private_ops {
/* Allow user/kernel migration; requires migrate_to and folio_migrate */
#define NP_OPS_MIGRATION BIT(0)
+/* Allow mempolicy-directed allocation and mbind migration to this node */
+#define NP_OPS_MEMPOLICY BIT(1)
/**
* struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -276,6 +278,34 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,
return ret;
}
+
+static inline bool node_mpol_eligible(int nid)
+{
+ bool ret;
+
+ if (!node_state(nid, N_MEMORY_PRIVATE))
+ return node_state(nid, N_MEMORY);
+
+ rcu_read_lock();
+ ret = node_private_has_flag(nid, NP_OPS_MEMPOLICY);
+ rcu_read_unlock();
+ return ret;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+ int nid;
+ bool eligible = false;
+
+ for_each_node_mask(nid, *nodes) {
+ if (!node_state(nid, N_MEMORY_PRIVATE))
+ continue;
+ if (!node_mpol_eligible(nid))
+ return false;
+ eligible = true;
+ }
+ return eligible;
+}
#endif /* CONFIG_MEMORY_HOTPLUG */
#else /* !CONFIG_NUMA */
@@ -364,6 +394,16 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,
return -ENODEV;
}
+static inline bool node_mpol_eligible(int nid)
+{
+ return false;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+ return false;
+}
+
static inline int node_private_register(int nid, struct node_private *np)
{
return -ENODEV;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..b606eae983c8 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,6 +64,7 @@ enum {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_PRIVATE (1 << 5) /* policy targets private node; use __GFP_PRIVATE */
/*
* Enabling zone reclaim means the page allocator will attempt to fulfill
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2b0f9762d171..8ac014950e88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -406,8 +406,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
- int ret;
-
/*
* Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
@@ -416,9 +414,12 @@ static int mpol_set_nodemask(struct mempolicy *pol,
if (!pol || pol->mode == MPOL_LOCAL)
return 0;
- /* Check N_MEMORY */
+ /* Check N_MEMORY and N_MEMORY_PRIVATE*/
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
+ nodes_and(nsc->mask2, cpuset_current_mems_allowed,
+ node_states[N_MEMORY_PRIVATE]);
+ nodes_or(nsc->mask1, nsc->mask1, nsc->mask2);
VM_BUG_ON(!nodes);
@@ -432,8 +433,13 @@ static int mpol_set_nodemask(struct mempolicy *pol,
else
pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
- return ret;
+ /* All private nodes in the mask must have NP_OPS_MEMPOLICY. */
+ if (nodes_private_mpol_allowed(&nsc->mask2))
+ pol->flags |= MPOL_F_PRIVATE;
+ else if (nodes_intersects(nsc->mask2, node_states[N_MEMORY_PRIVATE]))
+ return -EINVAL;
+
+ return mpol_ops[pol->mode].create(pol, &nsc->mask2);
}
/*
@@ -500,6 +506,7 @@ static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
+ int nid;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
@@ -514,6 +521,21 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
if (nodes_empty(tmp))
tmp = *nodes;
+ /*
+ * Drop private nodes that don't have mempolicy support.
+ * cpusets guarantees at least one N_MEMORY node in effective_mems
+ * and mems_allowed, so dropping private nodes here is safe.
+ */
+ for_each_node_mask(nid, tmp) {
+ if (node_state(nid, N_MEMORY_PRIVATE) &&
+ !node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+ node_clear(nid, tmp);
+ }
+ if (nodes_intersects(tmp, node_states[N_MEMORY_PRIVATE]))
+ pol->flags |= MPOL_F_PRIVATE;
+ else
+ pol->flags &= ~MPOL_F_PRIVATE;
+
pol->nodes = tmp;
}
@@ -661,6 +683,9 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
}
if (!queue_folio_required(folio, qp))
return;
+ if (folio_is_private_node(folio) &&
+ !folio_private_flags(folio, NP_OPS_MIGRATION))
+ return;
if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(walk->vma) ||
!migrate_folio_add(folio, qp->pagelist, qp->flags))
@@ -717,6 +742,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_is_private_node(folio) &&
+ !folio_private_flags(folio, NP_OPS_MIGRATION))
+ continue;
if (folio_test_large(folio) && max_nr != 1)
nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
@@ -1451,6 +1479,9 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
else
gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
+ if (pol->flags & MPOL_F_PRIVATE)
+ gfp |= __GFP_PRIVATE;
+
return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else
@@ -2280,6 +2311,15 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
nodemask = &pol->nodes;
if (pol->home_node != NUMA_NO_NODE)
*nid = pol->home_node;
+ else if ((pol->flags & MPOL_F_PRIVATE) &&
+ !node_isset(*nid, pol->nodes)) {
+ /*
+ * Private nodes are not in N_MEMORY nodes' zonelists.
+ * When the preferred nid (usually numa_node_id()) can't
+ * reach the policy nodes, start from a policy node.
+ */
+ *nid = first_node(pol->nodes);
+ }
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
@@ -2533,6 +2573,10 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
gfp |= __GFP_NOWARN;
pol = get_vma_policy(vma, addr, order, &ilx);
+
+ if (pol->flags & MPOL_F_PRIVATE)
+ gfp |= __GFP_PRIVATE;
+
folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
mpol_cond_put(pol);
return folio;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a1b35421d78..ec6c1f8e85d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3849,8 +3849,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
* if another process has NUMA bindings and is causing
* kswapd wakeups on only some nodes. Avoid accidental
* "node_reclaim_mode"-like behavior in this case.
+ *
+ * Nodes without kswapd (some private nodes) are never
+ * skipped - this causes some mempolicies to silently
+ * fall back to DRAM even if the node is eligible.
*/
if (skip_kswapd_nodes &&
+ zone->zone_pgdat->kswapd &&
!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
skipped_kswapd_nodes = true;
continue;
--
2.53.0
next prev parent reply other threads:[~2026-02-22 8:49 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-22 8:48 [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM) Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 01/27] numa: introduce N_MEMORY_PRIVATE node state Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 02/27] mm,cpuset: gate allocations from N_MEMORY_PRIVATE behind __GFP_PRIVATE Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 03/27] mm/page_alloc: add numa_zone_allowed() and wire it up Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 04/27] mm/page_alloc: Add private node handling to build_zonelists Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 05/27] mm: introduce folio_is_private_managed() unified predicate Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 06/27] mm/mlock: skip mlock for managed-memory folios Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 07/27] mm/madvise: skip madvise " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 08/27] mm/ksm: skip KSM " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 09/27] mm/khugepaged: skip private node folios when trying to collapse Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 10/27] mm/swap: add free_folio callback for folio release cleanup Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 11/27] mm/huge_memory.c: add private node folio split notification callback Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 12/27] mm/migrate: NP_OPS_MIGRATION - support private node user migration Gregory Price
2026-02-22 8:48 ` Gregory Price [this message]
2026-02-22 8:48 ` [RFC PATCH v4 14/27] mm/memory-tiers: NP_OPS_DEMOTION - support private node demotion Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 15/27] mm/mprotect: NP_OPS_PROTECT_WRITE - gate PTE/PMD write-upgrades Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 16/27] mm: NP_OPS_RECLAIM - private node reclaim participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 17/27] mm/oom: NP_OPS_OOM_ELIGIBLE - private node OOM participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 18/27] mm/memory: NP_OPS_NUMA_BALANCING - private node NUMA balancing Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 19/27] mm/compaction: NP_OPS_COMPACTION - private node compaction support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 20/27] mm/gup: NP_OPS_LONGTERM_PIN - private node longterm pin support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 21/27] mm/memory-failure: add memory_failure callback to node_private_ops Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 22/27] mm/memory_hotplug: add add_private_memory_driver_managed() Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 23/27] mm/cram: add compressed ram memory management subsystem Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 24/27] cxl/core: Add cxl_sysram region type Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 25/27] cxl/core: Add private node support to cxl_sysram Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 26/27] cxl: add cxl_mempolicy sample PCI driver Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 27/27] cxl: add cxl_compression " Gregory Price
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260222084842.1824063-14-gourry@gourry.net \
--to=gourry@gourry.net \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alison.schofield@intel.com \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=cl@gentwo.org \
--cc=dakr@kernel.org \
--cc=damon@lists.linux.dev \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gregkh@linuxfoundation.org \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=ira.weiny@intel.com \
--cc=jackmanb@google.com \
--cc=jannh@google.com \
--cc=jonathan.cameron@huawei.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linmiaohe@huawei.com \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=linux@rasmusvillemoes.dk \
--cc=longman@redhat.com \
--cc=lorenzo.stoakes@oracle.com \
--cc=lsf-pc@lists.linux-foundation.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.brost@intel.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@suse.com \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nao.horiguchi@gmail.com \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=osalvador@suse.de \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=sj@kernel.org \
--cc=surenb@google.com \
--cc=terry.bowman@amd.com \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=vishal.l.verma@intel.com \
--cc=weixugc@google.com \
--cc=xu.xin16@zte.com.cn \
--cc=ying.huang@linux.alibaba.com \
--cc=yuanchu@google.com \
--cc=yury.norov@gmail.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox