[RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Gregory Price <gourry@gourry.net>
To: lsf-pc@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-cxl@vger.kernel.org,
	cgroups@vger.kernel.org, linux-mm@kvack.org,
	linux-trace-kernel@vger.kernel.org, damon@lists.linux.dev,
	kernel-team@meta.com, gregkh@linuxfoundation.org,
	rafael@kernel.org, dakr@kernel.org, dave@stgolabs.net,
	jonathan.cameron@huawei.com, dave.jiang@intel.com,
	alison.schofield@intel.com, vishal.l.verma@intel.com,
	ira.weiny@intel.com, dan.j.williams@intel.com,
	longman@redhat.com, akpm@linux-foundation.org, david@kernel.org,
	lorenzo.stoakes@oracle.com, Liam.Howlett@oracle.com,
	vbabka@suse.cz, rppt@kernel.org, surenb@google.com,
	mhocko@suse.com, osalvador@suse.de, ziy@nvidia.com,
	matthew.brost@intel.com, joshua.hahnjy@gmail.com,
	rakie.kim@sk.com, byungchul@sk.com, gourry@gourry.net,
	ying.huang@linux.alibaba.com, apopple@nvidia.com,
	axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
	yury.norov@gmail.com, linux@rasmusvillemoes.dk,
	mhiramat@kernel.org, mathieu.desnoyers@efficios.com,
	tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
	jackmanb@google.com, sj@kernel.org,
	baolin.wang@linux.alibaba.com, npache@redhat.com,
	ryan.roberts@arm.com, dev.jain@arm.com, baohua@kernel.org,
	lance.yang@linux.dev, muchun.song@linux.dev, xu.xin16@zte.com.cn,
	chengming.zhou@linux.dev, jannh@google.com, linmiaohe@huawei.com,
	nao.horiguchi@gmail.com, pfalcato@suse.de, rientjes@google.com,
	shakeel.butt@linux.dev, riel@surriel.com, harry.yoo@oracle.com,
	cl@gentwo.org, roman.gushchin@linux.dev, chrisl@kernel.org,
	kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
	bhe@redhat.com, zhengqi.arch@bytedance.com, terry.bowman@amd.com
Subject: [RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy
Date: Sun, 22 Feb 2026 03:48:28 -0500	[thread overview]
Message-ID: <20260222084842.1824063-14-gourry@gourry.net> (raw)
In-Reply-To: <20260222084842.1824063-1-gourry@gourry.net>

Some private nodes want userland to directly allocate from the node
via set_mempolicy() and mbind() - but don't want that node as normal
allocable system memory in the fallback lists.

Add NP_OPS_MEMPOLICY flag requiring NP_OPS_MIGRATION (since mbind can
drive migrations).  Only allow private nodes in policy nodemasks if
all private nodes in the mask support NP_OPS_MEMPOLICY. This prevents
__GFP_PRIVATE from unlocking nodes without NP_OPS_MEMPOLICY support.

Add __GFP_PRIVATE to mempolicy migration sites so moves to opted-in
private nodes succeed.

Update the sysfs "has_memory" attribute to include N_MEMORY_PRIVATE
nodes with NP_OPS_MEMPOLICY set, allowing existing numactl userland
tools to work without modification.

Signed-off-by: Gregory Price <gourry@gourry.net>
---
 drivers/base/node.c            | 22 +++++++++++++-
 include/linux/node_private.h   | 40 +++++++++++++++++++++++++
 include/uapi/linux/mempolicy.h |  1 +
 mm/mempolicy.c                 | 54 ++++++++++++++++++++++++++++++----
 mm/page_alloc.c                |  5 ++++
 5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e587f5781135..c08b5a948779 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -953,6 +953,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops)
 	    (!ops->migrate_to || !ops->folio_migrate))
 		return -EINVAL;
 
+	if ((ops->flags & NP_OPS_MEMPOLICY) &&
+	    !(ops->flags & NP_OPS_MIGRATION))
+		return -EINVAL;
+
 	mutex_lock(&node_private_lock);
 	np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
 				       lockdep_is_held(&node_private_lock));
@@ -1145,6 +1149,21 @@ static ssize_t show_node_state(struct device *dev,
 			  nodemask_pr_args(&node_states[na->state]));
 }
 
+/* has_memory includes N_MEMORY + N_MEMORY_PRIVATE that support mempolicy. */
+static ssize_t show_has_memory(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	nodemask_t mask = node_states[N_MEMORY];
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY_PRIVATE) {
+		if (node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+			node_set(nid, mask);
+	}
+
+	return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&mask));
+}
+
 #define _NODE_ATTR(name, state) \
 	{ __ATTR(name, 0444, show_node_state, NULL), state }
 
@@ -1155,7 +1174,8 @@ static struct node_attr node_state_attr[] = {
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
-	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+	[N_MEMORY] = { __ATTR(has_memory, 0444, show_has_memory, NULL),
+		       N_MEMORY },
 	[N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE),
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 	[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index 0c5be1ee6e60..e9b58afa366b 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -86,6 +86,8 @@ struct node_private_ops {
 
 /* Allow user/kernel migration; requires migrate_to and folio_migrate */
 #define NP_OPS_MIGRATION		BIT(0)
+/* Allow mempolicy-directed allocation and mbind migration to this node */
+#define NP_OPS_MEMPOLICY		BIT(1)
 
 /**
  * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -276,6 +278,34 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,
 
 	return ret;
 }
+
+static inline bool node_mpol_eligible(int nid)
+{
+	bool ret;
+
+	if (!node_state(nid, N_MEMORY_PRIVATE))
+		return node_state(nid, N_MEMORY);
+
+	rcu_read_lock();
+	ret = node_private_has_flag(nid, NP_OPS_MEMPOLICY);
+	rcu_read_unlock();
+	return ret;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+	int nid;
+	bool eligible = false;
+
+	for_each_node_mask(nid, *nodes) {
+		if (!node_state(nid, N_MEMORY_PRIVATE))
+			continue;
+		if (!node_mpol_eligible(nid))
+			return false;
+		eligible = true;
+	}
+	return eligible;
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #else /* !CONFIG_NUMA */
@@ -364,6 +394,16 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,
 	return -ENODEV;
 }
 
+static inline bool node_mpol_eligible(int nid)
+{
+	return false;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+	return false;
+}
+
 static inline int node_private_register(int nid, struct node_private *np)
 {
 	return -ENODEV;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..b606eae983c8 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,6 +64,7 @@ enum {
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_PRIVATE	(1 << 5) /* policy targets private node; use __GFP_PRIVATE */
 
 /*
  * Enabling zone reclaim means the page allocator will attempt to fulfill
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2b0f9762d171..8ac014950e88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -406,8 +406,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-	int ret;
-
 	/*
 	 * Default (pol==NULL) resp. local memory policies are not a
 	 * subject of any remapping. They also do not need any special
@@ -416,9 +414,12 @@ static int mpol_set_nodemask(struct mempolicy *pol,
 	if (!pol || pol->mode == MPOL_LOCAL)
 		return 0;
 
-	/* Check N_MEMORY */
+	/* Check N_MEMORY and N_MEMORY_PRIVATE*/
 	nodes_and(nsc->mask1,
 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
+	nodes_and(nsc->mask2, cpuset_current_mems_allowed,
+		  node_states[N_MEMORY_PRIVATE]);
+	nodes_or(nsc->mask1, nsc->mask1, nsc->mask2);
 
 	VM_BUG_ON(!nodes);
 
@@ -432,8 +433,13 @@ static int mpol_set_nodemask(struct mempolicy *pol,
 	else
 		pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 
-	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
-	return ret;
+	/* All private nodes in the mask must have NP_OPS_MEMPOLICY. */
+	if (nodes_private_mpol_allowed(&nsc->mask2))
+		pol->flags |= MPOL_F_PRIVATE;
+	else if (nodes_intersects(nsc->mask2, node_states[N_MEMORY_PRIVATE]))
+		return -EINVAL;
+
+	return mpol_ops[pol->mode].create(pol, &nsc->mask2);
 }
 
 /*
@@ -500,6 +506,7 @@ static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	nodemask_t tmp;
+	int nid;
 
 	if (pol->flags & MPOL_F_STATIC_NODES)
 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
@@ -514,6 +521,21 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 
+	/*
+	 * Drop private nodes that don't have mempolicy support.
+	 * cpusets guarantees at least one N_MEMORY node in effective_mems
+	 * and mems_allowed, so dropping private nodes here is safe.
+	 */
+	for_each_node_mask(nid, tmp) {
+		if (node_state(nid, N_MEMORY_PRIVATE) &&
+		    !node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+			node_clear(nid, tmp);
+	}
+	if (nodes_intersects(tmp, node_states[N_MEMORY_PRIVATE]))
+		pol->flags |= MPOL_F_PRIVATE;
+	else
+		pol->flags &= ~MPOL_F_PRIVATE;
+
 	pol->nodes = tmp;
 }
 
@@ -661,6 +683,9 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 	}
 	if (!queue_folio_required(folio, qp))
 		return;
+	if (folio_is_private_node(folio) &&
+	    !folio_private_flags(folio, NP_OPS_MIGRATION))
+		return;
 	if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
 	    !vma_migratable(walk->vma) ||
 	    !migrate_folio_add(folio, qp->pagelist, qp->flags))
@@ -717,6 +742,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 		folio = vm_normal_folio(vma, addr, ptent);
 		if (!folio || folio_is_zone_device(folio))
 			continue;
+		if (folio_is_private_node(folio) &&
+		    !folio_private_flags(folio, NP_OPS_MIGRATION))
+			continue;
 		if (folio_test_large(folio) && max_nr != 1)
 			nr = folio_pte_batch(folio, pte, ptent, max_nr);
 		/*
@@ -1451,6 +1479,9 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
 	else
 		gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
 
+	if (pol->flags & MPOL_F_PRIVATE)
+		gfp |= __GFP_PRIVATE;
+
 	return folio_alloc_mpol(gfp, order, pol, ilx, nid);
 }
 #else
@@ -2280,6 +2311,15 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
 			nodemask = &pol->nodes;
 		if (pol->home_node != NUMA_NO_NODE)
 			*nid = pol->home_node;
+		else if ((pol->flags & MPOL_F_PRIVATE) &&
+			 !node_isset(*nid, pol->nodes)) {
+			/*
+			 * Private nodes are not in N_MEMORY nodes' zonelists.
+			 * When the preferred nid (usually numa_node_id()) can't
+			 * reach the policy nodes, start from a policy node.
+			 */
+			*nid = first_node(pol->nodes);
+		}
 		/*
 		 * __GFP_THISNODE shouldn't even be used with the bind policy
 		 * because we might easily break the expectation to stay on the
@@ -2533,6 +2573,10 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
 		gfp |= __GFP_NOWARN;
 
 	pol = get_vma_policy(vma, addr, order, &ilx);
+
+	if (pol->flags & MPOL_F_PRIVATE)
+		gfp |= __GFP_PRIVATE;
+
 	folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
 	mpol_cond_put(pol);
 	return folio;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a1b35421d78..ec6c1f8e85d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3849,8 +3849,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 		 * if another process has NUMA bindings and is causing
 		 * kswapd wakeups on only some nodes. Avoid accidental
 		 * "node_reclaim_mode"-like behavior in this case.
+		 *
+		 * Nodes without kswapd (some private nodes) are never
+		 * skipped - this causes some mempolicies to silently
+		 * fall back to DRAM even if the node is eligible.
 		 */
 		if (skip_kswapd_nodes &&
+		    zone->zone_pgdat->kswapd &&
 		    !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
 			skipped_kswapd_nodes = true;
 			continue;
-- 
2.53.0

next prev parent reply	other threads:[~2026-02-22  8:49 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-22  8:48 [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM) Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 01/27] numa: introduce N_MEMORY_PRIVATE node state Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 02/27] mm,cpuset: gate allocations from N_MEMORY_PRIVATE behind __GFP_PRIVATE Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 03/27] mm/page_alloc: add numa_zone_allowed() and wire it up Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 04/27] mm/page_alloc: Add private node handling to build_zonelists Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 05/27] mm: introduce folio_is_private_managed() unified predicate Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 06/27] mm/mlock: skip mlock for managed-memory folios Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 07/27] mm/madvise: skip madvise " Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 08/27] mm/ksm: skip KSM " Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 09/27] mm/khugepaged: skip private node folios when trying to collapse Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 10/27] mm/swap: add free_folio callback for folio release cleanup Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 11/27] mm/huge_memory.c: add private node folio split notification callback Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 12/27] mm/migrate: NP_OPS_MIGRATION - support private node user migration Gregory Price
2026-02-22  8:48 ` Gregory Price [this message]
2026-02-22  8:48 ` [RFC PATCH v4 14/27] mm/memory-tiers: NP_OPS_DEMOTION - support private node demotion Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 15/27] mm/mprotect: NP_OPS_PROTECT_WRITE - gate PTE/PMD write-upgrades Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 16/27] mm: NP_OPS_RECLAIM - private node reclaim participation Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 17/27] mm/oom: NP_OPS_OOM_ELIGIBLE - private node OOM participation Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 18/27] mm/memory: NP_OPS_NUMA_BALANCING - private node NUMA balancing Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 19/27] mm/compaction: NP_OPS_COMPACTION - private node compaction support Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 20/27] mm/gup: NP_OPS_LONGTERM_PIN - private node longterm pin support Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 21/27] mm/memory-failure: add memory_failure callback to node_private_ops Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 22/27] mm/memory_hotplug: add add_private_memory_driver_managed() Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 23/27] mm/cram: add compressed ram memory management subsystem Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 24/27] cxl/core: Add cxl_sysram region type Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 25/27] cxl/core: Add private node support to cxl_sysram Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 26/27] cxl: add cxl_mempolicy sample PCI driver Gregory Price
2026-02-22  8:48 ` [RFC PATCH v4 27/27] cxl: add cxl_compression " Gregory Price

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260222084842.1824063-14-gourry@gourry.net \
    --to=gourry@gourry.net \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=alison.schofield@intel.com \
    --cc=apopple@nvidia.com \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bhe@redhat.com \
    --cc=byungchul@sk.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=cl@gentwo.org \
    --cc=dakr@kernel.org \
    --cc=damon@lists.linux.dev \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=dave@stgolabs.net \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=harry.yoo@oracle.com \
    --cc=ira.weiny@intel.com \
    --cc=jackmanb@google.com \
    --cc=jannh@google.com \
    --cc=jonathan.cameron@huawei.com \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=linmiaohe@huawei.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=linux@rasmusvillemoes.dk \
    --cc=longman@redhat.com \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=lsf-pc@lists.linux-foundation.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=matthew.brost@intel.com \
    --cc=mhiramat@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=nao.horiguchi@gmail.com \
    --cc=npache@redhat.com \
    --cc=nphamcs@gmail.com \
    --cc=osalvador@suse.de \
    --cc=pfalcato@suse.de \
    --cc=rafael@kernel.org \
    --cc=rakie.kim@sk.com \
    --cc=riel@surriel.com \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=sj@kernel.org \
    --cc=surenb@google.com \
    --cc=terry.bowman@amd.com \
    --cc=tj@kernel.org \
    --cc=vbabka@suse.cz \
    --cc=vishal.l.verma@intel.com \
    --cc=weixugc@google.com \
    --cc=xu.xin16@zte.com.cn \
    --cc=ying.huang@linux.alibaba.com \
    --cc=yuanchu@google.com \
    --cc=yury.norov@gmail.com \
    --cc=zhengqi.arch@bytedance.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox