From: Gregory Price <gourry@gourry.net>
To: linux-mm@kvack.org, cgroups@vger.kernel.org, linux-cxl@vger.kernel.org
Cc: linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-fsdevel@vger.kernel.org, kernel-team@meta.com,
longman@redhat.com, tj@kernel.org, hannes@cmpxchg.org,
mkoutny@suse.com, corbet@lwn.net, gregkh@linuxfoundation.org,
rafael@kernel.org, dakr@kernel.org, dave@stgolabs.net,
jonathan.cameron@huawei.com, dave.jiang@intel.com,
alison.schofield@intel.com, vishal.l.verma@intel.com,
ira.weiny@intel.com, dan.j.williams@intel.com,
akpm@linux-foundation.org, vbabka@suse.cz, surenb@google.com,
mhocko@suse.com, jackmanb@google.com, ziy@nvidia.com,
david@kernel.org, lorenzo.stoakes@oracle.com,
Liam.Howlett@oracle.com, rppt@kernel.org,
axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
yury.norov@gmail.com, linux@rasmusvillemoes.dk,
rientjes@google.com, shakeel.butt@linux.dev, chrisl@kernel.org,
kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
bhe@redhat.com, baohua@kernel.org, yosry.ahmed@linux.dev,
chengming.zhou@linux.dev, roman.gushchin@linux.dev,
muchun.song@linux.dev, osalvador@suse.de,
matthew.brost@intel.com, joshua.hahnjy@gmail.com,
rakie.kim@sk.com, byungchul@sk.com, gourry@gourry.net,
ying.huang@linux.alibaba.com, apopple@nvidia.com, cl@gentwo.org,
harry.yoo@oracle.com, zhengqi.arch@bytedance.com,
Balbir Singh <bsingharora@gmail.com>
Subject: [RFC PATCH v3 1/8] numa,memory_hotplug: create N_PRIVATE (Private Nodes)
Date: Thu, 8 Jan 2026 15:37:48 -0500 [thread overview]
Message-ID: <20260108203755.1163107-2-gourry@gourry.net> (raw)
In-Reply-To: <20260108203755.1163107-1-gourry@gourry.net>
N_MEMORY nodes are intended to contain general System RAM. Today, some
device drivers hotplug their memory (marked Specific Purpose or Reserved)
to get access to mm/ services, but don't intend it for general consumption.
This creates reliability issues as there are no isolation guarantees.
Create N_PRIVATE for memory nodes whose memory is not intended for
general consumption. This state is mutually exclusive with N_MEMORY.
This will allow existing service code (like page_alloc.c) to manage
N_PRIVATE nodes without exposing N_MEMORY users to that memory.
Add `node_mark_private()` for device drivers to call to mark a node
as private prior to hotplugging memory. This fails if the node is
already online or already has N_MEMORY.
Private nodes must have a memory types so that multiple drivers
trying to online private memory onto the same node are warned
when a conflict occurs.
Suggested-by: David Hildenbrand <david@kernel.org>
Suggested-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
drivers/base/node.c | 199 +++++++++++++++++++++++++++++++++++++++
include/linux/node.h | 60 ++++++++++++
include/linux/nodemask.h | 1 +
mm/memory_hotplug.c | 2 +-
4 files changed, 261 insertions(+), 1 deletion(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 00cf4532f121..b503782ea109 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -861,6 +861,193 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
(void *)&nid, register_mem_block_under_node_hotplug);
return;
}
+
+static enum private_memtype *private_nodes;
+/* Per-node list of private node operations callbacks */
+static struct list_head private_node_ops_list[MAX_NUMNODES];
+static DEFINE_MUTEX(private_node_ops_lock);
+static bool private_node_ops_initialized;
+
+/*
+ * Note: private_node_ops_list is initialized in node_dev_init() before
+ * any calls to node_register_private() can occur.
+ */
+
+/**
+ * node_register_private - Mark a node as private and register ops
+ * @nid: Node identifier
+ * @ops: Callback operations structure (required, but callbacks may be NULL)
+ *
+ * Mark a node as private and register the given ops structure. The ops
+ * structure must have res_start and res_end set to the physical address
+ * range covered by this registration, and memtype set to the private
+ * memory type. Multiple registrations for the same node are allowed as
+ * long as they have the same memtype.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int node_register_private(int nid, struct private_node_ops *ops)
+{
+ int rc = 0;
+ enum private_memtype ctype;
+ enum private_memtype type;
+
+ if (!ops)
+ return -EINVAL;
+
+ type = ops->memtype;
+
+ if (!node_possible(nid) || !private_nodes || type >= NODE_MAX_MEMTYPE)
+ return -EINVAL;
+
+ /* Validate resource bounds */
+ if (ops->res_start > ops->res_end)
+ return -EINVAL;
+
+ mutex_lock(&private_node_ops_lock);
+
+ /* hotplug lock must be held while checking online/node state */
+ mem_hotplug_begin();
+
+ /*
+ * N_PRIVATE and N_MEMORY are mutually exclusive. Fail if the node
+ * already has N_MEMORY set, regardless of online state.
+ */
+ if (node_state(nid, N_MEMORY)) {
+ rc = -EBUSY;
+ goto out;
+ }
+
+ ctype = private_nodes[nid];
+ if (ctype > NODE_MEM_NOTYPE && ctype != type) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Initialize the ops list entry and add to the node's list */
+ INIT_LIST_HEAD(&ops->list);
+ list_add_tail_rcu(&ops->list, &private_node_ops_list[nid]);
+
+ private_nodes[nid] = type;
+ node_set_state(nid, N_PRIVATE);
+out:
+ mem_hotplug_done();
+ mutex_unlock(&private_node_ops_lock);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(node_register_private);
+
+/**
+ * node_unregister_private - Unregister ops and potentially unmark node as private
+ * @nid: Node identifier
+ * @ops: Callback operations structure to remove
+ *
+ * Remove the given ops structure from the node's ops list. If this is
+ * the last ops structure for the node and the node is offline, the
+ * node is unmarked as private.
+ */
+void node_unregister_private(int nid, struct private_node_ops *ops)
+{
+ if (!node_possible(nid) || !private_nodes || !ops)
+ return;
+
+ mutex_lock(&private_node_ops_lock);
+ mem_hotplug_begin();
+
+ list_del_rcu(&ops->list);
+ /* If list is now empty, clear private state */
+ if (list_empty(&private_node_ops_list[nid])) {
+ private_nodes[nid] = NODE_MEM_NOTYPE;
+ node_clear_state(nid, N_PRIVATE);
+ }
+
+ mem_hotplug_done();
+ mutex_unlock(&private_node_ops_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(node_unregister_private);
+
+/**
+ * node_private_allocated - Validate a page allocation from a private node
+ * @page: The allocated page
+ *
+ * Find the ops structure whose region contains the page's physical address
+ * and call its page_allocated callback if one is registered.
+ *
+ * Returns:
+ * 0 if the callback succeeds or no callback is registered for this region
+ * -ENXIO if the page is not found in any registered region
+ * Other negative error code if the callback indicates the page is not safe
+ */
+int node_private_allocated(struct page *page)
+{
+ struct private_node_ops *ops;
+ phys_addr_t page_phys;
+ int nid = page_to_nid(page);
+ int ret = -ENXIO;
+
+ if (!node_possible(nid) || nid >= MAX_NUMNODES)
+ return -ENXIO;
+
+ if (!private_node_ops_initialized)
+ return -ENXIO;
+
+ page_phys = page_to_phys(page);
+
+ /*
+ * Use RCU to safely traverse the list without holding locks.
+ * Writers use list_add_tail_rcu/list_del_rcu with synchronize_rcu()
+ * to ensure safe concurrent access.
+ */
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &private_node_ops_list[nid], list) {
+ if (page_phys >= ops->res_start && page_phys <= ops->res_end) {
+ if (ops->page_allocated)
+ ret = ops->page_allocated(page, ops->data);
+ else
+ ret = 0;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(node_private_allocated);
+
+/**
+ * node_private_freed - Notify that a page from a private node is being freed
+ * @page: The page being freed
+ *
+ * Find the ops structure whose region contains the page's physical address
+ * and call its page_freed callback if one is registered.
+ */
+void node_private_freed(struct page *page)
+{
+ struct private_node_ops *ops;
+ phys_addr_t page_phys;
+ int nid = page_to_nid(page);
+
+ if (!node_possible(nid) || nid >= MAX_NUMNODES)
+ return;
+
+ if (!private_node_ops_initialized)
+ return;
+
+ page_phys = page_to_phys(page);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &private_node_ops_list[nid], list) {
+ if (page_phys >= ops->res_start && page_phys <= ops->res_end) {
+ if (ops->page_freed)
+ ops->page_freed(page, ops->data);
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(node_private_freed);
+
#endif /* CONFIG_MEMORY_HOTPLUG */
/**
@@ -959,6 +1146,7 @@ static struct node_attr node_state_attr[] = {
[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
#endif
[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+ [N_PRIVATE] = _NODE_ATTR(has_private_memory, N_PRIVATE),
[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
N_GENERIC_INITIATOR),
@@ -972,6 +1160,7 @@ static struct attribute *node_state_attrs[] = {
&node_state_attr[N_HIGH_MEMORY].attr.attr,
#endif
&node_state_attr[N_MEMORY].attr.attr,
+ &node_state_attr[N_PRIVATE].attr.attr,
&node_state_attr[N_CPU].attr.attr,
&node_state_attr[N_GENERIC_INITIATOR].attr.attr,
NULL
@@ -1007,5 +1196,15 @@ void __init node_dev_init(void)
panic("%s() failed to add node: %d\n", __func__, ret);
}
+ private_nodes = kzalloc(sizeof(enum private_memtype) * MAX_NUMNODES,
+ GFP_KERNEL);
+ if (!private_nodes)
+ pr_warn("Failed to allocate private_nodes, private node support disabled\n");
+
+ /* Initialize private node ops lists */
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&private_node_ops_list[i]);
+ private_node_ops_initialized = true;
+
register_memory_blocks_under_nodes();
}
diff --git a/include/linux/node.h b/include/linux/node.h
index 0269b064ba65..53a9fb63b60e 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -62,6 +62,47 @@ enum cache_mode {
NODE_CACHE_ADDR_MODE_EXTENDED_LINEAR,
};
+enum private_memtype {
+ NODE_MEM_NOTYPE,
+ NODE_MEM_ZSWAP,
+ NODE_MEM_COMPRESSED,
+ NODE_MEM_ACCELERATOR,
+ NODE_MEM_DEMOTE_ONLY,
+ NODE_MAX_MEMTYPE,
+};
+
+/**
+ * struct private_node_ops - Callbacks for private node operations
+ * @list: List node for per-node ops list
+ * @res_start: Start physical address of the memory region
+ * @res_end: End physical address of the memory region (inclusive)
+ * @memtype: Private node memory type for this region
+ * @page_allocated: Called after a page is allocated from this region
+ * to validate that the page is safe to use. Returns 0
+ * on success, negative error code on failure. If this
+ * returns an error, the caller should free the page
+ * and try another node. May be NULL if no validation
+ * is needed.
+ * @page_freed: Called when a page from this region is being freed.
+ * Allows the driver to update its internal tracking.
+ * May be NULL if no notification is needed.
+ * @data: Driver-private data passed to callbacks
+ *
+ * Multiple drivers may register ops for a single private node. Each
+ * registration covers a specific physical memory region. When a page
+ * is allocated, the appropriate ops structure is found by matching
+ * the page's physical address against the registered regions.
+ */
+struct private_node_ops {
+ struct list_head list;
+ resource_size_t res_start;
+ resource_size_t res_end;
+ enum private_memtype memtype;
+ int (*page_allocated)(struct page *page, void *data);
+ void (*page_freed)(struct page *page, void *data);
+ void *data;
+};
+
/**
* struct node_cache_attrs - system memory caching attributes
*
@@ -121,6 +162,10 @@ extern struct node *node_devices[];
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
unsigned long end_pfn);
+int node_register_private(int nid, struct private_node_ops *ops);
+void node_unregister_private(int nid, struct private_node_ops *ops);
+int node_private_allocated(struct page *page);
+void node_private_freed(struct page *page);
#else
static inline void register_memory_blocks_under_node_hotplug(int nid,
unsigned long start_pfn,
@@ -130,6 +175,21 @@ static inline void register_memory_blocks_under_node_hotplug(int nid,
static inline void register_memory_blocks_under_nodes(void)
{
}
+static inline int node_register_private(int nid, struct private_node_ops *ops)
+{
+ return -ENODEV;
+}
+static inline void node_unregister_private(int nid,
+ struct private_node_ops *ops)
+{
+}
+static inline int node_private_allocated(struct page *page)
+{
+ return -ENXIO;
+}
+static inline void node_private_freed(struct page *page)
+{
+}
#endif
struct node_notify {
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index bd38648c998d..dac250c6f1a9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -391,6 +391,7 @@ enum node_states {
N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
N_MEMORY, /* The node has memory(regular, high, movable) */
+ N_PRIVATE, /* The node's memory is private */
N_CPU, /* The node has one or more cpus */
N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */
NR_NODE_STATES
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 389989a28abe..57463fcb4021 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1207,7 +1207,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- if (node_arg.nid >= 0)
+ if (node_arg.nid >= 0 && !node_state(nid, N_PRIVATE))
node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
--
2.52.0
next prev parent reply other threads:[~2026-01-08 20:38 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-08 20:37 [RFC PATCH v3 0/8] mm,numa: N_PRIVATE node isolation for device-managed memory Gregory Price
2026-01-08 20:37 ` Gregory Price [this message]
2026-01-08 20:37 ` [RFC PATCH v3 2/8] mm: constify oom_control, scan_control, and alloc_context nodemask Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 3/8] mm: restrict slub, compaction, and page_alloc to sysram Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 4/8] cpuset: introduce cpuset.mems.sysram Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 5/8] Documentation/admin-guide/cgroups: update docs for mems_allowed Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 6/8] drivers/cxl/core/region: add private_region Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 7/8] mm/zswap: compressed ram direct integration Gregory Price
2026-01-09 16:00 ` Yosry Ahmed
2026-01-09 17:03 ` Gregory Price
2026-01-09 21:40 ` Gregory Price
2026-01-08 20:37 ` [RFC PATCH v3 8/8] drivers/cxl: add zswap private_region type Gregory Price
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260108203755.1163107-2-gourry@gourry.net \
--to=gourry@gourry.net \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alison.schofield@intel.com \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=bsingharora@gmail.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=cl@gentwo.org \
--cc=corbet@lwn.net \
--cc=dakr@kernel.org \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=david@kernel.org \
--cc=gregkh@linuxfoundation.org \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=ira.weiny@intel.com \
--cc=jackmanb@google.com \
--cc=jonathan.cameron@huawei.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux@rasmusvillemoes.dk \
--cc=longman@redhat.com \
--cc=lorenzo.stoakes@oracle.com \
--cc=matthew.brost@intel.com \
--cc=mhocko@suse.com \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=osalvador@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=surenb@google.com \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=vishal.l.verma@intel.com \
--cc=weixugc@google.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yosry.ahmed@linux.dev \
--cc=yuanchu@google.com \
--cc=yury.norov@gmail.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox