From: Gregory Price <gourry@gourry.net>
To: lsf-pc@lists.linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-cxl@vger.kernel.org,
cgroups@vger.kernel.org, linux-mm@kvack.org,
linux-trace-kernel@vger.kernel.org, damon@lists.linux.dev,
kernel-team@meta.com, gregkh@linuxfoundation.org,
rafael@kernel.org, dakr@kernel.org, dave@stgolabs.net,
jonathan.cameron@huawei.com, dave.jiang@intel.com,
alison.schofield@intel.com, vishal.l.verma@intel.com,
ira.weiny@intel.com, dan.j.williams@intel.com,
longman@redhat.com, akpm@linux-foundation.org, david@kernel.org,
lorenzo.stoakes@oracle.com, Liam.Howlett@oracle.com,
vbabka@suse.cz, rppt@kernel.org, surenb@google.com,
mhocko@suse.com, osalvador@suse.de, ziy@nvidia.com,
matthew.brost@intel.com, joshua.hahnjy@gmail.com,
rakie.kim@sk.com, byungchul@sk.com, gourry@gourry.net,
ying.huang@linux.alibaba.com, apopple@nvidia.com,
axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
yury.norov@gmail.com, linux@rasmusvillemoes.dk,
mhiramat@kernel.org, mathieu.desnoyers@efficios.com,
tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com,
jackmanb@google.com, sj@kernel.org,
baolin.wang@linux.alibaba.com, npache@redhat.com,
ryan.roberts@arm.com, dev.jain@arm.com, baohua@kernel.org,
lance.yang@linux.dev, muchun.song@linux.dev, xu.xin16@zte.com.cn,
chengming.zhou@linux.dev, jannh@google.com, linmiaohe@huawei.com,
nao.horiguchi@gmail.com, pfalcato@suse.de, rientjes@google.com,
shakeel.butt@linux.dev, riel@surriel.com, harry.yoo@oracle.com,
cl@gentwo.org, roman.gushchin@linux.dev, chrisl@kernel.org,
kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
bhe@redhat.com, zhengqi.arch@bytedance.com, terry.bowman@amd.com
Subject: [RFC PATCH v4 22/27] mm/memory_hotplug: add add_private_memory_driver_managed()
Date: Sun, 22 Feb 2026 03:48:37 -0500 [thread overview]
Message-ID: <20260222084842.1824063-23-gourry@gourry.net> (raw)
In-Reply-To: <20260222084842.1824063-1-gourry@gourry.net>
Add a new function for drivers to hotplug memory as N_MEMORY_PRIVATE.
This function combines node_private_region_register() with
__add_memory_driver_managed() to ensure proper ordering:
1. Register the private region first (sets private node context)
2. Then hotplug the memory (sets N_MEMORY_PRIVATE)
3. On failure, unregister the private region to avoid leaving the
node in an inconsistent state.
When the last of memory is removed, hotplug also removes the private
node context. If migration is not supported and the node is still
online, fire a warning (likely bug in the driver).
Signed-off-by: Gregory Price <gourry@gourry.net>
---
include/linux/memory_hotplug.h | 11 +++
include/linux/mmzone.h | 12 ++++
mm/memory_hotplug.c | 122 ++++++++++++++++++++++++++++++---
3 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 1f19f08552ea..e5abade9450a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -293,6 +293,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
extern int remove_memory(u64 start, u64 size);
extern void __remove_memory(u64 start, u64 size);
extern int offline_and_remove_memory(u64 start, u64 size);
+extern int offline_and_remove_private_memory(int nid, u64 start, u64 size);
#else
static inline void try_offline_node(int nid) {}
@@ -309,6 +310,12 @@ static inline int remove_memory(u64 start, u64 size)
}
static inline void __remove_memory(u64 start, u64 size) {}
+
+static inline int offline_and_remove_private_memory(int nid, u64 start,
+ u64 size)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -326,6 +333,10 @@ int __add_memory_driver_managed(int nid, u64 start, u64 size,
extern int add_memory_driver_managed(int nid, u64 start, u64 size,
const char *resource_name,
mhp_t mhp_flags);
+int add_private_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name,
+ mhp_t mhp_flags, enum mmop online_type,
+ struct node_private *np);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
struct vmem_altmap *altmap, int migratetype,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 992eb1c5a2c6..cc532b67ad3f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1524,6 +1524,18 @@ typedef struct pglist_data {
#endif
} pg_data_t;
+#ifdef CONFIG_NUMA
+static inline bool pgdat_is_private(pg_data_t *pgdat)
+{
+ return pgdat->private;
+}
+#else
+static inline bool pgdat_is_private(pg_data_t *pgdat)
+{
+ return false;
+}
+#endif
+
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d2dc527bd5b0..9d72f44a30dc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -36,6 +36,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/node.h>
+#include <linux/node_private.h>
#include <asm/tlbflush.h>
@@ -1173,8 +1174,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
true);
- if (!node_state(nid, N_MEMORY)) {
- /* Adding memory to the node for the first time */
+ if (!node_state(nid, N_MEMORY) && !node_state(nid, N_MEMORY_PRIVATE)) {
node_arg.nid = nid;
ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
ret = notifier_to_errno(ret);
@@ -1208,8 +1208,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- if (node_arg.nid >= 0)
- node_set_state(nid, N_MEMORY);
+ if (node_arg.nid >= 0) {
+ if (pgdat_is_private(NODE_DATA(nid)))
+ node_set_state(nid, N_MEMORY_PRIVATE);
+ else
+ node_set_state(nid, N_MEMORY);
+ }
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
@@ -1227,8 +1231,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* reinitialise watermarks and update pcp limits */
init_per_zone_wmark_min();
- kswapd_run(nid);
- kcompactd_run(nid);
+ /*
+ * Don't start reclaim/compaction daemons for private nodes.
+ * Private node services will decide whether to start these services.
+ */
+ if (!pgdat_is_private(NODE_DATA(nid))) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
if (node_arg.nid >= 0)
/* First memory added successfully. Notify consumers. */
@@ -1722,6 +1732,54 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
}
EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+/**
+ * add_private_memory_driver_managed - add driver-managed N_MEMORY_PRIVATE memory
+ * @nid: NUMA node ID (or memory group ID when MHP_NID_IS_MGID is set)
+ * @start: Start physical address
+ * @size: Size in bytes
+ * @resource_name: "System RAM ($DRIVER)" format
+ * @mhp_flags: Memory hotplug flags
+ * @online_type: MMOP_* online type
+ * @np: Driver-owned node_private structure (owner, refcount)
+ *
+ * Registers node_private first, then hotplugs the memory.
+ *
+ * On failure, unregisters the node_private.
+ */
+int add_private_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name,
+ mhp_t mhp_flags, enum mmop online_type,
+ struct node_private *np)
+{
+ struct memory_group *group;
+ int real_nid = nid;
+ int rc;
+
+ if (!np)
+ return -EINVAL;
+
+ if (mhp_flags & MHP_NID_IS_MGID) {
+ group = memory_group_find_by_id(nid);
+ if (!group)
+ return -EINVAL;
+ real_nid = group->nid;
+ }
+
+ rc = node_private_register(real_nid, np);
+ if (rc)
+ return rc;
+
+ rc = __add_memory_driver_managed(nid, start, size, resource_name,
+ mhp_flags, online_type);
+ if (rc) {
+ node_private_unregister(real_nid);
+ return rc;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_private_memory_driver_managed);
+
/*
* Platforms should define arch_get_mappable_range() that provides
* maximum possible addressable physical memory range for which the
@@ -1872,6 +1930,15 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
goto put_folio;
}
+ /* Private nodes w/o migration must ensure folios are offline */
+ if (folio_is_private_node(folio) &&
+ !folio_private_flags(folio, NP_OPS_MIGRATION)) {
+ WARN_ONCE(1, "hot-unplug on non-migratable node %d pfn %lx\n",
+ folio_nid(folio), pfn);
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
+ goto put_folio;
+ }
+
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
pr_warn("failed to isolate pfn %lx\n",
@@ -2014,8 +2081,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/*
* Check whether the node will have no present pages after we offline
- * 'nr_pages' more. If so, we know that the node will become empty, and
- * so we will clear N_MEMORY for it.
+ * 'nr_pages' more. If so, send pre-notification for last memory removal.
+ * We will clear N_MEMORY(_PRIVATE) if this is the case.
*/
if (nr_pages >= pgdat->node_present_pages) {
node_arg.nid = node;
@@ -2108,8 +2175,12 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- if (node_arg.nid >= 0)
- node_clear_state(node, N_MEMORY);
+ if (node_arg.nid >= 0) {
+ if (node_state(node, N_MEMORY))
+ node_clear_state(node, N_MEMORY);
+ else if (node_state(node, N_MEMORY_PRIVATE))
+ node_clear_state(node, N_MEMORY_PRIVATE);
+ }
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
@@ -2461,4 +2532,35 @@ int offline_and_remove_memory(u64 start, u64 size)
return rc;
}
EXPORT_SYMBOL_GPL(offline_and_remove_memory);
+
+/**
+ * offline_and_remove_private_memory - offline, remove, and unregister private memory
+ * @nid: NUMA node ID of the private memory
+ * @start: Start physical address
+ * @size: Size in bytes
+ *
+ * Counterpart to add_private_memory_driver_managed(). Offlines and removes
+ * the memory range, then attempts to unregister the node_private.
+ *
+ * offline_and_remove_memory() clears N_MEMORY_PRIVATE when the last block
+ * is offlined, which allows node_private_unregister() to clear the
+ * pgdat->node_private pointer. If other private memory ranges remain on
+ * the node, node_private_unregister() returns -EBUSY (N_MEMORY_PRIVATE
+ * is still set) and the node_private remains registered.
+ *
+ * Return: 0 on full success (memory removed and node_private unregistered),
+ * -EBUSY if memory was removed but node still has other private memory,
+ * other negative error code if offline/remove failed.
+ */
+int offline_and_remove_private_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
+ rc = offline_and_remove_memory(start, size);
+ if (rc)
+ return rc;
+
+ return node_private_unregister(nid);
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_private_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
--
2.53.0
next prev parent reply other threads:[~2026-02-22 8:50 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-22 8:48 [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM) Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 01/27] numa: introduce N_MEMORY_PRIVATE node state Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 02/27] mm,cpuset: gate allocations from N_MEMORY_PRIVATE behind __GFP_PRIVATE Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 03/27] mm/page_alloc: add numa_zone_allowed() and wire it up Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 04/27] mm/page_alloc: Add private node handling to build_zonelists Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 05/27] mm: introduce folio_is_private_managed() unified predicate Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 06/27] mm/mlock: skip mlock for managed-memory folios Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 07/27] mm/madvise: skip madvise " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 08/27] mm/ksm: skip KSM " Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 09/27] mm/khugepaged: skip private node folios when trying to collapse Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 10/27] mm/swap: add free_folio callback for folio release cleanup Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 11/27] mm/huge_memory.c: add private node folio split notification callback Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 12/27] mm/migrate: NP_OPS_MIGRATION - support private node user migration Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 14/27] mm/memory-tiers: NP_OPS_DEMOTION - support private node demotion Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 15/27] mm/mprotect: NP_OPS_PROTECT_WRITE - gate PTE/PMD write-upgrades Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 16/27] mm: NP_OPS_RECLAIM - private node reclaim participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 17/27] mm/oom: NP_OPS_OOM_ELIGIBLE - private node OOM participation Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 18/27] mm/memory: NP_OPS_NUMA_BALANCING - private node NUMA balancing Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 19/27] mm/compaction: NP_OPS_COMPACTION - private node compaction support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 20/27] mm/gup: NP_OPS_LONGTERM_PIN - private node longterm pin support Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 21/27] mm/memory-failure: add memory_failure callback to node_private_ops Gregory Price
2026-02-22 8:48 ` Gregory Price [this message]
2026-02-22 8:48 ` [RFC PATCH v4 23/27] mm/cram: add compressed ram memory management subsystem Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 24/27] cxl/core: Add cxl_sysram region type Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 25/27] cxl/core: Add private node support to cxl_sysram Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 26/27] cxl: add cxl_mempolicy sample PCI driver Gregory Price
2026-02-22 8:48 ` [RFC PATCH v4 27/27] cxl: add cxl_compression " Gregory Price
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260222084842.1824063-23-gourry@gourry.net \
--to=gourry@gourry.net \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=alison.schofield@intel.com \
--cc=apopple@nvidia.com \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=byungchul@sk.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=cl@gentwo.org \
--cc=dakr@kernel.org \
--cc=damon@lists.linux.dev \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=dave@stgolabs.net \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=gregkh@linuxfoundation.org \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=ira.weiny@intel.com \
--cc=jackmanb@google.com \
--cc=jannh@google.com \
--cc=jonathan.cameron@huawei.com \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=kernel-team@meta.com \
--cc=lance.yang@linux.dev \
--cc=linmiaohe@huawei.com \
--cc=linux-cxl@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=linux@rasmusvillemoes.dk \
--cc=longman@redhat.com \
--cc=lorenzo.stoakes@oracle.com \
--cc=lsf-pc@lists.linux-foundation.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matthew.brost@intel.com \
--cc=mhiramat@kernel.org \
--cc=mhocko@suse.com \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nao.horiguchi@gmail.com \
--cc=npache@redhat.com \
--cc=nphamcs@gmail.com \
--cc=osalvador@suse.de \
--cc=pfalcato@suse.de \
--cc=rafael@kernel.org \
--cc=rakie.kim@sk.com \
--cc=riel@surriel.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=rppt@kernel.org \
--cc=ryan.roberts@arm.com \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=sj@kernel.org \
--cc=surenb@google.com \
--cc=terry.bowman@amd.com \
--cc=tj@kernel.org \
--cc=vbabka@suse.cz \
--cc=vishal.l.verma@intel.com \
--cc=weixugc@google.com \
--cc=xu.xin16@zte.com.cn \
--cc=ying.huang@linux.alibaba.com \
--cc=yuanchu@google.com \
--cc=yury.norov@gmail.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox