* [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
@ 2026-01-05 20:36 Gregory Price
2026-01-06 15:05 ` Michal Hocko
2026-01-06 15:24 ` David Hildenbrand (Red Hat)
0 siblings, 2 replies; 20+ messages in thread
From: Gregory Price @ 2026-01-05 20:36 UTC (permalink / raw)
To: linux-mm
Cc: linux-kernel, kernel-team, david, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko, hare
It was reported (LPC 2025) that userland services which monitor memory
blocks can cause hot-unplug to fail permanently.
This can occur when drivers attempt to hot-remove memory in two phases
(offline, remove), while a userland service detects the memory offline
and re-onlines the memory into a zone which may prevent removal.
This patch allows a driver to specify that a given memory block is
intended as ZONE_MOVABLE memory only (i.e. the system should try to
protect its hot-unpluggability). This is done via an MHP flag and a new
"movable_only" bool in `struct memory_block`.
Attempts to online a memory block with movable_only=true with any value
other than MMOP_ONLINE_MOVABLE will fail with -EINVAL.
It is hard to catch all possible ways to implement offline/remove
process, so a race condition here can clearly still occur if the
userland service onlines the memory back into ZONE_MOVABLE, but it at
least will not prevent the removal of a block at a later time.
Suggested-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Gregory Price <gourry@gourry.net>
---
drivers/base/memory.c | 15 +++++++++++----
include/linux/memory.h | 4 +++-
include/linux/memory_hotplug.h | 13 +++++++++++++
mm/memory_hotplug.c | 12 +++++++++---
4 files changed, 36 insertions(+), 8 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6d84a02cfa5d..59512e4b8d62 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -374,6 +374,8 @@ static int memory_block_change_state(struct memory_block *mem,
if (to_state == MEM_OFFLINE)
mem->state = MEM_GOING_OFFLINE;
+ else if (mem->movable_only && to_state != MMOP_ONLINE_MOVABLE)
+ return -EINVAL;
ret = memory_block_action(mem, to_state);
mem->state = ret ? from_state_req : to_state;
@@ -811,7 +813,8 @@ void memory_block_add_nid_early(struct memory_block *mem, int nid)
static int add_memory_block(unsigned long block_id, int nid, unsigned long state,
struct vmem_altmap *altmap,
- struct memory_group *group)
+ struct memory_group *group,
+ bool movable_only)
{
struct memory_block *mem;
int ret = 0;
@@ -829,6 +832,7 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state
mem->state = state;
mem->nid = nid;
mem->altmap = altmap;
+ mem->movable_only = movable_only;
INIT_LIST_HEAD(&mem->group_next);
#ifndef CONFIG_NUMA
@@ -880,7 +884,8 @@ static void remove_memory_block(struct memory_block *memory)
*/
int create_memory_block_devices(unsigned long start, unsigned long size,
int nid, struct vmem_altmap *altmap,
- struct memory_group *group)
+ struct memory_group *group,
+ bool movable_only)
{
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
@@ -893,7 +898,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
return -EINVAL;
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
- ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group);
+ ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group,
+ movable_only);
if (ret)
break;
}
@@ -998,7 +1004,8 @@ void __init memory_dev_init(void)
continue;
block_id = memory_block_id(nr);
- ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL);
+ ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL,
+ false);
if (ret) {
panic("%s() failed to add memory block: %d\n",
__func__, ret);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 43d378038ce2..bab24f796d3d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -80,6 +80,7 @@ struct memory_block {
struct vmem_altmap *altmap;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
+ bool movable_only; /* If set, only ZONE_MOVABLE is valid */
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
atomic_long_t nr_hwpoison;
#endif
@@ -160,7 +161,8 @@ extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size,
int nid, struct vmem_altmap *altmap,
- struct memory_group *group);
+ struct memory_group *group,
+ bool movable_only);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 23f038a16231..ca51ef2ad0cf 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -75,6 +75,19 @@ typedef int __bitwise mhp_t;
*/
#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3))
+/*
+ * Restrict hotplugged memory blocks to ZONE_MOVABLE only.
+ *
+ * During offlining of hotplugged memory which was originally onlined
+ * as ZONE_MOVABLE, userland services may detect blocks going offline
+ * and automatically re-online them into ZONE_NORMAL or lower. When
+ * this happens it may become permanently incapable of being removed.
+ *
+ * Allow driver-managed memory sources to restrict memory blocks to
+ * ZONE_MOVABLE only, so that the truly degenerate case can be mitigated.
+ */
+#define MHP_MOVABLE_ONLY ((__force mhp_t)BIT(4))
+
/*
* Extended parameters for memory hotplug:
* altmap: alternative allocator for memmap array (optional)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 81ba5b019926..1a184bfd87f6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1346,7 +1346,9 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = mhp_get_default_online_type();
+ mem->online_type = mem->movable_only ?
+ MMOP_ONLINE_MOVABLE :
+ mhp_get_default_online_type();
return device_online(&mem->dev);
}
@@ -1449,6 +1451,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
int ret;
+ bool movable_only = mhp_flags & MHP_MOVABLE_ONLY;
for (cur_start = start; cur_start < start + size;
cur_start += memblock_size) {
@@ -1478,7 +1481,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
/* create memory block devices after memory was added */
ret = create_memory_block_devices(cur_start, memblock_size, nid,
- params.altmap, group);
+ params.altmap, group,
+ movable_only);
if (ret) {
arch_remove_memory(cur_start, memblock_size, NULL);
kfree(params.altmap);
@@ -1506,6 +1510,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
struct memory_group *group = NULL;
u64 start, size;
bool new_node = false;
+ bool movable_only = mhp_flags & MHP_MOVABLE_ONLY;
int ret;
start = res->start;
@@ -1564,7 +1569,8 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error;
/* create memory block devices after memory was added */
- ret = create_memory_block_devices(start, size, nid, NULL, group);
+ ret = create_memory_block_devices(start, size, nid, NULL, group,
+ movable_only);
if (ret) {
arch_remove_memory(start, size, params.altmap);
goto error;
--
2.52.0
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-05 20:36 [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable Gregory Price
@ 2026-01-06 15:05 ` Michal Hocko
2026-01-06 16:53 ` Gregory Price
2026-01-06 15:24 ` David Hildenbrand (Red Hat)
1 sibling, 1 reply; 20+ messages in thread
From: Michal Hocko @ 2026-01-06 15:05 UTC (permalink / raw)
To: Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, david, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, hare
On Mon 05-01-26 15:36:11, Gregory Price wrote:
> It was reported (LPC 2025) that userland services which monitor memory
> blocks can cause hot-unplug to fail permanently.
>
> This can occur when drivers attempt to hot-remove memory in two phases
> (offline, remove), while a userland service detects the memory offline
> and re-onlines the memory into a zone which may prevent removal.
Are there more details about this?
> This patch allows a driver to specify that a given memory block is
> intended as ZONE_MOVABLE memory only (i.e. the system should try to
> protect its hot-unpluggability). This is done via an MHP flag and a new
> "movable_only" bool in `struct memory_block`.
>
> Attempts to online a memory block with movable_only=true with any value
> other than MMOP_ONLINE_MOVABLE will fail with -EINVAL.
>
> It is hard to catch all possible ways to implement offline/remove
> process, so a race condition here can clearly still occur if the
> userland service onlines the memory back into ZONE_MOVABLE, but it at
> least will not prevent the removal of a block at a later time.
Irrespective of the userspace note above (which seems like a policy that
should probably be re-evaluated or allow for a better fine tuning) I can
see some sense in drivers having a better control of which zones (kernel
vs. movable) can their managed memory fall into.
That being said, rather than movable_only, should we have a mask of
online types supported for the mem block?
> Suggested-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
> drivers/base/memory.c | 15 +++++++++++----
> include/linux/memory.h | 4 +++-
> include/linux/memory_hotplug.h | 13 +++++++++++++
> mm/memory_hotplug.c | 12 +++++++++---
> 4 files changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 6d84a02cfa5d..59512e4b8d62 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -374,6 +374,8 @@ static int memory_block_change_state(struct memory_block *mem,
>
> if (to_state == MEM_OFFLINE)
> mem->state = MEM_GOING_OFFLINE;
> + else if (mem->movable_only && to_state != MMOP_ONLINE_MOVABLE)
> + return -EINVAL;
>
> ret = memory_block_action(mem, to_state);
> mem->state = ret ? from_state_req : to_state;
> @@ -811,7 +813,8 @@ void memory_block_add_nid_early(struct memory_block *mem, int nid)
>
> static int add_memory_block(unsigned long block_id, int nid, unsigned long state,
> struct vmem_altmap *altmap,
> - struct memory_group *group)
> + struct memory_group *group,
> + bool movable_only)
> {
> struct memory_block *mem;
> int ret = 0;
> @@ -829,6 +832,7 @@ static int add_memory_block(unsigned long block_id, int nid, unsigned long state
> mem->state = state;
> mem->nid = nid;
> mem->altmap = altmap;
> + mem->movable_only = movable_only;
> INIT_LIST_HEAD(&mem->group_next);
>
> #ifndef CONFIG_NUMA
> @@ -880,7 +884,8 @@ static void remove_memory_block(struct memory_block *memory)
> */
> int create_memory_block_devices(unsigned long start, unsigned long size,
> int nid, struct vmem_altmap *altmap,
> - struct memory_group *group)
> + struct memory_group *group,
> + bool movable_only)
> {
> const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
> unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
> @@ -893,7 +898,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
> return -EINVAL;
>
> for (block_id = start_block_id; block_id != end_block_id; block_id++) {
> - ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group);
> + ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group,
> + movable_only);
> if (ret)
> break;
> }
> @@ -998,7 +1004,8 @@ void __init memory_dev_init(void)
> continue;
>
> block_id = memory_block_id(nr);
> - ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL);
> + ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_ONLINE, NULL, NULL,
> + false);
> if (ret) {
> panic("%s() failed to add memory block: %d\n",
> __func__, ret);
> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index 43d378038ce2..bab24f796d3d 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -80,6 +80,7 @@ struct memory_block {
> struct vmem_altmap *altmap;
> struct memory_group *group; /* group (if any) for this block */
> struct list_head group_next; /* next block inside memory group */
> + bool movable_only; /* If set, only ZONE_MOVABLE is valid */
> #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
> atomic_long_t nr_hwpoison;
> #endif
> @@ -160,7 +161,8 @@ extern int register_memory_notifier(struct notifier_block *nb);
> extern void unregister_memory_notifier(struct notifier_block *nb);
> int create_memory_block_devices(unsigned long start, unsigned long size,
> int nid, struct vmem_altmap *altmap,
> - struct memory_group *group);
> + struct memory_group *group,
> + bool movable_only);
> void remove_memory_block_devices(unsigned long start, unsigned long size);
> extern void memory_dev_init(void);
> extern int memory_notify(unsigned long val, void *v);
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 23f038a16231..ca51ef2ad0cf 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -75,6 +75,19 @@ typedef int __bitwise mhp_t;
> */
> #define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3))
>
> +/*
> + * Restrict hotplugged memory blocks to ZONE_MOVABLE only.
> + *
> + * During offlining of hotplugged memory which was originally onlined
> + * as ZONE_MOVABLE, userland services may detect blocks going offline
> + * and automatically re-online them into ZONE_NORMAL or lower. When
> + * this happens it may become permanently incapable of being removed.
> + *
> + * Allow driver-managed memory sources to restrict memory blocks to
> + * ZONE_MOVABLE only, so that the truly degenerate case can be mitigated.
> + */
> +#define MHP_MOVABLE_ONLY ((__force mhp_t)BIT(4))
> +
> /*
> * Extended parameters for memory hotplug:
> * altmap: alternative allocator for memmap array (optional)
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 81ba5b019926..1a184bfd87f6 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1346,7 +1346,9 @@ static int check_hotplug_memory_range(u64 start, u64 size)
>
> static int online_memory_block(struct memory_block *mem, void *arg)
> {
> - mem->online_type = mhp_get_default_online_type();
> + mem->online_type = mem->movable_only ?
> + MMOP_ONLINE_MOVABLE :
> + mhp_get_default_online_type();
> return device_online(&mem->dev);
> }
>
> @@ -1449,6 +1451,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
> unsigned long memblock_size = memory_block_size_bytes();
> u64 cur_start;
> int ret;
> + bool movable_only = mhp_flags & MHP_MOVABLE_ONLY;
>
> for (cur_start = start; cur_start < start + size;
> cur_start += memblock_size) {
> @@ -1478,7 +1481,8 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
>
> /* create memory block devices after memory was added */
> ret = create_memory_block_devices(cur_start, memblock_size, nid,
> - params.altmap, group);
> + params.altmap, group,
> + movable_only);
> if (ret) {
> arch_remove_memory(cur_start, memblock_size, NULL);
> kfree(params.altmap);
> @@ -1506,6 +1510,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
> struct memory_group *group = NULL;
> u64 start, size;
> bool new_node = false;
> + bool movable_only = mhp_flags & MHP_MOVABLE_ONLY;
> int ret;
>
> start = res->start;
> @@ -1564,7 +1569,8 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
> goto error;
>
> /* create memory block devices after memory was added */
> - ret = create_memory_block_devices(start, size, nid, NULL, group);
> + ret = create_memory_block_devices(start, size, nid, NULL, group,
> + movable_only);
> if (ret) {
> arch_remove_memory(start, size, params.altmap);
> goto error;
> --
> 2.52.0
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-05 20:36 [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable Gregory Price
2026-01-06 15:05 ` Michal Hocko
@ 2026-01-06 15:24 ` David Hildenbrand (Red Hat)
2026-01-06 16:58 ` Gregory Price
1 sibling, 1 reply; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-06 15:24 UTC (permalink / raw)
To: Gregory Price, linux-mm
Cc: linux-kernel, kernel-team, osalvador, gregkh, rafael, dakr, akpm,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
hare
> +/*
> + * Restrict hotplugged memory blocks to ZONE_MOVABLE only.
> + *
> + * During offlining of hotplugged memory which was originally onlined
> + * as ZONE_MOVABLE, userland services may detect blocks going offline
> + * and automatically re-online them into ZONE_NORMAL or lower. When
> + * this happens it may become permanently incapable of being removed.
If it's really only that, we could also look into simply making a
re-online without a specific mode ("online") to use the previous mode.
We could glue that to the "contig-zones" policy only, to not affect
"auto-movable".
That is, remember the zone to which it was previously onlined, and then
simply re-online to that one.
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 15:05 ` Michal Hocko
@ 2026-01-06 16:53 ` Gregory Price
2026-01-06 19:49 ` Michal Hocko
0 siblings, 1 reply; 20+ messages in thread
From: Gregory Price @ 2026-01-06 16:53 UTC (permalink / raw)
To: Michal Hocko
Cc: linux-mm, linux-kernel, kernel-team, david, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, hare
On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
> On Mon 05-01-26 15:36:11, Gregory Price wrote:
> > It was reported (LPC 2025) that userland services which monitor memory
> > blocks can cause hot-unplug to fail permanently.
> >
> > This can occur when drivers attempt to hot-remove memory in two phases
> > (offline, remove), while a userland service detects the memory offline
> > and re-onlines the memory into a zone which may prevent removal.
>
> Are there more details about this?
The details are with Hannes, I was just recapping what was described in
his devmem talk at LPC ("To online or not online").
>
> > This patch allows a driver to specify that a given memory block is
> > intended as ZONE_MOVABLE memory only (i.e. the system should try to
> > protect its hot-unpluggability). This is done via an MHP flag and a new
> > "movable_only" bool in `struct memory_block`.
> >
> > Attempts to online a memory block with movable_only=true with any value
> > other than MMOP_ONLINE_MOVABLE will fail with -EINVAL.
> >
> > It is hard to catch all possible ways to implement offline/remove
> > process, so a race condition here can clearly still occur if the
> > userland service onlines the memory back into ZONE_MOVABLE, but it at
> > least will not prevent the removal of a block at a later time.
>
> Irrespective of the userspace note above (which seems like a policy that
> should probably be re-evaluated or allow for a better fine tuning) I can
> see some sense in drivers having a better control of which zones (kernel
> vs. movable) can their managed memory fall into.
Hannes pointed out that this is some default policy on one or more
distributions, which is quite annoying. Obviously a kernel change to
fight against user-policy is not great, but trying to prevent
hotplug-intended memory from being onlined in hotplug-unfriendly zones
seemed like a pretty straight forward improvement.
>
> That being said, rather than movable_only, should we have a mask of
> online types supported for the mem block?
>
I briefly considered this. I went with this for RFC-v1 since it's
fairly simple and because movable is really the only zone with hotplug
guarantees (any other zone makes no hotplug guarantees).
It's also significantly more complex of a change for questionable value,
but if people see this as the way to go i'll happily pivot to that.
~Gregory
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 15:24 ` David Hildenbrand (Red Hat)
@ 2026-01-06 16:58 ` Gregory Price
2026-01-06 17:52 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 20+ messages in thread
From: Gregory Price @ 2026-01-06 16:58 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko, hare
On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
> > +/*
> > + * Restrict hotplugged memory blocks to ZONE_MOVABLE only.
> > + *
> > + * During offlining of hotplugged memory which was originally onlined
> > + * as ZONE_MOVABLE, userland services may detect blocks going offline
> > + * and automatically re-online them into ZONE_NORMAL or lower. When
> > + * this happens it may become permanently incapable of being removed.
>
> If it's really only that, we could also look into simply making a re-online
> without a specific mode ("online") to use the previous mode.
>
> We could glue that to the "contig-zones" policy only, to not affect
> "auto-movable".
>
> That is, remember the zone to which it was previously onlined, and then
> simply re-online to that one.
>
I know we do this in memory_hotplug.c to rollback to prior state.
I did notice in... i think it was either memory.c or hotplug.c... that
we end up setting mem->online_type=MMOP_OFFLINE after comping an online
operation. That seemed confusing and maybe we can use that to store the
current state.
I'm not against this idea, but it also makes the sysfs a little more
confusing (`echo online` now does different things based on prior
state). I preferred just failing if the block wasn't compatible with
the zone (maybe making it more clear with a dmesg print?)
Anyway, let me know what your preference is, happy to pivot however.
Hopefully Hannes can add additional feedback and guidance here.
~Gregory
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 16:58 ` Gregory Price
@ 2026-01-06 17:52 ` David Hildenbrand (Red Hat)
2026-01-06 18:06 ` Gregory Price
0 siblings, 1 reply; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-06 17:52 UTC (permalink / raw)
To: Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko, hare
On 1/6/26 17:58, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
>>> +/*
>>> + * Restrict hotplugged memory blocks to ZONE_MOVABLE only.
>>> + *
>>> + * During offlining of hotplugged memory which was originally onlined
>>> + * as ZONE_MOVABLE, userland services may detect blocks going offline
>>> + * and automatically re-online them into ZONE_NORMAL or lower. When
>>> + * this happens it may become permanently incapable of being removed.
>>
>> If it's really only that, we could also look into simply making a re-online
>> without a specific mode ("online") to use the previous mode.
>>
>> We could glue that to the "contig-zones" policy only, to not affect
>> "auto-movable".
>>
>> That is, remember the zone to which it was previously onlined, and then
>> simply re-online to that one.
>>
>
> I know we do this in memory_hotplug.c to rollback to prior state.
>
> I did notice in... i think it was either memory.c or hotplug.c... that
> we end up setting mem->online_type=MMOP_OFFLINE after comping an online
> operation. That seemed confusing and maybe we can use that to store the
> current state.
>
> I'm not against this idea, but it also makes the sysfs a little more
> confusing (`echo online` now does different things based on prior
> state).
Right, but only for the contig-zones policy.
But maybe you really want the default for such memory to be "movable"
even when not onlined beforehand? So I am not sure if the description of
the problem here is accurate.
Isn't one problem also udev racing with ndctl?
> I preferred just failing if the block wasn't compatible with
> the zone (maybe making it more clear with a dmesg print?)
The thing is that this block is compatible with the zone, no?
In a system where you would never want to offline that memory, why
should we stop someone from onlining it to a kernel zone? I'm sure
someone with a weird use case will show up later that will complain
about this.
But the patch is missing details on who would actually set
MHP_MOVABLE_ONLY. A user should be posted alongside the core change.
>
> Anyway, let me know what your preference is, happy to pivot however.
Restricting memory to be movable-only to handle a user-space problem as
described here sounds like the wrong approach to me. You really want the
default of such memory to be "movable".
Almost like an optimized "auto-movable" policy :)
Or a new policy that will respect a provided default (MHP_DEFAULT_MOVABLE).
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 17:52 ` David Hildenbrand (Red Hat)
@ 2026-01-06 18:06 ` Gregory Price
2026-01-06 18:38 ` David Hildenbrand (Red Hat)
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Gregory Price @ 2026-01-06 18:06 UTC (permalink / raw)
To: David Hildenbrand (Red Hat), hare
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/6/26 17:58, Gregory Price wrote:
> > On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
> >
> > I'm not against this idea, but it also makes the sysfs a little more
> > confusing (`echo online` now does different things based on prior
> > state).
>
> Right, but only for the contig-zones policy.
>
> But maybe you really want the default for such memory to be "movable" even
> when not onlined beforehand? So I am not sure if the description of the
> problem here is accurate.
>
> Isn't one problem also udev racing with ndctl?
>
Yeah there's a bunch of races, the specific ones mentioned by Hannes i
need to go back and re-listen to the talk.
> > I preferred just failing if the block wasn't compatible with
> > the zone (maybe making it more clear with a dmesg print?)
>
> The thing is that this block is compatible with the zone, no?
>
> In a system where you would never want to offline that memory, why should we
> stop someone from onlining it to a kernel zone? I'm sure someone with a
> weird use case will show up later that will complain about this.
>
Presumably you wouldn't be setting the MHP flag that prevents the blocks
from being onlined in a kernel zone then - in which case this all just
works as intended today.
> But the patch is missing details on who would actually set MHP_MOVABLE_ONLY.
> A user should be posted alongside the core change.
>
This is fair and probably the obvious immediate user would be a dax
device with some kind of `dax0.0/protect_unplug` feature set.
(With a better name obviuosly).
I will defer to Hannes on his specific use case, but I could see the
CXL-DCD (Dynamic Capacity) set wanting something like this.
> >
> > Anyway, let me know what your preference is, happy to pivot however.
>
> Restricting memory to be movable-only to handle a user-space problem as
> described here sounds like the wrong approach to me. You really want the
> default of such memory to be "movable".
>
> Almost like an optimized "auto-movable" policy :)
>
> Or a new policy that will respect a provided default (MHP_DEFAULT_MOVABLE).
>
Fair, I'll revist this once Hannes gets a chance to chime in.
This was effective at getting the discussion started though :P
~Gregory
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 18:06 ` Gregory Price
@ 2026-01-06 18:38 ` David Hildenbrand (Red Hat)
2026-01-06 19:59 ` Gregory Price
2026-01-08 7:21 ` Hannes Reinecke
2026-01-08 7:22 ` Hannes Reinecke
2 siblings, 1 reply; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-06 18:38 UTC (permalink / raw)
To: Gregory Price, hare
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On 1/6/26 19:06, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/6/26 17:58, Gregory Price wrote:
>>> On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
>>>
>>> I'm not against this idea, but it also makes the sysfs a little more
>>> confusing (`echo online` now does different things based on prior
>>> state).
>>
>> Right, but only for the contig-zones policy.
>>
>> But maybe you really want the default for such memory to be "movable" even
>> when not onlined beforehand? So I am not sure if the description of the
>> problem here is accurate.
>>
>> Isn't one problem also udev racing with ndctl?
>>
>
> Yeah there's a bunch of races, the specific ones mentioned by Hannes i
> need to go back and re-listen to the talk.
>
>>> I preferred just failing if the block wasn't compatible with
>>> the zone (maybe making it more clear with a dmesg print?)
>>
>> The thing is that this block is compatible with the zone, no?
>>
>> In a system where you would never want to offline that memory, why should we
>> stop someone from onlining it to a kernel zone? I'm sure someone with a
>> weird use case will show up later that will complain about this.
>>
>
> Presumably you wouldn't be setting the MHP flag that prevents the blocks
> from being onlined in a kernel zone then - in which case this all just
> works as intended today.
>
>> But the patch is missing details on who would actually set MHP_MOVABLE_ONLY.
>> A user should be posted alongside the core change.
>>
>
> This is fair and probably the obvious immediate user would be a dax
> device with some kind of `dax0.0/protect_unplug` feature set.
> (With a better name obviuosly).
>
> I will defer to Hannes on his specific use case, but I could see the
> CXL-DCD (Dynamic Capacity) set wanting something like this.
>
>>>
>>> Anyway, let me know what your preference is, happy to pivot however.
>>
>> Restricting memory to be movable-only to handle a user-space problem as
>> described here sounds like the wrong approach to me. You really want the
>> default of such memory to be "movable".
>>
>> Almost like an optimized "auto-movable" policy :)
>>
>> Or a new policy that will respect a provided default (MHP_DEFAULT_MOVABLE).
>>
>
> Fair, I'll revist this once Hannes gets a chance to chime in.
>
> This was effective at getting the discussion started though :P
Hehe, yes.
Another thing to look into would be to provide a way for ndctl to just
add+online the memory in one shot, without having to go back to walking
memory blocks to online them etc.
After all, ndctl knows exactly what it wants, as configured by user space.
Something like "dax0.0/online_mode" (or however we can make it clearer
that this is for the system ram mode), which would default to "offline"
(what we have right now).
When set to "online_movable", we'd online the memory to online_movable
right during add_memory(). So no races with udev and no manual onlining
necessary.
One could also envision a mechanism for ndctl to
offline_and_remove_memory() memory, instead of manually offlining it, to
then race with somebody else wanting to reonline it.
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 16:53 ` Gregory Price
@ 2026-01-06 19:49 ` Michal Hocko
2026-01-07 12:47 ` Hannes Reinecke
2026-01-07 15:09 ` David Hildenbrand (Red Hat)
0 siblings, 2 replies; 20+ messages in thread
From: Michal Hocko @ 2026-01-06 19:49 UTC (permalink / raw)
To: Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, david, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, hare
On Tue 06-01-26 11:53:30, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
> > On Mon 05-01-26 15:36:11, Gregory Price wrote:
> > > It was reported (LPC 2025) that userland services which monitor memory
> > > blocks can cause hot-unplug to fail permanently.
> > >
> > > This can occur when drivers attempt to hot-remove memory in two phases
> > > (offline, remove), while a userland service detects the memory offline
> > > and re-onlines the memory into a zone which may prevent removal.
> >
> > Are there more details about this?
>
> The details are with Hannes, I was just recapping what was described in
> his devmem talk at LPC ("To online or not online").
I know of policies to online newly added memory blocks but I am not
aware of policies to re-online something that has been made offline.
> > That being said, rather than movable_only, should we have a mask of
> > online types supported for the mem block?
> >
>
> I briefly considered this. I went with this for RFC-v1 since it's
> fairly simple and because movable is really the only zone with hotplug
> guarantees (any other zone makes no hotplug guarantees).
>
> It's also significantly more complex of a change for questionable value,
> but if people see this as the way to go i'll happily pivot to that.
Sure, I wouldn't push for more complexity just for the sake of a
theoretical extensibility. And I have to admit I have't tried to a quick
PoC to see how complex this could grow. I was hoping this could get into
a simple mask for online types with default MMOP_ONLINE_KERNEL|MMOP_ONLINE_MOVABLE
and special cases just choosing one of the two and zone_for_pfn_range
checking for the compatibility with the requested online type. But I do
appreciate there might be some obstacles on the way to achieve that.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 18:38 ` David Hildenbrand (Red Hat)
@ 2026-01-06 19:59 ` Gregory Price
2026-01-06 20:22 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 20+ messages in thread
From: Gregory Price @ 2026-01-06 19:59 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: hare, linux-mm, linux-kernel, kernel-team, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko
On Tue, Jan 06, 2026 at 07:38:54PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/6/26 19:06, Gregory Price wrote:
> > On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
> > > On 1/6/26 17:58, Gregory Price wrote:
> >
> > Fair, I'll revist this once Hannes gets a chance to chime in.
> >
> > This was effective at getting the discussion started though :P
>
> Hehe, yes.
>
> Another thing to look into would be to provide a way for ndctl to just
> add+online the memory in one shot, without having to go back to walking
> memory blocks to online them etc.
>
I think it's the opposite: offline+remove needing to be done in one step
while holding the hotplug lock. Right now, I think you have to do
something like
daxctl offline-memory ...
daxctl destroy ...
You can't destroy and have it offline the memory for you in one go IIRC.
Maybe that's the ACTUAL fix here?
~Gregory
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 19:59 ` Gregory Price
@ 2026-01-06 20:22 ` David Hildenbrand (Red Hat)
2026-01-08 7:31 ` Hannes Reinecke
0 siblings, 1 reply; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-06 20:22 UTC (permalink / raw)
To: Gregory Price
Cc: hare, linux-mm, linux-kernel, kernel-team, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko
On 1/6/26 20:59, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 07:38:54PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/6/26 19:06, Gregory Price wrote:
>>> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
>>>> On 1/6/26 17:58, Gregory Price wrote:
>>>
>>> Fair, I'll revist this once Hannes gets a chance to chime in.
>>>
>>> This was effective at getting the discussion started though :P
>>
>> Hehe, yes.
>>
>> Another thing to look into would be to provide a way for ndctl to just
>> add+online the memory in one shot, without having to go back to walking
>> memory blocks to online them etc.
>>
>
> I think it's the opposite: offline+remove needing to be done in one step
> while holding the hotplug lock. Right now, I think you have to do
> something like
That's what I note below, yes.
For the udev vs. ndctl race to be handled in a
good way you need add+online be done in one operation.
>
> daxctl offline-memory ...
> daxctl destroy ...
>
> You can't destroy and have it offline the memory for you in one go IIRC.
As noted below, we have offline_and_remove_memory().
I added the comment:
/*
* Try to offline and remove memory. Might take a long time to finish in case
* memory is still in use. Primarily useful for memory devices that logically
* unplugged all memory (so it's no longer in use) and want to offline + remove
* that memory.
*/
Nothing speaks against letting dax use that, but the tricky part is that
offlining might take forever, so one has to be prepared to handle that
(and letting user space cancel the operation).
And for dax devices that consist of multiple ranges, it can be "fun" having
some regions removed and others not.
Something to think about :)
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 19:49 ` Michal Hocko
@ 2026-01-07 12:47 ` Hannes Reinecke
2026-01-07 17:17 ` Michal Hocko
2026-01-07 15:09 ` David Hildenbrand (Red Hat)
1 sibling, 1 reply; 20+ messages in thread
From: Hannes Reinecke @ 2026-01-07 12:47 UTC (permalink / raw)
To: Michal Hocko, Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, david, osalvador, gregkh,
rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb
On 1/6/26 20:49, Michal Hocko wrote:
> On Tue 06-01-26 11:53:30, Gregory Price wrote:
>> On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
>>> On Mon 05-01-26 15:36:11, Gregory Price wrote:
>>>> It was reported (LPC 2025) that userland services which monitor memory
>>>> blocks can cause hot-unplug to fail permanently.
>>>>
>>>> This can occur when drivers attempt to hot-remove memory in two phases
>>>> (offline, remove), while a userland service detects the memory offline
>>>> and re-onlines the memory into a zone which may prevent removal.
>>>
>>> Are there more details about this?
>>
>> The details are with Hannes, I was just recapping what was described in
>> his devmem talk at LPC ("To online or not online").
>
> I know of policies to online newly added memory blocks but I am not
> aware of policies to re-online something that has been made offline.
>
It's not a policy per-se, but rather a udev rule (which one could
argue _is_ a policy, mind). There is a rather long-running SLES bug
around this if you are interested...
But in either case: we cannot prevent the user from writing arbitrary
udev rules. But we should make sure that the result of udev actions
makes sense for the system.
>>> That being said, rather than movable_only, should we have a mask of
>>> online types supported for the mem block?
>>>
>>
>> I briefly considered this. I went with this for RFC-v1 since it's
>> fairly simple and because movable is really the only zone with hotplug
>> guarantees (any other zone makes no hotplug guarantees).
>>
>> It's also significantly more complex of a change for questionable value,
>> but if people see this as the way to go i'll happily pivot to that.
>
> Sure, I wouldn't push for more complexity just for the sake of a
> theoretical extensibility. And I have to admit I have't tried to a quick
> PoC to see how complex this could grow. I was hoping this could get into
> a simple mask for online types with default MMOP_ONLINE_KERNEL|MMOP_ONLINE_MOVABLE
> and special cases just choosing one of the two and zone_for_pfn_range
> checking for the compatibility with the requested online type. But I do
> appreciate there might be some obstacles on the way to achieve that.
Yes, and really it's only ZONE_MOVABLE for which such a treatment
makes sense currently. Once we have other zone types we might need
to re-evaluate that. But for now I guess we're fine with a simple flag.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 19:49 ` Michal Hocko
2026-01-07 12:47 ` Hannes Reinecke
@ 2026-01-07 15:09 ` David Hildenbrand (Red Hat)
2026-01-07 16:00 ` Gregory Price
2026-01-07 17:19 ` Michal Hocko
1 sibling, 2 replies; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-07 15:09 UTC (permalink / raw)
To: Michal Hocko, Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
hare
On 1/6/26 20:49, Michal Hocko wrote:
> On Tue 06-01-26 11:53:30, Gregory Price wrote:
>> On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
>>> On Mon 05-01-26 15:36:11, Gregory Price wrote:
>>>> It was reported (LPC 2025) that userland services which monitor memory
>>>> blocks can cause hot-unplug to fail permanently.
>>>>
>>>> This can occur when drivers attempt to hot-remove memory in two phases
>>>> (offline, remove), while a userland service detects the memory offline
>>>> and re-onlines the memory into a zone which may prevent removal.
>>>
>>> Are there more details about this?
>>
>> The details are with Hannes, I was just recapping what was described in
>> his devmem talk at LPC ("To online or not online").
>
> I know of policies to online newly added memory blocks but I am not
> aware of policies to re-online something that has been made offline.
>
>>> That being said, rather than movable_only, should we have a mask of
>>> online types supported for the mem block?
>>>
>>
>> I briefly considered this. I went with this for RFC-v1 since it's
>> fairly simple and because movable is really the only zone with hotplug
>> guarantees (any other zone makes no hotplug guarantees).
>>
>> It's also significantly more complex of a change for questionable value,
>> but if people see this as the way to go i'll happily pivot to that.
>
> Sure, I wouldn't push for more complexity just for the sake of a
> theoretical extensibility. And I have to admit I have't tried to a quick
> PoC to see how complex this could grow. I was hoping this could get into
> a simple mask for online types with default MMOP_ONLINE_KERNEL|MMOP_ONLINE_MOVABLE
> and special cases just choosing one of the two and zone_for_pfn_range
> checking for the compatibility with the requested online type. But I do
> appreciate there might be some obstacles on the way to achieve that.
If we want to go down that path of failing onlining, we could likely do
without any core-MM changes: dax can simply register a memory notifier
and fail MEM_GOING_ONLINE of its memory with -EINVAL when it sees
!ZONE_MOVABLE.
That works, because online_pages() does the move_pfn_range_to_zone()
before calling MEM_GOING_ONLINE.
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-07 15:09 ` David Hildenbrand (Red Hat)
@ 2026-01-07 16:00 ` Gregory Price
2026-01-07 17:19 ` Michal Hocko
1 sibling, 0 replies; 20+ messages in thread
From: Gregory Price @ 2026-01-07 16:00 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: Michal Hocko, linux-mm, linux-kernel, kernel-team, osalvador,
gregkh, rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, hare
On Wed, Jan 07, 2026 at 04:09:34PM +0100, David Hildenbrand (Red Hat) wrote:
> > Sure, I wouldn't push for more complexity just for the sake of a
> > theoretical extensibility. And I have to admit I have't tried to a quick
> > PoC to see how complex this could grow. I was hoping this could get into
> > a simple mask for online types with default MMOP_ONLINE_KERNEL|MMOP_ONLINE_MOVABLE
> > and special cases just choosing one of the two and zone_for_pfn_range
> > checking for the compatibility with the requested online type. But I do
> > appreciate there might be some obstacles on the way to achieve that.
>
> If we want to go down that path of failing onlining, we could likely do
> without any core-MM changes: dax can simply register a memory notifier and
> fail MEM_GOING_ONLINE of its memory with -EINVAL when it sees !ZONE_MOVABLE.
>
> That works, because online_pages() does the move_pfn_range_to_zone() before
> calling MEM_GOING_ONLINE.
>
This would be clean, and we could add a switch in dax-kmem for something
like hotunplug=true which limits zone eligibility to ZONE_MOVABLE.
I can look at this next week or so.
~Gregory
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-07 12:47 ` Hannes Reinecke
@ 2026-01-07 17:17 ` Michal Hocko
0 siblings, 0 replies; 20+ messages in thread
From: Michal Hocko @ 2026-01-07 17:17 UTC (permalink / raw)
To: Hannes Reinecke
Cc: Gregory Price, linux-mm, linux-kernel, kernel-team, david,
osalvador, gregkh, rafael, dakr, akpm, lorenzo.stoakes,
Liam.Howlett, vbabka, rppt, surenb
On Wed 07-01-26 13:47:41, Hannes Reinecke wrote:
> On 1/6/26 20:49, Michal Hocko wrote:
> > On Tue 06-01-26 11:53:30, Gregory Price wrote:
> > > On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
> > > > On Mon 05-01-26 15:36:11, Gregory Price wrote:
> > > > > It was reported (LPC 2025) that userland services which monitor memory
> > > > > blocks can cause hot-unplug to fail permanently.
> > > > >
> > > > > This can occur when drivers attempt to hot-remove memory in two phases
> > > > > (offline, remove), while a userland service detects the memory offline
> > > > > and re-onlines the memory into a zone which may prevent removal.
> > > >
> > > > Are there more details about this?
> > >
> > > The details are with Hannes, I was just recapping what was described in
> > > his devmem talk at LPC ("To online or not online").
> >
> > I know of policies to online newly added memory blocks but I am not
> > aware of policies to re-online something that has been made offline.
> It's not a policy per-se, but rather a udev rule (which one could
> argue _is_ a policy, mind). There is a rather long-running SLES bug
> around this if you are interested...
I am aware of udev rules which automatically online memory that is hot
added. But I am not aware of any rules to re-online memory that has been
offlined. The former makes some sense while the latter makes very little
sense to me.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-07 15:09 ` David Hildenbrand (Red Hat)
2026-01-07 16:00 ` Gregory Price
@ 2026-01-07 17:19 ` Michal Hocko
1 sibling, 0 replies; 20+ messages in thread
From: Michal Hocko @ 2026-01-07 17:19 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: Gregory Price, linux-mm, linux-kernel, kernel-team, osalvador,
gregkh, rafael, dakr, akpm, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, hare
On Wed 07-01-26 16:09:34, David Hildenbrand wrote:
> On 1/6/26 20:49, Michal Hocko wrote:
> > On Tue 06-01-26 11:53:30, Gregory Price wrote:
> > > On Tue, Jan 06, 2026 at 04:05:48PM +0100, Michal Hocko wrote:
> > > > On Mon 05-01-26 15:36:11, Gregory Price wrote:
> > > > > It was reported (LPC 2025) that userland services which monitor memory
> > > > > blocks can cause hot-unplug to fail permanently.
> > > > >
> > > > > This can occur when drivers attempt to hot-remove memory in two phases
> > > > > (offline, remove), while a userland service detects the memory offline
> > > > > and re-onlines the memory into a zone which may prevent removal.
> > > >
> > > > Are there more details about this?
> > >
> > > The details are with Hannes, I was just recapping what was described in
> > > his devmem talk at LPC ("To online or not online").
> >
> > I know of policies to online newly added memory blocks but I am not
> > aware of policies to re-online something that has been made offline.
> > > > That being said, rather than movable_only, should we have a mask of
> > > > online types supported for the mem block?
> > > >
> > >
> > > I briefly considered this. I went with this for RFC-v1 since it's
> > > fairly simple and because movable is really the only zone with hotplug
> > > guarantees (any other zone makes no hotplug guarantees).
> > >
> > > It's also significantly more complex of a change for questionable value,
> > > but if people see this as the way to go i'll happily pivot to that.
> >
> > Sure, I wouldn't push for more complexity just for the sake of a
> > theoretical extensibility. And I have to admit I have't tried to a quick
> > PoC to see how complex this could grow. I was hoping this could get into
> > a simple mask for online types with default MMOP_ONLINE_KERNEL|MMOP_ONLINE_MOVABLE
> > and special cases just choosing one of the two and zone_for_pfn_range
> > checking for the compatibility with the requested online type. But I do
> > appreciate there might be some obstacles on the way to achieve that.
>
> If we want to go down that path of failing onlining, we could likely do
> without any core-MM changes: dax can simply register a memory notifier and
> fail MEM_GOING_ONLINE of its memory with -EINVAL when it sees !ZONE_MOVABLE.
>
> That works, because online_pages() does the move_pfn_range_to_zone() before
> calling MEM_GOING_ONLINE.
Yes, that makes sense as well and it seems rather elegand way to go
about that.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 18:06 ` Gregory Price
2026-01-06 18:38 ` David Hildenbrand (Red Hat)
@ 2026-01-08 7:21 ` Hannes Reinecke
2026-01-08 7:22 ` Hannes Reinecke
2 siblings, 0 replies; 20+ messages in thread
From: Hannes Reinecke @ 2026-01-08 7:21 UTC (permalink / raw)
To: Gregory Price, David Hildenbrand (Red Hat)
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On 1/6/26 19:06, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/6/26 17:58, Gregory Price wrote:
>>> On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
>>>
>>> I'm not against this idea, but it also makes the sysfs a little more
>>> confusing (`echo online` now does different things based on prior
>>> state).
>>
>> Right, but only for the contig-zones policy.
>>
>> But maybe you really want the default for such memory to be "movable" even
>> when not onlined beforehand? So I am not sure if the description of the
>> problem here is accurate.
>>
>> Isn't one problem also udev racing with ndctl?
>>
>
> Yeah there's a bunch of races, the specific ones mentioned by Hannes i
> need to go back and re-listen to the talk.
>
udev racing with ndctl is a general problem, and not specific to
zone movable. We definitely should be looking into that.
>>> I preferred just failing if the block wasn't compatible with
>>> the zone (maybe making it more clear with a dmesg print?)
>>
>> The thing is that this block is compatible with the zone, no?
>>
>> In a system where you would never want to offline that memory, why should we
>> stop someone from onlining it to a kernel zone? I'm sure someone with a
>> weird use case will show up later that will complain about this.
>>
>
> Presumably you wouldn't be setting the MHP flag that prevents the blocks
> from being onlined in a kernel zone then - in which case this all just
> works as intended today.
>
This is _not_ about never wanting to offline memory.
Quite the contrary, actually: we assume that CXL memory is
hotpluggable, and as such online and offline will happen.
What we need to ensure, though, is that all memory on CXL
always lands in zone movable.
>> But the patch is missing details on who would actually set MHP_MOVABLE_ONLY.
>> A user should be posted alongside the core change.
>>
>
> This is fair and probably the obvious immediate user would be a dax
> device with some kind of `dax0.0/protect_unplug` feature set.
> (With a better name obviuosly).
>
> I will defer to Hannes on his specific use case, but I could see the
> CXL-DCD (Dynamic Capacity) set wanting something like this.
>
The specific use-case is CXL: all memory on CXL should be onlined
to zone movable. The mechanism invoking that (be it udev or ndctl)
is secondary.
>>>
>>> Anyway, let me know what your preference is, happy to pivot however.
>>
>> Restricting memory to be movable-only to handle a user-space problem as
>> described here sounds like the wrong approach to me. You really want the
>> default of such memory to be "movable".
>>
>> Almost like an optimized "auto-movable" policy :)
>>
>> Or a new policy that will respect a provided default (MHP_DEFAULT_MOVABLE).
>>
>
> Fair, I'll revist this once Hannes gets a chance to chime in.
>
> This was effective at getting the discussion started though :P
>
Oh, definitely :-)
Thanks for getting started on this.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 18:06 ` Gregory Price
2026-01-06 18:38 ` David Hildenbrand (Red Hat)
2026-01-08 7:21 ` Hannes Reinecke
@ 2026-01-08 7:22 ` Hannes Reinecke
2 siblings, 0 replies; 20+ messages in thread
From: Hannes Reinecke @ 2026-01-08 7:22 UTC (permalink / raw)
To: Gregory Price, David Hildenbrand (Red Hat)
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On 1/6/26 19:06, Gregory Price wrote:
> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/6/26 17:58, Gregory Price wrote:
>>> On Tue, Jan 06, 2026 at 04:24:21PM +0100, David Hildenbrand (Red Hat) wrote:
>>>
>>> I'm not against this idea, but it also makes the sysfs a little more
>>> confusing (`echo online` now does different things based on prior
>>> state).
>>
>> Right, but only for the contig-zones policy.
>>
>> But maybe you really want the default for such memory to be "movable" even
>> when not onlined beforehand? So I am not sure if the description of the
>> problem here is accurate.
>>
>> Isn't one problem also udev racing with ndctl?
>>
>
> Yeah there's a bunch of races, the specific ones mentioned by Hannes i
> need to go back and re-listen to the talk.
>
udev racing with ndctl is a general problem, and not specific to
zone movable. We definitely should be looking into that.
>>> I preferred just failing if the block wasn't compatible with
>>> the zone (maybe making it more clear with a dmesg print?)
>>
>> The thing is that this block is compatible with the zone, no?
>>
>> In a system where you would never want to offline that memory, why should we
>> stop someone from onlining it to a kernel zone? I'm sure someone with a
>> weird use case will show up later that will complain about this.
>>
>
> Presumably you wouldn't be setting the MHP flag that prevents the blocks
> from being onlined in a kernel zone then - in which case this all just
> works as intended today.
>
This is _not_ about never wanting to offline memory.
Quite the contrary, actually: we assume that CXL memory is
hotpluggable, and as such online and offline will happen.
What we need to ensure, though, is that all memory on CXL
always lands in zone movable.
>> But the patch is missing details on who would actually set MHP_MOVABLE_ONLY.
>> A user should be posted alongside the core change.
>>
>
> This is fair and probably the obvious immediate user would be a dax
> device with some kind of `dax0.0/protect_unplug` feature set.
> (With a better name obviuosly).
>
> I will defer to Hannes on his specific use case, but I could see the
> CXL-DCD (Dynamic Capacity) set wanting something like this.
>
The specific use-case is CXL: all memory on CXL should be onlined
to zone movable. The mechanism invoking that (be it udev or ndctl)
is secondary.
>>>
>>> Anyway, let me know what your preference is, happy to pivot however.
>>
>> Restricting memory to be movable-only to handle a user-space problem as
>> described here sounds like the wrong approach to me. You really want the
>> default of such memory to be "movable".
>>
>> Almost like an optimized "auto-movable" policy :)
>>
>> Or a new policy that will respect a provided default (MHP_DEFAULT_MOVABLE).
>>
>
> Fair, I'll revist this once Hannes gets a chance to chime in.
>
> This was effective at getting the discussion started though :P
>
Oh, definitely :-)
Thanks for getting started on this.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-06 20:22 ` David Hildenbrand (Red Hat)
@ 2026-01-08 7:31 ` Hannes Reinecke
2026-01-08 14:16 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 20+ messages in thread
From: Hannes Reinecke @ 2026-01-08 7:31 UTC (permalink / raw)
To: David Hildenbrand (Red Hat), Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On 1/6/26 21:22, David Hildenbrand (Red Hat) wrote:
> On 1/6/26 20:59, Gregory Price wrote:
>> On Tue, Jan 06, 2026 at 07:38:54PM +0100, David Hildenbrand (Red Hat)
>> wrote:
>>> On 1/6/26 19:06, Gregory Price wrote:
>>>> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red
>>>> Hat) wrote:
>>>>> On 1/6/26 17:58, Gregory Price wrote:
>>>>
>>>> Fair, I'll revist this once Hannes gets a chance to chime in.
>>>>
>>>> This was effective at getting the discussion started though :P
>>>
>>> Hehe, yes.
>>>
>>> Another thing to look into would be to provide a way for ndctl to just
>>> add+online the memory in one shot, without having to go back to walking
>>> memory blocks to online them etc.
>>>
>>
>> I think it's the opposite: offline+remove needing to be done in one step
>> while holding the hotplug lock. Right now, I think you have to do
>> something like
>
> That's what I note below, yes.
>
> For the udev vs. ndctl race to be handled in a
> good way you need add+online be done in one operation.
>
>>
>> daxctl offline-memory ...
>> daxctl destroy ...
>>
>> You can't destroy and have it offline the memory for you in one go IIRC.
>
> As noted below, we have offline_and_remove_memory().
>
> I added the comment:
>
> /*
> * Try to offline and remove memory. Might take a long time to finish
> in case
> * memory is still in use. Primarily useful for memory devices that
> logically
> * unplugged all memory (so it's no longer in use) and want to offline
> + remove
> * that memory.
> */
>
> Nothing speaks against letting dax use that, but the tricky part is that
> offlining might take forever, so one has to be prepared to handle that
> (and letting user space cancel the operation).
>
> And for dax devices that consist of multiple ranges, it can be "fun" having
> some regions removed and others not.
>
> Something to think about :)
>
We had this discussion at LPC. The current interface of having to
individually offline every single memory block is not very
user-friendly. While it provides the best possible granularity, it
really only makes sense for virtual environments where you _can_
hotplug individual blocks.
For hardware-based scenarios memory will always be removed in
larger entities (eg the CXL device), and it's always an 'all-or-nothing'
scenario; you cannot remove individual memory blocks on a CXL device.
So there the memory block abstraction makes less sense, and it
would be good to have a single 'knob' to remove the entire CXL
device and all memory blocks on it.
Sure, it might take some time, but one doesn't need to worry about
restoring the original state if the operation on one block fails.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable
2026-01-08 7:31 ` Hannes Reinecke
@ 2026-01-08 14:16 ` David Hildenbrand (Red Hat)
0 siblings, 0 replies; 20+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-08 14:16 UTC (permalink / raw)
To: Hannes Reinecke, Gregory Price
Cc: linux-mm, linux-kernel, kernel-team, osalvador, gregkh, rafael,
dakr, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb,
mhocko
On 1/8/26 08:31, Hannes Reinecke wrote:
> On 1/6/26 21:22, David Hildenbrand (Red Hat) wrote:
>> On 1/6/26 20:59, Gregory Price wrote:
>>> On Tue, Jan 06, 2026 at 07:38:54PM +0100, David Hildenbrand (Red Hat)
>>> wrote:
>>>> On 1/6/26 19:06, Gregory Price wrote:
>>>>> On Tue, Jan 06, 2026 at 06:52:11PM +0100, David Hildenbrand (Red
>>>>> Hat) wrote:
>>>>>> On 1/6/26 17:58, Gregory Price wrote:
>>>>>
>>>>> Fair, I'll revist this once Hannes gets a chance to chime in.
>>>>>
>>>>> This was effective at getting the discussion started though :P
>>>>
>>>> Hehe, yes.
>>>>
>>>> Another thing to look into would be to provide a way for ndctl to just
>>>> add+online the memory in one shot, without having to go back to walking
>>>> memory blocks to online them etc.
>>>>
>>>
>>> I think it's the opposite: offline+remove needing to be done in one step
>>> while holding the hotplug lock. Right now, I think you have to do
>>> something like
>>
>> That's what I note below, yes.
>>
>> For the udev vs. ndctl race to be handled in a
>> good way you need add+online be done in one operation.
>>
>>>
>>> daxctl offline-memory ...
>>> daxctl destroy ...
>>>
>>> You can't destroy and have it offline the memory for you in one go IIRC.
>>
>> As noted below, we have offline_and_remove_memory().
>>
>> I added the comment:
>>
>> /*
>> * Try to offline and remove memory. Might take a long time to finish
>> in case
>> * memory is still in use. Primarily useful for memory devices that
>> logically
>> * unplugged all memory (so it's no longer in use) and want to offline
>> + remove
>> * that memory.
>> */
>>
>> Nothing speaks against letting dax use that, but the tricky part is that
>> offlining might take forever, so one has to be prepared to handle that
>> (and letting user space cancel the operation).
>>
>> And for dax devices that consist of multiple ranges, it can be "fun" having
>> some regions removed and others not.
>>
>> Something to think about :)
>>
> We had this discussion at LPC. The current interface of having to
> individually offline every single memory block is not very
> user-friendly. While it provides the best possible granularity, it
> really only makes sense for virtual environments where you _can_
> hotplug individual blocks.
Yes.
> For hardware-based scenarios memory will always be removed in
> larger entities (eg the CXL device), and it's always an 'all-or-nothing'
> scenario; you cannot remove individual memory blocks on a CXL device.
> So there the memory block abstraction makes less sense, and it
> would be good to have a single 'knob' to remove the entire CXL
> device and all memory blocks on it.
> Sure, it might take some time, but one doesn't need to worry about
> restoring the original state if the operation on one block fails.
That's not what I was getting at:
offline_and_remove_memory() can be called on large regions, and it
properly handles whether we have to back out because some offlining failed.
The issue arises once dax would have to call offline_and_remove_memory()
multiple times, on non-contiguous areas. Of course, we could handle that
by providing an interface that consumes multiple memory ranges.
For the DAX use case, I thing we'd really want a way to just use
* add_and_online_memory() [does not exist yet, but ppc does something
similar]
* offline_and_remove_memory()
And not have user space to worry otherwise about onlining/offlining of
memory at all.
Of course, that will require some new plumbing for ndctl to make use of
this functionality.
--
Cheers
David
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2026-01-08 14:16 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-05 20:36 [RFC PATCH] memory,memory_hotplug: allow restricting memory blocks to zone movable Gregory Price
2026-01-06 15:05 ` Michal Hocko
2026-01-06 16:53 ` Gregory Price
2026-01-06 19:49 ` Michal Hocko
2026-01-07 12:47 ` Hannes Reinecke
2026-01-07 17:17 ` Michal Hocko
2026-01-07 15:09 ` David Hildenbrand (Red Hat)
2026-01-07 16:00 ` Gregory Price
2026-01-07 17:19 ` Michal Hocko
2026-01-06 15:24 ` David Hildenbrand (Red Hat)
2026-01-06 16:58 ` Gregory Price
2026-01-06 17:52 ` David Hildenbrand (Red Hat)
2026-01-06 18:06 ` Gregory Price
2026-01-06 18:38 ` David Hildenbrand (Red Hat)
2026-01-06 19:59 ` Gregory Price
2026-01-06 20:22 ` David Hildenbrand (Red Hat)
2026-01-08 7:31 ` Hannes Reinecke
2026-01-08 14:16 ` David Hildenbrand (Red Hat)
2026-01-08 7:21 ` Hannes Reinecke
2026-01-08 7:22 ` Hannes Reinecke
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox