* [PATCH v4 1/4] kho: check if kho is finalized in __kho_preserve_order()
2025-09-17 17:40 [PATCH v4 0/4] kho: add support for preserving vmalloc allocations Mike Rapoport
@ 2025-09-17 17:40 ` Mike Rapoport
2025-09-18 10:12 ` Pratyush Yadav
2025-09-17 17:40 ` [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages() Mike Rapoport
` (2 subsequent siblings)
3 siblings, 1 reply; 12+ messages in thread
From: Mike Rapoport @ 2025-09-17 17:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
kexec, linux-mm, linux-kernel
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Instead of checking if kho is finalized in each caller of
__kho_preserve_order(), do it in the core function itself.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
kernel/kexec_handover.c | 55 +++++++++++++++++++----------------------
1 file changed, 26 insertions(+), 29 deletions(-)
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 8079fc4b9189..f421acc58c1f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -91,6 +91,29 @@ struct kho_serialization {
struct khoser_mem_chunk *preserved_mem_map;
};
+struct kho_out {
+ struct blocking_notifier_head chain_head;
+
+ struct dentry *dir;
+
+ struct mutex lock; /* protects KHO FDT finalization */
+
+ struct kho_serialization ser;
+ bool finalized;
+};
+
+static struct kho_out kho_out = {
+ .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
+ .lock = __MUTEX_INITIALIZER(kho_out.lock),
+ .ser = {
+ .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
+ .track = {
+ .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
+ },
+ },
+ .finalized = false,
+};
+
static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
{
void *elm, *res;
@@ -149,6 +172,9 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
might_sleep();
+ if (kho_out.finalized)
+ return -EBUSY;
+
physxa = xa_load(&track->orders, order);
if (!physxa) {
int err;
@@ -640,29 +666,6 @@ int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
}
EXPORT_SYMBOL_GPL(kho_add_subtree);
-struct kho_out {
- struct blocking_notifier_head chain_head;
-
- struct dentry *dir;
-
- struct mutex lock; /* protects KHO FDT finalization */
-
- struct kho_serialization ser;
- bool finalized;
-};
-
-static struct kho_out kho_out = {
- .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
- .lock = __MUTEX_INITIALIZER(kho_out.lock),
- .ser = {
- .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
- .track = {
- .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
- },
- },
- .finalized = false,
-};
-
int register_kho_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&kho_out.chain_head, nb);
@@ -690,9 +693,6 @@ int kho_preserve_folio(struct folio *folio)
const unsigned int order = folio_order(folio);
struct kho_mem_track *track = &kho_out.ser.track;
- if (kho_out.finalized)
- return -EBUSY;
-
return __kho_preserve_order(track, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@@ -716,9 +716,6 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
int err = 0;
struct kho_mem_track *track = &kho_out.ser.track;
- if (kho_out.finalized)
- return -EBUSY;
-
if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
return -EINVAL;
--
2.50.1
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 1/4] kho: check if kho is finalized in __kho_preserve_order()
2025-09-17 17:40 ` [PATCH v4 1/4] kho: check if kho is finalized in __kho_preserve_order() Mike Rapoport
@ 2025-09-18 10:12 ` Pratyush Yadav
0 siblings, 0 replies; 12+ messages in thread
From: Pratyush Yadav @ 2025-09-18 10:12 UTC (permalink / raw)
To: Mike Rapoport
Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec,
linux-mm, linux-kernel
On Wed, Sep 17 2025, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Instead of checking if kho is finalized in each caller of
> __kho_preserve_order(), do it in the core function itself.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
[...]
--
Regards,
Pratyush Yadav
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages()
2025-09-17 17:40 [PATCH v4 0/4] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-17 17:40 ` [PATCH v4 1/4] kho: check if kho is finalized in __kho_preserve_order() Mike Rapoport
@ 2025-09-17 17:40 ` Mike Rapoport
2025-09-18 10:32 ` Pratyush Yadav
2025-09-17 17:40 ` [PATCH v4 3/4] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-17 17:40 ` [PATCH v4 4/4] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
3 siblings, 1 reply; 12+ messages in thread
From: Mike Rapoport @ 2025-09-17 17:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
kexec, linux-mm, linux-kernel
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
to make it clear that KHO operates on pages rather than on a random
physical address.
The kho_preserve_pages() will be also used in upcoming support for
vmalloc preservation.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
include/linux/kexec_handover.h | 5 +++--
kernel/kexec_handover.c | 25 +++++++++++--------------
mm/memblock.c | 4 +++-
3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index 348844cffb13..cc5c49b0612b 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -18,6 +18,7 @@ enum kho_event {
struct folio;
struct notifier_block;
+struct page;
#define DECLARE_KHOSER_PTR(name, type) \
union { \
@@ -42,7 +43,7 @@ struct kho_serialization;
bool kho_is_enabled(void);
int kho_preserve_folio(struct folio *folio);
-int kho_preserve_phys(phys_addr_t phys, size_t size);
+int kho_preserve_pages(struct page *page, unsigned int nr_pages);
struct folio *kho_restore_folio(phys_addr_t phys);
int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
@@ -65,7 +66,7 @@ static inline int kho_preserve_folio(struct folio *folio)
return -EOPNOTSUPP;
}
-static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
+static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
{
return -EOPNOTSUPP;
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index f421acc58c1f..3ad59c5f9eaa 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -698,26 +698,23 @@ int kho_preserve_folio(struct folio *folio)
EXPORT_SYMBOL_GPL(kho_preserve_folio);
/**
- * kho_preserve_phys - preserve a physically contiguous range across kexec.
- * @phys: physical address of the range.
- * @size: size of the range.
+ * kho_preserve_pages - preserve contiguous pages across kexec
+ * @page: first page in the list.
+ * @nr_pages: number of pages.
*
- * Instructs KHO to preserve the memory range from @phys to @phys + @size
- * across kexec.
+ * Preserve a contiguous list of order 0 pages. Must be restored using
+ * kho_restore_page() on each order 0 page.
*
* Return: 0 on success, error code on failure
*/
-int kho_preserve_phys(phys_addr_t phys, size_t size)
+int kho_preserve_pages(struct page *page, unsigned int nr_pages)
{
- unsigned long pfn = PHYS_PFN(phys);
+ struct kho_mem_track *track = &kho_out.ser.track;
+ const unsigned long start_pfn = page_to_pfn(page);
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn = start_pfn;
unsigned long failed_pfn = 0;
- const unsigned long start_pfn = pfn;
- const unsigned long end_pfn = PHYS_PFN(phys + size);
int err = 0;
- struct kho_mem_track *track = &kho_out.ser.track;
-
- if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
- return -EINVAL;
while (pfn < end_pfn) {
const unsigned int order =
@@ -737,7 +734,7 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
return err;
}
-EXPORT_SYMBOL_GPL(kho_preserve_phys);
+EXPORT_SYMBOL_GPL(kho_preserve_pages);
/* Handling for debug/kho/out */
diff --git a/mm/memblock.c b/mm/memblock.c
index 117d963e677c..6ec3eaa4e8d1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2516,8 +2516,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser)
for (i = 0; i < reserved_mem_count; i++) {
struct reserve_mem_table *map = &reserved_mem_table[i];
+ struct page *page = phys_to_page(map->start);
+ unsigned int nr_pages = map->size >> PAGE_SHIFT;
- err |= kho_preserve_phys(map->start, map->size);
+ err |= kho_preserve_pages(page, nr_pages);
}
err |= kho_preserve_folio(page_folio(kho_fdt));
--
2.50.1
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages()
2025-09-17 17:40 ` [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages() Mike Rapoport
@ 2025-09-18 10:32 ` Pratyush Yadav
2025-09-18 11:04 ` Mike Rapoport
0 siblings, 1 reply; 12+ messages in thread
From: Pratyush Yadav @ 2025-09-18 10:32 UTC (permalink / raw)
To: Mike Rapoport
Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec,
linux-mm, linux-kernel
Hi Mike,
On Wed, Sep 17 2025, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> to make it clear that KHO operates on pages rather than on a random
> physical address.
>
> The kho_preserve_pages() will be also used in upcoming support for
> vmalloc preservation.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
> include/linux/kexec_handover.h | 5 +++--
> kernel/kexec_handover.c | 25 +++++++++++--------------
> mm/memblock.c | 4 +++-
> 3 files changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
> index 348844cffb13..cc5c49b0612b 100644
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -18,6 +18,7 @@ enum kho_event {
>
> struct folio;
> struct notifier_block;
> +struct page;
>
> #define DECLARE_KHOSER_PTR(name, type) \
> union { \
> @@ -42,7 +43,7 @@ struct kho_serialization;
> bool kho_is_enabled(void);
>
> int kho_preserve_folio(struct folio *folio);
> -int kho_preserve_phys(phys_addr_t phys, size_t size);
> +int kho_preserve_pages(struct page *page, unsigned int nr_pages);
> struct folio *kho_restore_folio(phys_addr_t phys);
> int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
> int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
> @@ -65,7 +66,7 @@ static inline int kho_preserve_folio(struct folio *folio)
> return -EOPNOTSUPP;
> }
>
> -static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
> +static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
> {
> return -EOPNOTSUPP;
> }
> diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
> index f421acc58c1f..3ad59c5f9eaa 100644
> --- a/kernel/kexec_handover.c
> +++ b/kernel/kexec_handover.c
> @@ -698,26 +698,23 @@ int kho_preserve_folio(struct folio *folio)
> EXPORT_SYMBOL_GPL(kho_preserve_folio);
>
> /**
> - * kho_preserve_phys - preserve a physically contiguous range across kexec.
> - * @phys: physical address of the range.
> - * @size: size of the range.
> + * kho_preserve_pages - preserve contiguous pages across kexec
> + * @page: first page in the list.
> + * @nr_pages: number of pages.
> *
> - * Instructs KHO to preserve the memory range from @phys to @phys + @size
> - * across kexec.
> + * Preserve a contiguous list of order 0 pages. Must be restored using
> + * kho_restore_page() on each order 0 page.
This is not true. The pages are preserved with the maximum order
possible.
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
err = __kho_preserve_order(track, pfn, order);
[...]
So four 0-order pages will be preserved as one 2-order page. Restoring
them as four 0-order pages is wrong. And my proposed patch for checking
the magic [0] will uncover this exact bug.
I think you should either change the logic to always preserve at order
0, or maybe add a kho_restore_pages() that replicates the same order
calculation.
[0] https://lore.kernel.org/lkml/20250917125725.665-2-pratyush@kernel.org/
> *
> * Return: 0 on success, error code on failure
> */
> -int kho_preserve_phys(phys_addr_t phys, size_t size)
> +int kho_preserve_pages(struct page *page, unsigned int nr_pages)
> {
> - unsigned long pfn = PHYS_PFN(phys);
> + struct kho_mem_track *track = &kho_out.ser.track;
> + const unsigned long start_pfn = page_to_pfn(page);
> + const unsigned long end_pfn = start_pfn + nr_pages;
> + unsigned long pfn = start_pfn;
> unsigned long failed_pfn = 0;
> - const unsigned long start_pfn = pfn;
> - const unsigned long end_pfn = PHYS_PFN(phys + size);
> int err = 0;
> - struct kho_mem_track *track = &kho_out.ser.track;
> -
> - if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
> - return -EINVAL;
>
> while (pfn < end_pfn) {
> const unsigned int order =
> @@ -737,7 +734,7 @@ int kho_preserve_phys(phys_addr_t phys, size_t size)
>
> return err;
> }
> -EXPORT_SYMBOL_GPL(kho_preserve_phys);
> +EXPORT_SYMBOL_GPL(kho_preserve_pages);
>
> /* Handling for debug/kho/out */
>
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 117d963e677c..6ec3eaa4e8d1 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -2516,8 +2516,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser)
>
> for (i = 0; i < reserved_mem_count; i++) {
> struct reserve_mem_table *map = &reserved_mem_table[i];
> + struct page *page = phys_to_page(map->start);
> + unsigned int nr_pages = map->size >> PAGE_SHIFT;
>
> - err |= kho_preserve_phys(map->start, map->size);
> + err |= kho_preserve_pages(page, nr_pages);
Unrelated to this patch, but since there is no
kho_restore_{phys,pages}(), won't the reserve_mem memory end up with
uninitialized struct pages, since preserved pages are
memblock_reserved_mark_noinit()?
That would also be a case for kho_restore_pages() I suppose?
--
Regards,
Pratyush Yadav
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages()
2025-09-18 10:32 ` Pratyush Yadav
@ 2025-09-18 11:04 ` Mike Rapoport
0 siblings, 0 replies; 12+ messages in thread
From: Mike Rapoport @ 2025-09-18 11:04 UTC (permalink / raw)
To: Pratyush Yadav
Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Jason Gunthorpe, Pasha Tatashin, kexec, linux-mm,
linux-kernel
Hi Pratyush,
On Thu, Sep 18, 2025 at 12:32:08PM +0200, Pratyush Yadav wrote:
> Hi Mike,
>
> On Wed, Sep 17 2025, Mike Rapoport wrote:
>
> > /**
> > - * kho_preserve_phys - preserve a physically contiguous range across kexec.
> > - * @phys: physical address of the range.
> > - * @size: size of the range.
> > + * kho_preserve_pages - preserve contiguous pages across kexec
> > + * @page: first page in the list.
> > + * @nr_pages: number of pages.
> > *
> > - * Instructs KHO to preserve the memory range from @phys to @phys + @size
> > - * across kexec.
> > + * Preserve a contiguous list of order 0 pages. Must be restored using
> > + * kho_restore_page() on each order 0 page.
>
> This is not true. The pages are preserved with the maximum order
> possible.
>
> while (pfn < end_pfn) {
> const unsigned int order =
> min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
>
> err = __kho_preserve_order(track, pfn, order);
> [...]
>
> So four 0-order pages will be preserved as one 2-order page. Restoring
> them as four 0-order pages is wrong. And my proposed patch for checking
> the magic [0] will uncover this exact bug.
>
> I think you should either change the logic to always preserve at order
> 0, or maybe add a kho_restore_pages() that replicates the same order
> calculation.
Heh, it seems I shot myself in the foot when I suggested to move the sanity
checks to kho_restore_page() :-D
We surely don't want to preserve contiguous chunks of order-0 pages as
order 0, so kho_restore_pages() it is.
> [0] https://lore.kernel.org/lkml/20250917125725.665-2-pratyush@kernel.org/
>
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index 117d963e677c..6ec3eaa4e8d1 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -2516,8 +2516,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser)
> >
> > for (i = 0; i < reserved_mem_count; i++) {
> > struct reserve_mem_table *map = &reserved_mem_table[i];
> > + struct page *page = phys_to_page(map->start);
> > + unsigned int nr_pages = map->size >> PAGE_SHIFT;
> >
> > - err |= kho_preserve_phys(map->start, map->size);
> > + err |= kho_preserve_pages(page, nr_pages);
>
> Unrelated to this patch, but since there is no
> kho_restore_{phys,pages}(), won't the reserve_mem memory end up with
> uninitialized struct pages, since preserved pages are
> memblock_reserved_mark_noinit()?
True, this is something we need to fix.
> That would also be a case for kho_restore_pages() I suppose?
Yes, just need to find the right place to stick it.
We cannot call kho_restore_pages() in reserve_mem_kho_revive() because at
that point there's still no memory map.
--
Sincerely yours,
Mike.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4 3/4] kho: add support for preserving vmalloc allocations
2025-09-17 17:40 [PATCH v4 0/4] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-17 17:40 ` [PATCH v4 1/4] kho: check if kho is finalized in __kho_preserve_order() Mike Rapoport
2025-09-17 17:40 ` [PATCH v4 2/4] kho: replace kho_preserve_phys() with kho_preserve_pages() Mike Rapoport
@ 2025-09-17 17:40 ` Mike Rapoport
2025-09-17 21:15 ` Andrew Morton
2025-09-18 10:33 ` Pratyush Yadav
2025-09-17 17:40 ` [PATCH v4 4/4] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
3 siblings, 2 replies; 12+ messages in thread
From: Mike Rapoport @ 2025-09-17 17:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
kexec, linux-mm, linux-kernel
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
A vmalloc allocation is preserved using binary structure similar to
global KHO memory tracker. It's a linked list of pages where each page
is an array of physical address of pages in vmalloc area.
kho_preserve_vmalloc() hands out the physical address of the head page
to the caller. This address is used as the argument to
kho_vmalloc_restore() to restore the mapping in the vmalloc address
space and populate it with the preserved pages.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
include/linux/kexec_handover.h | 21 +++
kernel/kexec_handover.c | 244 +++++++++++++++++++++++++++++++++
2 files changed, 265 insertions(+)
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index cc5c49b0612b..1cb515d8257a 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -39,12 +39,22 @@ struct page;
struct kho_serialization;
+struct kho_vmalloc_chunk;
+struct kho_vmalloc {
+ DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
+ unsigned int total_pages;
+ unsigned short flags;
+ unsigned short order;
+};
+
#ifdef CONFIG_KEXEC_HANDOVER
bool kho_is_enabled(void);
int kho_preserve_folio(struct folio *folio);
int kho_preserve_pages(struct page *page, unsigned int nr_pages);
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation);
struct folio *kho_restore_folio(phys_addr_t phys);
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation);
int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
@@ -71,11 +81,22 @@ static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages)
return -EOPNOTSUPP;
}
+static inline int kho_preserve_vmalloc(void *ptr,
+ struct kho_vmalloc *preservation)
+{
+ return -EOPNOTSUPP;
+}
+
static inline struct folio *kho_restore_folio(phys_addr_t phys)
{
return NULL;
}
+static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+ return NULL;
+}
+
static inline int kho_add_subtree(struct kho_serialization *ser,
const char *name, void *fdt)
{
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 3ad59c5f9eaa..d670caf6d07f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -18,6 +18,7 @@
#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
#include <asm/early_ioremap.h>
@@ -736,6 +737,249 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
}
EXPORT_SYMBOL_GPL(kho_preserve_pages);
+struct kho_vmalloc_hdr {
+ DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *);
+};
+
+#define KHO_VMALLOC_SIZE \
+ ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \
+ sizeof(phys_addr_t))
+
+struct kho_vmalloc_chunk {
+ struct kho_vmalloc_hdr hdr;
+ phys_addr_t phys[KHO_VMALLOC_SIZE];
+};
+
+static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE);
+
+/* vmalloc flags KHO supports */
+#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP)
+
+/* KHO internal flags for vmalloc preservations */
+#define KHO_VMALLOC_ALLOC 0x0001
+#define KHO_VMALLOC_HUGE_VMAP 0x0002
+
+static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags)
+{
+ unsigned short kho_flags = 0;
+
+ if (vm_flags & VM_ALLOC)
+ kho_flags |= KHO_VMALLOC_ALLOC;
+ if (vm_flags & VM_ALLOW_HUGE_VMAP)
+ kho_flags |= KHO_VMALLOC_HUGE_VMAP;
+
+ return kho_flags;
+}
+
+static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags)
+{
+ unsigned int vm_flags = 0;
+
+ if (kho_flags & KHO_VMALLOC_ALLOC)
+ vm_flags |= VM_ALLOC;
+ if (kho_flags & KHO_VMALLOC_HUGE_VMAP)
+ vm_flags |= VM_ALLOW_HUGE_VMAP;
+
+ return vm_flags;
+}
+
+static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur)
+{
+ struct kho_vmalloc_chunk *chunk;
+ int err;
+
+ chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ err = kho_preserve_pages(virt_to_page(chunk), 1);
+ if (err)
+ goto err_free;
+ if (cur)
+ KHOSER_STORE_PTR(cur->hdr.next, chunk);
+ return chunk;
+
+err_free:
+ free_page((unsigned long)chunk);
+ return NULL;
+}
+
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
+{
+ struct kho_mem_track *track = &kho_out.ser.track;
+ unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
+
+ __kho_unpreserve(track, pfn, pfn + 1);
+
+ for (int i = 0; chunk->phys[i]; i++) {
+ pfn = PHYS_PFN(chunk->phys[i]);
+ __kho_unpreserve(track, pfn, pfn + 1);
+ }
+}
+
+static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first);
+
+ while (chunk) {
+ struct kho_vmalloc_chunk *tmp = chunk;
+
+ kho_vmalloc_unpreserve_chunk(chunk);
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ kfree(tmp);
+ }
+}
+
+/**
+ * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec
+ * @ptr: pointer to the area in vmalloc address space
+ * @preservation: placeholder for preservation metadata
+ *
+ * Instructs KHO to preserve the area in vmalloc address space at @ptr. The
+ * physical pages mapped at @ptr will be preserved and on successful return
+ * @preservation will hold the physical address of a structure that describes
+ * the preservation.
+ *
+ * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably
+ * restored on the same node
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk;
+ struct vm_struct *vm = find_vm_area(ptr);
+ unsigned int order, flags, nr_contig_pages;
+ unsigned int idx = 0;
+ int err;
+
+ if (!vm)
+ return -EINVAL;
+
+ if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return -EOPNOTSUPP;
+
+ flags = vmalloc_flags_to_kho(vm->flags);
+ order = get_vm_area_page_order(vm);
+
+ chunk = new_vmalloc_chunk(NULL);
+ if (!chunk)
+ return -ENOMEM;
+ KHOSER_STORE_PTR(preservation->first, chunk);
+
+ nr_contig_pages = (1 << order);
+ for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) {
+ phys_addr_t phys = page_to_phys(vm->pages[i]);
+
+ err = kho_preserve_pages(vm->pages[i], nr_contig_pages);
+ if (err)
+ goto err_free;
+
+ chunk->phys[idx++] = phys;
+ if (idx == ARRAY_SIZE(chunk->phys)) {
+ chunk = new_vmalloc_chunk(chunk);
+ if (!chunk)
+ goto err_free;
+ idx = 0;
+ }
+ }
+
+ preservation->total_pages = vm->nr_pages;
+ preservation->flags = flags;
+ preservation->order = order;
+
+ return 0;
+
+err_free:
+ kho_vmalloc_free_chunks(preservation);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_vmalloc);
+
+/**
+ * kho_restore_vmalloc - recreates and populates an area in vmalloc address
+ * space from the preserved memory.
+ * @preservation: preservation metadata.
+ *
+ * Recreates an area in vmalloc address space and populates it with memory that
+ * was preserved using kho_preserve_vmalloc().
+ *
+ * Return: pointer to the area in the vmalloc address space, NULL on failure.
+ */
+void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
+{
+ struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first);
+ unsigned int align, order, shift, vm_flags;
+ unsigned int idx = 0, nr;
+ unsigned long addr, size;
+ struct vm_struct *area;
+ struct page **pages;
+ int err;
+
+ vm_flags = kho_flags_to_vmalloc(preservation->flags);
+ if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS)
+ return NULL;
+
+ nr = preservation->total_pages;
+ pages = kvmalloc_array(nr, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return NULL;
+ order = preservation->order;
+ shift = PAGE_SHIFT + order;
+ align = 1 << shift;
+
+ while (chunk) {
+ struct page *page;
+
+ for (int i = 0; chunk->phys[i]; i++) {
+ phys_addr_t phys = chunk->phys[i];
+
+ if (idx + (1 << order) > nr)
+ goto err_free_pages_array;
+
+ for (int j = 0; j < (1 << order); j++) {
+ page = phys_to_page(phys);
+ kho_restore_page(page, 0);
+ pages[idx++] = page;
+ phys += PAGE_SIZE;
+ }
+ }
+
+ page = virt_to_page(chunk);
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ kho_restore_page(page, 0);
+ __free_page(page);
+ }
+
+ if (idx != nr)
+ goto err_free_pages_array;
+
+ area = __get_vm_area_node(nr * PAGE_SIZE, align, shift, vm_flags,
+ VMALLOC_START, VMALLOC_END, NUMA_NO_NODE,
+ GFP_KERNEL, __builtin_return_address(0));
+ if (!area)
+ goto err_free_pages_array;
+
+ addr = (unsigned long)area->addr;
+ size = get_vm_area_size(area);
+ err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift);
+ if (err)
+ goto err_free_vm_area;
+
+ area->pages = pages;
+ area->nr_pages = nr;
+
+ return area->addr;
+
+err_free_vm_area:
+ free_vm_area(area);
+err_free_pages_array:
+ kvfree(pages);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kho_restore_vmalloc);
+
/* Handling for debug/kho/out */
static struct dentry *debugfs_root;
--
2.50.1
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 3/4] kho: add support for preserving vmalloc allocations
2025-09-17 17:40 ` [PATCH v4 3/4] kho: add support for preserving vmalloc allocations Mike Rapoport
@ 2025-09-17 21:15 ` Andrew Morton
2025-09-17 21:21 ` Jason Gunthorpe
2025-09-18 10:33 ` Pratyush Yadav
1 sibling, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2025-09-17 21:15 UTC (permalink / raw)
To: Mike Rapoport
Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
linux-kernel
On Wed, 17 Sep 2025 20:40:32 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> A vmalloc allocation is preserved using binary structure similar to
> global KHO memory tracker. It's a linked list of pages where each page
> is an array of physical address of pages in vmalloc area.
>
> kho_preserve_vmalloc() hands out the physical address of the head page
> to the caller. This address is used as the argument to
> kho_vmalloc_restore() to restore the mapping in the vmalloc address
> space and populate it with the preserved pages.
>
> ...
>
> --- a/include/linux/kexec_handover.h
> +++ b/include/linux/kexec_handover.h
> @@ -39,12 +39,22 @@ struct page;
>
> struct kho_serialization;
>
> +struct kho_vmalloc_chunk;
> +struct kho_vmalloc {
> + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
offtopic nit: DECLARE_KHOSER_PTR() *defines* a union named "first". It
doesn't declare one. A better name for this would have been DEFINE_...
And the world would be a better place if those three macros had a bit
of documentation ;)
The code looks nice though.
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 3/4] kho: add support for preserving vmalloc allocations
2025-09-17 21:15 ` Andrew Morton
@ 2025-09-17 21:21 ` Jason Gunthorpe
0 siblings, 0 replies; 12+ messages in thread
From: Jason Gunthorpe @ 2025-09-17 21:21 UTC (permalink / raw)
To: Andrew Morton
Cc: Mike Rapoport, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Pasha Tatashin, Pratyush Yadav, kexec, linux-mm,
linux-kernel
On Wed, Sep 17, 2025 at 02:15:28PM -0700, Andrew Morton wrote:
> On Wed, 17 Sep 2025 20:40:32 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> > +struct kho_vmalloc_chunk;
> > +struct kho_vmalloc {
> > + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *);
>
> offtopic nit: DECLARE_KHOSER_PTR() *defines* a union named "first". It
> doesn't declare one. A better name for this would have been DEFINE_...
It declares a *member* of the struct, in the same manner as the usual
DECLARE_* macros do.. Yes that member is an anonymous union that is
also created by the macro, but the main point is to add a member to
the struct.
Jason
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 3/4] kho: add support for preserving vmalloc allocations
2025-09-17 17:40 ` [PATCH v4 3/4] kho: add support for preserving vmalloc allocations Mike Rapoport
2025-09-17 21:15 ` Andrew Morton
@ 2025-09-18 10:33 ` Pratyush Yadav
1 sibling, 0 replies; 12+ messages in thread
From: Pratyush Yadav @ 2025-09-18 10:33 UTC (permalink / raw)
To: Mike Rapoport
Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec,
linux-mm, linux-kernel
On Wed, Sep 17 2025, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> A vmalloc allocation is preserved using binary structure similar to
> global KHO memory tracker. It's a linked list of pages where each page
> is an array of physical address of pages in vmalloc area.
>
> kho_preserve_vmalloc() hands out the physical address of the head page
> to the caller. This address is used as the argument to
> kho_vmalloc_restore() to restore the mapping in the vmalloc address
> space and populate it with the preserved pages.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
[...]
--
Regards,
Pratyush Yadav
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4 4/4] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt
2025-09-17 17:40 [PATCH v4 0/4] kho: add support for preserving vmalloc allocations Mike Rapoport
` (2 preceding siblings ...)
2025-09-17 17:40 ` [PATCH v4 3/4] kho: add support for preserving vmalloc allocations Mike Rapoport
@ 2025-09-17 17:40 ` Mike Rapoport
2025-09-18 10:36 ` Pratyush Yadav
3 siblings, 1 reply; 12+ messages in thread
From: Mike Rapoport @ 2025-09-17 17:40 UTC (permalink / raw)
To: Andrew Morton
Cc: Alexander Graf, Baoquan He, Changyuan Lyu, Chris Li,
Jason Gunthorpe, Mike Rapoport, Pasha Tatashin, Pratyush Yadav,
kexec, linux-mm, linux-kernel
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
KHO test stores physical addresses of the preserved folios directly in
fdt.
Use kho_preserve_vmalloc() instead of it and kho_restore_vmalloc() to
retrieve the addresses after kexec.
This makes the test more scalable from one side and adds tests coverage
for kho_preserve_vmalloc() from the other.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
lib/test_kho.c | 41 +++++++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 12 deletions(-)
diff --git a/lib/test_kho.c b/lib/test_kho.c
index fe8504e3407b..60cd899ea745 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -32,6 +32,7 @@ module_param(max_mem, long, 0644);
struct kho_test_state {
unsigned int nr_folios;
struct folio **folios;
+ phys_addr_t *folios_info;
struct folio *fdt;
__wsum csum;
};
@@ -67,18 +68,15 @@ static struct notifier_block kho_test_nb = {
static int kho_test_save_data(struct kho_test_state *state, void *fdt)
{
- phys_addr_t *folios_info;
+ phys_addr_t *folios_info __free(kvfree) = NULL;
+ struct kho_vmalloc folios_info_phys;
int err = 0;
- err |= fdt_begin_node(fdt, "data");
- err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
- sizeof(state->nr_folios));
- err |= fdt_property_placeholder(fdt, "folios_info",
- state->nr_folios * sizeof(*folios_info),
- (void **)&folios_info);
- err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
- err |= fdt_end_node(fdt);
+ folios_info = vmalloc_array(state->nr_folios, sizeof(*folios_info));
+ if (!folios_info)
+ return -ENOMEM;
+ err = kho_preserve_vmalloc(folios_info, &folios_info_phys);
if (err)
return err;
@@ -93,6 +91,17 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt)
break;
}
+ err |= fdt_begin_node(fdt, "data");
+ err |= fdt_property(fdt, "nr_folios", &state->nr_folios,
+ sizeof(state->nr_folios));
+ err |= fdt_property(fdt, "folios_info", &folios_info_phys,
+ sizeof(folios_info_phys));
+ err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum));
+ err |= fdt_end_node(fdt);
+
+ if (!err)
+ state->folios_info = no_free_ptr(folios_info);
+
return err;
}
@@ -209,8 +218,9 @@ static int kho_test_save(void)
static int kho_test_restore_data(const void *fdt, int node)
{
+ const struct kho_vmalloc *folios_info_phys;
const unsigned int *nr_folios;
- const phys_addr_t *folios_info;
+ phys_addr_t *folios_info;
const __wsum *old_csum;
__wsum csum = 0;
int len;
@@ -225,8 +235,12 @@ static int kho_test_restore_data(const void *fdt, int node)
if (!old_csum || len != sizeof(*old_csum))
return -EINVAL;
- folios_info = fdt_getprop(fdt, node, "folios_info", &len);
- if (!folios_info || len != sizeof(*folios_info) * *nr_folios)
+ folios_info_phys = fdt_getprop(fdt, node, "folios_info", &len);
+ if (!folios_info_phys || len != sizeof(*folios_info_phys))
+ return -EINVAL;
+
+ folios_info = kho_restore_vmalloc(folios_info_phys);
+ if (!folios_info)
return -EINVAL;
for (int i = 0; i < *nr_folios; i++) {
@@ -246,6 +260,8 @@ static int kho_test_restore_data(const void *fdt, int node)
folio_put(folio);
}
+ vfree(folios_info);
+
if (csum != *old_csum)
return -EINVAL;
@@ -304,6 +320,7 @@ static void kho_test_cleanup(void)
folio_put(kho_test_state.folios[i]);
kvfree(kho_test_state.folios);
+ vfree(kho_test_state.folios_info);
folio_put(kho_test_state.fdt);
}
--
2.50.1
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH v4 4/4] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt
2025-09-17 17:40 ` [PATCH v4 4/4] lib/test_kho: use kho_preserve_vmalloc instead of storing addresses in fdt Mike Rapoport
@ 2025-09-18 10:36 ` Pratyush Yadav
0 siblings, 0 replies; 12+ messages in thread
From: Pratyush Yadav @ 2025-09-18 10:36 UTC (permalink / raw)
To: Mike Rapoport
Cc: Andrew Morton, Alexander Graf, Baoquan He, Changyuan Lyu,
Chris Li, Jason Gunthorpe, Pasha Tatashin, Pratyush Yadav, kexec,
linux-mm, linux-kernel
On Wed, Sep 17 2025, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> KHO test stores physical addresses of the preserved folios directly in
> fdt.
> Use kho_preserve_vmalloc() instead of it and kho_restore_vmalloc() to
> retrieve the addresses after kexec.
>
> This makes the test more scalable from one side and adds tests coverage
> for kho_preserve_vmalloc() from the other.
>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
[...]
--
Regards,
Pratyush Yadav
^ permalink raw reply [flat|nested] 12+ messages in thread