* [RFC PATCH v2 1/7] kexec_file: Add fdt modification callback support
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
@ 2023-09-25 21:27 ` Stanislav Kinsburskii
2023-09-25 21:27 ` [RFC PATCH v2 2/7] x86: kexec: Transfer existing fdt to the new kernel Stanislav Kinsburskii
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:27 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
Introduce primitives to:
- Register and unregister callbacks for flattened device tree (fdt)
modifications.
- Invoke all registered callbacks.
- Check for any registered callbacks.
These enhancements enable the use of a device tree to store kernel bits.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
include/linux/kexec.h | 7 +++++++
kernel/kexec_file.c | 24 ++++++++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 22b5cd24f581..c9c70551796d 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -282,6 +282,13 @@ arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
return -ENOEXEC;
}
#endif
+
+struct notifier_block;
+extern int register_kexec_fdt_notifier(struct notifier_block *nb);
+extern int unregister_kexec_fdt_notifier(struct notifier_block *nb);
+extern bool kexec_fdt_notify_list_empty(void);
+extern int kexec_fdt_notify(void *fdt);
+
#endif /* CONFIG_KEXEC_FILE */
#ifdef CONFIG_KEXEC_ELF
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 881ba0d1714c..f9245d5e4459 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -43,6 +43,30 @@ static int kexec_calculate_store_digests(struct kimage *image);
/* Maximum size in bytes for kernel/initrd files. */
#define KEXEC_FILE_SIZE_MAX min_t(s64, 4LL << 30, SSIZE_MAX)
+static BLOCKING_NOTIFIER_HEAD(kexec_fdt_notify_list);
+
+bool kexec_fdt_notify_list_empty(void)
+{
+ return kexec_fdt_notify_list.head == NULL;
+}
+
+int kexec_fdt_notify(void *fdt)
+{
+ return blocking_notifier_call_chain(&kexec_fdt_notify_list, 0, fdt);
+}
+
+int register_kexec_fdt_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&kexec_fdt_notify_list, nb);
+}
+EXPORT_SYMBOL(register_kexec_fdt_notifier);
+
+int unregister_kexec_fdt_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&kexec_fdt_notify_list, nb);
+}
+EXPORT_SYMBOL(unregister_kexec_fdt_notifier);
+
/*
* Currently this is the only default function that is exported as some
* architectures need it to do additional handlings.
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 2/7] x86: kexec: Transfer existing fdt to the new kernel
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
2023-09-25 21:27 ` [RFC PATCH v2 1/7] kexec_file: Add fdt modification callback support Stanislav Kinsburskii
@ 2023-09-25 21:27 ` Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 3/7] x86: kexec: Enable fdt modification in callbacks Stanislav Kinsburskii
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:27 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
Enable passing of the Flattened Device Tree (fdt) over kexec for x86
architecture, as outlined in Documentation/x86/booting-dt.rst.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
arch/x86/Kconfig | 8 +++++
arch/x86/kernel/kexec-bzimage64.c | 58 +++++++++++++++++++++++++++++++++++++
2 files changed, 66 insertions(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e36261b4ea14..efb472e267ec 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2070,6 +2070,14 @@ config KEXEC_FILE
for kernel and initramfs as opposed to list of segments as
accepted by previous system call.
+config KEXEC_FILE_FDT
+ bool "Pass fdt over kexec"
+ depends on KEXEC_FILE && X86_64
+ depends on OF_FLATTREE
+ help
+ This option enables passing existent Flattened Device Tree to the new
+ kernel when kexec is invoked by the file based system call.
+
config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..ab9ae02c9a5f 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -18,6 +18,8 @@
#include <linux/mm.h>
#include <linux/efi.h>
#include <linux/random.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
#include <asm/bootparam.h>
#include <asm/setup.h>
@@ -381,7 +383,59 @@ static int bzImage64_probe(const char *buf, unsigned long len)
return ret;
}
+#ifdef CONFIG_KEXEC_FILE_FDT
+static void *fdt_get_runtime(void)
+{
+ return initial_boot_params;
+}
+
+static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
+{
+ void *fdt;
+ struct setup_data *sd;
+ unsigned long fdt_load_addr, fdt_sz;
+ int ret;
+
+ fdt = fdt_get_runtime();
+ if (!fdt)
+ return 0;
+
+ fdt_sz = fdt_totalsize(fdt);
+
+ kbuf->bufsz = kbuf->memsz = sizeof(struct setup_data) + fdt_sz;
+
+ sd = kzalloc(kbuf->bufsz, GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+
+ kbuf->buffer = sd;
+ kbuf->buf_align = PAGE_SIZE;
+ kbuf->buf_min = MIN_INITRD_LOAD_ADDR;
+ kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ return ret;
+
+ fdt_load_addr = kbuf->mem;
+ pr_debug("Loaded fdt at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ fdt_load_addr, fdt_sz, fdt_sz);
+
+ sd->type = SETUP_DTB;
+ sd->len = fdt_sz;
+ memcpy(sd->data, fdt, fdt_sz);
+
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = fdt_load_addr;
+
+ return 0;
+}
+#else
+static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
+{
+ return 0;
+}
+#endif
static void *bzImage64_load(struct kimage *image, char *kernel,
unsigned long kernel_len, char *initrd,
unsigned long initrd_len, char *cmdline,
@@ -561,6 +615,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (ret)
goto out_free_params;
+ ret = kexec_setup_fdt(&kbuf, params);
+ if (ret)
+ goto out_free_params;
+
/* Allocate loader specific data */
ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
if (!ldata) {
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 3/7] x86: kexec: Enable fdt modification in callbacks
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
2023-09-25 21:27 ` [RFC PATCH v2 1/7] kexec_file: Add fdt modification callback support Stanislav Kinsburskii
2023-09-25 21:27 ` [RFC PATCH v2 2/7] x86: kexec: Transfer existing fdt to the new kernel Stanislav Kinsburskii
@ 2023-09-25 21:28 ` Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 4/7] pmpool: Introduce persistent memory pool Stanislav Kinsburskii
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:28 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
This option allows kernel subsystems to modify (or create, if necessary)
the Flattened Device Tree (fdt) using registered callbacks and then pass
the modified version to the new kernel.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
arch/x86/Kconfig | 8 +++++++
arch/x86/kernel/kexec-bzimage64.c | 41 ++++++++++++++++++++++++++++++++++++-
2 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb472e267ec..90da51fbb8f8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2078,6 +2078,14 @@ config KEXEC_FILE_FDT
This option enables passing existent Flattened Device Tree to the new
kernel when kexec is invoked by the file based system call.
+config KEXEC_FILE_FDT_CALLBACK
+ bool "Enable kexec fdt modification support"
+ depends on KEXEC_FILE_FDT
+ select LIBFDT
+ help
+ This option enables Flattened Device Tree modification (and creation
+ if needed) by kernel subsystems, registered corresponding callback.
+
config ARCH_HAS_KEXEC_PURGATORY
def_bool KEXEC_FILE
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ab9ae02c9a5f..3c6df28d3637 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -384,11 +384,50 @@ static int bzImage64_probe(const char *buf, unsigned long len)
return ret;
}
#ifdef CONFIG_KEXEC_FILE_FDT
+#ifdef CONFIG_KEXEC_FILE_FDT_CALLBACK
+static void *fdt_get_runtime(void)
+{
+ void *fdt;
+ size_t fdt_size = SZ_2M;
+ int status;
+
+ /* It's nothing to do without existent fdt and any callbacks */
+ if (!initial_boot_params && kexec_fdt_notify_list_empty())
+ return NULL;
+
+ fdt = kzalloc(fdt_size, GFP_KERNEL);
+ if (!fdt)
+ return NULL;
+
+ if (initial_boot_params)
+ status = fdt_open_into(initial_boot_params, fdt, fdt_size);
+ else
+ status = fdt_create_empty_tree(fdt, fdt_size);
+ if (status != 0) {
+ pr_err("failed to get fdt\n");
+ goto free_fdt;
+ }
+
+ status = kexec_fdt_notify(fdt);
+ if (status) {
+ pr_err("fdt notification failed\n");
+ goto free_fdt;
+ }
+
+ fdt_pack(fdt);
+
+ return fdt;
+
+free_fdt:
+ kfree(fdt);
+ return NULL;
+}
+#else
static void *fdt_get_runtime(void)
{
return initial_boot_params;
}
-
+#endif
static int kexec_setup_fdt(struct kexec_buf *kbuf, struct boot_params *params)
{
void *fdt;
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 4/7] pmpool: Introduce persistent memory pool
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
` (2 preceding siblings ...)
2023-09-25 21:28 ` [RFC PATCH v2 3/7] x86: kexec: Enable fdt modification in callbacks Stanislav Kinsburskii
@ 2023-09-25 21:28 ` Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 5/7] pmpool: Update device tree on kexec Stanislav Kinsburskii
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:28 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
This patch introduces a memory allocator specifically tailored for
persistent memory within the kernel. The allocator maintains
kernel-specific states like DMA passthrough device states, IOMMU state, and
more across kexec.
The current implementation provides a foundation for custom solutions that
may be developed in the future. Although the design is kept concise and
straightforward to encourage discussion and feedback, it remains fully
functional.
The persistent memory pool builds upon the continuous memory allocator
(CMA) and ensures CMA state persistency across kexec by incorporating the
CMA bitmap into the memory region.
Potential applications include:
1. Enabling various in-kernel entities to allocate persistent pages from
a unified memory pool, obviating the need for reserving multiple
regions.
2. For in-kernel components that need the allocation address to be
retained on kernel kexec, this address can be exposed to user space
and subsequently passed through the command line.
3. Distinct subsystems or drivers can set aside their region, allocating
a segment for their persistent memory pool, suitable for uses such as
file systems, key-value stores, and other applications.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
include/linux/pmpool.h | 22 +++++++++++
mm/Kconfig | 8 ++++
mm/Makefile | 1
mm/pmpool.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 131 insertions(+)
create mode 100644 include/linux/pmpool.h
create mode 100644 mm/pmpool.c
diff --git a/include/linux/pmpool.h b/include/linux/pmpool.h
new file mode 100644
index 000000000000..b41f16fa9660
--- /dev/null
+++ b/include/linux/pmpool.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PMPOOL_H
+#define _PMPOOL_H
+
+struct page;
+
+#if defined(CONFIG_PMPOOL)
+struct page *pmpool_alloc(unsigned long count);
+bool pmpool_release(struct page *pages, unsigned long count);
+#else
+static inline struct page *pmpool_alloc(unsigned long count)
+{
+ return NULL;
+}
+static inline bool pmpool_release(struct page *pages, unsigned long count)
+{
+ return false;
+}
+#endif
+
+#endif /* _PMPOOL_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 09130434e30d..e7c10094fb10 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -922,6 +922,14 @@ config CMA_AREAS
If unsure, leave the default value "7" in UMA and "19" in NUMA.
+config PMPOOL
+ bool "Persistent memory pool support"
+ select CMA
+ help
+ This option adds support for CMA-based persistent memory pool
+ feature, which provides pages allocation and freeing from a set of
+ persistent memory ranges, deposited to the memory pool.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
diff --git a/mm/Makefile b/mm/Makefile
index 678530a07326..8d3579e58c2c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -139,3 +139,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_PMPOOL) += pmpool.o
diff --git a/mm/pmpool.c b/mm/pmpool.c
new file mode 100644
index 000000000000..12a8cac75558
--- /dev/null
+++ b/mm/pmpool.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "pmpool: " fmt
+
+#include <linux/bitmap.h>
+#include <linux/cma.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pmpool.h>
+
+#include "cma.h"
+
+struct pmpool {
+ struct cma *cma;
+};
+
+static struct pmpool *default_pmpool;
+
+bool pmpool_release(struct page *pages, unsigned long count)
+{
+ if (!default_pmpool)
+ return false;
+
+ return cma_release(default_pmpool->cma, pages, count);
+}
+
+struct page *pmpool_alloc(unsigned long count)
+{
+ if (!default_pmpool)
+ return NULL;
+
+ return cma_alloc(default_pmpool->cma, count, 0, true);
+}
+
+static void pmpool_fixup_cma(struct cma *cma)
+{
+ unsigned long bitmap_size;
+
+ bitmap_free(cma->bitmap);
+ cma->bitmap = phys_to_virt(PFN_PHYS(cma->base_pfn));
+
+ bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma));
+ memset(cma->bitmap, 0, bitmap_size);
+ bitmap_set(cma->bitmap, 0, PAGE_ALIGN(bitmap_size) >> PAGE_SHIFT);
+
+ pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap));
+}
+
+static int __init default_pmpool_fixup_cma(void)
+{
+ if (!default_pmpool)
+ return 0;
+
+ pmpool_fixup_cma(default_pmpool->cma);
+ return 0;
+}
+postcore_initcall(default_pmpool_fixup_cma);
+
+static int __init parse_pmpool_opt(char *str)
+{
+ static struct pmpool pmpool;
+ phys_addr_t base, size;
+ int err;
+
+ /* Format is pmpool=<base>,<size> */
+ base = memparse(str, &str);
+ size = memparse(str + 1, NULL);
+
+ err = memblock_is_region_reserved(base, size);
+ if (err) {
+ pr_err("memory block overlaps with another one: %d\n", err);
+ return 0;
+ }
+
+ err = memblock_reserve(base, size);
+ if (err) {
+ pr_err("failed to reerve memory block: %d\n", err);
+ return 0;
+ }
+
+ err = cma_init_reserved_mem(base, size, 0, "pmpool", &pmpool.cma);
+ if (err) {
+ pr_err("failed to initialize CMA: %d\n", err);
+ goto free_memblock;
+ }
+
+ pr_info("default memory pool is created: %#llx-%#llx\n",
+ base, base + size);
+
+ default_pmpool = &pmpool;
+
+ return 0;
+
+free_memblock:
+ memblock_phys_free(base, size);
+ return 0;
+}
+early_param("pmpool", parse_pmpool_opt);
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 5/7] pmpool: Update device tree on kexec
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
` (3 preceding siblings ...)
2023-09-25 21:28 ` [RFC PATCH v2 4/7] pmpool: Introduce persistent memory pool Stanislav Kinsburskii
@ 2023-09-25 21:28 ` Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 6/7] pmpool: Restore state from device tree post-kexec Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 7/7] Drivers: hv: Allocate persistent pages for root partition Stanislav Kinsburskii
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:28 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
Introduce a pmpool kexec fdt notifier that enables pmpool to pass its
metadata, including the bitmap address, to the new kernel during kexec.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
mm/Kconfig | 1 +
mm/pmpool.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/mm/Kconfig b/mm/Kconfig
index e7c10094fb10..1eefdd4c82ba 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -925,6 +925,7 @@ config CMA_AREAS
config PMPOOL
bool "Persistent memory pool support"
select CMA
+ select LIBFDT
help
This option adds support for CMA-based persistent memory pool
feature, which provides pages allocation and freeing from a set of
diff --git a/mm/pmpool.c b/mm/pmpool.c
index 12a8cac75558..f2173db782d6 100644
--- a/mm/pmpool.c
+++ b/mm/pmpool.c
@@ -6,6 +6,7 @@
#include <linux/cma.h>
#include <linux/io.h>
#include <linux/kexec.h>
+#include <linux/libfdt.h>
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pmpool.h>
@@ -58,6 +59,59 @@ static int __init default_pmpool_fixup_cma(void)
}
postcore_initcall(default_pmpool_fixup_cma);
+static int pmpool_fdt_update(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ void *fdt = data;
+ int node, status;
+
+ if (!fdt)
+ goto err;
+
+ node = fdt_subnode_offset(fdt, 0, "chosen");
+ if (node < 0) {
+ node = fdt_add_subnode(fdt, 0, "chosen");
+ if (node < 0)
+ goto err;
+ }
+
+ node = fdt_add_subnode(fdt, node, "default_pmpool");
+ if (node == -FDT_ERR_EXISTS)
+ return 0;
+ if (node < 0)
+ goto err;
+
+ status = fdt_setprop(fdt, node, "compatible",
+ "pmpool", sizeof("pmpool"));
+ if (status)
+ goto err;
+
+ status = fdt_setprop_u64(fdt, node, "bitmap",
+ virt_to_phys(default_pmpool->cma->bitmap));
+ if (status)
+ goto err;
+
+ status = fdt_setprop_u64(fdt, node, "size",
+ default_pmpool->cma->count << PAGE_SHIFT);
+ if (status)
+ goto err;
+
+ status = fdt_setprop_u64(fdt, node, "base",
+ default_pmpool->cma->base_pfn << PAGE_SHIFT);
+ if (status)
+ goto err;
+
+ return NOTIFY_DONE;
+
+err:
+ pr_err("failed to update fdt\n");
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block pmpool_kexec_fdt_nb = {
+ .notifier_call = pmpool_fdt_update,
+};
+
static int __init parse_pmpool_opt(char *str)
{
static struct pmpool pmpool;
@@ -80,10 +134,16 @@ static int __init parse_pmpool_opt(char *str)
return 0;
}
+ err = register_kexec_fdt_notifier(&pmpool_kexec_fdt_nb);
+ if (err) {
+ pr_err("failed to register kexec fdt notifier: %d\n", err);
+ goto free_memblock;
+ }
+
err = cma_init_reserved_mem(base, size, 0, "pmpool", &pmpool.cma);
if (err) {
pr_err("failed to initialize CMA: %d\n", err);
- goto free_memblock;
+ goto notifier_unregister;
}
pr_info("default memory pool is created: %#llx-%#llx\n",
@@ -93,6 +153,8 @@ static int __init parse_pmpool_opt(char *str)
return 0;
+notifier_unregister:
+ unregister_kexec_fdt_notifier(&pmpool_kexec_fdt_nb);
free_memblock:
memblock_phys_free(base, size);
return 0;
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 6/7] pmpool: Restore state from device tree post-kexec
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
` (4 preceding siblings ...)
2023-09-25 21:28 ` [RFC PATCH v2 5/7] pmpool: Update device tree on kexec Stanislav Kinsburskii
@ 2023-09-25 21:28 ` Stanislav Kinsburskii
2023-09-25 21:28 ` [RFC PATCH v2 7/7] Drivers: hv: Allocate persistent pages for root partition Stanislav Kinsburskii
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:28 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
From: Stanislav Kinsburskii <stanislav.kinsburskii@gmail.com>
Retrieve the pmpool bitmap from metadata in the fdt passed over kexec,
bypassing the need for reinitialization. This ensures the seamless transfer
of the pmpool state across kexec.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
mm/pmpool.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/mm/pmpool.c b/mm/pmpool.c
index f2173db782d6..6c1a28fd3493 100644
--- a/mm/pmpool.c
+++ b/mm/pmpool.c
@@ -9,6 +9,7 @@
#include <linux/libfdt.h>
#include <linux/memblock.h>
#include <linux/mm.h>
+#include <linux/of.h>
#include <linux/pmpool.h>
#include "cma.h"
@@ -49,11 +50,56 @@ static void pmpool_fixup_cma(struct cma *cma)
pr_info("CMA bitmap moved to %#llx\n", virt_to_phys(cma->bitmap));
}
+static int pmpool_fdt_restore(struct cma *cma)
+{
+ struct device_node *dn;
+ u64 val;
+
+ dn = of_find_compatible_node(NULL, NULL, "pmpool");
+ if (!dn)
+ return -ENOENT;
+
+ if (of_property_read_u64(dn, "base", &val)) {
+ pr_err("invalid fdt: no base\n");
+ return -EINVAL;
+ }
+ if (val != PFN_PHYS(cma->base_pfn)) {
+ pr_err("fdt base doesn't match: %#llx != %#llx\n",
+ val, PFN_PHYS(cma->base_pfn));
+ return -EINVAL;
+ }
+
+ if (of_property_read_u64(dn, "size", &val)) {
+ pr_err("invalid fdt: no size\n");
+ return -EINVAL;
+ }
+ if (val != (cma->count << PAGE_SHIFT)) {
+ pr_err("fdt size doesn't match: %#llx != %#lx\n",
+ val, cma->count << PAGE_SHIFT);
+ return -EINVAL;
+ }
+
+ if (of_property_read_u64(dn, "bitmap", &val)) {
+ pr_err("invalid fdt: no bitmap\n");
+ return -EINVAL;
+ }
+
+ pr_info("CMA bitmap restored to %#llx\n", val);
+
+ bitmap_free(cma->bitmap);
+ cma->bitmap = phys_to_virt(val);
+
+ return 0;
+}
+
static int __init default_pmpool_fixup_cma(void)
{
if (!default_pmpool)
return 0;
+ if (!pmpool_fdt_restore(default_pmpool->cma))
+ return 0;
+
pmpool_fixup_cma(default_pmpool->cma);
return 0;
}
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH v2 7/7] Drivers: hv: Allocate persistent pages for root partition
2023-09-25 21:27 [RFC PATCH v2 0/7] Introduce persistent memory pool Stanislav Kinsburskii
` (5 preceding siblings ...)
2023-09-25 21:28 ` [RFC PATCH v2 6/7] pmpool: Restore state from device tree post-kexec Stanislav Kinsburskii
@ 2023-09-25 21:28 ` Stanislav Kinsburskii
6 siblings, 0 replies; 8+ messages in thread
From: Stanislav Kinsburskii @ 2023-09-25 21:28 UTC (permalink / raw)
To: tglx, mingo, bp, dave.hansen, x86, hpa, ebiederm, akpm,
stanislav.kinsburskii, corbet, linux-kernel, kexec, linux-mm,
kys, jgowans, wei.liu, arnd, gregkh, graf, pbonzini
Deposited pages are owned by the hypervisor. Accessing them can trigger a
kernel panic due to a general protection fault.
This patch ensures that pages for the root partition are allocated from the
persistent memory pool. This allocation guarantees stability post-kexec,
protecting hypervisor-deposited pages from unintended reuse by the new
kernel.
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/hv_common.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 335aec5ec504..a81c5613e745 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -426,7 +426,10 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
order = 31 - __builtin_clz(num_pages);
while (1) {
- pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (paritition_id == hv_current_partition_id)
+ pages[i] = pmpool_alloc(1 << order);
+ else
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
if (pages[i])
break;
if (!order) {
@@ -471,8 +474,12 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
err_free_allocations:
for (i = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
- for (j = 0; j < counts[i]; ++j)
- __free_page(pfn_to_page(base_pfn + j));
+ for (j = 0; j < counts[i]; ++j) {
+ if (paritition_id == hv_current_partition_id)
+ pmpool_release(pages[i], counts[i]);
+ else
+ __free_page(pfn_to_page(base_pfn + j));
+ }
}
free_buf:
^ permalink raw reply [flat|nested] 8+ messages in thread