* [PATCH v6 01/30] mm: Introduce kpkeys
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 02/30] set_memory: Introduce set_memory_pkey() stub Kevin Brodsky
` (28 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
kpkeys is a simple framework to enable the use of protection keys
(pkeys) to harden the kernel itself. This patch introduces the basic
API in <linux/kpkeys.h>: a couple of functions to set and restore
the pkey register and macros to define guard objects.
kpkeys introduces a new concept on top of pkeys: the kpkeys level.
Each level is associated to a set of permissions for the pkeys
managed by the kpkeys framework. kpkeys_set_level(lvl) sets those
permissions according to lvl, and returns the original pkey
register, to be later restored by kpkeys_restore_pkey_reg(). To
start with, only KPKEYS_LVL_DEFAULT is available, which is meant
to grant RW access to KPKEYS_PKEY_DEFAULT (i.e. all memory since
this is the only available pkey for now).
Because each architecture implementing pkeys uses a different
representation for the pkey register, and may reserve certain pkeys
for specific uses, support for kpkeys must be explicitly indicated
by selecting ARCH_HAS_KPKEYS and defining the following functions in
<asm/kpkeys.h>, in addition to the macros provided in
<asm-generic/kpkeys.h>:
- arch_kpkeys_set_level()
- arch_kpkeys_restore_pkey_reg()
- arch_kpkeys_enabled()
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/asm-generic/kpkeys.h | 17 ++++++
include/linux/kpkeys.h | 113 +++++++++++++++++++++++++++++++++++
mm/Kconfig | 2 +
3 files changed, 132 insertions(+)
create mode 100644 include/asm-generic/kpkeys.h
create mode 100644 include/linux/kpkeys.h
diff --git a/include/asm-generic/kpkeys.h b/include/asm-generic/kpkeys.h
new file mode 100644
index 000000000000..ab819f157d6a
--- /dev/null
+++ b/include/asm-generic/kpkeys.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_KPKEYS_H
+#define __ASM_GENERIC_KPKEYS_H
+
+#ifndef KPKEYS_PKEY_DEFAULT
+#define KPKEYS_PKEY_DEFAULT 0
+#endif
+
+/*
+ * Represents a pkey register value that cannot be used, typically disabling
+ * access to all keys.
+ */
+#ifndef KPKEYS_PKEY_REG_INVAL
+#define KPKEYS_PKEY_REG_INVAL 0
+#endif
+
+#endif /* __ASM_GENERIC_KPKEYS_H */
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
new file mode 100644
index 000000000000..faa6e2615798
--- /dev/null
+++ b/include/linux/kpkeys.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_KPKEYS_H
+#define _LINUX_KPKEYS_H
+
+#include <linux/bug.h>
+#include <linux/cleanup.h>
+
+#define KPKEYS_LVL_DEFAULT 0
+
+#define KPKEYS_LVL_MIN KPKEYS_LVL_DEFAULT
+#define KPKEYS_LVL_MAX KPKEYS_LVL_DEFAULT
+
+#define __KPKEYS_GUARD(name, set_level, restore_pkey_reg, set_arg, ...) \
+ __DEFINE_CLASS_IS_CONDITIONAL(name, false); \
+ DEFINE_CLASS(name, u64, \
+ restore_pkey_reg, set_level, set_arg); \
+ static inline void *class_##name##_lock_ptr(u64 *_T) \
+ { return _T; }
+
+/**
+ * KPKEYS_GUARD_NOOP() - define a guard type that does nothing
+ * @name: the name of the guard type
+ * @cond_arg: an argument specification (optional)
+ *
+ * Define a guard type that does nothing, useful to match a real guard type
+ * that is defined under an #ifdef. @cond_arg may optionally be passed to match
+ * a guard defined using KPKEYS_GUARD_COND().
+ */
+#define KPKEYS_GUARD_NOOP(name, ...) \
+ __KPKEYS_GUARD(name, 0, (void)_T, ##__VA_ARGS__, void)
+
+#ifdef CONFIG_ARCH_HAS_KPKEYS
+
+#include <asm/kpkeys.h>
+
+/**
+ * KPKEYS_GUARD_COND() - define a guard type that conditionally switches to
+ * a given kpkeys level
+ * @name: the name of the guard type
+ * @level: the kpkeys level to switch to
+ * @cond: an expression that is evaluated as condition
+ * @cond_arg: an argument specification for the condition (optional)
+ *
+ * Define a guard type that switches to @level if @cond evaluates to true, and
+ * does nothing otherwise. @cond_arg may be specified to give access to a
+ * caller-defined argument to @cond.
+ */
+#define KPKEYS_GUARD_COND(name, level, cond, ...) \
+ __KPKEYS_GUARD(name, \
+ cond ? kpkeys_set_level(level) \
+ : KPKEYS_PKEY_REG_INVAL, \
+ kpkeys_restore_pkey_reg(_T), \
+ ##__VA_ARGS__, void)
+
+/**
+ * KPKEYS_GUARD() - define a guard type that switches to a given kpkeys level
+ * if kpkeys are enabled
+ * @name: the name of the guard type
+ * @level: the kpkeys level to switch to
+ *
+ * Define a guard type that switches to @level if the system supports kpkeys.
+ */
+#define KPKEYS_GUARD(name, level) \
+ KPKEYS_GUARD_COND(name, level, arch_kpkeys_enabled())
+
+/**
+ * kpkeys_set_level() - switch kpkeys level
+ * @level: the level to switch to
+ *
+ * Switches the kpkeys level to the specified value. @level must be a
+ * compile-time constant. The arch-specific pkey register will be updated
+ * accordingly, and the original value returned.
+ *
+ * Return: the original pkey register value if the register was written to, or
+ * KPKEYS_PKEY_REG_INVAL otherwise (no write to the register was
+ * required).
+ */
+static __always_inline u64 kpkeys_set_level(int level)
+{
+ BUILD_BUG_ON_MSG(!__builtin_constant_p(level),
+ "kpkeys_set_level() only takes constant levels");
+ BUILD_BUG_ON_MSG(level < KPKEYS_LVL_MIN || level > KPKEYS_LVL_MAX,
+ "Invalid level passed to kpkeys_set_level()");
+
+ return arch_kpkeys_set_level(level);
+}
+
+/**
+ * kpkeys_restore_pkey_reg() - restores a pkey register value
+ * @pkey_reg: the pkey register value to restore
+ *
+ * This function is meant to be passed the value returned by kpkeys_set_level(),
+ * in order to restore the pkey register to its original value (thus restoring
+ * the original kpkeys level).
+ */
+static __always_inline void kpkeys_restore_pkey_reg(u64 pkey_reg)
+{
+ if (pkey_reg != KPKEYS_PKEY_REG_INVAL)
+ arch_kpkeys_restore_pkey_reg(pkey_reg);
+}
+
+#else /* CONFIG_ARCH_HAS_KPKEYS */
+
+#include <asm-generic/kpkeys.h>
+
+static inline bool arch_kpkeys_enabled(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_ARCH_HAS_KPKEYS */
+
+#endif /* _LINUX_KPKEYS_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index ebd8ea353687..2baedee59bb2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1243,6 +1243,8 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
bool
+config ARCH_HAS_KPKEYS
+ bool
config ARCH_USES_PG_ARCH_2
bool
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 02/30] set_memory: Introduce set_memory_pkey() stub
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 01/30] mm: Introduce kpkeys Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 03/30] arm64: mm: Enable overlays for all EL1 indirect permissions Kevin Brodsky
` (27 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Introduce a new function, set_memory_pkey(), which sets the
protection key (pkey) of pages in the specified linear mapping
range. Architectures implementing kernel pkeys (kpkeys) must
provide a suitable implementation; an empty stub is added as
fallback.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/linux/set_memory.h | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 3030d9245f5a..7b3a8bfde3c6 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -84,4 +84,11 @@ static inline int set_memory_decrypted(unsigned long addr, int numpages)
}
#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
+#ifndef CONFIG_ARCH_HAS_KPKEYS
+static inline int set_memory_pkey(unsigned long addr, int numpages, int pkey)
+{
+ return 0;
+}
+#endif
+
#endif /* _LINUX_SET_MEMORY_H_ */
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 03/30] arm64: mm: Enable overlays for all EL1 indirect permissions
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 01/30] mm: Introduce kpkeys Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 02/30] set_memory: Introduce set_memory_pkey() stub Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 04/30] arm64: Introduce por_elx_set_pkey_perms() helper Kevin Brodsky
` (26 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
In preparation of using POE inside the kernel, enable "Overlay
applied" for all stage 1 base permissions in PIR_EL1. This ensures
that the permissions set in POR_EL1 affect all kernel mappings.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/pgtable-prot.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index d27e8872fe3c..849fec1328ae 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -180,13 +180,13 @@ static inline bool __pure lpa2_is_enabled(void)
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \
PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \
- PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW))
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R_O) | \
+ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW_O))
#endif /* __ASM_PGTABLE_PROT_H */
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 04/30] arm64: Introduce por_elx_set_pkey_perms() helper
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (2 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 03/30] arm64: mm: Enable overlays for all EL1 indirect permissions Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 05/30] arm64: Implement asm/kpkeys.h using POE Kevin Brodsky
` (25 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Introduce a helper that sets the permissions of a given pkey
(POIndex) in the POR_ELx format, and make use of it in
arch_set_user_pkey_access().
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/por.h | 7 +++++++
arch/arm64/mm/mmu.c | 26 ++++++++++----------------
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
index d913d5b529e4..bffb4d2b1246 100644
--- a/arch/arm64/include/asm/por.h
+++ b/arch/arm64/include/asm/por.h
@@ -31,4 +31,11 @@ static inline bool por_elx_allows_exec(u64 por, u8 pkey)
return perm & POE_X;
}
+static inline u64 por_elx_set_pkey_perms(u64 por, u8 pkey, u64 perms)
+{
+ u64 shift = POR_ELx_PERM_SHIFT(pkey);
+
+ return (por & ~(POE_MASK << shift)) | (perms << shift);
+}
+
#endif /* _ASM_ARM64_POR_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a6a00accf4f9..09ca62f77a84 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -2208,8 +2208,8 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
#ifdef CONFIG_ARCH_HAS_PKEYS
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
{
- u64 new_por;
- u64 old_por;
+ u64 new_perms;
+ u64 por;
if (!system_supports_poe())
return -ENOSPC;
@@ -2223,25 +2223,19 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
return -EINVAL;
/* Set the bits we need in POR: */
- new_por = POE_RWX;
+ new_perms = POE_RWX;
if (init_val & PKEY_DISABLE_WRITE)
- new_por &= ~POE_W;
+ new_perms &= ~POE_W;
if (init_val & PKEY_DISABLE_ACCESS)
- new_por &= ~POE_RW;
+ new_perms &= ~POE_RW;
if (init_val & PKEY_DISABLE_READ)
- new_por &= ~POE_R;
+ new_perms &= ~POE_R;
if (init_val & PKEY_DISABLE_EXECUTE)
- new_por &= ~POE_X;
+ new_perms &= ~POE_X;
- /* Shift the bits in to the correct place in POR for pkey: */
- new_por = POR_ELx_PERM_PREP(pkey, new_por);
-
- /* Get old POR and mask off any old bits in place: */
- old_por = read_sysreg_s(SYS_POR_EL0);
- old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey));
-
- /* Write old part along with new part: */
- write_sysreg_s(old_por | new_por, SYS_POR_EL0);
+ por = read_sysreg_s(SYS_POR_EL0);
+ por = por_elx_set_pkey_perms(por, pkey, new_perms);
+ write_sysreg_s(por, SYS_POR_EL0);
return 0;
}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 05/30] arm64: Implement asm/kpkeys.h using POE
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (3 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 04/30] arm64: Introduce por_elx_set_pkey_perms() helper Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 06/30] arm64: set_memory: Implement set_memory_pkey() Kevin Brodsky
` (24 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Implement the kpkeys interface if CONFIG_ARM64_POE is enabled.
The permissions for KPKEYS_PKEY_DEFAULT (pkey 0) are set to RWX as
this pkey is also used for code mappings.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/kpkeys.h | 49 +++++++++++++++++++++++++++++++++
1 file changed, 49 insertions(+)
create mode 100644 arch/arm64/include/asm/kpkeys.h
diff --git a/arch/arm64/include/asm/kpkeys.h b/arch/arm64/include/asm/kpkeys.h
new file mode 100644
index 000000000000..3b0ab5e7dd22
--- /dev/null
+++ b/arch/arm64/include/asm/kpkeys.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_KPKEYS_H
+#define __ASM_KPKEYS_H
+
+#include <asm/barrier.h>
+#include <asm/cpufeature.h>
+#include <asm/por.h>
+
+#include <asm-generic/kpkeys.h>
+
+static inline bool arch_kpkeys_enabled(void)
+{
+ return system_supports_poe();
+}
+
+#ifdef CONFIG_ARM64_POE
+
+static inline u64 por_set_kpkeys_level(u64 por, int level)
+{
+ por = por_elx_set_pkey_perms(por, KPKEYS_PKEY_DEFAULT, POE_RWX);
+
+ return por;
+}
+
+static __always_inline void __kpkeys_set_pkey_reg_nosync(u64 pkey_reg)
+{
+ write_sysreg_s(pkey_reg, SYS_POR_EL1);
+}
+
+static __always_inline int arch_kpkeys_set_level(int level)
+{
+ u64 prev_por = read_sysreg_s(SYS_POR_EL1);
+ u64 new_por = por_set_kpkeys_level(prev_por, level);
+
+ __kpkeys_set_pkey_reg_nosync(new_por);
+ isb();
+
+ return prev_por;
+}
+
+static __always_inline void arch_kpkeys_restore_pkey_reg(u64 pkey_reg)
+{
+ __kpkeys_set_pkey_reg_nosync(pkey_reg);
+ isb();
+}
+
+#endif /* CONFIG_ARM64_POE */
+
+#endif /* __ASM_KPKEYS_H */
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 06/30] arm64: set_memory: Implement set_memory_pkey()
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (4 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 05/30] arm64: Implement asm/kpkeys.h using POE Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 07/30] arm64: Reset POR_EL1 on exception entry Kevin Brodsky
` (23 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Implement set_memory_pkey() using POE.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/set_memory.h | 4 ++++
arch/arm64/mm/pageattr.c | 26 ++++++++++++++++++++++++++
2 files changed, 30 insertions(+)
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 90f61b17275e..b6cd6de34abf 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -19,4 +19,8 @@ bool kernel_page_present(struct page *page);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+#ifdef CONFIG_ARCH_HAS_KPKEYS
+int set_memory_pkey(unsigned long addr, int numpages, int pkey);
+#endif
+
#endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 358d1dc9a576..d2a7e104a5c2 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -9,6 +9,8 @@
#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <linux/pagewalk.h>
+#include <linux/pkeys.h>
+#include <linux/kpkeys.h>
#include <asm/cacheflush.h>
#include <asm/pgtable-prot.h>
@@ -359,6 +361,30 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return set_memory_valid(addr, nr, valid);
}
+#ifdef CONFIG_ARCH_HAS_KPKEYS
+int set_memory_pkey(unsigned long addr, int numpages, int pkey)
+{
+ unsigned long set_prot = 0;
+
+ if (!arch_kpkeys_enabled())
+ return 0;
+
+ if (!__is_lm_address(addr))
+ return -EINVAL;
+
+ if (pkey >= arch_max_pkey())
+ return -EINVAL;
+
+ set_prot |= pkey & BIT(0) ? PTE_PO_IDX_0 : 0;
+ set_prot |= pkey & BIT(1) ? PTE_PO_IDX_1 : 0;
+ set_prot |= pkey & BIT(2) ? PTE_PO_IDX_2 : 0;
+
+ return __change_memory_common(addr, PAGE_SIZE * numpages,
+ __pgprot(set_prot),
+ __pgprot(PTE_PO_IDX_MASK));
+}
+#endif
+
#ifdef CONFIG_DEBUG_PAGEALLOC
/*
* This is - apart from the return value - doing the same
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 07/30] arm64: Reset POR_EL1 on exception entry
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (5 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 06/30] arm64: set_memory: Implement set_memory_pkey() Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 08/30] arm64: Context-switch POR_EL1 Kevin Brodsky
` (22 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
POR_EL1 will be modified, through the kpkeys framework, in order to
grant temporary RW access to certain keys. If an exception occurs
in the middle of a "critical section" where POR_EL1 is set to a
privileged value, it is preferable to reset it to its default value
upon taking the exception to minimise the amount of code running at
higher kpkeys level.
This patch implements the reset of POR_EL1 on exception entry,
storing the original value in a new pt_regs field and restoring on
exception return. To avoid an expensive ISB, the register is only
reset if the interrupted value isn't the default. No check is made
on the return path as an ISB occurs anyway as part of ERET.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/kpkeys.h | 10 ++++++++++
arch/arm64/include/asm/por.h | 4 ++++
arch/arm64/include/asm/ptrace.h | 4 ++++
arch/arm64/kernel/asm-offsets.c | 3 +++
arch/arm64/kernel/entry.S | 24 +++++++++++++++++++++++-
5 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/kpkeys.h b/arch/arm64/include/asm/kpkeys.h
index 3b0ab5e7dd22..79ae33388088 100644
--- a/arch/arm64/include/asm/kpkeys.h
+++ b/arch/arm64/include/asm/kpkeys.h
@@ -8,6 +8,14 @@
#include <asm-generic/kpkeys.h>
+/*
+ * Equivalent to por_set_kpkeys_level(0, KPKEYS_LVL_DEFAULT), but can also be
+ * used in assembly.
+ */
+#define POR_EL1_INIT POR_ELx_PERM_PREP(KPKEYS_PKEY_DEFAULT, POE_RWX)
+
+#ifndef __ASSEMBLY__
+
static inline bool arch_kpkeys_enabled(void)
{
return system_supports_poe();
@@ -46,4 +54,6 @@ static __always_inline void arch_kpkeys_restore_pkey_reg(u64 pkey_reg)
#endif /* CONFIG_ARM64_POE */
+#endif /* __ASSEMBLY__ */
+
#endif /* __ASM_KPKEYS_H */
diff --git a/arch/arm64/include/asm/por.h b/arch/arm64/include/asm/por.h
index bffb4d2b1246..58dce4b8021b 100644
--- a/arch/arm64/include/asm/por.h
+++ b/arch/arm64/include/asm/por.h
@@ -10,6 +10,8 @@
#define POR_EL0_INIT POR_ELx_PERM_PREP(0, POE_RWX)
+#ifndef __ASSEMBLY__
+
static inline bool por_elx_allows_read(u64 por, u8 pkey)
{
u8 perm = POR_ELx_PERM_GET(pkey, por);
@@ -38,4 +40,6 @@ static inline u64 por_elx_set_pkey_perms(u64 por, u8 pkey, u64 perms)
return (por & ~(POE_MASK << shift)) | (perms << shift);
}
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_ARM64_POR_H */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 39582511ad72..1a258617ab89 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -166,6 +166,10 @@ struct pt_regs {
u64 orig_x0;
s32 syscallno;
u32 pmr;
+#ifdef CONFIG_ARM64_POE
+ u64 por_el1;
+ u64 __unused;
+#endif
u64 sdei_ttbr1;
struct frame_record_meta stackframe;
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index b6367ff3a49c..30b4d0636f58 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -76,6 +76,9 @@ int main(void)
DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno));
DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1));
DEFINE(S_PMR, offsetof(struct pt_regs, pmr));
+#ifdef CONFIG_ARM64_POE
+ DEFINE(S_POR_EL1, offsetof(struct pt_regs, por_el1));
+#endif
DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe));
DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type));
DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs));
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f8018b5c1f9a..0dd6f7fbb669 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -20,6 +20,7 @@
#include <asm/errno.h>
#include <asm/esr.h>
#include <asm/irq.h>
+#include <asm/kpkeys.h>
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/processor.h>
@@ -277,6 +278,19 @@ alternative_else_nop_endif
.else
add x21, sp, #PT_REGS_SIZE
get_current_task tsk
+#ifdef CONFIG_ARM64_POE
+alternative_if_not ARM64_HAS_S1POE
+ b 1f
+alternative_else_nop_endif
+ mrs_s x0, SYS_POR_EL1
+ str x0, [sp, #S_POR_EL1]
+ mov x1, #POR_EL1_INIT
+ cmp x0, x1
+ b.eq 1f
+ msr_s SYS_POR_EL1, x1
+ isb
+1:
+#endif /* CONFIG_ARM64_POE */
.endif /* \el == 0 */
mrs x22, elr_el1
mrs x23, spsr_el1
@@ -407,7 +421,15 @@ alternative_else_nop_endif
mte_set_user_gcr tsk, x0, x1
apply_ssbd 0, x0, x1
- .endif
+ .else
+#ifdef CONFIG_ARM64_POE
+alternative_if ARM64_HAS_S1POE
+ ldr x0, [sp, #S_POR_EL1]
+ msr_s SYS_POR_EL1, x0
+ /* No explicit ISB; we rely on ERET */
+alternative_else_nop_endif
+#endif /* CONFIG_ARM64_POE */
+ .endif /* \el == 0 */
msr elr_el1, x21 // set up the return data
msr spsr_el1, x22
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 08/30] arm64: Context-switch POR_EL1
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (6 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 07/30] arm64: Reset POR_EL1 on exception entry Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 09/30] arm64: Initialize POR_EL1 register on cpu_resume() Kevin Brodsky
` (21 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
POR_EL1 is about to be used by the kpkeys framework, modifying it
for (typically small) sections of code. If an exception occurs
during that window and scheduling occurs, we must ensure that
POR_EL1 is context-switched as needed (saving the old value and
restoring the new one). An ISB is needed to ensure the write takes
effect, so we skip it if the new value is the same as the old, like
for POR_EL0.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/processor.h | 1 +
arch/arm64/kernel/process.c | 9 +++++++++
2 files changed, 10 insertions(+)
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index e30c4c8e3a7a..6095322343fc 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -192,6 +192,7 @@ struct thread_struct {
u64 svcr;
u64 tpidr2_el0;
u64 por_el0;
+ u64 por_el1;
#ifdef CONFIG_ARM64_GCS
unsigned int gcs_el0_mode;
unsigned int gcs_el0_locked;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 489554931231..618fb5ce1c1d 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -429,6 +429,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
ptrauth_thread_init_kernel(p);
+ if (system_supports_poe())
+ p->thread.por_el1 = read_sysreg_s(SYS_POR_EL1);
+
if (likely(!args->fn)) {
*childregs = *current_pt_regs();
childregs->regs[0] = 0;
@@ -679,6 +682,12 @@ static void permission_overlay_switch(struct task_struct *next)
* of POR_EL0.
*/
}
+
+ current->thread.por_el1 = read_sysreg_s(SYS_POR_EL1);
+ if (current->thread.por_el1 != next->thread.por_el1) {
+ write_sysreg_s(next->thread.por_el1, SYS_POR_EL1);
+ isb();
+ }
}
/*
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 09/30] arm64: Initialize POR_EL1 register on cpu_resume()
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (7 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 08/30] arm64: Context-switch POR_EL1 Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 10/30] arm64: Enable kpkeys Kevin Brodsky
` (20 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
From: Yeoreum Yun <yeoreum.yun@arm.com>
The POR_EL1 register is reset to an unknown value after cpu_suspend().
Since POR_EL1 always holds POR_EL1_INIT when entering cpu_suspend(),
initialize POR_EL1 with POR_EL1_INIT before cpu_do_resume().
Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/kernel/sleep.S | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index f093cdf71be1..e0a6ad85cd24 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -3,6 +3,7 @@
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include <asm/assembler.h>
+#include <asm/kpkeys.h>
#include <asm/smp.h>
.text
@@ -134,6 +135,17 @@ SYM_FUNC_START(_cpu_resume)
/* load sp from context */
ldr x2, [x0, #CPU_CTX_SP]
mov sp, x2
+
+#ifdef CONFIG_ARM64_POE
+alternative_if_not ARM64_HAS_S1POE
+ b .Lskip_por_set
+alternative_else_nop_endif
+ mov_q x2, POR_EL1_INIT
+ msr_s SYS_POR_EL1, x2
+ /* isb can be skipped since cpu_do_resume() will do it. */
+.Lskip_por_set:
+#endif /* CONFIG_ARM64_POE */
+
/*
* cpu_do_resume expects x0 to contain context address pointer
*/
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 10/30] arm64: Enable kpkeys
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (8 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 09/30] arm64: Initialize POR_EL1 register on cpu_resume() Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:54 ` [PATCH v6 11/30] memblock: Move INIT_MEMBLOCK_* macros to header Kevin Brodsky
` (19 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
This is the final step to enable kpkeys on arm64. We enable
POE at EL1 by setting TCR2_EL1.POE, and initialise POR_EL1 to the
default value, enabling access to the default pkey/POIndex (0).
An ISB is added so that POE restrictions are enforced immediately.
Having done this, we can now select ARCH_HAS_KPKEYS if ARM64_POE is
enabled.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/cpufeature.c | 5 ++++-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 38dba5f7e4d2..732d4dbbab20 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2164,6 +2164,7 @@ config ARM64_POE
def_bool y
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
+ select ARCH_HAS_KPKEYS
help
The Permission Overlay Extension is used to implement Memory
Protection Keys. Memory Protection Keys provides a mechanism for
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c31f8e17732a..3b01a70b617b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -76,6 +76,7 @@
#include <linux/kasan.h>
#include <linux/percpu.h>
#include <linux/sched/isolation.h>
+#include <linux/kpkeys.h>
#include <asm/cpu.h>
#include <asm/cpufeature.h>
@@ -2447,8 +2448,10 @@ static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused)
#ifdef CONFIG_ARM64_POE
static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused)
{
- sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE);
+ write_sysreg_s(POR_EL1_INIT, SYS_POR_EL1);
+ sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE | TCR2_EL1_POE);
sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE);
+ isb();
}
#endif
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 11/30] memblock: Move INIT_MEMBLOCK_* macros to header
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (9 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 10/30] arm64: Enable kpkeys Kevin Brodsky
@ 2026-02-27 17:54 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 12/30] set_memory: Introduce arch_has_pte_only_direct_map() Kevin Brodsky
` (18 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:54 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The upcoming page table allocator for the kpkeys_hardened_pgtables
feature will need to know the maximum number of memblock regions.
Move the corresponding macros to <linux/memblock.h> to allow that.
INIT_MEMBLOCK_{RESERVED,MEMORY}_REGIONS may be overridden, but this
should be fine as only arm64 and loong currently do that and the
relevant header is already (indirectly) included by
<linux/memblock.h>.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/linux/memblock.h | 11 +++++++++++
mm/memblock.c | 11 -----------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6ec5e9ac0699..79f3ca8ff268 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -24,6 +24,17 @@ extern unsigned long max_pfn;
*/
extern unsigned long long max_possible_pfn;
+#define INIT_MEMBLOCK_REGIONS 128
+#define INIT_PHYSMEM_REGIONS 4
+
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+#define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
+#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
+#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* enum memblock_flags - definition of memory region attributes
* @MEMBLOCK_NONE: no special request
diff --git a/mm/memblock.c b/mm/memblock.c
index b3ddfdec7a80..954eb76aa0c2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -29,17 +29,6 @@
#include "internal.h"
-#define INIT_MEMBLOCK_REGIONS 128
-#define INIT_PHYSMEM_REGIONS 4
-
-#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
-# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
-#endif
-
-#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
-#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS
-#endif
-
/**
* DOC: memblock overview
*
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 12/30] set_memory: Introduce arch_has_pte_only_direct_map()
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (10 preceding siblings ...)
2026-02-27 17:54 ` [PATCH v6 11/30] memblock: Move INIT_MEMBLOCK_* macros to header Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 13/30] mm: kpkeys: Introduce kpkeys_hardened_pgtables feature Kevin Brodsky
` (17 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Introduce a helper that returns whether the linear map is fully
PTE-mapped (i.e. no block mapping is used). This is a runtime
decision for some architectures, hence the need for a new arch
helper.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/linux/set_memory.h | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 7b3a8bfde3c6..c6df55422da2 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -58,6 +58,19 @@ static inline bool can_set_direct_map(void)
#endif
#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */
+#ifndef arch_has_pte_only_direct_map
+/*
+ * The default assumption is that the direct map might be created using block
+ * mappings (PMD or higher). An architecture may override this if the direct
+ * map is fully PTE-mapped.
+ */
+static inline bool arch_has_pte_only_direct_map(void)
+{
+ return false;
+}
+#define arch_has_pte_only_direct_map arch_has_pte_only_direct_map
+#endif
+
#ifdef CONFIG_X86_64
int set_mce_nospec(unsigned long pfn);
int clear_mce_nospec(unsigned long pfn);
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 13/30] mm: kpkeys: Introduce kpkeys_hardened_pgtables feature
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (11 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 12/30] set_memory: Introduce arch_has_pte_only_direct_map() Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 14/30] mm: kpkeys: Introduce block-based page table allocator Kevin Brodsky
` (16 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
kpkeys_hardened_pgtables is a hardening feature based on kpkeys. It
aims to prevent the corruption of page tables by: 1. mapping all
page table pages, both kernel and user, with a privileged pkey
(KPKEYS_PKEY_PGTABLES), and 2. granting write access to that pkey
only when running at a higher kpkeys level (KPKEYS_LVL_PGTABLES).
This patch introduces basic infrastructure; the implementation of
both aspects will follow.
The feature is exposed as CONFIG_KPKEYS_HARDENED_PGTABLES; it
requires explicit architecture opt-in by selecting
ARCH_HAS_KPKEYS_HARDENED_PGTABLES, since much of the page table
handling is arch-specific.
Because this feature relies on kpkeys being available and enabled,
and modifies attributes of the linear map, it must be inactive on
boot. kpkeys_hardened_pgtables_init() enables it by toggling a
static key; this function must be called by supported architectures
in mem_init(), before any call to pagetable_alloc() is made.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/asm-generic/kpkeys.h | 4 ++++
include/linux/kpkeys.h | 30 +++++++++++++++++++++++++++++-
mm/Kconfig | 3 +++
mm/Makefile | 1 +
mm/kpkeys_hardened_pgtables.c | 13 +++++++++++++
security/Kconfig.hardening | 12 ++++++++++++
6 files changed, 62 insertions(+), 1 deletion(-)
create mode 100644 mm/kpkeys_hardened_pgtables.c
diff --git a/include/asm-generic/kpkeys.h b/include/asm-generic/kpkeys.h
index ab819f157d6a..cec92334a9f3 100644
--- a/include/asm-generic/kpkeys.h
+++ b/include/asm-generic/kpkeys.h
@@ -2,6 +2,10 @@
#ifndef __ASM_GENERIC_KPKEYS_H
#define __ASM_GENERIC_KPKEYS_H
+#ifndef KPKEYS_PKEY_PGTABLES
+#define KPKEYS_PKEY_PGTABLES 1
+#endif
+
#ifndef KPKEYS_PKEY_DEFAULT
#define KPKEYS_PKEY_DEFAULT 0
#endif
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index faa6e2615798..49af2ec76923 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -4,11 +4,13 @@
#include <linux/bug.h>
#include <linux/cleanup.h>
+#include <linux/jump_label.h>
#define KPKEYS_LVL_DEFAULT 0
+#define KPKEYS_LVL_PGTABLES 1
#define KPKEYS_LVL_MIN KPKEYS_LVL_DEFAULT
-#define KPKEYS_LVL_MAX KPKEYS_LVL_DEFAULT
+#define KPKEYS_LVL_MAX KPKEYS_LVL_PGTABLES
#define __KPKEYS_GUARD(name, set_level, restore_pkey_reg, set_arg, ...) \
__DEFINE_CLASS_IS_CONDITIONAL(name, false); \
@@ -110,4 +112,30 @@ static inline bool arch_kpkeys_enabled(void)
#endif /* CONFIG_ARCH_HAS_KPKEYS */
+#ifdef CONFIG_KPKEYS_HARDENED_PGTABLES
+
+DECLARE_STATIC_KEY_FALSE(kpkeys_hardened_pgtables_key);
+
+static inline bool kpkeys_hardened_pgtables_enabled(void)
+{
+ return static_branch_unlikely(&kpkeys_hardened_pgtables_key);
+}
+
+/*
+ * Should be called from mem_init(): as soon as the buddy allocator becomes
+ * available and before any call to pagetable_alloc().
+ */
+void kpkeys_hardened_pgtables_init(void);
+
+#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
+static inline bool kpkeys_hardened_pgtables_enabled(void)
+{
+ return false;
+}
+
+static inline void kpkeys_hardened_pgtables_init(void) {}
+
+#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
#endif /* _LINUX_KPKEYS_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 2baedee59bb2..2f87ee69d16e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1245,6 +1245,9 @@ config ARCH_HAS_PKEYS
bool
config ARCH_HAS_KPKEYS
bool
+# ARCH_HAS_KPKEYS must be selected when selecting this option
+config ARCH_HAS_KPKEYS_HARDENED_PGTABLES
+ bool
config ARCH_USES_PG_ARCH_2
bool
diff --git a/mm/Makefile b/mm/Makefile
index 8ad2ab08244e..7603e6051afa 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -150,3 +150,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o
+obj-$(CONFIG_KPKEYS_HARDENED_PGTABLES) += kpkeys_hardened_pgtables.o
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
new file mode 100644
index 000000000000..9e4771263ad2
--- /dev/null
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kpkeys.h>
+#include <linux/mm.h>
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(kpkeys_hardened_pgtables_key);
+
+void __init kpkeys_hardened_pgtables_init(void)
+{
+ if (!arch_kpkeys_enabled())
+ return;
+
+ static_branch_enable(&kpkeys_hardened_pgtables_key);
+}
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 86f8768c63d4..fdaf977d4626 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -275,6 +275,18 @@ config BUG_ON_DATA_CORRUPTION
If unsure, say N.
+config KPKEYS_HARDENED_PGTABLES
+ bool "Harden page tables using kernel pkeys"
+ depends on ARCH_HAS_KPKEYS_HARDENED_PGTABLES
+ help
+ This option makes all page tables mostly read-only by
+ allocating them with a non-default protection key (pkey) and
+ only enabling write access to that pkey in routines that are
+ expected to write to page table entries.
+
+ This option has no effect if the system does not support
+ kernel pkeys.
+
endmenu
config CC_HAS_RANDSTRUCT
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 14/30] mm: kpkeys: Introduce block-based page table allocator
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (12 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 13/30] mm: kpkeys: Introduce kpkeys_hardened_pgtables feature Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 15/30] mm: kpkeys: Handle splitting of linear map Kevin Brodsky
` (15 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
If the kpkeys_hardened_pgtables feature is enabled, page table pages
(PTPs) should be protected by modifying the linear mapping to map
them with a privileged pkey (KPKEYS_PKEY_PGTABLES). This patch
introduces a new page allocator for that purpose:
* kpkeys_pgtable_alloc() allocates a new PTP and sets the linear
mapping to KPKEYS_PKEY_PGTABLES for that page
* kpkeys_pgtable_free() frees such a PTP and restores the linear
mapping to the default pkey
If the linear map is fully PTE-mapped (as per
arch_has_pte_only_direct_map()), there is no need to introduce extra
state - the functions above directly call into the buddy allocator
and set the pkey for the given page.
Such an approach is however insufficient when block mappings are
used, because setting the pkey for individual pages is likely to
result in blocks getting split up: for a PMD block, (1 << PMD_ORDER)
PTEs are created, and the pkey is then set in the appropriate PTE.
This is doubly expensive: 1. a new PTE page must be allocated and
the entries updated accordingly; and 2. TLB pressure increases due
to the additional entries. Worse still, 1. is likely to be
recursive, as allocating a new PTE page with the appropriate pkey
could in turn require splitting a PMD.
This patch introduces a simple allocator for this specific use-case.
All PTPs are allocated from a global pool mapped with
KPKEYS_PKEY_PGTABLES; the pool is refilled with whole blocks if
possible (setting the pkey at PMD level). This greatly reduces the
number of blocks getting split - splitting should only occur under
memory pressure.
Important limitations:
* Special handling is required when refilling the cache, since
set_memory_pkey() may split a PUD/PMD block, requiring a new
PMD/PTE page to be allocated. This will be addressed in subsequent
patches.
* Cached pages should be reclaimable under memory pressure. This
will also be handled in a later patch.
* Only PTP allocations of order 0 and a small set of GFP flags
(PBA_GFP_OPT_MASK) are supported. That should be good enough to
cover the architectures that support pkeys (arm64, x86, powerpc).
* Pages are zeroed on alloc if requested (__GFP_ZERO). There is no
support for init_on_free, PAGE_POISONING and other debug features.
Also noteworthy: spin_lock_bh is used as PTPs may be freed in
softirq context (RCU).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/linux/kpkeys.h | 10 ++
include/linux/mm.h | 14 +-
mm/kpkeys_hardened_pgtables.c | 307 ++++++++++++++++++++++++++++++++++
3 files changed, 329 insertions(+), 2 deletions(-)
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index 49af2ec76923..303ddef6752c 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -121,6 +121,9 @@ static inline bool kpkeys_hardened_pgtables_enabled(void)
return static_branch_unlikely(&kpkeys_hardened_pgtables_key);
}
+struct page *kpkeys_pgtable_alloc(gfp_t gfp);
+void kpkeys_pgtable_free(struct page *page);
+
/*
* Should be called from mem_init(): as soon as the buddy allocator becomes
* available and before any call to pagetable_alloc().
@@ -134,6 +137,13 @@ static inline bool kpkeys_hardened_pgtables_enabled(void)
return false;
}
+static inline struct page *kpkeys_pgtable_alloc(gfp_t gfp)
+{
+ return NULL;
+}
+
+static inline void kpkeys_pgtable_free(struct page *page) {}
+
static inline void kpkeys_hardened_pgtables_init(void) {}
#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5be3d8a8f806..c3eab0228608 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -38,6 +38,7 @@
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/iommu-debug-pagealloc.h>
+#include <linux/kpkeys.h>
struct mempolicy;
struct anon_vma;
@@ -3401,7 +3402,12 @@ static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
*/
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
- struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
+ struct page *page;
+
+ if (kpkeys_hardened_pgtables_enabled() && !WARN_ON_ONCE(order != 0))
+ page = kpkeys_pgtable_alloc(gfp);
+ else
+ page = alloc_pages_noprof(gfp | __GFP_COMP, order);
return page_ptdesc(page);
}
@@ -3410,8 +3416,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
static inline void __pagetable_free(struct ptdesc *pt)
{
struct page *page = ptdesc_page(pt);
+ unsigned int order = compound_order(page);
- __free_pages(page, compound_order(page));
+ if (kpkeys_hardened_pgtables_enabled() && order == 0)
+ kpkeys_pgtable_free(page);
+ else
+ __free_pages(page, order);
}
#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index 9e4771263ad2..da5695da518d 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -1,13 +1,320 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
#include <linux/kpkeys.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/set_memory.h>
__ro_after_init DEFINE_STATIC_KEY_FALSE(kpkeys_hardened_pgtables_key);
+static int set_pkey_pgtable(struct page *page, unsigned int nr_pages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+ int ret;
+
+ ret = set_memory_pkey(addr, nr_pages, KPKEYS_PKEY_PGTABLES);
+
+ WARN_ON(ret);
+ return ret;
+}
+
+static int set_pkey_default(struct page *page, unsigned int nr_pages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+ int ret;
+
+ ret = set_memory_pkey(addr, nr_pages, KPKEYS_PKEY_DEFAULT);
+
+ WARN_ON(ret);
+ return ret;
+}
+
+/* pkeys block allocator (PBA) - implemented below */
+static bool pba_enabled(void);
+static struct page *pba_pgtable_alloc(gfp_t gfp);
+static void pba_pgtable_free(struct page *page);
+static void pba_init(void);
+
+/* Trivial allocator in case the linear map is PTE-mapped (no block mapping) */
+static struct page *noblock_pgtable_alloc(gfp_t gfp)
+{
+ struct page *page;
+ int ret;
+
+ page = alloc_pages_noprof(gfp, 0);
+ if (!page)
+ return page;
+
+ ret = set_pkey_pgtable(page, 1);
+ if (ret) {
+ __free_page(page);
+ return NULL;
+ }
+
+ return page;
+}
+
+static void noblock_pgtable_free(struct page *page)
+{
+ set_pkey_default(page, 1);
+ __free_page(page);
+}
+
+/* Public interface */
+struct page *kpkeys_pgtable_alloc(gfp_t gfp)
+{
+ if (pba_enabled())
+ return pba_pgtable_alloc(gfp);
+ else
+ return noblock_pgtable_alloc(gfp);
+}
+
+void kpkeys_pgtable_free(struct page *page)
+{
+ if (pba_enabled())
+ pba_pgtable_free(page);
+ else
+ noblock_pgtable_free(page);
+}
+
void __init kpkeys_hardened_pgtables_init(void)
{
if (!arch_kpkeys_enabled())
return;
+ pba_init();
static_branch_enable(&kpkeys_hardened_pgtables_key);
}
+
+/*
+ * pkeys block allocator (PBA): dedicated page table allocator for block-mapped
+ * linear map. Block splitting is minimised by prioritising the allocation and
+ * freeing of full blocks.
+ */
+#define PBA_GFP_ALLOC GFP_KERNEL
+#define PBA_GFP_OPT_MASK (__GFP_ZERO | __GFP_ACCOUNT)
+
+#define BLOCK_ORDER PMD_ORDER
+
+/*
+ * Refilling the cache is done by attempting allocation in decreasing orders
+ * (higher orders may not be available due to memory pressure). The specific
+ * orders are tweaked based on the page size.
+ *
+ * - A whole block (PMD_ORDER) is the preferred size. A lower order is used
+ * for page sizes above 16K to avoid reserving too much memory for page
+ * tables (a PMD block is 512 MB for 64K pages on arm64).
+ *
+ * - The next order corresponds to the contpte size on arm64, which helps to
+ * reduce TLB pressure. Other architectures may prefer other values.
+ *
+ * - The last order *must* be 2 (4 pages) to guarantee that __refill_pages()
+ * actually increases the number of cached pages - up to 2 cached pages
+ * may be used up by set_memory_pkey() for splitting the linear map.
+ */
+static const unsigned int refill_orders[] =
+#if PAGE_SHIFT <= 12
+ { BLOCK_ORDER, 4, 2 } /* 4K pages */
+#elif PAGE_SHIFT <= 14
+ { BLOCK_ORDER, 7, 2 } /* 16K pages */
+#else
+ { 9, 5, 2 } /* 64K pages */
+#endif
+;
+
+struct pkeys_block_allocator {
+ struct list_head cached_list;
+ unsigned long nr_cached;
+ spinlock_t lock;
+};
+
+static struct pkeys_block_allocator pkeys_block_allocator = {
+ .cached_list = LIST_HEAD_INIT(pkeys_block_allocator.cached_list),
+ .nr_cached = 0,
+ .lock = __SPIN_LOCK_UNLOCKED(pkeys_block_allocator.lock),
+};
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(pba_enabled_key);
+
+static bool pba_enabled(void)
+{
+ return static_branch_likely(&pba_enabled_key);
+}
+
+static void cached_list_add_pages(struct page *page, unsigned int nr_pages)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ for (unsigned int i = 0; i < nr_pages; i++)
+ list_add(&page[i].lru, &pba->cached_list);
+
+ pba->nr_cached += nr_pages;
+}
+
+static void cached_list_del_page(struct page *page)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ list_del(&page->lru);
+ pba->nr_cached--;
+}
+
+static void __refill_pages_add_to_cache(struct page *page, unsigned int order,
+ bool alloc_one)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ unsigned int nr_pages = 1 << order;
+
+ if (alloc_one) {
+ page++;
+ nr_pages--;
+ }
+
+ guard(spinlock_bh)(&pba->lock);
+
+ cached_list_add_pages(page, nr_pages);
+}
+
+static struct page *__refill_pages(bool alloc_one)
+{
+ struct page *page;
+ unsigned int order;
+ int ret;
+
+ for (int i = 0; i < ARRAY_SIZE(refill_orders); ++i) {
+ order = refill_orders[i];
+ page = alloc_pages_noprof(PBA_GFP_ALLOC, order);
+ if (page)
+ break;
+ }
+
+ if (!page)
+ return NULL;
+
+ pr_debug("%s: order=%d, pfn=%lx\n", __func__, order, page_to_pfn(page));
+
+ ret = set_pkey_pgtable(page, 1 << order);
+
+ if (ret) {
+ __free_pages(page, order);
+ return NULL;
+ }
+
+ /* Each page is going to be allocated individually */
+ split_page(page, order);
+
+ __refill_pages_add_to_cache(page, order, alloc_one);
+
+ return page;
+}
+
+static struct page *refill_pages_and_alloc_one(void)
+{
+ return __refill_pages(true);
+}
+
+static bool cached_page_available(void)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ return pba->nr_cached > 0;
+}
+
+static struct page *get_cached_page(gfp_t gfp)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ struct page *page;
+
+ guard(spinlock_bh)(&pba->lock);
+
+ if (!cached_page_available())
+ return NULL;
+
+ page = list_first_entry_or_null(&pba->cached_list, struct page, lru);
+ if (WARN_ON(!page))
+ return NULL;
+
+ cached_list_del_page(page);
+ return page;
+}
+
+static void check_gfp(gfp_t gfp)
+{
+ VM_WARN_ON_ONCE((gfp & PBA_GFP_ALLOC) != PBA_GFP_ALLOC);
+
+ gfp &= ~(PBA_GFP_ALLOC | PBA_GFP_OPT_MASK);
+
+ VM_WARN_ONCE(gfp, "Unexpected gfp: %pGg\n", &gfp);
+}
+
+static int prepare_page(struct page *page, gfp_t gfp)
+{
+ if (gfp & __GFP_ACCOUNT) {
+ int ret = memcg_kmem_charge_page(page, gfp, 0);
+
+ if (unlikely(ret))
+ return ret;
+ }
+
+ /*
+ * __refill_pages() only guarantees that page_private is zeroed for the
+ * head page, so it is safer to zero it every time we allocate a new
+ * page.
+ */
+ set_page_private(page, 0);
+
+ if (gfp & __GFP_ZERO) {
+ u64 saved_pkey_reg;
+
+ /*
+ * The page is mapped with KPKEYS_PKEY_PGTABLES so we need
+ * to switch to the corresponding kpkeys level to write to it.
+ */
+ saved_pkey_reg = kpkeys_set_level(KPKEYS_LVL_PGTABLES);
+ clear_highpage(page);
+ kpkeys_restore_pkey_reg(saved_pkey_reg);
+ }
+
+ return 0;
+}
+
+static struct page *pba_pgtable_alloc(gfp_t gfp)
+{
+ struct page *page;
+
+ check_gfp(gfp);
+
+ page = get_cached_page(gfp);
+
+ if (!page)
+ page = refill_pages_and_alloc_one();
+ WARN_ON(!page);
+
+ if (page && prepare_page(page, gfp)) {
+ kpkeys_pgtable_free(page);
+ return NULL;
+ }
+
+ return page;
+}
+
+static void pba_pgtable_free(struct page *page)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ memcg_kmem_uncharge_page(page, 0);
+
+ guard(spinlock_bh)(&pba->lock);
+
+ cached_list_add_pages(page, 1);
+}
+
+static void __init pba_init(void)
+{
+ if (arch_has_pte_only_direct_map())
+ return;
+
+ static_branch_enable(&pba_enabled_key);
+}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 15/30] mm: kpkeys: Handle splitting of linear map
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (13 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 14/30] mm: kpkeys: Introduce block-based page table allocator Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 16/30] mm: kpkeys: Defer early call to set_memory_pkey() Kevin Brodsky
` (14 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
When block mappings are used for the linear map, the kpkeys page
table allocator attempts to cache whole blocks to reduce splitting.
However, splitting cannot be fully avoided, if only because
allocating a (PMD) block may require splitting a PUD.
This requires special handling because we cannot recursively split
the linear map: to ensure that all page table pages (PTPs) are
mapped with the privileged pkey at all times, we need a reserve of
PTPs that can be used to split the linear map (inserted at PMD
and/or PTE level). This reserve is made up of 2 groups of 2 pages
(PMD + PTE):
1. 2 pages for set_memory_pkey() while refilling the allocator's
page cache. A mutex is used to guarantee that only one such
splitting happens at a time. These 2 pages are always available,
and are replenished by the refill operation itself (which yields
at least 4 pages: order >= 2).
2. 2 pages for any other splitting operation (e.g. set_memory_pkey()
in another context or set_memory_ro()). In this case we need to
explicitly replenish the reserve before attempting the operation;
a new API is introduced for that purpose:
* kpkeys_prepare_direct_map_split() performs a refill if the
reserve needs to be replenished. It should be called by the
relevant architecture code and doesn't require locking.
* kpkeys_ready_for_direct_map_split() returns whether splitting
can be performed. This should be called once the linear map lock
has been acquired. If false, the lock should be released and
another refill attempted.
The first group needs to be populated on startup before the
kpkeys_hardened_pgtables feature is enabled; this is done by filling
up the page cache in pba_init().
The page reserve is accessed by passing a new flag
__GFP_PGTABLE_SPLIT. This is probably overkill for such a narrow
use-case, but it avoids invasive changes to the pagetable_alloc()
logic.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
Adding a GFP flag was just the easy thing to do - alternative
suggestions welcome (new variant of pagetable_alloc()?)
Relying on the owner of alloc_mutex to decide which reserve to use isn't
very pretty, but there doesn't seem to be a simpler solution here.
---
include/linux/gfp_types.h | 3 +
include/linux/kpkeys.h | 13 +++++
mm/kpkeys_hardened_pgtables.c | 100 ++++++++++++++++++++++++++++++++--
3 files changed, 112 insertions(+), 4 deletions(-)
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 814bb2892f99..34e882c9253d 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -56,6 +56,7 @@ enum {
___GFP_NOLOCKDEP_BIT,
#endif
___GFP_NO_OBJ_EXT_BIT,
+ ___GFP_PGTABLE_SPLIT_BIT,
___GFP_LAST_BIT
};
@@ -97,6 +98,7 @@ enum {
#define ___GFP_NOLOCKDEP 0
#endif
#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT)
+#define ___GFP_PGTABLE_SPLIT BIT(___GFP_PGTABLE_SPLIT_BIT)
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
@@ -146,6 +148,7 @@ enum {
#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT)
#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT)
+#define __GFP_PGTABLE_SPLIT ((__force gfp_t)___GFP_PGTABLE_SPLIT)
/**
* DOC: Watermark modifiers
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index 303ddef6752c..983f55655dde 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -124,6 +124,9 @@ static inline bool kpkeys_hardened_pgtables_enabled(void)
struct page *kpkeys_pgtable_alloc(gfp_t gfp);
void kpkeys_pgtable_free(struct page *page);
+int kpkeys_prepare_direct_map_split(void);
+bool kpkeys_ready_for_direct_map_split(void);
+
/*
* Should be called from mem_init(): as soon as the buddy allocator becomes
* available and before any call to pagetable_alloc().
@@ -142,6 +145,16 @@ static inline struct page *kpkeys_pgtable_alloc(gfp_t gfp)
return NULL;
}
+static inline int kpkeys_prepare_direct_map_split(void)
+{
+ return 0;
+}
+
+static inline bool kpkeys_ready_for_direct_map_split(void)
+{
+ return true;
+}
+
static inline void kpkeys_pgtable_free(struct page *page) {}
static inline void kpkeys_hardened_pgtables_init(void) {}
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index da5695da518d..5b1231e1422a 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -5,6 +5,7 @@
#include <linux/kpkeys.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
+#include <linux/mutex.h>
#include <linux/set_memory.h>
__ro_after_init DEFINE_STATIC_KEY_FALSE(kpkeys_hardened_pgtables_key);
@@ -35,6 +36,8 @@ static int set_pkey_default(struct page *page, unsigned int nr_pages)
static bool pba_enabled(void);
static struct page *pba_pgtable_alloc(gfp_t gfp);
static void pba_pgtable_free(struct page *page);
+static int pba_prepare_direct_map_split(void);
+static bool pba_ready_for_direct_map_split(void);
static void pba_init(void);
/* Trivial allocator in case the linear map is PTE-mapped (no block mapping) */
@@ -79,6 +82,22 @@ void kpkeys_pgtable_free(struct page *page)
noblock_pgtable_free(page);
}
+int kpkeys_prepare_direct_map_split(void)
+{
+ if (pba_enabled())
+ return pba_prepare_direct_map_split();
+
+ return 0;
+}
+
+bool kpkeys_ready_for_direct_map_split(void)
+{
+ if (pba_enabled())
+ return pba_ready_for_direct_map_split();
+
+ return true;
+}
+
void __init kpkeys_hardened_pgtables_init(void)
{
if (!arch_kpkeys_enabled())
@@ -94,7 +113,24 @@ void __init kpkeys_hardened_pgtables_init(void)
* freeing of full blocks.
*/
#define PBA_GFP_ALLOC GFP_KERNEL
-#define PBA_GFP_OPT_MASK (__GFP_ZERO | __GFP_ACCOUNT)
+#define PBA_GFP_OPT_MASK (__GFP_ZERO | __GFP_ACCOUNT | __GFP_PGTABLE_SPLIT)
+
+/*
+ * Pages need to be reserved for splitting the linear map; __GFP_PGTABLE_SPLIT
+ * must be passed to access these pages. 4 pages are reserved:
+ *
+ * - 2 in case a PMD and/or PTE page needs to be allocated if set_memory_pkey()
+ * splits the linear map while refilling our own page cache (see
+ * __refill_pages()). These 2 pages must always be available as we cannot
+ * refill recursively. They are protected by alloc_mutex and are guaranteed to
+ * be replenished when refilling is complete and we release the mutex.
+ *
+ * - 2 for splitting the linear map for any other purpose (e.g. calling
+ * set_memory_pkey() or set_memory_ro() on an arbitrary range). These pages
+ * are replenished before the split is attempted, see
+ * kpkeys_prepare_direct_map_split().
+ */
+#define PBA_NR_RESERVED_PAGES 4
#define BLOCK_ORDER PMD_ORDER
@@ -128,12 +164,14 @@ struct pkeys_block_allocator {
struct list_head cached_list;
unsigned long nr_cached;
spinlock_t lock;
+ struct mutex alloc_mutex;
};
static struct pkeys_block_allocator pkeys_block_allocator = {
.cached_list = LIST_HEAD_INIT(pkeys_block_allocator.cached_list),
.nr_cached = 0,
.lock = __SPIN_LOCK_UNLOCKED(pkeys_block_allocator.lock),
+ .alloc_mutex = __MUTEX_INITIALIZER(pkeys_block_allocator.alloc_mutex)
};
static __ro_after_init DEFINE_STATIC_KEY_FALSE(pba_enabled_key);
@@ -143,6 +181,13 @@ static bool pba_enabled(void)
return static_branch_likely(&pba_enabled_key);
}
+static bool alloc_mutex_locked(void)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ return mutex_get_owner(&pba->alloc_mutex) == (unsigned long)current;
+}
+
static void cached_list_add_pages(struct page *page, unsigned int nr_pages)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
@@ -179,6 +224,7 @@ static void __refill_pages_add_to_cache(struct page *page, unsigned int order,
static struct page *__refill_pages(bool alloc_one)
{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
struct page *page;
unsigned int order;
int ret;
@@ -195,6 +241,8 @@ static struct page *__refill_pages(bool alloc_one)
pr_debug("%s: order=%d, pfn=%lx\n", __func__, order, page_to_pfn(page));
+ guard(mutex)(&pba->alloc_mutex);
+
ret = set_pkey_pgtable(page, 1 << order);
if (ret) {
@@ -210,16 +258,27 @@ static struct page *__refill_pages(bool alloc_one)
return page;
}
+static int refill_pages(void)
+{
+ return __refill_pages(false) ? 0 : -ENOMEM;
+}
+
static struct page *refill_pages_and_alloc_one(void)
{
return __refill_pages(true);
}
-static bool cached_page_available(void)
+static bool cached_page_available(gfp_t gfp)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
- return pba->nr_cached > 0;
+ if (gfp & __GFP_PGTABLE_SPLIT) {
+ pr_debug("%s: split pgtable (nr_cached: %lu, in_alloc: %d)\n",
+ __func__, pba->nr_cached, alloc_mutex_locked());
+ return true;
+ }
+
+ return pba->nr_cached > PBA_NR_RESERVED_PAGES;
}
static struct page *get_cached_page(gfp_t gfp)
@@ -229,7 +288,7 @@ static struct page *get_cached_page(gfp_t gfp)
guard(spinlock_bh)(&pba->lock);
- if (!cached_page_available())
+ if (!cached_page_available(gfp))
return NULL;
page = list_first_entry_or_null(&pba->cached_list, struct page, lru);
@@ -311,10 +370,43 @@ static void pba_pgtable_free(struct page *page)
cached_list_add_pages(page, 1);
}
+static int pba_prepare_direct_map_split(void)
+{
+ if (pba_ready_for_direct_map_split())
+ return 0;
+
+ /* Ensure we have at least PBA_NR_RESERVED_PAGES available */
+ return refill_pages();
+}
+
+static bool pba_ready_for_direct_map_split(void)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+
+ /*
+ * For a regular split, we must ensure the reserve is fully replenished
+ * before splitting (which may consume 2 pages out of 4).
+ *
+ * When refilling our cache, alloc_mutex is locked and we must use
+ * pages from the reserve (remaining 2 pages).
+ */
+ return READ_ONCE(pba->nr_cached) >= PBA_NR_RESERVED_PAGES ||
+ alloc_mutex_locked();
+}
+
static void __init pba_init(void)
{
+ int ret;
+
if (arch_has_pte_only_direct_map())
return;
static_branch_enable(&pba_enabled_key);
+
+ /*
+ * Refill the cache so that the reserve pages are available for
+ * splitting next time we need to refill.
+ */
+ ret = refill_pages();
+ WARN_ON(ret);
}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 16/30] mm: kpkeys: Defer early call to set_memory_pkey()
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (14 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 15/30] mm: kpkeys: Handle splitting of linear map Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 17/30] mm: kpkeys: Add shrinker for block pgtable allocator Kevin Brodsky
` (13 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The kpkeys_hardened_pgtables feature requires all page table pages
to be mapped with a non-default pkey. When the linear map
uses large block mappings, setting the pkey for an arbitrary range
may require splitting an existing block.
The kpkeys page table allocator attempts to reduce such splitting,
but it cannot avoid it altogether. This is problematic during early
boot on some systems (arm64 with BBML2-noabort), because the linear
map may not be split until feature detection has completed on all
CPUs. This occurs after the buddy allocator becomes available, and
pagetable_alloc() is called multiple times by that point.
To address this, defer the first call to set_memory_pkey()
(triggered by the refill in pba_init()) until a point where it is
safe to do so. A late initialisation function is introduced to that
effect.
Only one such early region may be registered; further refills in
that early window will trigger a warning and leave the memory
unprotected. The underlying assumption is that there are relatively
few calls to pagetable_alloc() before
kpkeys_hardened_pgtables_init_late() is called. This seems to be the
case at least on arm64; the main user is vmalloc() while allocating
per-CPU IRQ stacks, and even with the largest possible NR_CPUS this
would not require allocating more than 16 PTE pages.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
This patch is rather unpleasant (especially the arbitrary limit of pages
that can be deferred), but it seems difficult to avoid on arm64 as we
must wait to know whether all CPUs support BBML2-noabort before relying
on it to split blocks.
The case where the boot CPU supports BBML2-noabort but some other
doesn't is not explicitly supported. In that case, the linear map will
end up being PTE-mapped, but we will still use the block allocator for
page tables. This may be suboptimal, but it remains functionally
correct.
---
include/linux/kpkeys.h | 8 +++++
mm/kpkeys_hardened_pgtables.c | 58 +++++++++++++++++++++++++++++++++--
2 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index 983f55655dde..8cfeb6e5af56 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -133,6 +133,12 @@ bool kpkeys_ready_for_direct_map_split(void);
*/
void kpkeys_hardened_pgtables_init(void);
+/*
+ * Should be called by architecture code as soon as it is safe to modify the
+ * pkey of arbitrary linear map ranges.
+ */
+void kpkeys_hardened_pgtables_init_late(void);
+
#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
static inline bool kpkeys_hardened_pgtables_enabled(void)
@@ -159,6 +165,8 @@ static inline void kpkeys_pgtable_free(struct page *page) {}
static inline void kpkeys_hardened_pgtables_init(void) {}
+static inline void kpkeys_hardened_pgtables_init_late(void) {}
+
#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
#endif /* _LINUX_KPKEYS_H */
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index 5b1231e1422a..223a0bb02df0 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -39,6 +39,7 @@ static void pba_pgtable_free(struct page *page);
static int pba_prepare_direct_map_split(void);
static bool pba_ready_for_direct_map_split(void);
static void pba_init(void);
+static void pba_init_late(void);
/* Trivial allocator in case the linear map is PTE-mapped (no block mapping) */
static struct page *noblock_pgtable_alloc(gfp_t gfp)
@@ -107,6 +108,15 @@ void __init kpkeys_hardened_pgtables_init(void)
static_branch_enable(&kpkeys_hardened_pgtables_key);
}
+void __init kpkeys_hardened_pgtables_init_late(void)
+{
+ if (!arch_kpkeys_enabled())
+ return;
+
+ if (pba_enabled())
+ pba_init_late();
+}
+
/*
* pkeys block allocator (PBA): dedicated page table allocator for block-mapped
* linear map. Block splitting is minimised by prioritising the allocation and
@@ -174,7 +184,13 @@ static struct pkeys_block_allocator pkeys_block_allocator = {
.alloc_mutex = __MUTEX_INITIALIZER(pkeys_block_allocator.alloc_mutex)
};
+static struct {
+ struct page *head_page;
+ unsigned int order;
+} pba_early_region __initdata;
+
static __ro_after_init DEFINE_STATIC_KEY_FALSE(pba_enabled_key);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(pba_can_set_pkey);
static bool pba_enabled(void)
{
@@ -188,6 +204,28 @@ static bool alloc_mutex_locked(void)
return mutex_get_owner(&pba->alloc_mutex) == (unsigned long)current;
}
+/*
+ * __ref is used as this is called from __refill_pages() which is not __init.
+ * The call to pba_init_late() guarantees this is not called after boot has
+ * completed.
+ */
+static void __ref register_early_region(struct page *head_page,
+ unsigned int order)
+{
+ /*
+ * Only one region is expected to be registered. Any further region
+ * is left untracked (i.e. unprotected).
+ */
+ if (WARN_ON(pba_early_region.head_page))
+ return;
+
+ pr_debug("%s: order=%d, pfn=%lx\n", __func__, order,
+ page_to_pfn(head_page));
+
+ pba_early_region.head_page = head_page;
+ pba_early_region.order = order;
+}
+
static void cached_list_add_pages(struct page *page, unsigned int nr_pages)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
@@ -227,7 +265,7 @@ static struct page *__refill_pages(bool alloc_one)
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
struct page *page;
unsigned int order;
- int ret;
+ int ret = 0;
for (int i = 0; i < ARRAY_SIZE(refill_orders); ++i) {
order = refill_orders[i];
@@ -243,7 +281,10 @@ static struct page *__refill_pages(bool alloc_one)
guard(mutex)(&pba->alloc_mutex);
- ret = set_pkey_pgtable(page, 1 << order);
+ if (static_branch_likely(&pba_can_set_pkey))
+ ret = set_pkey_pgtable(page, 1 << order);
+ else
+ register_early_region(page, order);
if (ret) {
__free_pages(page, order);
@@ -406,7 +447,20 @@ static void __init pba_init(void)
/*
* Refill the cache so that the reserve pages are available for
* splitting next time we need to refill.
+ *
+ * We cannot split the linear map at this stage, so the allocated
+ * region will be registered as early region (pba_early_region) and
+ * its pkey set later.
*/
ret = refill_pages();
WARN_ON(ret);
}
+
+static void __init pba_init_late(void)
+{
+ static_branch_enable(&pba_can_set_pkey);
+
+ if (pba_early_region.head_page)
+ set_pkey_pgtable(pba_early_region.head_page,
+ 1 << pba_early_region.order);
+}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 17/30] mm: kpkeys: Add shrinker for block pgtable allocator
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (15 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 16/30] mm: kpkeys: Defer early call to set_memory_pkey() Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 18/30] mm: kpkeys: Introduce early page table allocator Kevin Brodsky
` (12 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The newly introduced kpkeys block allocator does not return freed
page table pages (PTPs) to the buddy allocator, but instead caches
them, so that:
1. Future allocation requests can be quickly serviced, without
calling alloc_pages() or set_memory_pkey().
2. Blocks are not needlessly split, since releasing a single page to
the buddy allocator requires resetting the pkey for just that
page, splitting the PMD into PTEs.
We cannot however let this cache grow indefinitely. This patch
introduces a shrinker that allows reclaiming those cached pages.
Like the rest of the allocator, the primary objective is to minimise
the splitting of blocks. Each shrinker pass (call to
pba_shrink_scan()) attempts to release all free pages within a given
block, or cached pages that do not lie inside a block managed by the
allocator.
In order to choose which block to shrink, we need to know how many
free pages a block contains. The approach taken here is to store
that value in the head page of each managed block and update it
whenever a page is allocated or freed. Other pages in the block are
also marked with a flag. This simplifies the shrinker pass, and the
associated overhead should be minimal.
We then scan all cached pages and find the "emptiest" block, i.e.
containing the most free pages. If a block is completely empty, then
we release it right away as that can be done without splitting.
Otherwise, we pay the price of splitting the block if we consider it
has enough free pages (and there are not enough non-block free
pages).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
Much of patch is up for debate, with various thresholds that would
deserve to be tuned. Tracking blocks seems like a good idea to reduce
fragmentation, but it's unclear how much that helps in a real-life
scenario. Feedback welcome!
---
mm/kpkeys_hardened_pgtables.c | 287 ++++++++++++++++++++++++++++++++++
1 file changed, 287 insertions(+)
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index 223a0bb02df0..dcc5e6da7c85 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -143,6 +143,7 @@ void __init kpkeys_hardened_pgtables_init_late(void)
#define PBA_NR_RESERVED_PAGES 4
#define BLOCK_ORDER PMD_ORDER
+#define BLOCK_NR_PAGES (1ul << (BLOCK_ORDER))
/*
* Refilling the cache is done by attempting allocation in decreasing orders
@@ -226,6 +227,68 @@ static void __ref register_early_region(struct page *head_page,
pba_early_region.order = order;
}
+/*
+ * Private per-page allocator data. It needs to be preserved when a page table
+ * page is allocated, so we cannot use page->private, which overlaps with
+ * struct ptdesc::ptl. page->mapping is unused in struct ptdesc so we store it
+ * there instead.
+ */
+struct pba_page_data {
+ bool in_block;
+ u32 block_nr_free; /* Only used for the head page of a block */
+};
+
+static struct pba_page_data *page_pba_data(struct page *page)
+{
+ BUILD_BUG_ON(sizeof(struct pba_page_data) > sizeof(page->mapping));
+
+ return (struct pba_page_data *)&page->mapping;
+}
+
+static void mark_block_cached(struct page *head_page, struct page *cached_pages,
+ unsigned int nr_cached_pages)
+{
+ page_pba_data(head_page)->in_block = true;
+ page_pba_data(head_page)->block_nr_free = nr_cached_pages;
+
+ for (unsigned int i = 0; i < nr_cached_pages; i++)
+ page_pba_data(&cached_pages[i])->in_block = true;
+}
+
+static void mark_block_noncached(struct page *head_page)
+{
+ for (unsigned int i = 0; i < BLOCK_NR_PAGES; i++)
+ head_page[i].mapping = NULL;
+}
+
+static struct page *block_head_page(struct page *page)
+{
+ unsigned long page_pfn;
+
+ if (!page_pba_data(page)->in_block)
+ return NULL;
+
+ page_pfn = page_to_pfn(page);
+
+ return pfn_to_page(ALIGN_DOWN(page_pfn, BLOCK_NR_PAGES));
+}
+
+static void inc_block_nr_free(struct page *page)
+{
+ struct page *head_page = block_head_page(page);
+
+ if (head_page)
+ page_pba_data(head_page)->block_nr_free++;
+}
+
+static void dec_block_nr_free(struct page *page)
+{
+ struct page *head_page = block_head_page(page);
+
+ if (head_page)
+ page_pba_data(head_page)->block_nr_free--;
+}
+
static void cached_list_add_pages(struct page *page, unsigned int nr_pages)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
@@ -248,6 +311,7 @@ static void __refill_pages_add_to_cache(struct page *page, unsigned int order,
bool alloc_one)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ struct page *head_page = page;
unsigned int nr_pages = 1 << order;
if (alloc_one) {
@@ -255,6 +319,9 @@ static void __refill_pages_add_to_cache(struct page *page, unsigned int order,
nr_pages--;
}
+ if (order == BLOCK_ORDER)
+ mark_block_cached(head_page, page, nr_pages);
+
guard(spinlock_bh)(&pba->lock);
cached_list_add_pages(page, nr_pages);
@@ -309,6 +376,56 @@ static struct page *refill_pages_and_alloc_one(void)
return __refill_pages(true);
}
+static unsigned long release_page_list(struct list_head *page_list)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ unsigned long nr_freed = 0;
+ struct page *page, *tmp;
+
+ /* _safe is required because __free_page() overwrites page->lru */
+ list_for_each_entry_safe(page, tmp, page_list, lru) {
+ int ret = 0;
+
+ ret = set_pkey_default(page, 1);
+
+ if (ret) {
+ guard(spinlock_bh)(&pba->lock);
+ cached_list_add_pages(page, 1);
+ break;
+ }
+
+ __free_page(page);
+ nr_freed++;
+ }
+
+ return nr_freed;
+}
+
+static unsigned long release_whole_block(struct list_head *page_list,
+ struct page *block_head)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ unsigned long nr_freed = 0;
+ struct page *page, *tmp;
+ int ret;
+
+ /* Reset the pkey for the full block to avoid splitting the linear map */
+ ret = set_pkey_default(block_head, BLOCK_NR_PAGES);
+
+ if (ret) {
+ guard(spinlock_bh)(&pba->lock);
+ cached_list_add_pages(block_head, BLOCK_NR_PAGES);
+ return 0;
+ }
+
+ list_for_each_entry_safe(page, tmp, page_list, lru) {
+ __free_page(page);
+ nr_freed++;
+ }
+
+ return nr_freed;
+}
+
static bool cached_page_available(gfp_t gfp)
{
struct pkeys_block_allocator *pba = &pkeys_block_allocator;
@@ -337,6 +454,7 @@ static struct page *get_cached_page(gfp_t gfp)
return NULL;
cached_list_del_page(page);
+ dec_block_nr_free(page);
return page;
}
@@ -409,6 +527,7 @@ static void pba_pgtable_free(struct page *page)
guard(spinlock_bh)(&pba->lock);
cached_list_add_pages(page, 1);
+ inc_block_nr_free(page);
}
static int pba_prepare_direct_map_split(void)
@@ -464,3 +583,171 @@ static void __init pba_init_late(void)
set_pkey_pgtable(pba_early_region.head_page,
1 << pba_early_region.order);
}
+
+/* Shrinker */
+
+/* Keep some pages around to avoid shrinking causing a refill right away */
+#define PBA_UNSHRINKABLE_PAGES 16
+/* Don't shrink a block that is almost full to avoid excessive splitting */
+#define PBA_SHRINK_BLOCK_MIN_PAGES (BLOCK_NR_PAGES / 8)
+
+static unsigned long count_shrinkable_pages(void)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ unsigned long nr_cached = READ_ONCE(pba->nr_cached);
+
+ return nr_cached > PBA_UNSHRINKABLE_PAGES ?
+ nr_cached - PBA_UNSHRINKABLE_PAGES : 0;
+}
+
+static unsigned long pba_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+
+ return count_shrinkable_pages() ?: SHRINK_EMPTY;
+}
+
+static bool block_worth_shrinking(unsigned long nr_pages_target_block,
+ unsigned long nr_pages_nonblock,
+ struct shrink_control *sc)
+{
+ /*
+ * Avoid partially shrinking a block (which means splitting it) if
+ * we can reclaim enough/more non-block pages instead, or if we would
+ * reclaim only few pages (below PBA_SHRINK_BLOCK_MIN_PAGES)
+ */
+ return nr_pages_nonblock < nr_pages_target_block &&
+ nr_pages_nonblock < sc->nr_to_scan &&
+ nr_pages_target_block >= PBA_SHRINK_BLOCK_MIN_PAGES;
+}
+
+static unsigned long pba_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct pkeys_block_allocator *pba = &pkeys_block_allocator;
+ LIST_HEAD(pages_to_free);
+ struct page *page, *tmp;
+ unsigned long nr_pages_nonblock = 0, nr_pages_target_block = 0;
+ unsigned long nr_pages_uncached = 0, nr_freed = 0;
+ unsigned long nr_pages_shrinkable;
+ struct page *target_block = NULL;
+
+ sc->nr_scanned = 0;
+
+ pr_debug("%s: nr_to_scan = %lu, nr_cached = %lu\n",
+ __func__, sc->nr_to_scan, pba->nr_cached);
+
+ spin_lock_bh(&pba->lock);
+ nr_pages_shrinkable = count_shrinkable_pages();
+
+ /*
+ * Count pages that don't belong to any block, and find the block
+ * with the highest number of free pages
+ */
+ list_for_each_entry(page, &pba->cached_list, lru) {
+ struct page *block = block_head_page(page);
+ unsigned long block_nr_free;
+
+ if (!block) {
+ nr_pages_nonblock++;
+ continue;
+ }
+
+ block_nr_free = page_pba_data(block)->block_nr_free;
+
+ if (block_nr_free > nr_pages_target_block) {
+ target_block = block;
+ nr_pages_target_block = block_nr_free;
+ }
+
+ /* We will free this block, so no need to continue scanning */
+ if (nr_pages_target_block == BLOCK_NR_PAGES)
+ break;
+ }
+
+ if (nr_pages_target_block == BLOCK_NR_PAGES) {
+ /*
+ * If a whole block is empty, take the opportunity to free it
+ * completely (regardless of the requested nr_to_scan) to avoid
+ * splitting the linear map. If nr_pages_shrinkable is too low,
+ * we bail out as we would have to split the block to shrink it
+ * partially (and there is nothing else we can shrink).
+ */
+ if (nr_pages_shrinkable < BLOCK_NR_PAGES) {
+ spin_unlock_bh(&pba->lock);
+ pr_debug("%s: cannot free empty block, bailing out\n",
+ __func__);
+ goto out;
+ }
+
+ sc->nr_to_scan = BLOCK_NR_PAGES;
+ } else if (block_worth_shrinking(nr_pages_target_block,
+ nr_pages_nonblock, sc)) {
+ /* Shrink block (partially) */
+ sc->nr_to_scan = min(sc->nr_to_scan, nr_pages_target_block);
+ } else {
+ /* Free non-block pages */
+ sc->nr_to_scan = min(sc->nr_to_scan, nr_pages_nonblock);
+ target_block = NULL;
+ }
+
+ list_for_each_entry_safe(page, tmp, &pba->cached_list, lru) {
+ struct page *block = block_head_page(page);
+
+ if (!(nr_pages_uncached < sc->nr_to_scan &&
+ nr_pages_uncached < nr_pages_shrinkable))
+ break;
+
+ if (block == target_block) {
+ list_move(&page->lru, &pages_to_free);
+ nr_pages_uncached++;
+ }
+ }
+
+ pba->nr_cached -= nr_pages_uncached;
+ sc->nr_scanned = nr_pages_uncached;
+
+ if (target_block)
+ mark_block_noncached(target_block);
+ spin_unlock_bh(&pba->lock);
+
+ if (target_block)
+ pr_debug("%s: freeing block (pfn = %lx, %lu/%lu free pages)\n",
+ __func__, page_to_pfn(target_block),
+ nr_pages_target_block, BLOCK_NR_PAGES);
+ else
+ pr_debug("%s: freeing non-block (%lu free pages)\n",
+ __func__, nr_pages_nonblock);
+
+ if (nr_pages_target_block == BLOCK_NR_PAGES) {
+ VM_WARN_ON(nr_pages_uncached != BLOCK_NR_PAGES);
+ nr_freed = release_whole_block(&pages_to_free, target_block);
+ } else {
+ nr_freed = release_page_list(&pages_to_free);
+ }
+
+ pr_debug("%s: freed %lu pages, nr_cached = %lu\n", __func__,
+ nr_freed, pba->nr_cached);
+out:
+ return nr_freed ?: SHRINK_STOP;
+}
+
+static int __init pba_init_shrinker(void)
+{
+ struct shrinker *shrinker;
+
+ if (!pba_enabled())
+ return 0;
+
+ shrinker = shrinker_alloc(0, "kpkeys-pgtable-block");
+ if (!shrinker)
+ return -ENOMEM;
+
+ shrinker->count_objects = pba_shrink_count;
+ shrinker->scan_objects = pba_shrink_scan;
+ shrinker->seeks = 0;
+ shrinker->batch = BLOCK_NR_PAGES;
+ shrinker_register(shrinker);
+ return 0;
+}
+late_initcall(pba_init_shrinker);
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 18/30] mm: kpkeys: Introduce early page table allocator
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (16 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 17/30] mm: kpkeys: Add shrinker for block pgtable allocator Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 19/30] mm: kpkeys: Introduce hook for protecting static page tables Kevin Brodsky
` (11 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The kpkeys_hardened_pgtables feature aims to protect all page table
pages (PTPs) by mapping them with a privileged pkey. This is primarily
handled by kpkeys_pgtable_alloc(), called from pagetable_alloc().
However, this does not cover PTPs allocated early, before the
buddy allocator is available. These PTPs are allocated by architecture
code, either 1. from static pools or 2. using the memblock allocator,
and should also be protected.
This patch addresses the second category: PTPs allocated via memblock.
Such PTPs are notably used to create the linear map. Protecting them as
soon as they are allocated would require modifying the linear map while
it is being created, which seems at best difficult. Instead, a simple
allocator is introduced, refilling a cache with memblock and keeping
track of all allocated ranges to set their pkey once it is safe to do
so. PTPs allocated at that stage are not freed, so there is no need
to manage a free list.
The refill size/alignment is the same as for the pkeys block allocator.
For systems that use large block mappings, the same rationale applies
(reducing fragmentation of the linear map). This is also used for other
systems, as this reduces the number of calls to memblock, without much
downside.
The number of PTPs required to create the linear map is proportional to
the amount of available memory, which means it may be large. At that
stage the memblock allocator may however only track a limited number of
regions, and we size our own tracking array (full_ranges) accordingly.
The array may be quite large as a result (16KB on arm64), but it is
discarded once boot has completed (__initdata).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
The full_ranges array will end up mostly empty on most systems, but
relying on INIT_MEMBLOCK_MEMORY_REGIONS seemed to be the only way to
guarantee that we can track all ranges regardless of the size and layout
of physical memory.
An alternative would be to rebuild the ranges by walking the kernel page
tables in init_late(), but that's arguably at least as complex
(requiring stop_machine()).
---
include/linux/kpkeys.h | 7 ++
mm/kpkeys_hardened_pgtables.c | 165 ++++++++++++++++++++++++++++++++++
2 files changed, 172 insertions(+)
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index 8cfeb6e5af56..73b456ecec65 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -139,6 +139,8 @@ void kpkeys_hardened_pgtables_init(void);
*/
void kpkeys_hardened_pgtables_init_late(void);
+phys_addr_t kpkeys_physmem_pgtable_alloc(void);
+
#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
static inline bool kpkeys_hardened_pgtables_enabled(void)
@@ -167,6 +169,11 @@ static inline void kpkeys_hardened_pgtables_init(void) {}
static inline void kpkeys_hardened_pgtables_init_late(void) {}
+static inline phys_addr_t kpkeys_physmem_pgtable_alloc(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
#endif /* _LINUX_KPKEYS_H */
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index dcc5e6da7c85..1b649812f474 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -3,6 +3,7 @@
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/kpkeys.h>
+#include <linux/memblock.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
@@ -41,6 +42,9 @@ static bool pba_ready_for_direct_map_split(void);
static void pba_init(void);
static void pba_init_late(void);
+/* pkeys physmem allocator (PPA) - implemented below */
+static void ppa_finalize(void);
+
/* Trivial allocator in case the linear map is PTE-mapped (no block mapping) */
static struct page *noblock_pgtable_alloc(gfp_t gfp)
{
@@ -113,8 +117,14 @@ void __init kpkeys_hardened_pgtables_init_late(void)
if (!arch_kpkeys_enabled())
return;
+ /*
+ * Called first to avoid relying on pba_early_region for splitting
+ * the linear map in the subsequent calls.
+ */
if (pba_enabled())
pba_init_late();
+
+ ppa_finalize();
}
/*
@@ -751,3 +761,158 @@ static int __init pba_init_shrinker(void)
return 0;
}
late_initcall(pba_init_shrinker);
+
+/*
+ * pkeys physmem allocator (PPA): block-based allocator for very early page
+ * tables (especially for creating the linear map), based on memblock. Blocks
+ * are tracked so that their pkey can be set once it is safe to do so.
+ */
+
+/*
+ * We may have to track many ranges when allocating page tables for the linear
+ * map, as their number grows with the amount of available memory. Assuming that
+ * memblock returns contiguous blocks whenever possible, the number of ranges
+ * to track cannot however exceed the number of regions that memblock itself
+ * tracks. memblock_allow_resize() hasn't been called yet at that point, so
+ * that limit is the size of the statically allocated array.
+ */
+#define PHYSMEM_MAX_RANGES INIT_MEMBLOCK_MEMORY_REGIONS
+
+/*
+ * We allocate ranges with the same size and alignment as the maximum refill
+ * size for the regular block allocator, with the same rationale (minimising
+ * spliting and optimising TLB usage).
+ */
+#define PHYSMEM_REFILL_SIZE (PAGE_SIZE << refill_orders[0])
+
+struct physmem_range {
+ phys_addr_t addr;
+ phys_addr_t size;
+};
+
+struct pkeys_physmem_allocator {
+ struct physmem_range free_range;
+
+ struct physmem_range full_ranges[PHYSMEM_MAX_RANGES];
+ unsigned int nr_full_ranges;
+};
+
+static struct pkeys_physmem_allocator pkeys_physmem_allocator __initdata;
+
+static int __init set_pkey_pgtable_phys(phys_addr_t pa, phys_addr_t size)
+{
+ unsigned long addr = (unsigned long)__va(pa);
+ int ret;
+
+ ret = set_memory_pkey(addr, size / PAGE_SIZE, KPKEYS_PKEY_PGTABLES);
+ pr_debug("%s: addr=%pa, size=%pa\n", __func__, &addr, &size);
+
+ WARN_ON(ret);
+ return ret;
+}
+
+static bool __init ppa_try_extend_last_range(phys_addr_t addr, phys_addr_t size)
+{
+ struct pkeys_physmem_allocator *ppa = &pkeys_physmem_allocator;
+ struct physmem_range *range;
+
+ if (!ppa->nr_full_ranges)
+ return false;
+
+ range = &ppa->full_ranges[ppa->nr_full_ranges - 1];
+
+ /* Merge the new range into the last range if they are contiguous */
+ if (addr == range->addr + range->size) {
+ range->size += size;
+ return true;
+ } else if (addr + size == range->addr) {
+ range->addr -= size;
+ range->size += size;
+ return true;
+ }
+
+ return false;
+}
+
+static void __init ppa_register_full_range(phys_addr_t addr)
+{
+ struct pkeys_physmem_allocator *ppa = &pkeys_physmem_allocator;
+ struct physmem_range *range;
+
+ if (!addr)
+ return;
+
+ if (ppa_try_extend_last_range(addr, PHYSMEM_REFILL_SIZE))
+ return;
+
+ /* Could not extend the last range, create a new one */
+ if (WARN_ON(ppa->nr_full_ranges >= PHYSMEM_MAX_RANGES))
+ return;
+
+ range = &ppa->full_ranges[ppa->nr_full_ranges++];
+ range->addr = addr;
+ range->size = PHYSMEM_REFILL_SIZE;
+}
+
+static void __init ppa_refill(void)
+{
+ struct pkeys_physmem_allocator *ppa = &pkeys_physmem_allocator;
+ phys_addr_t size = PHYSMEM_REFILL_SIZE;
+ phys_addr_t addr;
+
+ /*
+ * There should be plenty of contiguous physical memory available so
+ * early during boot so there should be no need for fallback sizes.
+ */
+ addr = memblock_phys_alloc_range(size, size, 0,
+ MEMBLOCK_ALLOC_NOLEAKTRACE);
+ WARN_ON(!addr);
+
+ pr_debug("%s: addr=%pa\n", __func__, &addr);
+
+ ppa->free_range.addr = addr;
+ ppa->free_range.size = (addr ? size : 0);
+}
+
+static void __init ppa_finalize(void)
+{
+ struct pkeys_physmem_allocator *ppa = &pkeys_physmem_allocator;
+
+ if (ppa->free_range.addr) {
+ struct physmem_range *free_range = &ppa->free_range;
+
+ /* Protect the range that was allocated, and free the rest */
+ set_pkey_pgtable_phys(free_range->addr + free_range->size,
+ PHYSMEM_REFILL_SIZE - free_range->size);
+
+ if (free_range->size)
+ memblock_free_late(free_range->addr, free_range->size);
+
+ free_range->addr = 0;
+ free_range->size = 0;
+ }
+
+ for (unsigned int i = 0; i < ppa->nr_full_ranges; i++) {
+ struct physmem_range *range = &ppa->full_ranges[i];
+
+ set_pkey_pgtable_phys(range->addr, range->size);
+ }
+}
+
+phys_addr_t __init kpkeys_physmem_pgtable_alloc(void)
+{
+ struct pkeys_physmem_allocator *ppa = &pkeys_physmem_allocator;
+
+ if (!ppa->free_range.size) {
+ ppa_register_full_range(ppa->free_range.addr);
+ ppa_refill();
+ }
+
+ if (!ppa->free_range.addr)
+ /* Refilling failed - allocate untracked memory */
+ return memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_NOLEAKTRACE);
+
+ ppa->free_range.size -= PAGE_SIZE;
+ return ppa->free_range.addr + ppa->free_range.size;
+}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 19/30] mm: kpkeys: Introduce hook for protecting static page tables
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (17 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 18/30] mm: kpkeys: Introduce early page table allocator Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 20/30] arm64: cpufeature: Add helper to directly probe CPU for POE support Kevin Brodsky
` (10 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The kpkeys_hardened_pgtables infrastructure introduced so far allows
compatible architectures to protect all page table pages (PTPs)
allocated at runtime (first via memblock, then the buddy allocator).
Some PTPs are however required even earlier, before any allocator is
available. This is typically needed for mapping the kernel image
itself.
These PTPs are at least as sensitive as those allocated later on,
and should be protected by mapping them with the privileged pkey.
Exactly how these pages are obtained is entirely arch-specific, so
we introduce a hook to let architectures that implement
kpkeys_hardened_pgtables do the right thing.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
include/linux/kpkeys.h | 4 ++++
mm/kpkeys_hardened_pgtables.c | 1 +
2 files changed, 5 insertions(+)
diff --git a/include/linux/kpkeys.h b/include/linux/kpkeys.h
index 73b456ecec65..cf2f7735ce03 100644
--- a/include/linux/kpkeys.h
+++ b/include/linux/kpkeys.h
@@ -141,6 +141,10 @@ void kpkeys_hardened_pgtables_init_late(void);
phys_addr_t kpkeys_physmem_pgtable_alloc(void);
+#ifndef arch_kpkeys_protect_static_pgtables
+static inline void arch_kpkeys_protect_static_pgtables(void) {}
+#endif
+
#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
static inline bool kpkeys_hardened_pgtables_enabled(void)
diff --git a/mm/kpkeys_hardened_pgtables.c b/mm/kpkeys_hardened_pgtables.c
index 1b649812f474..cc1dc44335c3 100644
--- a/mm/kpkeys_hardened_pgtables.c
+++ b/mm/kpkeys_hardened_pgtables.c
@@ -125,6 +125,7 @@ void __init kpkeys_hardened_pgtables_init_late(void)
pba_init_late();
ppa_finalize();
+ arch_kpkeys_protect_static_pgtables();
}
/*
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 20/30] arm64: cpufeature: Add helper to directly probe CPU for POE support
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (18 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 19/30] mm: kpkeys: Introduce hook for protecting static page tables Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 21/30] arm64: set_memory: Implement arch_has_pte_only_direct_map() Kevin Brodsky
` (9 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
In order to support the kpkeys_hardened_pgtables feature, we will
need to decide how to allocate early page tables, before boot CPU
features have been detected. To that end, add a new helper to check
that the boot CPU supports POE (meaning POE will eventually be
enabled).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/cpufeature.h | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 4de51f8d92cb..8722e9e62702 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -1078,6 +1078,18 @@ static inline bool cpu_has_lpa2(void)
#endif
}
+static inline bool cpu_has_poe(void)
+{
+ u64 mmfr3;
+
+ if (!IS_ENABLED(CONFIG_ARM64_POE))
+ return false;
+
+ mmfr3 = read_sysreg_s(SYS_ID_AA64MMFR3_EL1);
+ return cpuid_feature_extract_unsigned_field(mmfr3,
+ ID_AA64MMFR3_EL1_S1POE_SHIFT);
+}
+
#endif /* __ASSEMBLER__ */
#endif
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 21/30] arm64: set_memory: Implement arch_has_pte_only_direct_map()
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (19 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 20/30] arm64: cpufeature: Add helper to directly probe CPU for POE support Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 22/30] arm64: kpkeys: Support KPKEYS_LVL_PGTABLES Kevin Brodsky
` (8 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Implement the new arch helper, which is equivalent to
force_pte_mapping().
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/set_memory.h | 3 +++
arch/arm64/mm/mmu.c | 5 +++++
2 files changed, 8 insertions(+)
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index b6cd6de34abf..2bf6e0b917e6 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -9,6 +9,9 @@
bool can_set_direct_map(void);
#define can_set_direct_map can_set_direct_map
+bool arch_has_pte_only_direct_map(void);
+#define arch_has_pte_only_direct_map arch_has_pte_only_direct_map
+
int set_memory_valid(unsigned long addr, int numpages, int enable);
int set_direct_map_invalid_noflush(struct page *page);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 09ca62f77a84..a8e982ac5079 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1181,6 +1181,11 @@ static void __init map_mem(pgd_t *pgdp)
arm64_kfence_map_pool(early_kfence_pool, pgdp);
}
+bool arch_has_pte_only_direct_map(void)
+{
+ return force_pte_mapping();
+}
+
void mark_rodata_ro(void)
{
unsigned long section_size;
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 22/30] arm64: kpkeys: Support KPKEYS_LVL_PGTABLES
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (20 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 21/30] arm64: set_memory: Implement arch_has_pte_only_direct_map() Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified Kevin Brodsky
` (7 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Enable RW access to KPKEYS_PKEY_PGTABLES (used to map page table
pages) if switching to KPKEYS_LVL_PGTABLES, otherwise only grant RO
access.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/kpkeys.h | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/kpkeys.h b/arch/arm64/include/asm/kpkeys.h
index 79ae33388088..64d6e22740ec 100644
--- a/arch/arm64/include/asm/kpkeys.h
+++ b/arch/arm64/include/asm/kpkeys.h
@@ -12,7 +12,8 @@
* Equivalent to por_set_kpkeys_level(0, KPKEYS_LVL_DEFAULT), but can also be
* used in assembly.
*/
-#define POR_EL1_INIT POR_ELx_PERM_PREP(KPKEYS_PKEY_DEFAULT, POE_RWX)
+#define POR_EL1_INIT (POR_ELx_PERM_PREP(KPKEYS_PKEY_DEFAULT, POE_RWX) | \
+ POR_ELx_PERM_PREP(KPKEYS_PKEY_PGTABLES, POE_R))
#ifndef __ASSEMBLY__
@@ -26,6 +27,8 @@ static inline bool arch_kpkeys_enabled(void)
static inline u64 por_set_kpkeys_level(u64 por, int level)
{
por = por_elx_set_pkey_perms(por, KPKEYS_PKEY_DEFAULT, POE_RWX);
+ por = por_elx_set_pkey_perms(por, KPKEYS_PKEY_PGTABLES,
+ level == KPKEYS_LVL_PGTABLES ? POE_RW : POE_R);
return por;
}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (21 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 22/30] arm64: kpkeys: Support KPKEYS_LVL_PGTABLES Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 20:28 ` kernel test robot
2026-02-27 17:55 ` [PATCH v6 24/30] arm64: kpkeys: Handle splitting of linear map Kevin Brodsky
` (6 subsequent siblings)
29 siblings, 1 reply; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
When the kpkeys_hardened_pgtables feature is enabled, we need to be
able to modify attributes (specifically the pkey/POIndex) in the
linear map at page granularity.
Add the appropriate check to can_set_direct_map() and
force_pte_mapping(), on the same principle as rodata_full and other
features.
These functions can be called very early, before POE is actually
detected. Introduce a helper that returns whether the hardening
feature is/will be enabled, by checking whether POE is supported
by the CPU if it hasn't been detected yet.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/kpkeys.h | 18 ++++++++++++++++++
arch/arm64/mm/mmu.c | 3 ++-
arch/arm64/mm/pageattr.c | 3 ++-
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/kpkeys.h b/arch/arm64/include/asm/kpkeys.h
index 64d6e22740ec..eeebbdfe239a 100644
--- a/arch/arm64/include/asm/kpkeys.h
+++ b/arch/arm64/include/asm/kpkeys.h
@@ -57,6 +57,24 @@ static __always_inline void arch_kpkeys_restore_pkey_reg(u64 pkey_reg)
#endif /* CONFIG_ARM64_POE */
+#ifdef CONFIG_KPKEYS_HARDENED_PGTABLES
+
+static inline bool arm64_supports_kpkeys_hardened_pgtables(void)
+{
+ /* POE is a boot feature */
+ return boot_capabilities_finalized() ?
+ system_supports_poe() : cpu_has_poe();
+}
+
+#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
+static inline bool arm64_supports_kpkeys_hardened_pgtables(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_KPKEYS_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a8e982ac5079..ea1cb1875257 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -764,7 +764,8 @@ static inline bool force_pte_mapping(void)
return true;
if (bbml2)
return false;
- return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world();
+ return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world() ||
+ arm64_supports_kpkeys_hardened_pgtables();
}
static DEFINE_MUTEX(pgtable_split_lock);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index d2a7e104a5c2..05e57387c0b5 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -96,7 +96,8 @@ bool can_set_direct_map(void)
* Realms need to make pages shared/protected at page granularity.
*/
return rodata_full || debug_pagealloc_enabled() ||
- arm64_kfence_can_set_direct_map() || is_realm_world();
+ arm64_kfence_can_set_direct_map() || is_realm_world() ||
+ arm64_supports_kpkeys_hardened_pgtables();
}
static int update_range_prot(unsigned long start, unsigned long size,
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified
2026-02-27 17:55 ` [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified Kevin Brodsky
@ 2026-02-27 20:28 ` kernel test robot
0 siblings, 0 replies; 32+ messages in thread
From: kernel test robot @ 2026-02-27 20:28 UTC (permalink / raw)
To: Kevin Brodsky, linux-hardening
Cc: oe-kbuild-all, linux-kernel, Kevin Brodsky, Andrew Morton,
Linux Memory Management List, Andy Lutomirski, Catalin Marinas,
Dave Hansen, David Hildenbrand, Ira Weiny, Jann Horn, Jeff Xu,
Joey Gouly, Kees Cook, Linus Walleij, Lorenzo Stoakes,
Marc Zyngier, Mark Brown, Matthew Wilcox, Maxwell Bland,
Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi
Hi Kevin,
kernel test robot noticed the following build errors:
[auto build test ERROR on 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f]
url: https://github.com/intel-lab-lkp/linux/commits/Kevin-Brodsky/mm-Introduce-kpkeys/20260228-020115
base: 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f
patch link: https://lore.kernel.org/r/20260227175518.3728055-24-kevin.brodsky%40arm.com
patch subject: [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified
config: arm64-allnoconfig (https://download.01.org/0day-ci/archive/20260228/202602280415.zn69IEgu-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260228/202602280415.zn69IEgu-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602280415.zn69IEgu-lkp@intel.com/
All errors (new ones prefixed by >>):
arch/arm64/mm/mmu.c: In function 'force_pte_mapping':
>> arch/arm64/mm/mmu.c:768:17: error: implicit declaration of function 'arm64_supports_kpkeys_hardened_pgtables' [-Wimplicit-function-declaration]
768 | arm64_supports_kpkeys_hardened_pgtables();
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--
arch/arm64/mm/pageattr.c: In function 'can_set_direct_map':
>> arch/arm64/mm/pageattr.c:100:17: error: implicit declaration of function 'arm64_supports_kpkeys_hardened_pgtables' [-Wimplicit-function-declaration]
100 | arm64_supports_kpkeys_hardened_pgtables();
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
vim +/arm64_supports_kpkeys_hardened_pgtables +768 arch/arm64/mm/mmu.c
757
758 static inline bool force_pte_mapping(void)
759 {
760 const bool bbml2 = system_capabilities_finalized() ?
761 system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
762
763 if (debug_pagealloc_enabled())
764 return true;
765 if (bbml2)
766 return false;
767 return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world() ||
> 768 arm64_supports_kpkeys_hardened_pgtables();
769 }
770
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH v6 24/30] arm64: kpkeys: Handle splitting of linear map
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (22 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 23/30] arm64: kpkeys: Ensure the linear map can be modified Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 25/30] arm64: kpkeys: Protect early page tables Kevin Brodsky
` (5 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
When the kpkeys_hardened_pgtables feature is enabled, special care
is required when allocating page table pages while splitting the
linear map.
Indicate that such pages are being allocated by passing
__GFP_PGTABLE_SPLIT and use the appropriate interface to prepare the
kpkeys_hardened_pgtables allocator in
split_kernel_leaf_mapping().
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/mm/mmu.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ea1cb1875257..2cee0b7f8a56 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -707,7 +707,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
if (!pud_present(pud))
goto out;
if (pud_leaf(pud)) {
- ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+ ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL | __GFP_PGTABLE_SPLIT, true);
if (ret)
goto out;
}
@@ -732,7 +732,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
*/
if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
goto out;
- ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+ ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL | __GFP_PGTABLE_SPLIT, true);
if (ret)
goto out;
}
@@ -800,7 +800,18 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
return -EINVAL;
+kpkeys_retry:
+ ret = kpkeys_prepare_direct_map_split();
+ if (ret)
+ return ret;
+
mutex_lock(&pgtable_split_lock);
+
+ if (!kpkeys_ready_for_direct_map_split()) {
+ mutex_unlock(&pgtable_split_lock);
+ goto kpkeys_retry;
+ }
+
lazy_mmu_mode_enable();
/*
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 25/30] arm64: kpkeys: Protect early page tables
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (23 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 24/30] arm64: kpkeys: Handle splitting of linear map Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 26/30] arm64: kpkeys: Protect init_pg_dir Kevin Brodsky
` (4 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Use the dedicated kpkeys allocator for early page tables (used to create
the linear map) when the kpkeys_hardened_pgtables feature is enabled.
CPU features have not been detected at this stage so we use the
early helper arm64_supports_kpkeys_hardened_pgtables(). This is not
a concern as kpkeys_physmem_pgtable_alloc() does not itself use POE
or set_memory_pkey().
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/mm/mmu.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 2cee0b7f8a56..7072d5ac0579 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -116,8 +116,11 @@ static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
{
phys_addr_t phys;
- phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
- MEMBLOCK_ALLOC_NOLEAKTRACE);
+ if (arm64_supports_kpkeys_hardened_pgtables())
+ phys = kpkeys_physmem_pgtable_alloc();
+ else
+ phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_NOLEAKTRACE);
if (!phys)
panic("Failed to allocate page table page\n");
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 26/30] arm64: kpkeys: Protect init_pg_dir
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (24 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 25/30] arm64: kpkeys: Protect early page tables Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 27/30] arm64: kpkeys: Guard page table writes Kevin Brodsky
` (3 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
When kpkeys_hardened_pgtables is enabled, protect the page tables
that map the kernel image by setting the appropriate pkey for the
linear mapping of those pages.
Most other static page tables (e.g. swapper_pg_dir) should be
read-only both in the kernel image mapping and the linear mapping,
so there is no need to change their pkey.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
This patch may not be comprehensive - there are multiple static pools
used for various page directories.
---
arch/arm64/include/asm/kpkeys.h | 3 +++
arch/arm64/mm/mmu.c | 13 +++++++++++++
2 files changed, 16 insertions(+)
diff --git a/arch/arm64/include/asm/kpkeys.h b/arch/arm64/include/asm/kpkeys.h
index eeebbdfe239a..2d8bb1e25e3b 100644
--- a/arch/arm64/include/asm/kpkeys.h
+++ b/arch/arm64/include/asm/kpkeys.h
@@ -66,6 +66,9 @@ static inline bool arm64_supports_kpkeys_hardened_pgtables(void)
system_supports_poe() : cpu_has_poe();
}
+#define arch_kpkeys_protect_static_pgtables arch_kpkeys_protect_static_pgtables
+void arch_kpkeys_protect_static_pgtables(void);
+
#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
static inline bool arm64_supports_kpkeys_hardened_pgtables(void)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 7072d5ac0579..1e2cf0166c1d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1048,6 +1048,19 @@ void __init mark_linear_text_alias_ro(void)
PAGE_KERNEL_RO);
}
+#ifdef CONFIG_KPKEYS_HARDENED_PGTABLES
+void __init arch_kpkeys_protect_static_pgtables(void)
+{
+ extern char __pi_init_pg_dir[], __pi_init_pg_end[];
+ unsigned long addr = (unsigned long)lm_alias(__pi_init_pg_dir);
+ unsigned long size = __pi_init_pg_end - __pi_init_pg_dir;
+ int ret;
+
+ ret = set_memory_pkey(addr, size / PAGE_SIZE, KPKEYS_PKEY_PGTABLES);
+ WARN_ON(ret);
+}
+#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
#ifdef CONFIG_KFENCE
bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 27/30] arm64: kpkeys: Guard page table writes
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (25 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 26/30] arm64: kpkeys: Protect init_pg_dir Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 28/30] arm64: kpkeys: Batch KPKEYS_LVL_PGTABLES switches Kevin Brodsky
` (2 subsequent siblings)
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
When CONFIG_KPKEYS_HARDENED_PGTABLES is enabled, page tables (both
user and kernel) are mapped with a privileged pkey in the linear
mapping. As a result, they can only be written at an elevated kpkeys
level.
Introduce a kpkeys guard that sets POR_EL1 appropriately to allow
writing to page tables, and use this guard wherever necessary. The
scope is kept as small as possible, so that POR_EL1 is quickly reset
to its default value. Where atomics are involved, the guard's scope
encompasses the whole loop to avoid switching POR_EL1 unnecessarily.
This patch is a no-op if CONFIG_KPKEYS_HARDENED_PGTABLES is disabled
(default).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/pgtable.h | 22 +++++++++++++++++++++-
arch/arm64/mm/fault.c | 2 ++
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49b..8c85e23223da 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -39,6 +39,14 @@
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/page_table_check.h>
+#include <linux/kpkeys.h>
+
+#ifdef CONFIG_KPKEYS_HARDENED_PGTABLES
+KPKEYS_GUARD_COND(kpkeys_hardened_pgtables, KPKEYS_LVL_PGTABLES,
+ kpkeys_hardened_pgtables_enabled())
+#else
+KPKEYS_GUARD_NOOP(kpkeys_hardened_pgtables)
+#endif
static inline void emit_pte_barriers(void)
{
@@ -363,6 +371,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
static inline void __set_pte_nosync(pte_t *ptep, pte_t pte)
{
+ guard(kpkeys_hardened_pgtables)();
WRITE_ONCE(*ptep, pte);
}
@@ -830,6 +839,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+ guard(kpkeys_hardened_pgtables)();
WRITE_ONCE(*pmdp, pmd);
if (pmd_valid(pmd))
@@ -890,6 +900,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
return;
}
+ guard(kpkeys_hardened_pgtables)();
WRITE_ONCE(*pudp, pud);
if (pud_valid(pud))
@@ -971,6 +982,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
return;
}
+ guard(kpkeys_hardened_pgtables)();
WRITE_ONCE(*p4dp, p4d);
queue_pte_barriers();
}
@@ -1099,6 +1111,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
return;
}
+ guard(kpkeys_hardened_pgtables)();
WRITE_ONCE(*pgdp, pgd);
queue_pte_barriers();
}
@@ -1295,6 +1308,7 @@ static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
{
pte_t old_pte, pte;
+ guard(kpkeys_hardened_pgtables)();
pte = __ptep_get(ptep);
do {
old_pte = pte;
@@ -1343,7 +1357,10 @@ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm,
pte_t *ptep,
unsigned long pgsize)
{
- pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
+ pte_t pte;
+
+ scoped_guard(kpkeys_hardened_pgtables)
+ pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
switch (pgsize) {
case PAGE_SIZE:
@@ -1416,6 +1433,7 @@ static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
{
pte_t old_pte;
+ guard(kpkeys_hardened_pgtables)();
do {
old_pte = pte;
pte = pte_wrprotect(pte);
@@ -1449,6 +1467,7 @@ static inline void __clear_young_dirty_pte(struct vm_area_struct *vma,
{
pte_t old_pte;
+ guard(kpkeys_hardened_pgtables)();
do {
old_pte = pte;
@@ -1496,6 +1515,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
+ guard(kpkeys_hardened_pgtables)();
return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
}
#endif
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be9dab2c7d6a..7e230499f7fe 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -214,6 +214,8 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
if (pte_same(pte, entry))
return 0;
+ guard(kpkeys_hardened_pgtables)();
+
/* only preserve the access flags and write permission */
pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 28/30] arm64: kpkeys: Batch KPKEYS_LVL_PGTABLES switches
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (26 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 27/30] arm64: kpkeys: Guard page table writes Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 29/30] arm64: kpkeys: Enable kpkeys_hardened_pgtables support Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 30/30] mm: Add basic tests for kpkeys_hardened_pgtables Kevin Brodsky
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The kpkeys_hardened_pgtables feature currently switches kpkeys level
in every helper that writes to page tables, such as set_pte(). With
kpkeys implemented using POE, this entails a pair of ISBs whenever
such helper is called.
A simple way to reduce this overhead is to make use of the lazy MMU
mode. We amend the kpkeys_hardened_pgtables guard so that no level
switch (i.e. POR_EL1 update) is issued while the lazy MMU mode is
active. Instead, we switch to KPKEYS_LVL_PGTABLES when entering the
lazy MMU mode, and restore the previous level when exiting it.
Restoring the previous kpkeys level requires storing the original
value of POR_EL1 somewhere. This is a full 64-bit value so we cannot
simply use a TIF flag. There is no straightforward way to reuse
current->thread.por_el1 for that purpose - this is where the current
value of POR_EL1 is stored on a context switch, i.e. the value
corresponding to KPKEYS_LVL_PGTABLES inside a lazy_mmu section.
Instead, we add a new member to thread_struct to hold that value
temporarily. This isn't optimal as that member is unused outside of
lazy MMU sections, but it is the simplest option. Nesting of
sections is not a concern as arch_{enter,leave}_lazy_mmu_mode() are
not called in inner sections (nor do we need to do anything there).
A further optimisation this patch makes is to merge the ISBs when
exiting lazy_mmu mode. That is, if an ISB is going to be issued by
emit_pte_barriers() because kernel pgtables were modified in the
lazy MMU section, we skip the ISB after restoring POR_EL1. This is
done by checking TIF_LAZY_MMU_PENDING and ensuring that POR_EL1 is
restored before emit_pte_barriers() is called.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/include/asm/pgtable.h | 50 +++++++++++++++++++++++++++---
arch/arm64/include/asm/processor.h | 1 +
2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8c85e23223da..556de0a4537e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -43,10 +43,44 @@
#ifdef CONFIG_KPKEYS_HARDENED_PGTABLES
KPKEYS_GUARD_COND(kpkeys_hardened_pgtables, KPKEYS_LVL_PGTABLES,
- kpkeys_hardened_pgtables_enabled())
-#else
+ kpkeys_hardened_pgtables_enabled() &&
+ !is_lazy_mmu_mode_active())
+
+static void kpkeys_lazy_mmu_enter(void)
+{
+ if (!kpkeys_hardened_pgtables_enabled())
+ return;
+
+ current->thread.por_el1_lazy_mmu = kpkeys_set_level(KPKEYS_LVL_PGTABLES);
+}
+
+static void kpkeys_lazy_mmu_exit(void)
+{
+ u64 saved_por_el1;
+
+ if (!kpkeys_hardened_pgtables_enabled())
+ return;
+
+ saved_por_el1 = current->thread.por_el1_lazy_mmu;
+
+ /*
+ * We skip any barrier if TIF_LAZY_MMU_PENDING is set:
+ * emit_pte_barriers() will issue an ISB just after this function
+ * returns.
+ */
+ if (test_thread_flag(TIF_LAZY_MMU_PENDING))
+ __kpkeys_set_pkey_reg_nosync(saved_por_el1);
+ else
+ arch_kpkeys_restore_pkey_reg(saved_por_el1);
+}
+#else /* CONFIG_KPKEYS_HARDENED_PGTABLES */
KPKEYS_GUARD_NOOP(kpkeys_hardened_pgtables)
-#endif
+
+static void kpkeys_lazy_mmu_enter(void) {}
+static void kpkeys_lazy_mmu_exit(void) {}
+#endif /* CONFIG_KPKEYS_HARDENED_PGTABLES */
+
+
static inline void emit_pte_barriers(void)
{
@@ -79,7 +113,10 @@ static inline void queue_pte_barriers(void)
}
}
-static inline void arch_enter_lazy_mmu_mode(void) {}
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ kpkeys_lazy_mmu_enter();
+}
static inline void arch_flush_lazy_mmu_mode(void)
{
@@ -89,6 +126,11 @@ static inline void arch_flush_lazy_mmu_mode(void)
static inline void arch_leave_lazy_mmu_mode(void)
{
+ /*
+ * The ordering should be preserved to allow kpkeys_lazy_mmu_exit()
+ * to skip any barrier when TIF_LAZY_MMU_PENDING is set.
+ */
+ kpkeys_lazy_mmu_exit();
arch_flush_lazy_mmu_mode();
}
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 6095322343fc..c3a86ddce637 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -193,6 +193,7 @@ struct thread_struct {
u64 tpidr2_el0;
u64 por_el0;
u64 por_el1;
+ u64 por_el1_lazy_mmu;
#ifdef CONFIG_ARM64_GCS
unsigned int gcs_el0_mode;
unsigned int gcs_el0_locked;
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 29/30] arm64: kpkeys: Enable kpkeys_hardened_pgtables support
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (27 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 28/30] arm64: kpkeys: Batch KPKEYS_LVL_PGTABLES switches Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
2026-02-27 17:55 ` [PATCH v6 30/30] mm: Add basic tests for kpkeys_hardened_pgtables Kevin Brodsky
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
The kpkeys_hardened_pgtables feature needs to be initialised in two
stages:
1. As soon as the buddy allocator becomes available. The canonical
place to handle this is mem_init().
2. As soon as the linear map can be split. With BBML2-noabort, this
requires CPU detection to be completed on all CPUs.
The earliest point is therefore after setup_system_features() has
been called.
With that done, all the bits are in place and we can advertise
support for kpkeys_hardened_pgtables by selecting
ARCH_HAS_KPKEYS_HARDENED_PGTABLES if ARM64_POE is enabled.
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/smp.c | 2 ++
arch/arm64/mm/mmu.c | 5 +++++
3 files changed, 8 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 732d4dbbab20..2faf082cc1d0 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2165,6 +2165,7 @@ config ARM64_POE
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
select ARCH_HAS_KPKEYS
+ select ARCH_HAS_KPKEYS_HARDENED_PGTABLES
help
The Permission Overlay Extension is used to implement Memory
Protection Keys. Memory Protection Keys provides a mechanism for
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 1aa324104afb..b2efff7a82ef 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -35,6 +35,7 @@
#include <linux/kgdb.h>
#include <linux/kvm_host.h>
#include <linux/nmi.h>
+#include <linux/kpkeys.h>
#include <asm/alternative.h>
#include <asm/atomic.h>
@@ -441,6 +442,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
hyp_mode_check();
setup_system_features();
setup_user_features();
+ kpkeys_hardened_pgtables_init_late();
mark_linear_text_alias_ro();
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1e2cf0166c1d..1a96c186c4a3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -2273,3 +2273,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long i
return 0;
}
#endif
+
+void __init mem_init(void)
+{
+ kpkeys_hardened_pgtables_init();
+}
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread* [PATCH v6 30/30] mm: Add basic tests for kpkeys_hardened_pgtables
2026-02-27 17:54 [PATCH v6 00/30] pkeys-based page table hardening Kevin Brodsky
` (28 preceding siblings ...)
2026-02-27 17:55 ` [PATCH v6 29/30] arm64: kpkeys: Enable kpkeys_hardened_pgtables support Kevin Brodsky
@ 2026-02-27 17:55 ` Kevin Brodsky
29 siblings, 0 replies; 32+ messages in thread
From: Kevin Brodsky @ 2026-02-27 17:55 UTC (permalink / raw)
To: linux-hardening
Cc: linux-kernel, Kevin Brodsky, Andrew Morton, Andy Lutomirski,
Catalin Marinas, Dave Hansen, David Hildenbrand, Ira Weiny,
Jann Horn, Jeff Xu, Joey Gouly, Kees Cook, Linus Walleij,
Lorenzo Stoakes, Marc Zyngier, Mark Brown, Matthew Wilcox,
Maxwell Bland, Mike Rapoport (IBM),
Peter Zijlstra, Pierre Langlois, Quentin Perret, Rick Edgecombe,
Ryan Roberts, Thomas Gleixner, Vlastimil Babka, Will Deacon,
Yang Shi, Yeoreum Yun, linux-arm-kernel, linux-mm, x86
Add basic tests for the kpkeys_hardened_pgtables feature: try to
perform direct writes to kernel and user page table entries and
ensure they fail.
Multiple cases are considered for kernel page tables, as early page
tables are allocated and/or protected in a different way.
The tests are builtin (cannot be built as a module) because they
refer to multiple symbols that are not exported (e.g.
copy_to_kernel_nofault()).
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
---
mm/Makefile | 1 +
mm/tests/kpkeys_hardened_pgtables_kunit.c | 202 ++++++++++++++++++++++
security/Kconfig.hardening | 12 ++
3 files changed, 215 insertions(+)
create mode 100644 mm/tests/kpkeys_hardened_pgtables_kunit.c
diff --git a/mm/Makefile b/mm/Makefile
index 7603e6051afa..9ebdbaa696b2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -151,3 +151,4 @@ obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o
obj-$(CONFIG_KPKEYS_HARDENED_PGTABLES) += kpkeys_hardened_pgtables.o
+obj-$(CONFIG_KPKEYS_HARDENED_PGTABLES_KUNIT_TEST) += tests/kpkeys_hardened_pgtables_kunit.o
diff --git a/mm/tests/kpkeys_hardened_pgtables_kunit.c b/mm/tests/kpkeys_hardened_pgtables_kunit.c
new file mode 100644
index 000000000000..fa11fcd7abde
--- /dev/null
+++ b/mm/tests/kpkeys_hardened_pgtables_kunit.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+#include <linux/mman.h>
+#include <linux/pgtable.h>
+#include <linux/set_memory.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_ARM64
+#include <asm/stacktrace.h>
+#endif
+
+static void free_page_wrapper(void *ctx)
+{
+ __free_page((struct page *)ctx);
+}
+
+KUNIT_DEFINE_ACTION_WRAPPER(vfree_wrapper, vfree, const void *);
+
+static pud_t *pud_off_k(unsigned long va)
+{
+ return pud_offset(p4d_offset(pgd_offset_k(va), va), va);
+}
+
+static pte_t *get_kernel_pte(unsigned long addr)
+{
+ pmd_t *pmdp = pmd_off_k(addr);
+
+ if (!pmdp || pmd_leaf(*pmdp))
+ return NULL;
+
+ return pte_offset_kernel(pmdp, addr);
+}
+
+#define write_pgtable(type, ptr) do { \
+ type##_t val; \
+ int ret; \
+ \
+ pr_debug("%s: writing to "#type" at %px\n", __func__, (ptr)); \
+ \
+ val = type##p_get(ptr); \
+ ret = copy_to_kernel_nofault(ptr, &val, sizeof(val)); \
+ KUNIT_EXPECT_EQ_MSG(test, ret, -EFAULT, \
+ "Direct "#type" write wasn't prevented"); \
+} while (0)
+
+/*
+ * Try to write linear map page tables, at every level. This is worthwhile
+ * because those page table pages are obtained from different allocators:
+ *
+ * - Static memory (part of the kernel image) for PGD
+ * - memblock for PUD and possibly PMD/PTE
+ * - pagetable_alloc() (buddy allocator) for PMD/PTE if large block mappings are
+ * used and the linear map is split after being created
+ */
+static void write_direct_map_pgtables(struct kunit *test)
+{
+ struct page *page;
+ unsigned long addr;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ int ret;
+
+ if (!arch_kpkeys_enabled())
+ kunit_skip(test, "kpkeys are not supported");
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ ret = kunit_add_action_or_reset(test, free_page_wrapper, page);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* Ensure page is PTE-mapped (splitting the linear map if necessary) */
+ ret = set_direct_map_invalid_noflush(page);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = set_direct_map_default_noflush(page);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ addr = (unsigned long)page_address(page);
+
+ pgdp = pgd_offset_k(addr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, pgdp, "Failed to get PGD");
+ /*
+ * swapper_pg_dir is still writable at this stage, so don't check it.
+ * It is not protected by kpkeys_hardened_pgtables because it should be
+ * made read-only by mark_rodata_ro(). However since these
+ * KUnit tests are builtin, they are run before mark_rodata_ro() is
+ * called.
+ */
+
+ p4dp = p4d_offset(pgdp, addr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, p4dp, "Failed to get P4D");
+ /* Not checked; same rationale as PGD in case P4D is folded */
+
+ pudp = pud_offset(p4dp, addr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, pudp, "Failed to get PUD");
+ write_pgtable(pud, pudp);
+
+ pmdp = pmd_offset(pudp, addr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, pmdp, "Failed to get PMD");
+ write_pgtable(pmd, pmdp);
+
+ ptep = pte_offset_kernel(pmdp, addr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, ptep, "Failed to get PTE");
+ write_pgtable(pte, ptep);
+}
+
+/* Worth checking since the kernel image is mapped with static page tables */
+static void write_kernel_image_pud(struct kunit *test)
+{
+ pud_t *pudp;
+
+ if (!arch_kpkeys_enabled())
+ kunit_skip(test, "kpkeys are not supported");
+
+ /* The kernel is probably block-mapped, check the PUD to be safe */
+ pudp = pud_off_k((unsigned long)&init_mm);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, pudp, "Failed to get PUD");
+
+ write_pgtable(pud, pudp);
+}
+
+static void write_kernel_vmalloc_pte(struct kunit *test)
+{
+ void *mem;
+ pte_t *ptep;
+ int ret;
+
+ if (!arch_kpkeys_enabled())
+ kunit_skip(test, "kpkeys are not supported");
+
+ mem = vmalloc(PAGE_SIZE);
+ KUNIT_ASSERT_NOT_NULL(test, mem);
+ ret = kunit_add_action_or_reset(test, vfree_wrapper, mem);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ /* vmalloc() without VM_ALLOW_HUGE_VMAP is PTE-mapped */
+ ptep = get_kernel_pte((unsigned long)mem);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, ptep, "Failed to get PTE");
+
+ write_pgtable(pte, ptep);
+}
+
+#ifdef CONFIG_ARM64
+static void write_early_kernel_vmap_pte(struct kunit *test)
+{
+ pte_t *ptep;
+
+ if (!arch_kpkeys_enabled())
+ kunit_skip(test, "kpkeys are not supported");
+
+ /*
+ * When block mappings are used, the IRQ stacks are allocated before
+ * set_memory_pkey() is available - the pkey is set later by
+ * kpkeys_hardened_pgtables_init_late()
+ */
+ ptep = get_kernel_pte((unsigned long)raw_cpu_read(irq_stack_ptr));
+ KUNIT_ASSERT_NOT_NULL_MSG(test, ptep, "Failed to get PTE");
+
+ write_pgtable(pte, ptep);
+}
+#endif
+
+static void write_user_pmd(struct kunit *test)
+{
+ pmd_t *pmdp;
+ unsigned long uaddr;
+
+ if (!arch_kpkeys_enabled())
+ kunit_skip(test, "kpkeys are not supported");
+
+ uaddr = kunit_vm_mmap(test, NULL, 0, PAGE_SIZE, PROT_READ,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, 0);
+ KUNIT_ASSERT_NE_MSG(test, uaddr, 0, "Could not create userspace mm");
+
+ /* We passed MAP_POPULATE so a PMD should already be allocated */
+ pmdp = pmd_off(current->mm, uaddr);
+ KUNIT_ASSERT_NOT_NULL_MSG(test, pmdp, "Failed to get PMD");
+
+ write_pgtable(pmd, pmdp);
+}
+
+static struct kunit_case kpkeys_hardened_pgtables_test_cases[] = {
+ KUNIT_CASE(write_direct_map_pgtables),
+ KUNIT_CASE(write_kernel_image_pud),
+ KUNIT_CASE(write_kernel_vmalloc_pte),
+#ifdef CONFIG_ARM64
+ KUNIT_CASE(write_early_kernel_vmap_pte),
+#endif
+ KUNIT_CASE(write_user_pmd),
+ {}
+};
+
+static struct kunit_suite kpkeys_hardened_pgtables_test_suite = {
+ .name = "kpkeys_hardened_pgtables",
+ .test_cases = kpkeys_hardened_pgtables_test_cases,
+};
+kunit_test_suite(kpkeys_hardened_pgtables_test_suite);
+
+MODULE_DESCRIPTION("Tests for the kpkeys_hardened_pgtables feature");
+MODULE_LICENSE("GPL");
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index fdaf977d4626..48789f93e933 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -287,6 +287,18 @@ config KPKEYS_HARDENED_PGTABLES
This option has no effect if the system does not support
kernel pkeys.
+config KPKEYS_HARDENED_PGTABLES_KUNIT_TEST
+ bool "KUnit tests for kpkeys_hardened_pgtables" if !KUNIT_ALL_TESTS
+ depends on KPKEYS_HARDENED_PGTABLES
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to check that the kpkeys_hardened_pgtables feature
+ functions as intended, i.e. prevents arbitrary writes to user and
+ kernel page tables.
+
+ If unsure, say N.
+
endmenu
config CC_HAS_RANDSTRUCT
--
2.51.2
^ permalink raw reply [flat|nested] 32+ messages in thread