* [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
@ 2026-01-14 13:45 ` Kalyazin, Nikita
2026-01-15 10:54 ` Huacai Chen
` (3 more replies)
2026-01-14 13:45 ` [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed Kalyazin, Nikita
` (11 subsequent siblings)
12 siblings, 4 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:45 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Nikita Kalyazin <kalyazin@amazon.com>
These allow guest_memfd to remove its memory from the direct map.
Only implement them for architectures that have direct map.
In folio_zap_direct_map(), flush TLB on architectures where
set_direct_map_valid_noflush() does not flush it internally.
The new helpers need to be accessible to KVM on architectures that
support guest_memfd (x86 and arm64). Since arm64 does not support
building KVM as a module, only export them on x86.
Direct map removal gives guest_memfd the same protection that
memfd_secret does, such as hardening against Spectre-like attacks
through in-kernel gadgets.
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
arch/arm64/include/asm/set_memory.h | 2 ++
arch/arm64/mm/pageattr.c | 12 ++++++++++++
arch/loongarch/include/asm/set_memory.h | 2 ++
arch/loongarch/mm/pageattr.c | 16 ++++++++++++++++
arch/riscv/include/asm/set_memory.h | 2 ++
arch/riscv/mm/pageattr.c | 16 ++++++++++++++++
arch/s390/include/asm/set_memory.h | 2 ++
arch/s390/mm/pageattr.c | 18 ++++++++++++++++++
arch/x86/include/asm/set_memory.h | 2 ++
arch/x86/mm/pat/set_memory.c | 20 ++++++++++++++++++++
include/linux/set_memory.h | 10 ++++++++++
11 files changed, 102 insertions(+)
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 90f61b17275e..d949f1deb701 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -14,6 +14,8 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+int folio_zap_direct_map(struct folio *folio);
+int folio_restore_direct_map(struct folio *folio);
bool kernel_page_present(struct page *page);
int set_memory_encrypted(unsigned long addr, int numpages);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index f0e784b963e6..a94eff324dda 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -357,6 +357,18 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return set_memory_valid(addr, nr, valid);
}
+int folio_zap_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), false);
+}
+
+int folio_restore_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
/*
* This is - apart from the return value - doing the same
diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h
index 55dfaefd02c8..9bc80ac420a9 100644
--- a/arch/loongarch/include/asm/set_memory.h
+++ b/arch/loongarch/include/asm/set_memory.h
@@ -18,5 +18,7 @@ bool kernel_page_present(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+int folio_zap_direct_map(struct folio *folio);
+int folio_restore_direct_map(struct folio *folio);
#endif /* _ASM_LOONGARCH_SET_MEMORY_H */
diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
index f5e910b68229..14bd322dd112 100644
--- a/arch/loongarch/mm/pageattr.c
+++ b/arch/loongarch/mm/pageattr.c
@@ -236,3 +236,19 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return __set_memory(addr, 1, set, clear);
}
+
+int folio_zap_direct_map(struct folio *folio)
+{
+ int ret;
+
+ ret = set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), false);
+
+ return ret;
+}
+
+int folio_restore_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true);
+}
diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
index 87389e93325a..16557b70c830 100644
--- a/arch/riscv/include/asm/set_memory.h
+++ b/arch/riscv/include/asm/set_memory.h
@@ -43,6 +43,8 @@ static inline int set_kernel_memory(char *startp, char *endp,
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+int folio_zap_direct_map(struct folio *folio);
+int folio_restore_direct_map(struct folio *folio);
bool kernel_page_present(struct page *page);
#endif /* __ASSEMBLER__ */
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 3f76db3d2769..2c218868114b 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -401,6 +401,22 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return __set_memory((unsigned long)page_address(page), nr, set, clear);
}
+int folio_zap_direct_map(struct folio *folio)
+{
+ int ret;
+
+ ret = set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), false);
+
+ return ret;
+}
+
+int folio_restore_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
{
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index 94092f4ae764..fc73652e5715 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -63,6 +63,8 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+int folio_zap_direct_map(struct folio *folio);
+int folio_restore_direct_map(struct folio *folio);
bool kernel_page_present(struct page *page);
#endif
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index d3ce04a4b248..df4a487b484d 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -412,6 +412,24 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return __set_memory((unsigned long)page_to_virt(page), nr, flags);
}
+int folio_zap_direct_map(struct folio *folio)
+{
+ unsigned long addr = (unsigned long)folio_address(folio);
+ int ret;
+
+ ret = set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), false);
+ flush_tlb_kernel_range(addr, addr + folio_size(folio));
+
+ return ret;
+}
+
+int folio_restore_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true);
+}
+
bool kernel_page_present(struct page *page)
{
unsigned long addr;
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 61f56cdaccb5..7208af609121 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -90,6 +90,8 @@ int set_pages_rw(struct page *page, int numpages);
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
+int folio_zap_direct_map(struct folio *folio);
+int folio_restore_direct_map(struct folio *folio);
bool kernel_page_present(struct page *page);
extern int kernel_set_to_readonly;
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 6c6eb486f7a6..3f0fc30eb320 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2656,6 +2656,26 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
return __set_pages_np(page, nr);
}
+int folio_zap_direct_map(struct folio *folio)
+{
+ unsigned long addr = (unsigned long)folio_address(folio);
+ int ret;
+
+ ret = set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), false);
+ flush_tlb_kernel_range(addr, addr + folio_size(folio));
+
+ return ret;
+}
+EXPORT_SYMBOL_FOR_MODULES(folio_zap_direct_map, "kvm");
+
+int folio_restore_direct_map(struct folio *folio)
+{
+ return set_direct_map_valid_noflush(folio_page(folio, 0),
+ folio_nr_pages(folio), true);
+}
+EXPORT_SYMBOL_FOR_MODULES(folio_restore_direct_map, "kvm");
+
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 3030d9245f5a..8d1c8a7f7d79 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -40,6 +40,16 @@ static inline int set_direct_map_valid_noflush(struct page *page,
return 0;
}
+static inline int folio_zap_direct_map(struct folio *folio)
+{
+ return 0;
+}
+
+static inline int folio_restore_direct_map(struct folio *folio)
+{
+ return 0;
+}
+
static inline bool kernel_page_present(struct page *page)
{
return true;
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
@ 2026-01-15 10:54 ` Huacai Chen
2026-01-15 11:03 ` [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers Nikita Kalyazin
2026-01-15 12:12 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Heiko Carstens
` (2 subsequent siblings)
3 siblings, 1 reply; 62+ messages in thread
From: Huacai Chen @ 2026-01-15 10:54 UTC (permalink / raw)
To: Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, willy, akpm, david, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, mhocko, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa, jgg, jhubbard, peterx, jannh, pfalcato, shuah,
riel, ryan.roberts, jgross, yu-cheng.yu, kas, coxu,
kevin.brodsky, ackerleytng, maobibo, prsampat, mlevitsk,
jmattson, jthoughton, agordeev, alex, aou, borntraeger, dev.jain,
gor, hca, Jonathan.Cameron, palmer, pjw, shijie, svens, thuth,
wyihan, yang, vannapurve, jackmanb, aneesh.kumar, patrick.roy,
Thomson, Jack, Itazuri, Takahiro, Manwaring, Derek, Cali, Marco
Hi, Nikita,
On Wed, Jan 14, 2026 at 9:45 PM Kalyazin, Nikita <kalyazin@amazon.co.uk> wrote:
>
> From: Nikita Kalyazin <kalyazin@amazon.com>
>
> These allow guest_memfd to remove its memory from the direct map.
> Only implement them for architectures that have direct map.
> In folio_zap_direct_map(), flush TLB on architectures where
> set_direct_map_valid_noflush() does not flush it internally.
>
> The new helpers need to be accessible to KVM on architectures that
> support guest_memfd (x86 and arm64). Since arm64 does not support
> building KVM as a module, only export them on x86.
>
> Direct map removal gives guest_memfd the same protection that
> memfd_secret does, such as hardening against Spectre-like attacks
> through in-kernel gadgets.
>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> arch/arm64/include/asm/set_memory.h | 2 ++
> arch/arm64/mm/pageattr.c | 12 ++++++++++++
> arch/loongarch/include/asm/set_memory.h | 2 ++
> arch/loongarch/mm/pageattr.c | 16 ++++++++++++++++
> arch/riscv/include/asm/set_memory.h | 2 ++
> arch/riscv/mm/pageattr.c | 16 ++++++++++++++++
> arch/s390/include/asm/set_memory.h | 2 ++
> arch/s390/mm/pageattr.c | 18 ++++++++++++++++++
> arch/x86/include/asm/set_memory.h | 2 ++
> arch/x86/mm/pat/set_memory.c | 20 ++++++++++++++++++++
> include/linux/set_memory.h | 10 ++++++++++
> 11 files changed, 102 insertions(+)
>
> diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
> index 90f61b17275e..d949f1deb701 100644
> --- a/arch/arm64/include/asm/set_memory.h
> +++ b/arch/arm64/include/asm/set_memory.h
> @@ -14,6 +14,8 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
> bool kernel_page_present(struct page *page);
>
> int set_memory_encrypted(unsigned long addr, int numpages);
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index f0e784b963e6..a94eff324dda 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -357,6 +357,18 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return set_memory_valid(addr, nr, valid);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> +}
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> +
> #ifdef CONFIG_DEBUG_PAGEALLOC
> /*
> * This is - apart from the return value - doing the same
> diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h
> index 55dfaefd02c8..9bc80ac420a9 100644
> --- a/arch/loongarch/include/asm/set_memory.h
> +++ b/arch/loongarch/include/asm/set_memory.h
> @@ -18,5 +18,7 @@ bool kernel_page_present(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
>
> #endif /* _ASM_LOONGARCH_SET_MEMORY_H */
> diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
> index f5e910b68229..14bd322dd112 100644
> --- a/arch/loongarch/mm/pageattr.c
> +++ b/arch/loongarch/mm/pageattr.c
> @@ -236,3 +236,19 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>
> return __set_memory(addr, 1, set, clear);
> }
> +
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + int ret;
> +
> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> +
> + return ret;
Why not use a single statement which is the same as the ARM64 version?
The RISCV version has the same problem.
Huacai
> +}
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
> index 87389e93325a..16557b70c830 100644
> --- a/arch/riscv/include/asm/set_memory.h
> +++ b/arch/riscv/include/asm/set_memory.h
> @@ -43,6 +43,8 @@ static inline int set_kernel_memory(char *startp, char *endp,
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
> bool kernel_page_present(struct page *page);
>
> #endif /* __ASSEMBLER__ */
> diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
> index 3f76db3d2769..2c218868114b 100644
> --- a/arch/riscv/mm/pageattr.c
> +++ b/arch/riscv/mm/pageattr.c
> @@ -401,6 +401,22 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return __set_memory((unsigned long)page_address(page), nr, set, clear);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + int ret;
> +
> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> +
> + return ret;
> +}
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> +
> #ifdef CONFIG_DEBUG_PAGEALLOC
> static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
> {
> diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
> index 94092f4ae764..fc73652e5715 100644
> --- a/arch/s390/include/asm/set_memory.h
> +++ b/arch/s390/include/asm/set_memory.h
> @@ -63,6 +63,8 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
> bool kernel_page_present(struct page *page);
>
> #endif
> diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
> index d3ce04a4b248..df4a487b484d 100644
> --- a/arch/s390/mm/pageattr.c
> +++ b/arch/s390/mm/pageattr.c
> @@ -412,6 +412,24 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return __set_memory((unsigned long)page_to_virt(page), nr, flags);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + unsigned long addr = (unsigned long)folio_address(folio);
> + int ret;
> +
> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
> +
> + return ret;
> +}
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> +
> bool kernel_page_present(struct page *page)
> {
> unsigned long addr;
> diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
> index 61f56cdaccb5..7208af609121 100644
> --- a/arch/x86/include/asm/set_memory.h
> +++ b/arch/x86/include/asm/set_memory.h
> @@ -90,6 +90,8 @@ int set_pages_rw(struct page *page, int numpages);
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
> bool kernel_page_present(struct page *page);
>
> extern int kernel_set_to_readonly;
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 6c6eb486f7a6..3f0fc30eb320 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -2656,6 +2656,26 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return __set_pages_np(page, nr);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + unsigned long addr = (unsigned long)folio_address(folio);
> + int ret;
> +
> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_FOR_MODULES(folio_zap_direct_map, "kvm");
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> +EXPORT_SYMBOL_FOR_MODULES(folio_restore_direct_map, "kvm");
> +
> #ifdef CONFIG_DEBUG_PAGEALLOC
> void __kernel_map_pages(struct page *page, int numpages, int enable)
> {
> diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
> index 3030d9245f5a..8d1c8a7f7d79 100644
> --- a/include/linux/set_memory.h
> +++ b/include/linux/set_memory.h
> @@ -40,6 +40,16 @@ static inline int set_direct_map_valid_noflush(struct page *page,
> return 0;
> }
>
> +static inline int folio_zap_direct_map(struct folio *folio)
> +{
> + return 0;
> +}
> +
> +static inline int folio_restore_direct_map(struct folio *folio)
> +{
> + return 0;
> +}
> +
> static inline bool kernel_page_present(struct page *page)
> {
> return true;
> --
> 2.50.1
>
>
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers
2026-01-15 10:54 ` Huacai Chen
@ 2026-01-15 11:03 ` Nikita Kalyazin
0 siblings, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-15 11:03 UTC (permalink / raw)
To: Huacai Chen, Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, willy, akpm, david, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, mhocko, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa, jgg, jhubbard, peterx, jannh, pfalcato, shuah,
riel, ryan.roberts, jgross, yu-cheng.yu, kas, coxu,
kevin.brodsky, ackerleytng, maobibo, prsampat, mlevitsk,
jmattson, jthoughton, agordeev, alex, aou, borntraeger, dev.jain,
gor, hca, Jonathan.Cameron, palmer, pjw, shijie, svens, thuth,
wyihan, yang, vannapurve, jackmanb, aneesh.kumar, patrick.roy,
Thomson, Jack, Itazuri, Takahiro, Manwaring, Derek, Cali, Marco
On 15/01/2026 10:54, Huacai Chen wrote:
> Hi, Nikita,
Hi Huacai,
>
> On Wed, Jan 14, 2026 at 9:45 PM Kalyazin, Nikita <kalyazin@amazon.co.uk> wrote:
>>
>> From: Nikita Kalyazin <kalyazin@amazon.com>
>>
>> These allow guest_memfd to remove its memory from the direct map.
>> Only implement them for architectures that have direct map.
>> In folio_zap_direct_map(), flush TLB on architectures where
>> set_direct_map_valid_noflush() does not flush it internally.
>>
>> The new helpers need to be accessible to KVM on architectures that
>> support guest_memfd (x86 and arm64). Since arm64 does not support
>> building KVM as a module, only export them on x86.
>>
>> Direct map removal gives guest_memfd the same protection that
>> memfd_secret does, such as hardening against Spectre-like attacks
>> through in-kernel gadgets.
>>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> arch/arm64/include/asm/set_memory.h | 2 ++
>> arch/arm64/mm/pageattr.c | 12 ++++++++++++
>> arch/loongarch/include/asm/set_memory.h | 2 ++
>> arch/loongarch/mm/pageattr.c | 16 ++++++++++++++++
>> arch/riscv/include/asm/set_memory.h | 2 ++
>> arch/riscv/mm/pageattr.c | 16 ++++++++++++++++
>> arch/s390/include/asm/set_memory.h | 2 ++
>> arch/s390/mm/pageattr.c | 18 ++++++++++++++++++
>> arch/x86/include/asm/set_memory.h | 2 ++
>> arch/x86/mm/pat/set_memory.c | 20 ++++++++++++++++++++
>> include/linux/set_memory.h | 10 ++++++++++
>> 11 files changed, 102 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
>> index 90f61b17275e..d949f1deb701 100644
>> --- a/arch/arm64/include/asm/set_memory.h
>> +++ b/arch/arm64/include/asm/set_memory.h
>> @@ -14,6 +14,8 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
>> int set_direct_map_invalid_noflush(struct page *page);
>> int set_direct_map_default_noflush(struct page *page);
>> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
>> +int folio_zap_direct_map(struct folio *folio);
>> +int folio_restore_direct_map(struct folio *folio);
>> bool kernel_page_present(struct page *page);
>>
>> int set_memory_encrypted(unsigned long addr, int numpages);
>> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
>> index f0e784b963e6..a94eff324dda 100644
>> --- a/arch/arm64/mm/pageattr.c
>> +++ b/arch/arm64/mm/pageattr.c
>> @@ -357,6 +357,18 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>> return set_memory_valid(addr, nr, valid);
>> }
>>
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> +}
>> +
>> +int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), true);
>> +}
>> +
>> #ifdef CONFIG_DEBUG_PAGEALLOC
>> /*
>> * This is - apart from the return value - doing the same
>> diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h
>> index 55dfaefd02c8..9bc80ac420a9 100644
>> --- a/arch/loongarch/include/asm/set_memory.h
>> +++ b/arch/loongarch/include/asm/set_memory.h
>> @@ -18,5 +18,7 @@ bool kernel_page_present(struct page *page);
>> int set_direct_map_default_noflush(struct page *page);
>> int set_direct_map_invalid_noflush(struct page *page);
>> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
>> +int folio_zap_direct_map(struct folio *folio);
>> +int folio_restore_direct_map(struct folio *folio);
>>
>> #endif /* _ASM_LOONGARCH_SET_MEMORY_H */
>> diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
>> index f5e910b68229..14bd322dd112 100644
>> --- a/arch/loongarch/mm/pageattr.c
>> +++ b/arch/loongarch/mm/pageattr.c
>> @@ -236,3 +236,19 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>>
>> return __set_memory(addr, 1, set, clear);
>> }
>> +
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + int ret;
>> +
>> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> +
>> + return ret;
> Why not use a single statement which is the same as the ARM64 version?
> The RISCV version has the same problem.
No reason for them to be different. Will update in the next version.
Thank you!
>
> Huacai
>
>> +}
>> +
>> +int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), true);
>> +}
>> diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h
>> index 87389e93325a..16557b70c830 100644
>> --- a/arch/riscv/include/asm/set_memory.h
>> +++ b/arch/riscv/include/asm/set_memory.h
>> @@ -43,6 +43,8 @@ static inline int set_kernel_memory(char *startp, char *endp,
>> int set_direct_map_invalid_noflush(struct page *page);
>> int set_direct_map_default_noflush(struct page *page);
>> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
>> +int folio_zap_direct_map(struct folio *folio);
>> +int folio_restore_direct_map(struct folio *folio);
>> bool kernel_page_present(struct page *page);
>>
>> #endif /* __ASSEMBLER__ */
>> diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
>> index 3f76db3d2769..2c218868114b 100644
>> --- a/arch/riscv/mm/pageattr.c
>> +++ b/arch/riscv/mm/pageattr.c
>> @@ -401,6 +401,22 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>> return __set_memory((unsigned long)page_address(page), nr, set, clear);
>> }
>>
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + int ret;
>> +
>> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> +
>> + return ret;
>> +}
>> +
>> +int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), true);
>> +}
>> +
>> #ifdef CONFIG_DEBUG_PAGEALLOC
>> static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
>> {
>> diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
>> index 94092f4ae764..fc73652e5715 100644
>> --- a/arch/s390/include/asm/set_memory.h
>> +++ b/arch/s390/include/asm/set_memory.h
>> @@ -63,6 +63,8 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
>> int set_direct_map_invalid_noflush(struct page *page);
>> int set_direct_map_default_noflush(struct page *page);
>> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
>> +int folio_zap_direct_map(struct folio *folio);
>> +int folio_restore_direct_map(struct folio *folio);
>> bool kernel_page_present(struct page *page);
>>
>> #endif
>> diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
>> index d3ce04a4b248..df4a487b484d 100644
>> --- a/arch/s390/mm/pageattr.c
>> +++ b/arch/s390/mm/pageattr.c
>> @@ -412,6 +412,24 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>> return __set_memory((unsigned long)page_to_virt(page), nr, flags);
>> }
>>
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + unsigned long addr = (unsigned long)folio_address(folio);
>> + int ret;
>> +
>> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
>> +
>> + return ret;
>> +}
>> +
>> +int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), true);
>> +}
>> +
>> bool kernel_page_present(struct page *page)
>> {
>> unsigned long addr;
>> diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
>> index 61f56cdaccb5..7208af609121 100644
>> --- a/arch/x86/include/asm/set_memory.h
>> +++ b/arch/x86/include/asm/set_memory.h
>> @@ -90,6 +90,8 @@ int set_pages_rw(struct page *page, int numpages);
>> int set_direct_map_invalid_noflush(struct page *page);
>> int set_direct_map_default_noflush(struct page *page);
>> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
>> +int folio_zap_direct_map(struct folio *folio);
>> +int folio_restore_direct_map(struct folio *folio);
>> bool kernel_page_present(struct page *page);
>>
>> extern int kernel_set_to_readonly;
>> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
>> index 6c6eb486f7a6..3f0fc30eb320 100644
>> --- a/arch/x86/mm/pat/set_memory.c
>> +++ b/arch/x86/mm/pat/set_memory.c
>> @@ -2656,6 +2656,26 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>> return __set_pages_np(page, nr);
>> }
>>
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + unsigned long addr = (unsigned long)folio_address(folio);
>> + int ret;
>> +
>> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_FOR_MODULES(folio_zap_direct_map, "kvm");
>> +
>> +int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), true);
>> +}
>> +EXPORT_SYMBOL_FOR_MODULES(folio_restore_direct_map, "kvm");
>> +
>> #ifdef CONFIG_DEBUG_PAGEALLOC
>> void __kernel_map_pages(struct page *page, int numpages, int enable)
>> {
>> diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
>> index 3030d9245f5a..8d1c8a7f7d79 100644
>> --- a/include/linux/set_memory.h
>> +++ b/include/linux/set_memory.h
>> @@ -40,6 +40,16 @@ static inline int set_direct_map_valid_noflush(struct page *page,
>> return 0;
>> }
>>
>> +static inline int folio_zap_direct_map(struct folio *folio)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline int folio_restore_direct_map(struct folio *folio)
>> +{
>> + return 0;
>> +}
>> +
>> static inline bool kernel_page_present(struct page *page)
>> {
>> return true;
>> --
>> 2.50.1
>>
>>
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
2026-01-15 10:54 ` Huacai Chen
@ 2026-01-15 12:12 ` Heiko Carstens
2026-01-15 15:25 ` [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers Nikita Kalyazin
2026-01-15 15:55 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Matthew Wilcox
2026-01-15 21:07 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Ackerley Tng
3 siblings, 1 reply; 62+ messages in thread
From: Heiko Carstens @ 2026-01-15 12:12 UTC (permalink / raw)
To: Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, willy, akpm, david, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, mhocko, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa, jgg, jhubbard, peterx, jannh, pfalcato, shuah,
riel, ryan.roberts, jgross, yu-cheng.yu, kas, coxu,
kevin.brodsky, ackerleytng, maobibo, prsampat, mlevitsk,
jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, Jonathan.Cameron, palmer, pjw, shijie,
svens, thuth, wyihan, yang, vannapurve, jackmanb, aneesh.kumar,
patrick.roy, Thomson, Jack, Itazuri, Takahiro, Manwaring, Derek,
Cali, Marco
On Wed, Jan 14, 2026 at 01:45:23PM +0000, Kalyazin, Nikita wrote:
> From: Nikita Kalyazin <kalyazin@amazon.com>
>
> These allow guest_memfd to remove its memory from the direct map.
> Only implement them for architectures that have direct map.
> In folio_zap_direct_map(), flush TLB on architectures where
> set_direct_map_valid_noflush() does not flush it internally.
...
> diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
> index d3ce04a4b248..df4a487b484d 100644
> --- a/arch/s390/mm/pageattr.c
> +++ b/arch/s390/mm/pageattr.c
> @@ -412,6 +412,24 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return __set_memory((unsigned long)page_to_virt(page), nr, flags);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + unsigned long addr = (unsigned long)folio_address(folio);
> + int ret;
> +
> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
> +
> + return ret;
> +}
The instructions used in the s390 implementation of
set_direct_map_valid_noflush() do flush TLB entries.
The extra flush_tlb_kernel_range() is not required.
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers
2026-01-15 12:12 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Heiko Carstens
@ 2026-01-15 15:25 ` Nikita Kalyazin
0 siblings, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-15 15:25 UTC (permalink / raw)
To: Heiko Carstens, Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, willy, akpm, david, lorenzo.stoakes, Liam.Howlett,
vbabka, rppt, surenb, mhocko, ast, daniel, andrii, martin.lau,
eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
haoluo, jolsa, jgg, jhubbard, peterx, jannh, pfalcato, shuah,
riel, ryan.roberts, jgross, yu-cheng.yu, kas, coxu,
kevin.brodsky, ackerleytng, maobibo, prsampat, mlevitsk,
jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, Jonathan.Cameron, palmer, pjw, shijie,
svens, thuth, wyihan, yang, vannapurve, jackmanb, aneesh.kumar,
patrick.roy, Thomson, Jack, Itazuri, Takahiro, Manwaring, Derek,
Cali, Marco
On 15/01/2026 12:12, Heiko Carstens wrote:
> On Wed, Jan 14, 2026 at 01:45:23PM +0000, Kalyazin, Nikita wrote:
>> From: Nikita Kalyazin <kalyazin@amazon.com>
>>
>> These allow guest_memfd to remove its memory from the direct map.
>> Only implement them for architectures that have direct map.
>> In folio_zap_direct_map(), flush TLB on architectures where
>> set_direct_map_valid_noflush() does not flush it internally.
>
> ...
>
>> diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
>> index d3ce04a4b248..df4a487b484d 100644
>> --- a/arch/s390/mm/pageattr.c
>> +++ b/arch/s390/mm/pageattr.c
>> @@ -412,6 +412,24 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
>> return __set_memory((unsigned long)page_to_virt(page), nr, flags);
>> }
>>
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + unsigned long addr = (unsigned long)folio_address(folio);
>> + int ret;
>> +
>> + ret = set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> + flush_tlb_kernel_range(addr, addr + folio_size(folio));
>> +
>> + return ret;
>> +}
>
> The instructions used in the s390 implementation of
> set_direct_map_valid_noflush() do flush TLB entries.
> The extra flush_tlb_kernel_range() is not required.
Thanks, Heiko. Will update in the next version.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
2026-01-15 10:54 ` Huacai Chen
2026-01-15 12:12 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Heiko Carstens
@ 2026-01-15 15:55 ` Matthew Wilcox
2026-01-15 17:45 ` [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers Nikita Kalyazin
2026-01-15 21:07 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Ackerley Tng
3 siblings, 1 reply; 62+ messages in thread
From: Matthew Wilcox @ 2026-01-15 15:55 UTC (permalink / raw)
To: Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg,
jhubbard, peterx, jannh, pfalcato, shuah, riel, ryan.roberts,
jgross, yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng,
maobibo, prsampat, mlevitsk, jmattson, jthoughton, agordeev,
alex, aou, borntraeger, chenhuacai, dev.jain, gor, hca,
Jonathan.Cameron, palmer, pjw, shijie, svens, thuth, wyihan,
yang, vannapurve, jackmanb, aneesh.kumar, patrick.roy, Thomson,
Jack, Itazuri, Takahiro, Manwaring, Derek, Cali, Marco
On Wed, Jan 14, 2026 at 01:45:23PM +0000, Kalyazin, Nikita wrote:
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> +}
The implementation isn't the greatest. None of the implementations
of set_direct_map_valid_noflush() actually do anything with the struct
page; they all call page_address() or page_to_virt() (fundamentally the
same thing). So converting folio->page->address is a bit inefficient.
It feels like we should change set_direct_map_valid_noflush() to take a
const void * and pass either page_address() or folio_address(), depending
whether the caller has a page or a folio. What do you think?
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers
2026-01-15 15:55 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Matthew Wilcox
@ 2026-01-15 17:45 ` Nikita Kalyazin
2026-01-15 20:05 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-15 17:45 UTC (permalink / raw)
To: Matthew Wilcox, Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, akpm, david, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg,
jhubbard, peterx, jannh, pfalcato, shuah, riel, ryan.roberts,
jgross, yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng,
maobibo, prsampat, mlevitsk, jmattson, jthoughton, agordeev,
alex, aou, borntraeger, chenhuacai, dev.jain, gor, hca,
Jonathan.Cameron, palmer, pjw, shijie, svens, thuth, wyihan,
yang, vannapurve, jackmanb, aneesh.kumar, patrick.roy, Thomson,
Jack, Itazuri, Takahiro, Manwaring, Derek, Cali, Marco
On 15/01/2026 15:55, Matthew Wilcox wrote:
> On Wed, Jan 14, 2026 at 01:45:23PM +0000, Kalyazin, Nikita wrote:
>> +int folio_zap_direct_map(struct folio *folio)
>> +{
>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>> + folio_nr_pages(folio), false);
>> +}
>
> The implementation isn't the greatest. None of the implementations
> of set_direct_map_valid_noflush() actually do anything with the struct
> page; they all call page_address() or page_to_virt() (fundamentally the
> same thing). So converting folio->page->address is a bit inefficient.
>
> It feels like we should change set_direct_map_valid_noflush() to take a
> const void * and pass either page_address() or folio_address(), depending
> whether the caller has a page or a folio. What do you think?
I have nothing against that. execmem_set_direct_map_valid() appears to
be the only other user of set_direct_map_valid_noflush() so it isn't
going to be a broad change.
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers
2026-01-15 17:45 ` [PATCH v9 01/13] set_memory: add folio_{zap, restore}_direct_map helpers Nikita Kalyazin
@ 2026-01-15 20:05 ` David Hildenbrand (Red Hat)
0 siblings, 0 replies; 62+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-15 20:05 UTC (permalink / raw)
To: kalyazin, Matthew Wilcox, Kalyazin, Nikita
Cc: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch, pbonzini, corbet, maz,
oupton, joey.gouly, suzuki.poulose, yuzenghui, catalin.marinas,
will, seanjc, tglx, mingo, bp, dave.hansen, x86, hpa, luto,
peterz, akpm, lorenzo.stoakes, Liam.Howlett, vbabka, rppt,
surenb, mhocko, ast, daniel, andrii, martin.lau, eddyz87, song,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg,
jhubbard, peterx, jannh, pfalcato, shuah, riel, ryan.roberts,
jgross, yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng,
maobibo, prsampat, mlevitsk, jmattson, jthoughton, agordeev,
alex, aou, borntraeger, chenhuacai, dev.jain, gor, hca,
Jonathan.Cameron, palmer, pjw, shijie, svens, thuth, wyihan,
yang, vannapurve, jackmanb, aneesh.kumar, patrick.roy, Thomson,
Jack, Itazuri, Takahiro, Manwaring, Derek, Cali, Marco
On 1/15/26 18:45, Nikita Kalyazin wrote:
>
>
> On 15/01/2026 15:55, Matthew Wilcox wrote:
>> On Wed, Jan 14, 2026 at 01:45:23PM +0000, Kalyazin, Nikita wrote:
>>> +int folio_zap_direct_map(struct folio *folio)
>>> +{
>>> + return set_direct_map_valid_noflush(folio_page(folio, 0),
>>> + folio_nr_pages(folio), false);
>>> +}
>>
>> The implementation isn't the greatest. None of the implementations
>> of set_direct_map_valid_noflush() actually do anything with the struct
>> page; they all call page_address() or page_to_virt() (fundamentally the
>> same thing). So converting folio->page->address is a bit inefficient.
>>
>> It feels like we should change set_direct_map_valid_noflush() to take a
>> const void * and pass either page_address() or folio_address(), depending
>> whether the caller has a page or a folio. What do you think?
>
> I have nothing against that. execmem_set_direct_map_valid() appears to
> be the only other user of set_direct_map_valid_noflush() so it isn't
> going to be a broad change.
Makes perfect sense to me :)
--
Cheers
David
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
` (2 preceding siblings ...)
2026-01-15 15:55 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Matthew Wilcox
@ 2026-01-15 21:07 ` Ackerley Tng
3 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 21:07 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Nikita Kalyazin <kalyazin@amazon.com>
>
> These allow guest_memfd to remove its memory from the direct map.
> Only implement them for architectures that have direct map.
> In folio_zap_direct_map(), flush TLB on architectures where
> set_direct_map_valid_noflush() does not flush it internally.
>
> The new helpers need to be accessible to KVM on architectures that
> support guest_memfd (x86 and arm64). Since arm64 does not support
> building KVM as a module, only export them on x86.
>
> Direct map removal gives guest_memfd the same protection that
> memfd_secret does, such as hardening against Spectre-like attacks
> through in-kernel gadgets.
>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> arch/arm64/include/asm/set_memory.h | 2 ++
> arch/arm64/mm/pageattr.c | 12 ++++++++++++
> arch/loongarch/include/asm/set_memory.h | 2 ++
> arch/loongarch/mm/pageattr.c | 16 ++++++++++++++++
> arch/riscv/include/asm/set_memory.h | 2 ++
> arch/riscv/mm/pageattr.c | 16 ++++++++++++++++
> arch/s390/include/asm/set_memory.h | 2 ++
> arch/s390/mm/pageattr.c | 18 ++++++++++++++++++
> arch/x86/include/asm/set_memory.h | 2 ++
> arch/x86/mm/pat/set_memory.c | 20 ++++++++++++++++++++
> include/linux/set_memory.h | 10 ++++++++++
> 11 files changed, 102 insertions(+)
>
> diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
> index 90f61b17275e..d949f1deb701 100644
> --- a/arch/arm64/include/asm/set_memory.h
> +++ b/arch/arm64/include/asm/set_memory.h
> @@ -14,6 +14,8 @@ int set_memory_valid(unsigned long addr, int numpages, int enable);
> int set_direct_map_invalid_noflush(struct page *page);
> int set_direct_map_default_noflush(struct page *page);
> int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid);
> +int folio_zap_direct_map(struct folio *folio);
> +int folio_restore_direct_map(struct folio *folio);
> bool kernel_page_present(struct page *page);
>
> int set_memory_encrypted(unsigned long addr, int numpages);
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index f0e784b963e6..a94eff324dda 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -357,6 +357,18 @@ int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
> return set_memory_valid(addr, nr, valid);
> }
>
> +int folio_zap_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), false);
> +}
> +
> +int folio_restore_direct_map(struct folio *folio)
> +{
> + return set_direct_map_valid_noflush(folio_page(folio, 0),
> + folio_nr_pages(folio), true);
> +}
> +
Was going to suggest a _noflush suffix to these functions, but saw
Aneesh's comment that these functions actually do flush_tlb_kernel [1]
[1] https://lore.kernel.org/all/yq5ajz07czvz.fsf@kernel.org/
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
> #ifdef CONFIG_DEBUG_PAGEALLOC
> /*
> * This is - apart from the return value - doing the same
>
> [...snip...]
>
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
@ 2026-01-14 13:45 ` Kalyazin, Nikita
2026-01-15 20:04 ` David Hildenbrand (Red Hat)
2026-01-15 21:40 ` Ackerley Tng
2026-01-14 13:45 ` [PATCH v9 03/13] mm: introduce AS_NO_DIRECT_MAP Kalyazin, Nikita
` (10 subsequent siblings)
12 siblings, 2 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:45 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
This drops an optimization in gup_fast_folio_allowed() where
secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
by default"), so the secretmem check did not actually end up elided in
most cases anymore anyway.
This is in preparation of the generalization of handling mappings where
direct map entries of folios are set to not present. Currently,
mappings that match this description are secretmem mappings
(memfd_secret()). Later, some guest_memfd configurations will also fall
into this category.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
mm/gup.c | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index 95d948c8e86c..9cad53acbc99 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2739,7 +2739,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
bool reject_file_backed = false;
struct address_space *mapping;
- bool check_secretmem = false;
unsigned long mapping_flags;
/*
@@ -2751,14 +2750,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
reject_file_backed = true;
/* We hold a folio reference, so we can safely access folio fields. */
-
- /* secretmem folios are always order-0 folios. */
- if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
- check_secretmem = true;
-
- if (!reject_file_backed && !check_secretmem)
- return true;
-
if (WARN_ON_ONCE(folio_test_slab(folio)))
return false;
@@ -2800,7 +2791,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
* At this point, we know the mapping is non-null and points to an
* address_space object.
*/
- if (check_secretmem && secretmem_mapping(mapping))
+ if (secretmem_mapping(mapping))
return false;
/* The only remaining allowed file system is shmem. */
return !reject_file_backed || shmem_mapping(mapping);
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed
2026-01-14 13:45 ` [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed Kalyazin, Nikita
@ 2026-01-15 20:04 ` David Hildenbrand (Red Hat)
2026-01-15 21:40 ` Ackerley Tng
1 sibling, 0 replies; 62+ messages in thread
From: David Hildenbrand (Red Hat) @ 2026-01-15 20:04 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco
On 1/14/26 14:45, Kalyazin, Nikita wrote:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> This drops an optimization in gup_fast_folio_allowed() where
> secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
> enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
> by default"), so the secretmem check did not actually end up elided in
> most cases anymore anyway.
>
> This is in preparation of the generalization of handling mappings where
> direct map entries of folios are set to not present. Currently,
> mappings that match this description are secretmem mappings
> (memfd_secret()). Later, some guest_memfd configurations will also fall
> into this category.
>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
--
Cheers
David
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed
2026-01-14 13:45 ` [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed Kalyazin, Nikita
2026-01-15 20:04 ` David Hildenbrand (Red Hat)
@ 2026-01-15 21:40 ` Ackerley Tng
2026-01-16 14:55 ` Nikita Kalyazin
1 sibling, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 21:40 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> This drops an optimization in gup_fast_folio_allowed() where
> secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
> enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
> by default"), so the secretmem check did not actually end up elided in
> most cases anymore anyway.
>
> This is in preparation of the generalization of handling mappings where
> direct map entries of folios are set to not present. Currently,
> mappings that match this description are secretmem mappings
> (memfd_secret()). Later, some guest_memfd configurations will also fall
> into this category.
>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> mm/gup.c | 11 +----------
> 1 file changed, 1 insertion(+), 10 deletions(-)
>
> diff --git a/mm/gup.c b/mm/gup.c
> index 95d948c8e86c..9cad53acbc99 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2739,7 +2739,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
> {
> bool reject_file_backed = false;
> struct address_space *mapping;
> - bool check_secretmem = false;
> unsigned long mapping_flags;
>
> /*
> @@ -2751,14 +2750,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
Copying some lines the diff didn't contain:
/*
* If we aren't pinning then no problematic write can occur. A long term
* pin is the most egregious case so this is the one we disallow.
*/
if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
(FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
If we're pinning, can we already return true here? IIUC this function
is passed a folio that is file-backed, and the check if (!mapping) is
just there to catch the case where the mapping got truncated.
Or should we wait for the check where the mapping got truncated? If so,
then maybe we can move this "are we pinning" check to after this check
and remove the reject_file_backed variable?
/*
* The mapping may have been truncated, in any case we cannot determine
* if this mapping is safe - fall back to slow path to determine how to
* proceed.
*/
if (!mapping)
return false;
> reject_file_backed = true;
>
> /* We hold a folio reference, so we can safely access folio fields. */
> -
> - /* secretmem folios are always order-0 folios. */
> - if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
> - check_secretmem = true;
> -
> - if (!reject_file_backed && !check_secretmem)
> - return true;
> -
> if (WARN_ON_ONCE(folio_test_slab(folio)))
> return false;
>
> @@ -2800,7 +2791,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
> * At this point, we know the mapping is non-null and points to an
> * address_space object.
> */
> - if (check_secretmem && secretmem_mapping(mapping))
> + if (secretmem_mapping(mapping))
> return false;
> /* The only remaining allowed file system is shmem. */
> return !reject_file_backed || shmem_mapping(mapping);
> --
> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed
2026-01-15 21:40 ` Ackerley Tng
@ 2026-01-16 14:55 ` Nikita Kalyazin
2026-01-22 0:20 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 14:55 UTC (permalink / raw)
To: Ackerley Tng, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
On 15/01/2026 21:40, Ackerley Tng wrote:
> "Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
>
>> From: Patrick Roy <patrick.roy@linux.dev>
>>
>> This drops an optimization in gup_fast_folio_allowed() where
>> secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
>> enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
>> by default"), so the secretmem check did not actually end up elided in
>> most cases anymore anyway.
>>
>> This is in preparation of the generalization of handling mappings where
>> direct map entries of folios are set to not present. Currently,
>> mappings that match this description are secretmem mappings
>> (memfd_secret()). Later, some guest_memfd configurations will also fall
>> into this category.
>>
>> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
>> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> mm/gup.c | 11 +----------
>> 1 file changed, 1 insertion(+), 10 deletions(-)
>>
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 95d948c8e86c..9cad53acbc99 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -2739,7 +2739,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>> {
>> bool reject_file_backed = false;
>> struct address_space *mapping;
>> - bool check_secretmem = false;
>> unsigned long mapping_flags;
>>
>> /*
>> @@ -2751,14 +2750,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>
> Copying some lines the diff didn't contain:
>
> /*
> * If we aren't pinning then no problematic write can occur. A long term
> * pin is the most egregious case so this is the one we disallow.
> */
> if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
> (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
>
> If we're pinning, can we already return true here? IIUC this function
> is passed a folio that is file-backed, and the check if (!mapping) is
> just there to catch the case where the mapping got truncated.
I have to admit that I am not comfortable with removing this check,
unless someone says it's certainly alright.
>
> Or should we wait for the check where the mapping got truncated? If so,
> then maybe we can move this "are we pinning" check to after this check
> and remove the reject_file_backed variable?
I can indeed move the pinning check to the end to remove the variable.
I'd do it in a separate patch.
>
> /*
> * The mapping may have been truncated, in any case we cannot determine
> * if this mapping is safe - fall back to slow path to determine how to
> * proceed.
> */
> if (!mapping)
> return false;
>
>
>> reject_file_backed = true;
>>
>> /* We hold a folio reference, so we can safely access folio fields. */
>> -
>> - /* secretmem folios are always order-0 folios. */
>> - if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
>> - check_secretmem = true;
>> -
>> - if (!reject_file_backed && !check_secretmem)
>> - return true;
>> -
>> if (WARN_ON_ONCE(folio_test_slab(folio)))
>> return false;
>>
>> @@ -2800,7 +2791,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>> * At this point, we know the mapping is non-null and points to an
>> * address_space object.
>> */
>> - if (check_secretmem && secretmem_mapping(mapping))
>> + if (secretmem_mapping(mapping))
>> return false;
>> /* The only remaining allowed file system is shmem. */
>> return !reject_file_backed || shmem_mapping(mapping);
>> --
>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed
2026-01-16 14:55 ` Nikita Kalyazin
@ 2026-01-22 0:20 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 0:20 UTC (permalink / raw)
To: kalyazin, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
Nikita Kalyazin <kalyazin@amazon.com> writes:
> On 15/01/2026 21:40, Ackerley Tng wrote:
>> "Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
>>
>>> From: Patrick Roy <patrick.roy@linux.dev>
>>>
>>> This drops an optimization in gup_fast_folio_allowed() where
>>> secretmem_mapping() was only called if CONFIG_SECRETMEM=y. secretmem is
>>> enabled by default since commit b758fe6df50d ("mm/secretmem: make it on
>>> by default"), so the secretmem check did not actually end up elided in
>>> most cases anymore anyway.
>>>
>>> This is in preparation of the generalization of handling mappings where
>>> direct map entries of folios are set to not present. Currently,
>>> mappings that match this description are secretmem mappings
>>> (memfd_secret()). Later, some guest_memfd configurations will also fall
>>> into this category.
>>>
>>> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
>>> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>>> ---
>>> mm/gup.c | 11 +----------
>>> 1 file changed, 1 insertion(+), 10 deletions(-)
>>>
>>> diff --git a/mm/gup.c b/mm/gup.c
>>> index 95d948c8e86c..9cad53acbc99 100644
>>> --- a/mm/gup.c
>>> +++ b/mm/gup.c
>>> @@ -2739,7 +2739,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>>> {
>>> bool reject_file_backed = false;
>>> struct address_space *mapping;
>>> - bool check_secretmem = false;
>>> unsigned long mapping_flags;
>>>
>>> /*
>>> @@ -2751,14 +2750,6 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>>
>> Copying some lines the diff didn't contain:
>>
>> /*
>> * If we aren't pinning then no problematic write can occur. A long term
>> * pin is the most egregious case so this is the one we disallow.
>> */
>> if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
>> (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
>>
>> If we're pinning, can we already return true here? IIUC this function
>> is passed a folio that is file-backed, and the check if (!mapping) is
>> just there to catch the case where the mapping got truncated.
>
> I have to admit that I am not comfortable with removing this check,
> unless someone says it's certainly alright.
>
Perhaps David can help here, David last changed this in
f002882ca369aba3eece5006f3346ccf75ede7c5 (mm: merge folio_is_secretmem()
and folio_fast_pin_allowed() into gup_fast_folio_allowed()) from return
true to check_secretmem = true :)
>>
>> Or should we wait for the check where the mapping got truncated? If so,
>> then maybe we can move this "are we pinning" check to after this check
>> and remove the reject_file_backed variable?
>
> I can indeed move the pinning check to the end to remove the variable.
> I'd do it in a separate patch.
>
>>
>> /*
>> * The mapping may have been truncated, in any case we cannot determine
>> * if this mapping is safe - fall back to slow path to determine how to
>> * proceed.
>> */
>> if (!mapping)
>> return false;
>>
>>
>>> reject_file_backed = true;
>>>
>>> /* We hold a folio reference, so we can safely access folio fields. */
>>> -
>>> - /* secretmem folios are always order-0 folios. */
>>> - if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
>>> - check_secretmem = true;
>>> -
>>> - if (!reject_file_backed && !check_secretmem)
>>> - return true;
>>> -
>>> if (WARN_ON_ONCE(folio_test_slab(folio)))
>>> return false;
>>>
>>> @@ -2800,7 +2791,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
>>> * At this point, we know the mapping is non-null and points to an
>>> * address_space object.
>>> */
>>> - if (check_secretmem && secretmem_mapping(mapping))
>>> + if (secretmem_mapping(mapping))
>>> return false;
>>> /* The only remaining allowed file system is shmem. */
>>> return !reject_file_backed || shmem_mapping(mapping);
>>> --
>>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 03/13] mm: introduce AS_NO_DIRECT_MAP
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
2026-01-14 13:45 ` [PATCH v9 01/13] set_memory: add folio_{zap,restore}_direct_map helpers Kalyazin, Nikita
2026-01-14 13:45 ` [PATCH v9 02/13] mm/gup: drop secretmem optimization from gup_fast_folio_allowed Kalyazin, Nikita
@ 2026-01-14 13:45 ` Kalyazin, Nikita
2026-01-15 21:42 ` Ackerley Tng
2026-01-14 13:45 ` [PATCH v9 04/13] KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate Kalyazin, Nikita
` (9 subsequent siblings)
12 siblings, 1 reply; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:45 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are
set to not present. Currently, mappings that match this description are
secretmem mappings (memfd_secret()). Later, some guest_memfd
configurations will also fall into this category.
Reject this new type of mappings in all locations that currently reject
secretmem mappings, on the assumption that if secretmem mappings are
rejected somewhere, it is precisely because of an inability to deal with
folios without direct map entries, and then make memfd_secret() use
AS_NO_DIRECT_MAP on its address_space to drop its special
vma_is_secretmem()/secretmem_mapping() checks.
Use a new flag instead of overloading AS_INACCESSIBLE (which is already
set by guest_memfd) because not all guest_memfd mappings will end up
being direct map removed (e.g. in pKVM setups, parts of guest_memfd that
can be mapped to userspace should also be GUP-able, and generally not
have restrictions on who can access it).
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
include/linux/pagemap.h | 16 ++++++++++++++++
include/linux/secretmem.h | 18 ------------------
lib/buildid.c | 4 ++--
mm/gup.c | 10 +++++-----
mm/mlock.c | 2 +-
mm/secretmem.c | 8 ++------
6 files changed, 26 insertions(+), 32 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31a848485ad9..6ce7301d474a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -210,6 +210,7 @@ enum mapping_flags {
AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
account usage to user cgroups */
+ AS_NO_DIRECT_MAP = 11, /* Folios in the mapping are not in the direct map */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -345,6 +346,21 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
}
+static inline void mapping_set_no_direct_map(struct address_space *mapping)
+{
+ set_bit(AS_NO_DIRECT_MAP, &mapping->flags);
+}
+
+static inline bool mapping_no_direct_map(const struct address_space *mapping)
+{
+ return test_bit(AS_NO_DIRECT_MAP, &mapping->flags);
+}
+
+static inline bool vma_has_no_direct_map(const struct vm_area_struct *vma)
+{
+ return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping);
+}
+
static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
return mapping->gfp_mask;
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index e918f96881f5..0ae1fb057b3d 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -4,28 +4,10 @@
#ifdef CONFIG_SECRETMEM
-extern const struct address_space_operations secretmem_aops;
-
-static inline bool secretmem_mapping(struct address_space *mapping)
-{
- return mapping->a_ops == &secretmem_aops;
-}
-
-bool vma_is_secretmem(struct vm_area_struct *vma);
bool secretmem_active(void);
#else
-static inline bool vma_is_secretmem(struct vm_area_struct *vma)
-{
- return false;
-}
-
-static inline bool secretmem_mapping(struct address_space *mapping)
-{
- return false;
-}
-
static inline bool secretmem_active(void)
{
return false;
diff --git a/lib/buildid.c b/lib/buildid.c
index aaf61dfc0919..b78fe5797e9c 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -46,8 +46,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
freader_put_folio(r);
- /* reject secretmem folios created with memfd_secret() */
- if (secretmem_mapping(r->file->f_mapping))
+ /* reject folios without direct map entries (e.g. from memfd_secret() or guest_memfd()) */
+ if (mapping_no_direct_map(r->file->f_mapping))
return -EFAULT;
r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
diff --git a/mm/gup.c b/mm/gup.c
index 9cad53acbc99..11461a54b3ae 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -11,7 +11,6 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/secretmem.h>
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
@@ -1216,7 +1215,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
return -EOPNOTSUPP;
- if (vma_is_secretmem(vma))
+ if (vma_has_no_direct_map(vma))
return -EFAULT;
if (write) {
@@ -2724,7 +2723,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* This call assumes the caller has pinned the folio, that the lowest page table
* level still points to this folio, and that interrupts have been disabled.
*
- * GUP-fast must reject all secretmem folios.
+ * GUP-fast must reject all folios without direct map entries (such as secretmem).
*
* Writing to pinned file-backed dirty tracked folios is inherently problematic
* (see comment describing the writable_file_mapping_allowed() function). We
@@ -2753,7 +2752,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
if (WARN_ON_ONCE(folio_test_slab(folio)))
return false;
- /* hugetlb neither requires dirty-tracking nor can be secretmem. */
+ /* hugetlb neither requires dirty-tracking nor can be without direct map. */
if (folio_test_hugetlb(folio))
return true;
@@ -2791,8 +2790,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
* At this point, we know the mapping is non-null and points to an
* address_space object.
*/
- if (secretmem_mapping(mapping))
+ if (mapping_no_direct_map(mapping))
return false;
+
/* The only remaining allowed file system is shmem. */
return !reject_file_backed || shmem_mapping(mapping);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 2f699c3497a5..a6f4b3df4f3f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
- vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
+ vma_is_dax(vma) || vma_has_no_direct_map(vma) || (oldflags & VM_DROPPABLE))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index edf111e0a1bb..560cdbe1fe5d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -134,11 +134,6 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc)
return 0;
}
-bool vma_is_secretmem(struct vm_area_struct *vma)
-{
- return vma->vm_ops == &secretmem_vm_ops;
-}
-
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
.mmap_prepare = secretmem_mmap_prepare,
@@ -156,7 +151,7 @@ static void secretmem_free_folio(struct folio *folio)
folio_zero_segment(folio, 0, folio_size(folio));
}
-const struct address_space_operations secretmem_aops = {
+static const struct address_space_operations secretmem_aops = {
.dirty_folio = noop_dirty_folio,
.free_folio = secretmem_free_folio,
.migrate_folio = secretmem_migrate_folio,
@@ -205,6 +200,7 @@ static struct file *secretmem_file_create(unsigned long flags)
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
+ mapping_set_no_direct_map(inode->i_mapping);
inode->i_op = &secretmem_iops;
inode->i_mapping->a_ops = &secretmem_aops;
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 03/13] mm: introduce AS_NO_DIRECT_MAP
2026-01-14 13:45 ` [PATCH v9 03/13] mm: introduce AS_NO_DIRECT_MAP Kalyazin, Nikita
@ 2026-01-15 21:42 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 21:42 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> Add AS_NO_DIRECT_MAP for mappings where direct map entries of folios are
> set to not present. Currently, mappings that match this description are
> secretmem mappings (memfd_secret()). Later, some guest_memfd
> configurations will also fall into this category.
>
> Reject this new type of mappings in all locations that currently reject
> secretmem mappings, on the assumption that if secretmem mappings are
> rejected somewhere, it is precisely because of an inability to deal with
> folios without direct map entries, and then make memfd_secret() use
> AS_NO_DIRECT_MAP on its address_space to drop its special
> vma_is_secretmem()/secretmem_mapping() checks.
>
> Use a new flag instead of overloading AS_INACCESSIBLE (which is already
> set by guest_memfd) because not all guest_memfd mappings will end up
> being direct map removed (e.g. in pKVM setups, parts of guest_memfd that
> can be mapped to userspace should also be GUP-able, and generally not
> have restrictions on who can access it).
>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> include/linux/pagemap.h | 16 ++++++++++++++++
> include/linux/secretmem.h | 18 ------------------
> lib/buildid.c | 4 ++--
> mm/gup.c | 10 +++++-----
> mm/mlock.c | 2 +-
> mm/secretmem.c | 8 ++------
> 6 files changed, 26 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 31a848485ad9..6ce7301d474a 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -210,6 +210,7 @@ enum mapping_flags {
> AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
> AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
> account usage to user cgroups */
> + AS_NO_DIRECT_MAP = 11, /* Folios in the mapping are not in the direct map */
> /* Bits 16-25 are used for FOLIO_ORDER */
> AS_FOLIO_ORDER_BITS = 5,
> AS_FOLIO_ORDER_MIN = 16,
> @@ -345,6 +346,21 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
> return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
> }
>
> +static inline void mapping_set_no_direct_map(struct address_space *mapping)
> +{
> + set_bit(AS_NO_DIRECT_MAP, &mapping->flags);
> +}
> +
> +static inline bool mapping_no_direct_map(const struct address_space *mapping)
> +{
> + return test_bit(AS_NO_DIRECT_MAP, &mapping->flags);
> +}
> +
> +static inline bool vma_has_no_direct_map(const struct vm_area_struct *vma)
> +{
> + return vma->vm_file && mapping_no_direct_map(vma->vm_file->f_mapping);
> +}
> +
> static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
> {
> return mapping->gfp_mask;
> diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
> index e918f96881f5..0ae1fb057b3d 100644
> --- a/include/linux/secretmem.h
> +++ b/include/linux/secretmem.h
> @@ -4,28 +4,10 @@
>
> #ifdef CONFIG_SECRETMEM
>
> -extern const struct address_space_operations secretmem_aops;
> -
> -static inline bool secretmem_mapping(struct address_space *mapping)
> -{
> - return mapping->a_ops == &secretmem_aops;
> -}
> -
> -bool vma_is_secretmem(struct vm_area_struct *vma);
> bool secretmem_active(void);
>
> #else
>
> -static inline bool vma_is_secretmem(struct vm_area_struct *vma)
> -{
> - return false;
> -}
> -
> -static inline bool secretmem_mapping(struct address_space *mapping)
> -{
> - return false;
> -}
> -
> static inline bool secretmem_active(void)
> {
> return false;
> diff --git a/lib/buildid.c b/lib/buildid.c
> index aaf61dfc0919..b78fe5797e9c 100644
> --- a/lib/buildid.c
> +++ b/lib/buildid.c
> @@ -46,8 +46,8 @@ static int freader_get_folio(struct freader *r, loff_t file_off)
>
> freader_put_folio(r);
>
> - /* reject secretmem folios created with memfd_secret() */
> - if (secretmem_mapping(r->file->f_mapping))
> + /* reject folios without direct map entries (e.g. from memfd_secret() or guest_memfd()) */
> + if (mapping_no_direct_map(r->file->f_mapping))
> return -EFAULT;
>
> r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
> diff --git a/mm/gup.c b/mm/gup.c
> index 9cad53acbc99..11461a54b3ae 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -11,7 +11,6 @@
> #include <linux/rmap.h>
> #include <linux/swap.h>
> #include <linux/swapops.h>
> -#include <linux/secretmem.h>
>
> #include <linux/sched/signal.h>
> #include <linux/rwsem.h>
> @@ -1216,7 +1215,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
> return -EOPNOTSUPP;
>
> - if (vma_is_secretmem(vma))
> + if (vma_has_no_direct_map(vma))
> return -EFAULT;
>
> if (write) {
> @@ -2724,7 +2723,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
> * This call assumes the caller has pinned the folio, that the lowest page table
> * level still points to this folio, and that interrupts have been disabled.
> *
> - * GUP-fast must reject all secretmem folios.
> + * GUP-fast must reject all folios without direct map entries (such as secretmem).
> *
> * Writing to pinned file-backed dirty tracked folios is inherently problematic
> * (see comment describing the writable_file_mapping_allowed() function). We
> @@ -2753,7 +2752,7 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
> if (WARN_ON_ONCE(folio_test_slab(folio)))
> return false;
>
> - /* hugetlb neither requires dirty-tracking nor can be secretmem. */
> + /* hugetlb neither requires dirty-tracking nor can be without direct map. */
> if (folio_test_hugetlb(folio))
> return true;
>
> @@ -2791,8 +2790,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
> * At this point, we know the mapping is non-null and points to an
> * address_space object.
> */
> - if (secretmem_mapping(mapping))
> + if (mapping_no_direct_map(mapping))
> return false;
> +
> /* The only remaining allowed file system is shmem. */
> return !reject_file_backed || shmem_mapping(mapping);
> }
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 2f699c3497a5..a6f4b3df4f3f 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -474,7 +474,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
>
> if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
> is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
> - vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
> + vma_is_dax(vma) || vma_has_no_direct_map(vma) || (oldflags & VM_DROPPABLE))
> /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
> goto out;
>
> diff --git a/mm/secretmem.c b/mm/secretmem.c
> index edf111e0a1bb..560cdbe1fe5d 100644
> --- a/mm/secretmem.c
> +++ b/mm/secretmem.c
> @@ -134,11 +134,6 @@ static int secretmem_mmap_prepare(struct vm_area_desc *desc)
> return 0;
> }
>
> -bool vma_is_secretmem(struct vm_area_struct *vma)
> -{
> - return vma->vm_ops == &secretmem_vm_ops;
> -}
> -
> static const struct file_operations secretmem_fops = {
> .release = secretmem_release,
> .mmap_prepare = secretmem_mmap_prepare,
> @@ -156,7 +151,7 @@ static void secretmem_free_folio(struct folio *folio)
> folio_zero_segment(folio, 0, folio_size(folio));
> }
>
> -const struct address_space_operations secretmem_aops = {
> +static const struct address_space_operations secretmem_aops = {
> .dirty_folio = noop_dirty_folio,
> .free_folio = secretmem_free_folio,
> .migrate_folio = secretmem_migrate_folio,
> @@ -205,6 +200,7 @@ static struct file *secretmem_file_create(unsigned long flags)
>
> mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> mapping_set_unevictable(inode->i_mapping);
> + mapping_set_no_direct_map(inode->i_mapping);
>
> inode->i_op = &secretmem_iops;
> inode->i_mapping->a_ops = &secretmem_aops;
> --
> 2.50.1
Thanks also for the cleanups!
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 04/13] KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (2 preceding siblings ...)
2026-01-14 13:45 ` [PATCH v9 03/13] mm: introduce AS_NO_DIRECT_MAP Kalyazin, Nikita
@ 2026-01-14 13:45 ` Kalyazin, Nikita
2026-01-15 21:47 ` Ackerley Tng
2026-01-14 13:46 ` [PATCH v9 05/13] KVM: x86: define kvm_arch_gmem_supports_no_direct_map() Kalyazin, Nikita
` (8 subsequent siblings)
12 siblings, 1 reply; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:45 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Add a no-op stub for kvm_arch_gmem_invalidate if
CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE=n. This allows defining
kvm_gmem_free_folio without ifdef-ery, which allows more cleanly using
guest_memfd's free_folio callback for non-arch-invalidation related
code.
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/guest_memfd.c | 4 ----
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d93f75b05ae2..27796a09d29b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2589,6 +2589,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
+#else
+static inline void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { }
#endif
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fdaea3422c30..92e7f8c1f303 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -527,7 +527,6 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
return MF_DELAYED;
}
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
static void kvm_gmem_free_folio(struct folio *folio)
{
struct page *page = folio_page(folio, 0);
@@ -536,15 +535,12 @@ static void kvm_gmem_free_folio(struct folio *folio)
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
}
-#endif
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
-#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
.free_folio = kvm_gmem_free_folio,
-#endif
};
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 04/13] KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate
2026-01-14 13:45 ` [PATCH v9 04/13] KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate Kalyazin, Nikita
@ 2026-01-15 21:47 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 21:47 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> Add a no-op stub for kvm_arch_gmem_invalidate if
> CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE=n. This allows defining
> kvm_gmem_free_folio without ifdef-ery, which allows more cleanly using
> guest_memfd's free_folio callback for non-arch-invalidation related
> code.
>
> Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> include/linux/kvm_host.h | 2 ++
> virt/kvm/guest_memfd.c | 4 ----
> 2 files changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index d93f75b05ae2..27796a09d29b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2589,6 +2589,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages
>
> #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
> +#else
> +static inline void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { }
> #endif
>
> #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index fdaea3422c30..92e7f8c1f303 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -527,7 +527,6 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
> return MF_DELAYED;
> }
>
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> static void kvm_gmem_free_folio(struct folio *folio)
> {
> struct page *page = folio_page(folio, 0);
> @@ -536,15 +535,12 @@ static void kvm_gmem_free_folio(struct folio *folio)
>
> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> }
> -#endif
>
> static const struct address_space_operations kvm_gmem_aops = {
> .dirty_folio = noop_dirty_folio,
> .migrate_folio = kvm_gmem_migrate_folio,
> .error_remove_folio = kvm_gmem_error_folio,
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> .free_folio = kvm_gmem_free_folio,
> -#endif
> };
>
> static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
> --
> 2.50.1
Like this change, thanks!
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 05/13] KVM: x86: define kvm_arch_gmem_supports_no_direct_map()
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (3 preceding siblings ...)
2026-01-14 13:45 ` [PATCH v9 04/13] KVM: guest_memfd: Add stub for kvm_arch_gmem_invalidate Kalyazin, Nikita
@ 2026-01-14 13:46 ` Kalyazin, Nikita
2026-01-15 21:48 ` Ackerley Tng
2026-01-14 13:46 ` [PATCH v9 06/13] KVM: arm64: " Kalyazin, Nikita
` (7 subsequent siblings)
12 siblings, 1 reply; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:46 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
x86 supports GUEST_MEMFD_FLAG_NO_DIRECT_MAP whenever direct map
modifications are possible (which is always the case).
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
arch/x86/include/asm/kvm_host.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5a3bfa293e8b..68bd29a52f24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -28,6 +28,7 @@
#include <linux/sched/vhost_task.h>
#include <linux/call_once.h>
#include <linux/atomic.h>
+#include <linux/set_memory.h>
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -2481,4 +2482,12 @@ static inline bool kvm_arch_has_irq_bypass(void)
return enable_device_posted_irqs;
}
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static inline bool kvm_arch_gmem_supports_no_direct_map(void)
+{
+ return can_set_direct_map();
+}
+#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map
+#endif /* CONFIG_KVM_GUEST_MEMFD */
+
#endif /* _ASM_X86_KVM_HOST_H */
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 05/13] KVM: x86: define kvm_arch_gmem_supports_no_direct_map()
2026-01-14 13:46 ` [PATCH v9 05/13] KVM: x86: define kvm_arch_gmem_supports_no_direct_map() Kalyazin, Nikita
@ 2026-01-15 21:48 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 21:48 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> x86 supports GUEST_MEMFD_FLAG_NO_DIRECT_MAP whenever direct map
> modifications are possible (which is always the case).
>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> arch/x86/include/asm/kvm_host.h | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5a3bfa293e8b..68bd29a52f24 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -28,6 +28,7 @@
> #include <linux/sched/vhost_task.h>
> #include <linux/call_once.h>
> #include <linux/atomic.h>
> +#include <linux/set_memory.h>
>
> #include <asm/apic.h>
> #include <asm/pvclock-abi.h>
> @@ -2481,4 +2482,12 @@ static inline bool kvm_arch_has_irq_bypass(void)
> return enable_device_posted_irqs;
> }
>
> +#ifdef CONFIG_KVM_GUEST_MEMFD
> +static inline bool kvm_arch_gmem_supports_no_direct_map(void)
> +{
> + return can_set_direct_map();
> +}
> +#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map
> +#endif /* CONFIG_KVM_GUEST_MEMFD */
> +
> #endif /* _ASM_X86_KVM_HOST_H */
> --
> 2.50.1
Reviewed-by: Ackerley Tng <ackerleytng@google.com>
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 06/13] KVM: arm64: define kvm_arch_gmem_supports_no_direct_map()
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (4 preceding siblings ...)
2026-01-14 13:46 ` [PATCH v9 05/13] KVM: x86: define kvm_arch_gmem_supports_no_direct_map() Kalyazin, Nikita
@ 2026-01-14 13:46 ` Kalyazin, Nikita
2026-01-14 13:46 ` [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map Kalyazin, Nikita
` (6 subsequent siblings)
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:46 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Support for GUEST_MEMFD_FLAG_NO_DIRECT_MAP on arm64 depends on 1) direct
map manipulations at 4k granularity being possible, and 2) FEAT_S2FWB.
1) is met whenever the direct map is set up at 4k granularity (e.g. not
with huge/gigantic pages) at boottime, as due to ARM's
break-before-make semantics, breaking huge mappings into 4k mappings in
the direct map is not possible (BBM would require temporary invalidation
of the entire huge mapping, even if only a 4k subrange should be zapped,
which will probably crash the kernel). However, the current default for
rodata_full is true, which forces a 4k direct map.
2) is required to allow KVM to elide cache coherency operations when
installing stage 2 page tables, which require the direct map to be
entry for the newly mapped memory to be present (which it will not be,
as guest_memfd would have removed direct map entries in
kvm_gmem_get_pfn()).
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
arch/arm64/include/asm/kvm_host.h | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ac7f970c7883..d431ca7d4fc9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -19,6 +19,7 @@
#include <linux/maple_tree.h>
#include <linux/percpu.h>
#include <linux/psci.h>
+#include <linux/set_memory.h>
#include <asm/arch_gicv3.h>
#include <asm/barrier.h>
#include <asm/cpufeature.h>
@@ -1654,5 +1655,17 @@ static __always_inline enum fgt_group_id __fgt_reg_to_group_id(enum vcpu_sysreg
\
p; \
})
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static inline bool kvm_arch_gmem_supports_no_direct_map(void)
+{
+ /*
+ * Without FWB, direct map access is needed in kvm_pgtable_stage2_map(),
+ * as it calls dcache_clean_inval_poc().
+ */
+ return can_set_direct_map() && cpus_have_final_cap(ARM64_HAS_STAGE2_FWB);
+}
+#define kvm_arch_gmem_supports_no_direct_map kvm_arch_gmem_supports_no_direct_map
+#endif /* CONFIG_KVM_GUEST_MEMFD */
+
#endif /* __ARM64_KVM_HOST_H__ */
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (5 preceding siblings ...)
2026-01-14 13:46 ` [PATCH v9 06/13] KVM: arm64: " Kalyazin, Nikita
@ 2026-01-14 13:46 ` Kalyazin, Nikita
2026-01-15 20:00 ` Ackerley Tng
` (2 more replies)
2026-01-14 13:46 ` [PATCH v9 08/13] KVM: selftests: load elf via bounce buffer Kalyazin, Nikita
` (5 subsequent siblings)
12 siblings, 3 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:46 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
ioctl. When set, guest_memfd folios will be removed from the direct map
after preparation, with direct map entries only restored when the folios
are freed.
To ensure these folios do not end up in places where the kernel cannot
deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
Note that this flag causes removal of direct map entries for all
guest_memfd folios independent of whether they are "shared" or "private"
(although current guest_memfd only supports either all folios in the
"shared" state, or all folios in the "private" state if
GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
entries of also the shared parts of guest_memfd are a special type of
non-CoCo VM where, host userspace is trusted to have access to all of
guest memory, but where Spectre-style transient execution attacks
through the host kernel's direct map should still be mitigated. In this
setup, KVM retains access to guest memory via userspace mappings of
guest_memfd, which are reflected back into KVM's memslots via
userspace_addr. This is needed for things like MMIO emulation on x86_64
to work.
Direct map entries are zapped right before guest or userspace mappings
of gmem folios are set up, e.g. in kvm_gmem_fault_user_mapping() or
kvm_gmem_get_pfn() [called from the KVM MMU code]. The only place where
a gmem folio can be allocated without being mapped anywhere is
kvm_gmem_populate(), where handling potential failures of direct map
removal is not possible (by the time direct map removal is attempted,
the folio is already marked as prepared, meaning attempting to re-try
kvm_gmem_populate() would just result in -EEXIST without fixing up the
direct map state). These folios are then removed form the direct map
upon kvm_gmem_get_pfn(), e.g. when they are mapped into the guest later.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
Documentation/virt/kvm/api.rst | 22 ++++++++------
include/linux/kvm_host.h | 12 ++++++++
include/uapi/linux/kvm.h | 1 +
virt/kvm/guest_memfd.c | 54 ++++++++++++++++++++++++++++++++++
4 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 01a3abef8abb..c5f54f1370c8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6440,15 +6440,19 @@ a single guest_memfd file, but the bound ranges must not overlap).
The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be
specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags:
- ============================ ================================================
- GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
- descriptor.
- GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
- KVM_CREATE_GUEST_MEMFD (memory files created
- without INIT_SHARED will be marked private).
- Shared memory can be faulted into host userspace
- page tables. Private memory cannot.
- ============================ ================================================
+ ============================== ================================================
+ GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
+ descriptor.
+ GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
+ KVM_CREATE_GUEST_MEMFD (memory files created
+ without INIT_SHARED will be marked private).
+ Shared memory can be faulted into host userspace
+ page tables. Private memory cannot.
+ GUEST_MEMFD_FLAG_NO_DIRECT_MAP The guest_memfd instance will behave similarly
+ to memfd_secret, and unmaps the memory backing
+ it from the kernel's address space before
+ being passed off to userspace or the guest.
+ ============================== ================================================
When the KVM MMU performs a PFN lookup to service a guest fault and the backing
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27796a09d29b..d4d5306075bf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -738,10 +738,22 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm)
if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
+ if (kvm_arch_gmem_supports_no_direct_map())
+ flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
+
return flags;
}
#endif
+#ifdef CONFIG_KVM_GUEST_MEMFD
+#ifndef kvm_arch_gmem_supports_no_direct_map
+static inline bool kvm_arch_gmem_supports_no_direct_map(void)
+{
+ return false;
+}
+#endif
+#endif /* CONFIG_KVM_GUEST_MEMFD */
+
#ifndef kvm_arch_has_readonly_mem
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
{
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dddb781b0507..60341e1ba1be 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1612,6 +1612,7 @@ struct kvm_memory_attributes {
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1)
+#define GUEST_MEMFD_FLAG_NO_DIRECT_MAP (1ULL << 2)
struct kvm_create_guest_memfd {
__u64 size;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 92e7f8c1f303..43f64c11467a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -7,6 +7,9 @@
#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
+#include <linux/set_memory.h>
+
+#include <asm/tlbflush.h>
#include "kvm_mm.h"
@@ -76,6 +79,43 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
+#define KVM_GMEM_FOLIO_NO_DIRECT_MAP BIT(0)
+
+static bool kvm_gmem_folio_no_direct_map(struct folio *folio)
+{
+ return ((u64) folio->private) & KVM_GMEM_FOLIO_NO_DIRECT_MAP;
+}
+
+static int kvm_gmem_folio_zap_direct_map(struct folio *folio)
+{
+ u64 gmem_flags = GMEM_I(folio_inode(folio))->flags;
+ int r = 0;
+
+ if (kvm_gmem_folio_no_direct_map(folio) || !(gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP))
+ goto out;
+
+ folio->private = (void *)((u64)folio->private | KVM_GMEM_FOLIO_NO_DIRECT_MAP);
+ r = folio_zap_direct_map(folio);
+
+out:
+ return r;
+}
+
+static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
+{
+ /*
+ * Direct map restoration cannot fail, as the only error condition
+ * for direct map manipulation is failure to allocate page tables
+ * when splitting huge pages, but this split would have already
+ * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
+ * Thus folio_restore_direct_map() here only updates prot bits.
+ */
+ if (kvm_gmem_folio_no_direct_map(folio)) {
+ WARN_ON_ONCE(folio_restore_direct_map(folio));
+ folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
+ }
+}
+
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
folio_mark_uptodate(folio);
@@ -398,6 +438,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct folio *folio;
vm_fault_t ret = VM_FAULT_LOCKED;
+ int err;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
@@ -423,6 +464,12 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
kvm_gmem_mark_prepared(folio);
}
+ err = kvm_gmem_folio_zap_direct_map(folio);
+ if (err) {
+ ret = vmf_error(err);
+ goto out_folio;
+ }
+
vmf->page = folio_file_page(folio, vmf->pgoff);
out_folio:
@@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
kvm_pfn_t pfn = page_to_pfn(page);
int order = folio_order(folio);
+ kvm_gmem_folio_restore_direct_map(folio);
+
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
}
@@ -596,6 +645,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+ if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
+ mapping_set_no_direct_map(inode->i_mapping);
+
GMEM_I(inode)->flags = flags;
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
@@ -807,6 +859,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+ kvm_gmem_folio_zap_direct_map(folio);
+
folio_unlock(folio);
if (!r)
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-14 13:46 ` [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map Kalyazin, Nikita
@ 2026-01-15 20:00 ` Ackerley Tng
2026-01-16 14:56 ` Nikita Kalyazin
2026-01-15 23:04 ` Edgecombe, Rick P
2026-01-16 0:00 ` Edgecombe, Rick P
2 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 20:00 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
> ioctl. When set, guest_memfd folios will be removed from the direct map
> after preparation, with direct map entries only restored when the folios
> are freed.
>
> To ensure these folios do not end up in places where the kernel cannot
> deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
> address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
>
> Note that this flag causes removal of direct map entries for all
> guest_memfd folios independent of whether they are "shared" or "private"
> (although current guest_memfd only supports either all folios in the
> "shared" state, or all folios in the "private" state if
> GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
> entries of also the shared parts of guest_memfd are a special type of
> non-CoCo VM where, host userspace is trusted to have access to all of
> guest memory, but where Spectre-style transient execution attacks
> through the host kernel's direct map should still be mitigated. In this
> setup, KVM retains access to guest memory via userspace mappings of
> guest_memfd, which are reflected back into KVM's memslots via
> userspace_addr. This is needed for things like MMIO emulation on x86_64
> to work.
>
> Direct map entries are zapped right before guest or userspace mappings
> of gmem folios are set up, e.g. in kvm_gmem_fault_user_mapping() or
> kvm_gmem_get_pfn() [called from the KVM MMU code]. The only place where
> a gmem folio can be allocated without being mapped anywhere is
> kvm_gmem_populate(), where handling potential failures of direct map
> removal is not possible (by the time direct map removal is attempted,
> the folio is already marked as prepared, meaning attempting to re-try
> kvm_gmem_populate() would just result in -EEXIST without fixing up the
> direct map state). These folios are then removed form the direct map
> upon kvm_gmem_get_pfn(), e.g. when they are mapped into the guest later.
>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> Documentation/virt/kvm/api.rst | 22 ++++++++------
> include/linux/kvm_host.h | 12 ++++++++
> include/uapi/linux/kvm.h | 1 +
> virt/kvm/guest_memfd.c | 54 ++++++++++++++++++++++++++++++++++
> 4 files changed, 80 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 01a3abef8abb..c5f54f1370c8 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6440,15 +6440,19 @@ a single guest_memfd file, but the bound ranges must not overlap).
> The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be
> specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags:
>
> - ============================ ================================================
> - GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
> - descriptor.
> - GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
> - KVM_CREATE_GUEST_MEMFD (memory files created
> - without INIT_SHARED will be marked private).
> - Shared memory can be faulted into host userspace
> - page tables. Private memory cannot.
> - ============================ ================================================
> + ============================== ================================================
> + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
> + descriptor.
> + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
> + KVM_CREATE_GUEST_MEMFD (memory files created
> + without INIT_SHARED will be marked private).
> + Shared memory can be faulted into host userspace
> + page tables. Private memory cannot.
> + GUEST_MEMFD_FLAG_NO_DIRECT_MAP The guest_memfd instance will behave similarly
> + to memfd_secret, and unmaps the memory backing
Perhaps the reference to memfd_secret can be dropped to avoid anyone
assuming further similarities between guest_memfd and memfd_secret. This
could just say that "The guest_memfd instance will unmap the memory
backing it from the kernel's address space...".
> + it from the kernel's address space before
> + being passed off to userspace or the guest.
> + ============================== ================================================
>
> When the KVM MMU performs a PFN lookup to service a guest fault and the backing
> guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 27796a09d29b..d4d5306075bf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -738,10 +738,22 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm)
> if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
> flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
>
> + if (kvm_arch_gmem_supports_no_direct_map())
> + flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
> +
> return flags;
> }
> #endif
>
> +#ifdef CONFIG_KVM_GUEST_MEMFD
> +#ifndef kvm_arch_gmem_supports_no_direct_map
> +static inline bool kvm_arch_gmem_supports_no_direct_map(void)
> +{
> + return false;
> +}
> +#endif
> +#endif /* CONFIG_KVM_GUEST_MEMFD */
> +
> #ifndef kvm_arch_has_readonly_mem
> static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
> {
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index dddb781b0507..60341e1ba1be 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1612,6 +1612,7 @@ struct kvm_memory_attributes {
> #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> #define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
> #define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1)
> +#define GUEST_MEMFD_FLAG_NO_DIRECT_MAP (1ULL << 2)
>
> struct kvm_create_guest_memfd {
> __u64 size;
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 92e7f8c1f303..43f64c11467a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -7,6 +7,9 @@
> #include <linux/mempolicy.h>
> #include <linux/pseudo_fs.h>
> #include <linux/pagemap.h>
> +#include <linux/set_memory.h>
> +
> +#include <asm/tlbflush.h>
>
> #include "kvm_mm.h"
>
> @@ -76,6 +79,43 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
> return 0;
> }
>
> +#define KVM_GMEM_FOLIO_NO_DIRECT_MAP BIT(0)
> +
> +static bool kvm_gmem_folio_no_direct_map(struct folio *folio)
> +{
> + return ((u64) folio->private) & KVM_GMEM_FOLIO_NO_DIRECT_MAP;
Nit: I think there shouldn't be a space between (u64) and what's being casted.
> +}
> +
> +static int kvm_gmem_folio_zap_direct_map(struct folio *folio)
> +{
> + u64 gmem_flags = GMEM_I(folio_inode(folio))->flags;
> + int r = 0;
> +
> + if (kvm_gmem_folio_no_direct_map(folio) || !(gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP))
> + goto out;
> +
> + folio->private = (void *)((u64)folio->private | KVM_GMEM_FOLIO_NO_DIRECT_MAP);
> + r = folio_zap_direct_map(folio);
> +
> +out:
> + return r;
> +}
> +
> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
> +{
> + /*
> + * Direct map restoration cannot fail, as the only error condition
> + * for direct map manipulation is failure to allocate page tables
> + * when splitting huge pages, but this split would have already
> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
> + * Thus folio_restore_direct_map() here only updates prot bits.
> + */
Thanks for this comment :)
> + if (kvm_gmem_folio_no_direct_map(folio)) {
> + WARN_ON_ONCE(folio_restore_direct_map(folio));
> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
> + }
> +}
> +
> static inline void kvm_gmem_mark_prepared(struct folio *folio)
> {
> folio_mark_uptodate(folio);
> @@ -398,6 +438,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> struct inode *inode = file_inode(vmf->vma->vm_file);
> struct folio *folio;
> vm_fault_t ret = VM_FAULT_LOCKED;
> + int err;
>
> if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
> return VM_FAULT_SIGBUS;
> @@ -423,6 +464,12 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> kvm_gmem_mark_prepared(folio);
> }
>
> + err = kvm_gmem_folio_zap_direct_map(folio);
Perhaps the check for gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP should
be done here before making the call to kvm_gmem_folio_zap_direct_map()
to make it more obvious that zapping is conditional.
Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
this call can be completely removed by the compiler if it wasn't
compiled in.
The kvm_gmem_folio_no_direct_map() check should probably remain in
kvm_gmem_folio_zap_direct_map() since that's a "if already zapped, don't
zap again" check.
> + if (err) {
> + ret = vmf_error(err);
> + goto out_folio;
> + }
> +
> vmf->page = folio_file_page(folio, vmf->pgoff);
>
> out_folio:
> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
> kvm_pfn_t pfn = page_to_pfn(page);
> int order = folio_order(folio);
>
> + kvm_gmem_folio_restore_direct_map(folio);
> +
I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
the caller or within kvm_gmem_folio_restore_direct_map(), since this
time it's a folio-specific property being checked.
Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
this call can be completely removed by the compiler if it wasn't
compiled in. IIUC whether the check is added in the caller or within
kvm_gmem_folio_restore_direct_map() the call can still be elided.
> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
> }
>
> @@ -596,6 +645,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> /* Unmovable mappings are supposed to be marked unevictable as well. */
> WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>
> + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
> + mapping_set_no_direct_map(inode->i_mapping);
> +
> GMEM_I(inode)->flags = flags;
>
> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
> @@ -807,6 +859,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> if (!is_prepared)
> r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>
> + kvm_gmem_folio_zap_direct_map(folio);
> +
Is there a reason why errors are not handled when faulting private memory?
> folio_unlock(folio);
>
> if (!r)
> --
> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-15 20:00 ` Ackerley Tng
@ 2026-01-16 14:56 ` Nikita Kalyazin
2026-01-22 16:34 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 14:56 UTC (permalink / raw)
To: Ackerley Tng, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
On 15/01/2026 20:00, Ackerley Tng wrote:
> "Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
>
>> From: Patrick Roy <patrick.roy@linux.dev>
>>
>> Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
>> ioctl. When set, guest_memfd folios will be removed from the direct map
>> after preparation, with direct map entries only restored when the folios
>> are freed.
>>
>> To ensure these folios do not end up in places where the kernel cannot
>> deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
>> address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
>>
>> Note that this flag causes removal of direct map entries for all
>> guest_memfd folios independent of whether they are "shared" or "private"
>> (although current guest_memfd only supports either all folios in the
>> "shared" state, or all folios in the "private" state if
>> GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
>> entries of also the shared parts of guest_memfd are a special type of
>> non-CoCo VM where, host userspace is trusted to have access to all of
>> guest memory, but where Spectre-style transient execution attacks
>> through the host kernel's direct map should still be mitigated. In this
>> setup, KVM retains access to guest memory via userspace mappings of
>> guest_memfd, which are reflected back into KVM's memslots via
>> userspace_addr. This is needed for things like MMIO emulation on x86_64
>> to work.
>>
>> Direct map entries are zapped right before guest or userspace mappings
>> of gmem folios are set up, e.g. in kvm_gmem_fault_user_mapping() or
>> kvm_gmem_get_pfn() [called from the KVM MMU code]. The only place where
>> a gmem folio can be allocated without being mapped anywhere is
>> kvm_gmem_populate(), where handling potential failures of direct map
>> removal is not possible (by the time direct map removal is attempted,
>> the folio is already marked as prepared, meaning attempting to re-try
>> kvm_gmem_populate() would just result in -EEXIST without fixing up the
>> direct map state). These folios are then removed form the direct map
>> upon kvm_gmem_get_pfn(), e.g. when they are mapped into the guest later.
>>
>> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> Documentation/virt/kvm/api.rst | 22 ++++++++------
>> include/linux/kvm_host.h | 12 ++++++++
>> include/uapi/linux/kvm.h | 1 +
>> virt/kvm/guest_memfd.c | 54 ++++++++++++++++++++++++++++++++++
>> 4 files changed, 80 insertions(+), 9 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 01a3abef8abb..c5f54f1370c8 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6440,15 +6440,19 @@ a single guest_memfd file, but the bound ranges must not overlap).
>> The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be
>> specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags:
>>
>> - ============================ ================================================
>> - GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
>> - descriptor.
>> - GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
>> - KVM_CREATE_GUEST_MEMFD (memory files created
>> - without INIT_SHARED will be marked private).
>> - Shared memory can be faulted into host userspace
>> - page tables. Private memory cannot.
>> - ============================ ================================================
>> + ============================== ================================================
>> + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file
>> + descriptor.
>> + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during
>> + KVM_CREATE_GUEST_MEMFD (memory files created
>> + without INIT_SHARED will be marked private).
>> + Shared memory can be faulted into host userspace
>> + page tables. Private memory cannot.
>> + GUEST_MEMFD_FLAG_NO_DIRECT_MAP The guest_memfd instance will behave similarly
>> + to memfd_secret, and unmaps the memory backing
>
> Perhaps the reference to memfd_secret can be dropped to avoid anyone
> assuming further similarities between guest_memfd and memfd_secret. This
> could just say that "The guest_memfd instance will unmap the memory
> backing it from the kernel's address space...".
Agree, it may lead to a confusion down the line, thanks.
>
>> + it from the kernel's address space before
>> + being passed off to userspace or the guest.
>> + ============================== ================================================
>>
>> When the KVM MMU performs a PFN lookup to service a guest fault and the backing
>> guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 27796a09d29b..d4d5306075bf 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -738,10 +738,22 @@ static inline u64 kvm_gmem_get_supported_flags(struct kvm *kvm)
>> if (!kvm || kvm_arch_supports_gmem_init_shared(kvm))
>> flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
>>
>> + if (kvm_arch_gmem_supports_no_direct_map())
>> + flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
>> +
>> return flags;
>> }
>> #endif
>>
>> +#ifdef CONFIG_KVM_GUEST_MEMFD
>> +#ifndef kvm_arch_gmem_supports_no_direct_map
>> +static inline bool kvm_arch_gmem_supports_no_direct_map(void)
>> +{
>> + return false;
>> +}
>> +#endif
>> +#endif /* CONFIG_KVM_GUEST_MEMFD */
>> +
>> #ifndef kvm_arch_has_readonly_mem
>> static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
>> {
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index dddb781b0507..60341e1ba1be 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1612,6 +1612,7 @@ struct kvm_memory_attributes {
>> #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
>> #define GUEST_MEMFD_FLAG_MMAP (1ULL << 0)
>> #define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1)
>> +#define GUEST_MEMFD_FLAG_NO_DIRECT_MAP (1ULL << 2)
>>
>> struct kvm_create_guest_memfd {
>> __u64 size;
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 92e7f8c1f303..43f64c11467a 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -7,6 +7,9 @@
>> #include <linux/mempolicy.h>
>> #include <linux/pseudo_fs.h>
>> #include <linux/pagemap.h>
>> +#include <linux/set_memory.h>
>> +
>> +#include <asm/tlbflush.h>
>>
>> #include "kvm_mm.h"
>>
>> @@ -76,6 +79,43 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>> return 0;
>> }
>>
>> +#define KVM_GMEM_FOLIO_NO_DIRECT_MAP BIT(0)
>> +
>> +static bool kvm_gmem_folio_no_direct_map(struct folio *folio)
>> +{
>> + return ((u64) folio->private) & KVM_GMEM_FOLIO_NO_DIRECT_MAP;
>
> Nit: I think there shouldn't be a space between (u64) and what's being casted.
True, will remove.
>
>> +}
>> +
>> +static int kvm_gmem_folio_zap_direct_map(struct folio *folio)
>> +{
>> + u64 gmem_flags = GMEM_I(folio_inode(folio))->flags;
>> + int r = 0;
>> +
>> + if (kvm_gmem_folio_no_direct_map(folio) || !(gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP))
>> + goto out;
>> +
>> + folio->private = (void *)((u64)folio->private | KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>> + r = folio_zap_direct_map(folio);
>> +
>> +out:
>> + return r;
>> +}
>> +
>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>> +{
>> + /*
>> + * Direct map restoration cannot fail, as the only error condition
>> + * for direct map manipulation is failure to allocate page tables
>> + * when splitting huge pages, but this split would have already
>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
>> + * Thus folio_restore_direct_map() here only updates prot bits.
>> + */
>
> Thanks for this comment :)
Thanks to Patrick :)
>
>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>> + }
>> +}
>> +
>> static inline void kvm_gmem_mark_prepared(struct folio *folio)
>> {
>> folio_mark_uptodate(folio);
>> @@ -398,6 +438,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>> struct inode *inode = file_inode(vmf->vma->vm_file);
>> struct folio *folio;
>> vm_fault_t ret = VM_FAULT_LOCKED;
>> + int err;
>>
>> if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
>> return VM_FAULT_SIGBUS;
>> @@ -423,6 +464,12 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>> kvm_gmem_mark_prepared(folio);
>> }
>>
>> + err = kvm_gmem_folio_zap_direct_map(folio);
>
> Perhaps the check for gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP should
> be done here before making the call to kvm_gmem_folio_zap_direct_map()
> to make it more obvious that zapping is conditional.
Makes sense to me.
>
> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
> this call can be completely removed by the compiler if it wasn't
> compiled in.
But if it is compiled in, we will be paying the cost of the call on
every page fault? Eg on arm64, it will call the following:
bool can_set_direct_map(void)
{
...
return rodata_full || debug_pagealloc_enabled() ||
arm64_kfence_can_set_direct_map() || is_realm_world();
}
>
> The kvm_gmem_folio_no_direct_map() check should probably remain in
> kvm_gmem_folio_zap_direct_map() since that's a "if already zapped, don't
> zap again" check.
>
>> + if (err) {
>> + ret = vmf_error(err);
>> + goto out_folio;
>> + }
>> +
>> vmf->page = folio_file_page(folio, vmf->pgoff);
>>
>> out_folio:
>> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
>> kvm_pfn_t pfn = page_to_pfn(page);
>> int order = folio_order(folio);
>>
>> + kvm_gmem_folio_restore_direct_map(folio);
>> +
>
> I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
> the caller or within kvm_gmem_folio_restore_direct_map(), since this
> time it's a folio-specific property being checked.
I'm tempted to keep it similar to the kvm_gmem_folio_zap_direct_map()
case. How does the fact it's a folio-speicific property change your
reasoning?
>
> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
> this call can be completely removed by the compiler if it wasn't
> compiled in. IIUC whether the check is added in the caller or within
> kvm_gmem_folio_restore_direct_map() the call can still be elided.
Same concern as the above about kvm_gmem_folio_zap_direct_map(), ie the
performance of the case where kvm_arch_gmem_supports_no_direct_map() exists.
>
>> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>> }
>>
>> @@ -596,6 +645,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>> /* Unmovable mappings are supposed to be marked unevictable as well. */
>> WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>
>> + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
>> + mapping_set_no_direct_map(inode->i_mapping);
>> +
>> GMEM_I(inode)->flags = flags;
>>
>> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
>> @@ -807,6 +859,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>> if (!is_prepared)
>> r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>>
>> + kvm_gmem_folio_zap_direct_map(folio);
>> +
>
> Is there a reason why errors are not handled when faulting private memory?
No, I can't see a reason. Will add a check, thanks.
>
>> folio_unlock(folio);
>>
>> if (!r)
>> --
>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 14:56 ` Nikita Kalyazin
@ 2026-01-22 16:34 ` Ackerley Tng
2026-01-22 18:04 ` Nikita Kalyazin
0 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 16:34 UTC (permalink / raw)
To: kalyazin, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
Nikita Kalyazin <kalyazin@amazon.com> writes:
Was preparing the reply but couldn't get to it before the
meeting. Here's what was also discussed at the guest_memfd biweekly on
2026-01-22:
>
> [...snip...]
>
>>> @@ -423,6 +464,12 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>>> kvm_gmem_mark_prepared(folio);
>>> }
>>>
>>> + err = kvm_gmem_folio_zap_direct_map(folio);
>>
>> Perhaps the check for gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP should
>> be done here before making the call to kvm_gmem_folio_zap_direct_map()
>> to make it more obvious that zapping is conditional.
>
> Makes sense to me.
>
>>
>> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
>> this call can be completely removed by the compiler if it wasn't
>> compiled in.
>
> But if it is compiled in, we will be paying the cost of the call on
> every page fault? Eg on arm64, it will call the following:
>
> bool can_set_direct_map(void)
> {
>
> ...
>
> return rodata_full || debug_pagealloc_enabled() ||
> arm64_kfence_can_set_direct_map() || is_realm_world();
> }
>
You're right that this could end up paying the cost on every page
fault. Please ignore this request!
>>
>> The kvm_gmem_folio_no_direct_map() check should probably remain in
>> kvm_gmem_folio_zap_direct_map() since that's a "if already zapped, don't
>> zap again" check.
>>
>>> + if (err) {
>>> + ret = vmf_error(err);
>>> + goto out_folio;
>>> + }
>>> +
>>> vmf->page = folio_file_page(folio, vmf->pgoff);
>>>
>>> out_folio:
>>> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
>>> kvm_pfn_t pfn = page_to_pfn(page);
>>> int order = folio_order(folio);
>>>
>>> + kvm_gmem_folio_restore_direct_map(folio);
>>> +
>>
>> I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
>> the caller or within kvm_gmem_folio_restore_direct_map(), since this
>> time it's a folio-specific property being checked.
>
> I'm tempted to keep it similar to the kvm_gmem_folio_zap_direct_map()
> case. How does the fact it's a folio-speicific property change your
> reasoning?
>
This is good too:
if (kvm_gmem_folio_no_direct_map(folio))
kvm_gmem_folio_restore_direct_map(folio)
>>
>> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
>> this call can be completely removed by the compiler if it wasn't
>> compiled in. IIUC whether the check is added in the caller or within
>> kvm_gmem_folio_restore_direct_map() the call can still be elided.
>
> Same concern as the above about kvm_gmem_folio_zap_direct_map(), ie the
> performance of the case where kvm_arch_gmem_supports_no_direct_map() exists.
>
Please ignore this request!
>>
>>> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>>> }
>>>
>>> @@ -596,6 +645,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>>> /* Unmovable mappings are supposed to be marked unevictable as well. */
>>> WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>>
>>> + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
>>> + mapping_set_no_direct_map(inode->i_mapping);
>>> +
>>> GMEM_I(inode)->flags = flags;
>>>
>>> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
>>> @@ -807,6 +859,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>> if (!is_prepared)
>>> r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>>>
>>> + kvm_gmem_folio_zap_direct_map(folio);
>>> +
>>
>> Is there a reason why errors are not handled when faulting private memory?
>
> No, I can't see a reason. Will add a check, thanks.
>
>>
>>> folio_unlock(folio);
>>>
>>> if (!r)
>>> --
>>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 16:34 ` Ackerley Tng
@ 2026-01-22 18:04 ` Nikita Kalyazin
2026-01-22 20:30 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-22 18:04 UTC (permalink / raw)
To: Ackerley Tng, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
On 22/01/2026 16:34, Ackerley Tng wrote:
> Nikita Kalyazin <kalyazin@amazon.com> writes:
>
> Was preparing the reply but couldn't get to it before the
> meeting. Here's what was also discussed at the guest_memfd biweekly on
> 2026-01-22:
>
>>
>> [...snip...]
>>
>>>> @@ -423,6 +464,12 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>>>> kvm_gmem_mark_prepared(folio);
>>>> }
>>>>
>>>> + err = kvm_gmem_folio_zap_direct_map(folio);
>>>
>>> Perhaps the check for gmem_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP should
>>> be done here before making the call to kvm_gmem_folio_zap_direct_map()
>>> to make it more obvious that zapping is conditional.
>>
>> Makes sense to me.
>>
>>>
>>> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
>>> this call can be completely removed by the compiler if it wasn't
>>> compiled in.
>>
>> But if it is compiled in, we will be paying the cost of the call on
>> every page fault? Eg on arm64, it will call the following:
>>
>> bool can_set_direct_map(void)
>> {
>>
>> ...
>>
>> return rodata_full || debug_pagealloc_enabled() ||
>> arm64_kfence_can_set_direct_map() || is_realm_world();
>> }
>>
>
> You're right that this could end up paying the cost on every page
> fault. Please ignore this request!
>
>>>
>>> The kvm_gmem_folio_no_direct_map() check should probably remain in
>>> kvm_gmem_folio_zap_direct_map() since that's a "if already zapped, don't
>>> zap again" check.
>>>
>>>> + if (err) {
>>>> + ret = vmf_error(err);
>>>> + goto out_folio;
>>>> + }
>>>> +
>>>> vmf->page = folio_file_page(folio, vmf->pgoff);
>>>>
>>>> out_folio:
>>>> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
>>>> kvm_pfn_t pfn = page_to_pfn(page);
>>>> int order = folio_order(folio);
>>>>
>>>> + kvm_gmem_folio_restore_direct_map(folio);
>>>> +
>>>
>>> I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
>>> the caller or within kvm_gmem_folio_restore_direct_map(), since this
>>> time it's a folio-specific property being checked.
>>
>> I'm tempted to keep it similar to the kvm_gmem_folio_zap_direct_map()
>> case. How does the fact it's a folio-speicific property change your
>> reasoning?
>>
>
> This is good too:
>
> if (kvm_gmem_folio_no_direct_map(folio))
> kvm_gmem_folio_restore_direct_map(folio)
It turns out we can't do that because folio->mapping is gone by the time
filemap_free_folio() is called so we can't inspect the flags. Are you
ok with only having this check when zapping (but not when restoring)?
Do you think we should add a comment saying it's conditional here?
>
>>>
>>> Perhaps also add a check for kvm_arch_gmem_supports_no_direct_map() so
>>> this call can be completely removed by the compiler if it wasn't
>>> compiled in. IIUC whether the check is added in the caller or within
>>> kvm_gmem_folio_restore_direct_map() the call can still be elided.
>>
>> Same concern as the above about kvm_gmem_folio_zap_direct_map(), ie the
>> performance of the case where kvm_arch_gmem_supports_no_direct_map() exists.
>>
>
> Please ignore this request!
>
>>>
>>>> kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
>>>> }
>>>>
>>>> @@ -596,6 +645,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
>>>> /* Unmovable mappings are supposed to be marked unevictable as well. */
>>>> WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>>>
>>>> + if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP)
>>>> + mapping_set_no_direct_map(inode->i_mapping);
>>>> +
>>>> GMEM_I(inode)->flags = flags;
>>>>
>>>> file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
>>>> @@ -807,6 +859,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>>> if (!is_prepared)
>>>> r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
>>>>
>>>> + kvm_gmem_folio_zap_direct_map(folio);
>>>> +
>>>
>>> Is there a reason why errors are not handled when faulting private memory?
>>
>> No, I can't see a reason. Will add a check, thanks.
>>
>>>
>>>> folio_unlock(folio);
>>>>
>>>> if (!r)
>>>> --
>>>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 18:04 ` Nikita Kalyazin
@ 2026-01-22 20:30 ` Ackerley Tng
2026-01-22 20:40 ` Nikita Kalyazin
0 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 20:30 UTC (permalink / raw)
To: kalyazin, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
Nikita Kalyazin <kalyazin@amazon.com> writes:
>
> [...snip...]
>
>>>>> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
>>>>> kvm_pfn_t pfn = page_to_pfn(page);
>>>>> int order = folio_order(folio);
>>>>>
>>>>> + kvm_gmem_folio_restore_direct_map(folio);
>>>>> +
>>>>
>>>> I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
>>>> the caller or within kvm_gmem_folio_restore_direct_map(), since this
>>>> time it's a folio-specific property being checked.
>>>
>>> I'm tempted to keep it similar to the kvm_gmem_folio_zap_direct_map()
>>> case. How does the fact it's a folio-speicific property change your
>>> reasoning?
>>>
>>
>> This is good too:
>>
>> if (kvm_gmem_folio_no_direct_map(folio))
>> kvm_gmem_folio_restore_direct_map(folio)
>
> It turns out we can't do that because folio->mapping is gone by the time
> filemap_free_folio() is called so we can't inspect the flags. Are you
> ok with only having this check when zapping (but not when restoring)?
> Do you think we should add a comment saying it's conditional here?
>
I thought kvm_gmem_folio_no_direct_map() only reads folio->private,
which I think should still be there at the point of
filemap_free_folio().
>>
>> [...snip...]
>>
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 20:30 ` Ackerley Tng
@ 2026-01-22 20:40 ` Nikita Kalyazin
0 siblings, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-22 20:40 UTC (permalink / raw)
To: Ackerley Tng, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
On 22/01/2026 20:30, Ackerley Tng wrote:
> Nikita Kalyazin <kalyazin@amazon.com> writes:
>
>>
>> [...snip...]
>>
>>>>>> @@ -533,6 +580,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
>>>>>> kvm_pfn_t pfn = page_to_pfn(page);
>>>>>> int order = folio_order(folio);
>>>>>>
>>>>>> + kvm_gmem_folio_restore_direct_map(folio);
>>>>>> +
>>>>>
>>>>> I can't decide if the kvm_gmem_folio_no_direct_map(folio) should be in
>>>>> the caller or within kvm_gmem_folio_restore_direct_map(), since this
>>>>> time it's a folio-specific property being checked.
>>>>
>>>> I'm tempted to keep it similar to the kvm_gmem_folio_zap_direct_map()
>>>> case. How does the fact it's a folio-speicific property change your
>>>> reasoning?
>>>>
>>>
>>> This is good too:
>>>
>>> if (kvm_gmem_folio_no_direct_map(folio))
>>> kvm_gmem_folio_restore_direct_map(folio)
>>
>> It turns out we can't do that because folio->mapping is gone by the time
>> filemap_free_folio() is called so we can't inspect the flags. Are you
>> ok with only having this check when zapping (but not when restoring)?
>> Do you think we should add a comment saying it's conditional here?
>>
>
> I thought kvm_gmem_folio_no_direct_map() only reads folio->private,
> which I think should still be there at the point of
> filemap_free_folio().
Oh, I misread your last reply. What you're proposing would work indeed.
>
>>>
>>> [...snip...]
>>>
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-14 13:46 ` [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map Kalyazin, Nikita
2026-01-15 20:00 ` Ackerley Tng
@ 2026-01-15 23:04 ` Edgecombe, Rick P
2026-01-16 15:02 ` Nikita Kalyazin
2026-01-16 17:30 ` Vishal Annapurve
2026-01-16 0:00 ` Edgecombe, Rick P
2 siblings, 2 replies; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-15 23:04 UTC (permalink / raw)
To: linux-riscv, kalyazin, kernel, linux-kselftest, linux-mm,
linux-fsdevel, linux-s390, kvmarm, linux-kernel,
linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
> Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
> ioctl. When set, guest_memfd folios will be removed from the direct map
> after preparation, with direct map entries only restored when the folios
> are freed.
>
> To ensure these folios do not end up in places where the kernel cannot
> deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
> address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
>
> Note that this flag causes removal of direct map entries for all
> guest_memfd folios independent of whether they are "shared" or "private"
> (although current guest_memfd only supports either all folios in the
> "shared" state, or all folios in the "private" state if
> GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
> entries of also the shared parts of guest_memfd are a special type of
> non-CoCo VM where, host userspace is trusted to have access to all of
> guest memory, but where Spectre-style transient execution attacks
> through the host kernel's direct map should still be mitigated. In this
> setup, KVM retains access to guest memory via userspace mappings of
> guest_memfd, which are reflected back into KVM's memslots via
> userspace_addr. This is needed for things like MMIO emulation on x86_64
> to work.
TDX does some clearing at the direct map mapping for pages that comes from gmem,
using a special instruction. It also does some clflushing at the direct map
address for these pages. So I think we need to make sure TDs don't pull from
gmem fds with this flag.
Not that there would be any expected use of the flag for TDs, but it could cause
a crash.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-15 23:04 ` Edgecombe, Rick P
@ 2026-01-16 15:02 ` Nikita Kalyazin
2026-01-16 15:35 ` Edgecombe, Rick P
2026-01-16 17:30 ` Vishal Annapurve
1 sibling, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 15:02 UTC (permalink / raw)
To: Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On 15/01/2026 23:04, Edgecombe, Rick P wrote:
> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>> Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
>> ioctl. When set, guest_memfd folios will be removed from the direct map
>> after preparation, with direct map entries only restored when the folios
>> are freed.
>>
>> To ensure these folios do not end up in places where the kernel cannot
>> deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
>> address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
>>
>> Note that this flag causes removal of direct map entries for all
>> guest_memfd folios independent of whether they are "shared" or "private"
>> (although current guest_memfd only supports either all folios in the
>> "shared" state, or all folios in the "private" state if
>> GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
>> entries of also the shared parts of guest_memfd are a special type of
>> non-CoCo VM where, host userspace is trusted to have access to all of
>> guest memory, but where Spectre-style transient execution attacks
>> through the host kernel's direct map should still be mitigated. In this
>> setup, KVM retains access to guest memory via userspace mappings of
>> guest_memfd, which are reflected back into KVM's memslots via
>> userspace_addr. This is needed for things like MMIO emulation on x86_64
>> to work.
>
> TDX does some clearing at the direct map mapping for pages that comes from gmem,
> using a special instruction. It also does some clflushing at the direct map
> address for these pages. So I think we need to make sure TDs don't pull from
> gmem fds with this flag.
Would you be able to give a pointer on how we can do that? I'm not very
familiar with the TDX code.
>
> Not that there would be any expected use of the flag for TDs, but it could cause
> a crash.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:02 ` Nikita Kalyazin
@ 2026-01-16 15:35 ` Edgecombe, Rick P
2026-01-16 15:41 ` Sean Christopherson
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 15:35 UTC (permalink / raw)
To: kalyazin, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Fri, 2026-01-16 at 15:02 +0000, Nikita Kalyazin wrote:
> > TDX does some clearing at the direct map mapping for pages that
> > comes from gmem, using a special instruction. It also does some
> > clflushing at the direct map address for these pages. So I think we
> > need to make sure TDs don't pull from gmem fds with this flag.
>
> Would you be able to give a pointer on how we can do that? I'm not
> very familiar with the TDX code.
Uhh, that is a good question. Let me think.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:35 ` Edgecombe, Rick P
@ 2026-01-16 15:41 ` Sean Christopherson
2026-01-16 17:32 ` Nikita Kalyazin
2026-01-16 17:51 ` Edgecombe, Rick P
0 siblings, 2 replies; 62+ messages in thread
From: Sean Christopherson @ 2026-01-16 15:41 UTC (permalink / raw)
To: Rick P Edgecombe
Cc: kalyazin, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc, david, svens,
catalin.marinas, palmer, jgross, surenb, vbabka, riel, pfalcato,
x86, rppt, thuth, borntraeger, maz, peterx, ast,
Vishal Annapurve, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will
On Fri, Jan 16, 2026, Rick P Edgecombe wrote:
> On Fri, 2026-01-16 at 15:02 +0000, Nikita Kalyazin wrote:
> > > TDX does some clearing at the direct map mapping for pages that
> > > comes from gmem, using a special instruction. It also does some
> > > clflushing at the direct map address for these pages. So I think we
> > > need to make sure TDs don't pull from gmem fds with this flag.
> >
> > Would you be able to give a pointer on how we can do that? I'm not
> > very familiar with the TDX code.
>
> Uhh, that is a good question. Let me think.
Pass @kvm to kvm_arch_gmem_supports_no_direct_map() and then return %false if
it's a TDX VM.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:41 ` Sean Christopherson
@ 2026-01-16 17:32 ` Nikita Kalyazin
2026-01-16 17:51 ` Edgecombe, Rick P
1 sibling, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 17:32 UTC (permalink / raw)
To: Sean Christopherson, Rick P Edgecombe
Cc: kalyazin, linux-riscv, linux-s390, linux-mm, linux-fsdevel,
linux-kselftest, kernel, kvmarm, linux-arm-kernel, linux-kernel,
kvm, bpf, loongarch, linux-doc, david, svens, catalin.marinas,
palmer, jgross, surenb, vbabka, riel, pfalcato, x86, rppt, thuth,
borntraeger, maz, peterx, ast, Vishal Annapurve, pjw, alex,
dave.hansen, tglx, hca, willy, wyihan, ryan.roberts, yang, jolsa,
jmattson, luto, aneesh.kumar, haoluo, patrick.roy, akpm, coxu,
mhocko, mlevitsk, jgg, hpa, song, Liam.Howlett, maobibo, peterz,
oupton, lorenzo.stoakes, jhubbard, martin.lau, jthoughton,
Jonathan.Cameron, Yu, Yu-cheng, eddyz87, yonghong.song,
chenhuacai, shuah, prsampat, kevin.brodsky, shijie, itazur,
suzuki.poulose, pbonzini, dev.jain, yuzenghui, gor, jackabt,
daniel, agordeev, andrii, mingo, aou, joey.gouly, derekmn,
xmarcalx, kpsingh, sdf, jackmanb, bp, corbet, ackerleytng, jannh,
john.fastabend, kas, will
On 16/01/2026 15:41, Sean Christopherson wrote:
> On Fri, Jan 16, 2026, Rick P Edgecombe wrote:
>> On Fri, 2026-01-16 at 15:02 +0000, Nikita Kalyazin wrote:
>>>> TDX does some clearing at the direct map mapping for pages that
>>>> comes from gmem, using a special instruction. It also does some
>>>> clflushing at the direct map address for these pages. So I think we
>>>> need to make sure TDs don't pull from gmem fds with this flag.
>>>
>>> Would you be able to give a pointer on how we can do that? I'm not
>>> very familiar with the TDX code.
>>
>> Uhh, that is a good question. Let me think.
>
> Pass @kvm to kvm_arch_gmem_supports_no_direct_map() and then return %false if
> it's a TDX VM.
Sounds good to me, thanks.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:41 ` Sean Christopherson
2026-01-16 17:32 ` Nikita Kalyazin
@ 2026-01-16 17:51 ` Edgecombe, Rick P
1 sibling, 0 replies; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 17:51 UTC (permalink / raw)
To: seanjc
Cc: david, kvm, catalin.marinas, palmer, jgross, bpf, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, borntraeger, maz, svens, ast,
vbabka, Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca,
willy, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, linux-kernel, akpm, coxu,
mhocko, mlevitsk, linux-kselftest, jgg, loongarch, song,
jhubbard, peterz, kernel, oupton, lorenzo.stoakes, Liam.Howlett,
maobibo, martin.lau, jthoughton, Yu, Yu-cheng, kvmarm,
Jonathan.Cameron, eddyz87, hpa, yonghong.song, linux-doc, shuah,
chenhuacai, prsampat, kevin.brodsky, shijie, itazur,
suzuki.poulose, pbonzini, kalyazin, dev.jain, gor, yuzenghui,
daniel, jackabt, agordeev, andrii, mingo, linux-riscv, aou,
joey.gouly, derekmn, xmarcalx, linux-s390, kpsingh, kalyazin,
linux-arm-kernel, sdf, jackmanb, bp, corbet, linux-fsdevel,
ackerleytng, jannh, john.fastabend, kas, linux-mm, will
On Fri, 2026-01-16 at 07:41 -0800, Sean Christopherson wrote:
> Pass @kvm to kvm_arch_gmem_supports_no_direct_map() and then return
> %false if it's a TDX VM.
Thanks!
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-15 23:04 ` Edgecombe, Rick P
2026-01-16 15:02 ` Nikita Kalyazin
@ 2026-01-16 17:30 ` Vishal Annapurve
2026-01-16 17:51 ` Edgecombe, Rick P
1 sibling, 1 reply; 62+ messages in thread
From: Vishal Annapurve @ 2026-01-16 17:30 UTC (permalink / raw)
To: Edgecombe, Rick P
Cc: linux-riscv, kalyazin, kernel, linux-kselftest, linux-mm,
linux-fsdevel, linux-s390, kvmarm, linux-kernel,
linux-arm-kernel, kvm, bpf, linux-doc, loongarch, david, palmer,
catalin.marinas, svens, jgross, surenb, riel, pfalcato, peterx,
x86, rppt, thuth, maz, dave.hansen, ast, vbabka, borntraeger,
alex, pjw, tglx, willy, hca, wyihan, ryan.roberts, jolsa, yang,
jmattson, luto, aneesh.kumar, haoluo, patrick.roy, akpm, coxu,
mhocko, mlevitsk, jgg, hpa, song, oupton, peterz, maobibo,
lorenzo.stoakes, Liam.Howlett, jthoughton, martin.lau, jhubbard,
Yu, Yu-cheng, Jonathan.Cameron, eddyz87, yonghong.song,
chenhuacai, shuah, prsampat, kevin.brodsky, shijie,
suzuki.poulose, itazur, pbonzini, yuzenghui, dev.jain, gor,
jackabt, daniel, agordeev, andrii, mingo, aou, joey.gouly,
derekmn, xmarcalx, kpsingh, sdf, jackmanb, bp, corbet,
ackerleytng, jannh, john.fastabend, kas, will, seanjc
On Thu, Jan 15, 2026 at 3:04 PM Edgecombe, Rick P
<rick.p.edgecombe@intel.com> wrote:
>
> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
> > Add GUEST_MEMFD_FLAG_NO_DIRECT_MAP flag for KVM_CREATE_GUEST_MEMFD()
> > ioctl. When set, guest_memfd folios will be removed from the direct map
> > after preparation, with direct map entries only restored when the folios
> > are freed.
> >
> > To ensure these folios do not end up in places where the kernel cannot
> > deal with them, set AS_NO_DIRECT_MAP on the guest_memfd's struct
> > address_space if GUEST_MEMFD_FLAG_NO_DIRECT_MAP is requested.
> >
> > Note that this flag causes removal of direct map entries for all
> > guest_memfd folios independent of whether they are "shared" or "private"
> > (although current guest_memfd only supports either all folios in the
> > "shared" state, or all folios in the "private" state if
> > GUEST_MEMFD_FLAG_MMAP is not set). The usecase for removing direct map
> > entries of also the shared parts of guest_memfd are a special type of
> > non-CoCo VM where, host userspace is trusted to have access to all of
> > guest memory, but where Spectre-style transient execution attacks
> > through the host kernel's direct map should still be mitigated. In this
> > setup, KVM retains access to guest memory via userspace mappings of
> > guest_memfd, which are reflected back into KVM's memslots via
> > userspace_addr. This is needed for things like MMIO emulation on x86_64
> > to work.
>
> TDX does some clearing at the direct map mapping for pages that comes from gmem,
> using a special instruction. It also does some clflushing at the direct map
> address for these pages. So I think we need to make sure TDs don't pull from
> gmem fds with this flag.
Disabling this feature for TDX VMs for now seems ok. I assume TDX code
can establish temporary mappings to the physical memory and therefore
doesn't necessarily have to rely on direct map.
Is it safe to say that we can remove direct map for guest memory for
TDX VMs (and ideally other CC VMs as well) in future as needed?
>
> Not that there would be any expected use of the flag for TDs, but it could cause
> a crash.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 17:30 ` Vishal Annapurve
@ 2026-01-16 17:51 ` Edgecombe, Rick P
2026-01-22 16:44 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 17:51 UTC (permalink / raw)
To: Annapurve, Vishal
Cc: david, kvm, catalin.marinas, palmer, jgross, bpf, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, borntraeger, maz, svens, ast,
vbabka, pjw, alex, dave.hansen, tglx, hca, willy, wyihan,
ryan.roberts, yang, jolsa, jmattson, aneesh.kumar, luto, haoluo,
patrick.roy, linux-kernel, akpm, coxu, mhocko, mlevitsk,
linux-kselftest, jgg, loongarch, song, oupton, jhubbard, kernel,
hpa, lorenzo.stoakes, Liam.Howlett, martin.lau, jthoughton, Yu,
Yu-cheng, maobibo, kvmarm, Jonathan.Cameron, peterz, eddyz87,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, dev.jain, gor, jackabt, daniel, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, ackerleytng, jannh, john.fastabend,
kas, linux-mm, will, seanjc
On Fri, 2026-01-16 at 09:30 -0800, Vishal Annapurve wrote:
> > TDX does some clearing at the direct map mapping for pages that
> > comes from gmem, using a special instruction. It also does some
> > clflushing at the direct map address for these pages. So I think we
> > need to make sure TDs don't pull from gmem fds with this flag.
>
> Disabling this feature for TDX VMs for now seems ok. I assume TDX
> code can establish temporary mappings to the physical memory and
> therefore doesn't necessarily have to rely on direct map.
Can, as in, can be changed to? It doesn't now, because the direct map
is reliable today.
>
> Is it safe to say that we can remove direct map for guest memory for
> TDX VMs (and ideally other CC VMs as well) in future as needed?
Linux code doesn't need to read the cipher text of course, but it does
need to help with memory cleaning on the errata systems. Doing a new
mapping for each page getting reclaimed would add cost to the shutdown
path.
Then there is the clfush. It is not actually required for the most
part. There is a TDX flag to check to see if you need to do it, so we
could probably remove the direct map accesses for some systems and
avoid temporary mappings.
So long term, I don't see a problem. For the old systems it would have
extra cost of temporary mappings at shutdown, but I would have imagined
direct map removal would have been costly too.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 17:51 ` Edgecombe, Rick P
@ 2026-01-22 16:44 ` Ackerley Tng
2026-01-22 17:35 ` Edgecombe, Rick P
0 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 16:44 UTC (permalink / raw)
To: Edgecombe, Rick P, Annapurve, Vishal
Cc: david, kvm, catalin.marinas, palmer, jgross, bpf, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, borntraeger, maz, svens, ast,
vbabka, pjw, alex, dave.hansen, tglx, hca, willy, wyihan,
ryan.roberts, yang, jolsa, jmattson, aneesh.kumar, luto, haoluo,
patrick.roy, linux-kernel, akpm, coxu, mhocko, mlevitsk,
linux-kselftest, jgg, loongarch, song, oupton, jhubbard, kernel,
hpa, lorenzo.stoakes, Liam.Howlett, martin.lau, jthoughton, Yu,
Yu-cheng, maobibo, kvmarm, Jonathan.Cameron, peterz, eddyz87,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, dev.jain, gor, jackabt, daniel, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, jannh, john.fastabend, kas, linux-mm,
will, seanjc
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:
> On Fri, 2026-01-16 at 09:30 -0800, Vishal Annapurve wrote:
>> > TDX does some clearing at the direct map mapping for pages that
>> > comes from gmem, using a special instruction. It also does some
>> > clflushing at the direct map address for these pages. So I think we
>> > need to make sure TDs don't pull from gmem fds with this flag.
>>
>> Disabling this feature for TDX VMs for now seems ok. I assume TDX
>> code can establish temporary mappings to the physical memory and
>> therefore doesn't necessarily have to rely on direct map.
>
> Can, as in, can be changed to? It doesn't now, because the direct map
> is reliable today.
>
>>
>> Is it safe to say that we can remove direct map for guest memory for
>> TDX VMs (and ideally other CC VMs as well) in future as needed?
>
> Linux code doesn't need to read the cipher text of course, but it does
> need to help with memory cleaning on the errata systems. Doing a new
> mapping for each page getting reclaimed would add cost to the shutdown
> path.
>
Can we disable direct map removal for errata systems using TDX only,
instead of all TDX?
If it's complicated to figure that out, we can disable direct map
removal for TDX for now and figure that out later.
> Then there is the clfush. It is not actually required for the most
> part. There is a TDX flag to check to see if you need to do it, so we
> could probably remove the direct map accesses for some systems and
> avoid temporary mappings.
>
> So long term, I don't see a problem. For the old systems it would have
> extra cost of temporary mappings at shutdown, but I would have imagined
> direct map removal would have been costly too.
Is there a way to check if the code is running on the errata system and
set up the temporary mappings only for those?
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 16:44 ` Ackerley Tng
@ 2026-01-22 17:35 ` Edgecombe, Rick P
2026-01-22 22:47 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-22 17:35 UTC (permalink / raw)
To: ackerleytng, Annapurve, Vishal
Cc: david, kvm, catalin.marinas, svens, jgross, bpf, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, palmer, ast,
pjw, alex, dave.hansen, tglx, hca, willy, wyihan, ryan.roberts,
yang, jolsa, jmattson, luto, aneesh.kumar, haoluo, patrick.roy,
peterx, linux-kernel, akpm, coxu, mhocko, linux-kselftest,
mlevitsk, jgg, loongarch, song, Liam.Howlett, oupton, kernel,
lorenzo.stoakes, peterz, Jonathan.Cameron, martin.lau,
jthoughton, jhubbard, Yu, Yu-cheng, kvmarm, eddyz87, hpa,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, maobibo, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, gor, dev.jain, daniel, jackabt, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, jannh, john.fastabend, kas, linux-mm,
will, seanjc
On Thu, 2026-01-22 at 08:44 -0800, Ackerley Tng wrote:
>
> Can we disable direct map removal for errata systems using TDX only,
> instead of all TDX?
>
> If it's complicated to figure that out, we can disable direct map
> removal for TDX for now and figure that out later.
In theory, but it still would require changes to TDX code since it does
the clflush unconditionally today. To know whether clflush is needed
(it's a different thing to the errata), you need to check a TDX module
flag. (CLFLUSH_BEFORE_ALLOC)
Gosh, you know what, I should double check that we don't need the
clflush from the vm shutdown optimization. It should be a different
thing, but for we gave scrutiny to the whole Linux flow when we did
that. So I'd have to double check nothing relied on it. We can follow
up here.
>
> > Then there is the clfush. It is not actually required for the most
> > part. There is a TDX flag to check to see if you need to do it, so
> > we could probably remove the direct map accesses for some systems
> > and avoid temporary mappings.
> >
> > So long term, I don't see a problem. For the old systems it would
> > have extra cost of temporary mappings at shutdown, but I would have
> > imagined direct map removal would have been costly too.
>
> Is there a way to check if the code is running on the errata system
> and set up the temporary mappings only for those?
The TDX code today doesn't do any remapping because the direct map is
reliably present. There isn't a flag or anything to just do the
remapping automatically. We would have to do some vmalloc mapping or
temporary_mm or something.
Can you explain what the use case is for unmapping encrypted TDX
private memory from the host direct map?
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 17:35 ` Edgecombe, Rick P
@ 2026-01-22 22:47 ` Ackerley Tng
2026-01-23 0:01 ` Edgecombe, Rick P
0 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 22:47 UTC (permalink / raw)
To: Edgecombe, Rick P, Annapurve, Vishal
Cc: david, kvm, catalin.marinas, svens, jgross, bpf, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, palmer, ast,
pjw, alex, dave.hansen, tglx, hca, willy, wyihan, ryan.roberts,
yang, jolsa, jmattson, luto, aneesh.kumar, haoluo, patrick.roy,
peterx, linux-kernel, akpm, coxu, mhocko, linux-kselftest,
mlevitsk, jgg, loongarch, song, Liam.Howlett, oupton, kernel,
lorenzo.stoakes, peterz, Jonathan.Cameron, martin.lau,
jthoughton, jhubbard, Yu, Yu-cheng, kvmarm, eddyz87, hpa,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, maobibo, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, gor, dev.jain, daniel, jackabt, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, jannh, john.fastabend, kas, linux-mm,
will, seanjc
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:
> On Thu, 2026-01-22 at 08:44 -0800, Ackerley Tng wrote:
>>
>> Can we disable direct map removal for errata systems using TDX only,
>> instead of all TDX?
>>
>> If it's complicated to figure that out, we can disable direct map
>> removal for TDX for now and figure that out later.
>
> In theory, but it still would require changes to TDX code since it does
> the clflush unconditionally today. To know whether clflush is needed
> (it's a different thing to the errata), you need to check a TDX module
> flag. (CLFLUSH_BEFORE_ALLOC)
>
> Gosh, you know what, I should double check that we don't need the
> clflush from the vm shutdown optimization. It should be a different
> thing, but for we gave scrutiny to the whole Linux flow when we did
> that. So I'd have to double check nothing relied on it. We can follow
> up here.
>
>>
>> > Then there is the clfush. It is not actually required for the most
>> > part. There is a TDX flag to check to see if you need to do it, so
>> > we could probably remove the direct map accesses for some systems
>> > and avoid temporary mappings.
>> >
>> > So long term, I don't see a problem. For the old systems it would
>> > have extra cost of temporary mappings at shutdown, but I would have
>> > imagined direct map removal would have been costly too.
>>
>> Is there a way to check if the code is running on the errata system
>> and set up the temporary mappings only for those?
>
> The TDX code today doesn't do any remapping because the direct map is
> reliably present. There isn't a flag or anything to just do the
> remapping automatically. We would have to do some vmalloc mapping or
> temporary_mm or something.
>
> Can you explain what the use case is for unmapping encrypted TDX
> private memory from the host direct map?
There's no use case I can think of for unmapping TDX private memory from
the host direct map, but Sean's suggestion
https://lore.kernel.org/all/aWpcDrGVLrZOqdcg@google.com/ won't even let
shared guest_memfd memory be unmapped from the direct map for TDX VMs.
Actually, does TDX's clflush that assumes presence in the direct map
apply only for private pages, or all pages?
If TDX's clflush only happens for private pages, then we could restore
private pages to the direct map, and then we'd be safe even for TDX?
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 22:47 ` Ackerley Tng
@ 2026-01-23 0:01 ` Edgecombe, Rick P
2026-01-28 0:29 ` Ackerley Tng
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-23 0:01 UTC (permalink / raw)
To: ackerleytng, Annapurve, Vishal
Cc: david, kvm, catalin.marinas, svens, jgross, bpf, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, palmer, ast,
peterx, alex, pjw, dave.hansen, tglx, hca, willy, wyihan,
ryan.roberts, jolsa, yang, jmattson, aneesh.kumar, luto, haoluo,
patrick.roy, linux-kernel, akpm, coxu, mhocko, linux-kselftest,
mlevitsk, jgg, loongarch, song, oupton, Liam.Howlett, kernel,
Jonathan.Cameron, lorenzo.stoakes, jhubbard, jthoughton,
martin.lau, Yu, Yu-cheng, peterz, kvmarm, eddyz87, hpa,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, maobibo, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, gor, dev.jain, daniel, jackabt, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, jannh, john.fastabend, kas, linux-mm,
will, seanjc
On Thu, 2026-01-22 at 14:47 -0800, Ackerley Tng wrote:
>
> There's no use case I can think of for unmapping TDX private memory
> from the host direct map, but Sean's suggestion
> https://lore.kernel.org/all/aWpcDrGVLrZOqdcg@google.com/ won't even
> let shared guest_memfd memory be unmapped from the direct map for TDX
> VMs.
Ah!
>
> Actually, does TDX's clflush that assumes presence in the direct map
> apply only for private pages, or all pages?
>
> If TDX's clflush only happens for private pages, then we could
> restore private pages to the direct map, and then we'd be safe even
> for TDX?
Yes, just private pages need the special treatment. But it will be much
simpler to start with just blocking the option for TDX. A shared pages
only mode could come later.
In general I think we should try to break things up like this when we
can. Kernel code is not set in stone, only ABI. I think it will lead to
overall faster upstreaming, because the series' can be simpler.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-23 0:01 ` Edgecombe, Rick P
@ 2026-01-28 0:29 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-28 0:29 UTC (permalink / raw)
To: Edgecombe, Rick P, Annapurve, Vishal
Cc: david, kvm, catalin.marinas, svens, jgross, bpf, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, palmer, ast,
peterx, alex, pjw, dave.hansen, tglx, hca, willy, wyihan,
ryan.roberts, jolsa, yang, jmattson, aneesh.kumar, luto, haoluo,
patrick.roy, linux-kernel, akpm, coxu, mhocko, linux-kselftest,
mlevitsk, jgg, loongarch, song, oupton, Liam.Howlett, kernel,
Jonathan.Cameron, lorenzo.stoakes, jhubbard, jthoughton,
martin.lau, Yu, Yu-cheng, peterz, kvmarm, eddyz87, hpa,
yonghong.song, linux-doc, shuah, chenhuacai, prsampat,
kevin.brodsky, maobibo, shijie, suzuki.poulose, itazur, pbonzini,
yuzenghui, gor, dev.jain, daniel, jackabt, agordeev, andrii,
mingo, linux-riscv, aou, joey.gouly, derekmn, xmarcalx,
linux-s390, kpsingh, kalyazin, linux-arm-kernel, sdf, jackmanb,
bp, corbet, linux-fsdevel, jannh, john.fastabend, kas, linux-mm,
will, seanjc
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> writes:
> On Thu, 2026-01-22 at 14:47 -0800, Ackerley Tng wrote:
>>
>> There's no use case I can think of for unmapping TDX private memory
>> from the host direct map, but Sean's suggestion
>> https://lore.kernel.org/all/aWpcDrGVLrZOqdcg@google.com/ won't even
>> let shared guest_memfd memory be unmapped from the direct map for TDX
>> VMs.
>
> Ah!
>
>>
>> Actually, does TDX's clflush that assumes presence in the direct map
>> apply only for private pages, or all pages?
>>
>> If TDX's clflush only happens for private pages, then we could
>> restore private pages to the direct map, and then we'd be safe even
>> for TDX?
>
> Yes, just private pages need the special treatment. But it will be much
> simpler to start with just blocking the option for TDX. A shared pages
> only mode could come later.
>
> In general I think we should try to break things up like this when we
> can. Kernel code is not set in stone, only ABI. I think it will lead to
> overall faster upstreaming, because the series' can be simpler.
I agree on splitting the feature up :), agree that simpler series are
better.
Perhaps just for my understanding,
+ shared pages => not in direct map => no TDX clflush
+ private pages => always in direct map => TDX performs clflush
(I could put pages back into the direct map while doing shared to
private conversions).
Is everything good then? Or does TDX code not apply the special
treatment, as in clflush only for private pages, as of now?
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-14 13:46 ` [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map Kalyazin, Nikita
2026-01-15 20:00 ` Ackerley Tng
2026-01-15 23:04 ` Edgecombe, Rick P
@ 2026-01-16 0:00 ` Edgecombe, Rick P
2026-01-16 15:00 ` Nikita Kalyazin
2 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 0:00 UTC (permalink / raw)
To: linux-riscv, kalyazin, kernel, linux-kselftest, linux-mm,
linux-fsdevel, linux-s390, kvmarm, linux-kernel,
linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
> +{
> + /*
> + * Direct map restoration cannot fail, as the only error condition
> + * for direct map manipulation is failure to allocate page tables
> + * when splitting huge pages, but this split would have already
> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
> + * Thus folio_restore_direct_map() here only updates prot bits.
> + */
> + if (kvm_gmem_folio_no_direct_map(folio)) {
> + WARN_ON_ONCE(folio_restore_direct_map(folio));
> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
> + }
> +}
> +
Does this assume the folio would not have been split after it was zapped? As in,
if it was zapped at 2MB granularity (no 4KB direct map split required) but then
restored at 4KB (split required)? Or it gets merged somehow before this?
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 0:00 ` Edgecombe, Rick P
@ 2026-01-16 15:00 ` Nikita Kalyazin
2026-01-16 15:34 ` Edgecombe, Rick P
2026-01-22 18:37 ` Ackerley Tng
0 siblings, 2 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 15:00 UTC (permalink / raw)
To: Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On 16/01/2026 00:00, Edgecombe, Rick P wrote:
> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>> +{
>> + /*
>> + * Direct map restoration cannot fail, as the only error condition
>> + * for direct map manipulation is failure to allocate page tables
>> + * when splitting huge pages, but this split would have already
>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
>> + * Thus folio_restore_direct_map() here only updates prot bits.
>> + */
>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>> + }
>> +}
>> +
>
> Does this assume the folio would not have been split after it was zapped? As in,
> if it was zapped at 2MB granularity (no 4KB direct map split required) but then
> restored at 4KB (split required)? Or it gets merged somehow before this?
AFAIK it can't be zapped at 2MB granularity as the zapping code will
inevitably cause splitting because guest_memfd faults occur at the base
page granularity as of now.
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:00 ` Nikita Kalyazin
@ 2026-01-16 15:34 ` Edgecombe, Rick P
2026-01-16 17:28 ` Nikita Kalyazin
2026-01-22 18:37 ` Ackerley Tng
1 sibling, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 15:34 UTC (permalink / raw)
To: kalyazin, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Fri, 2026-01-16 at 15:00 +0000, Nikita Kalyazin wrote:
> > Does this assume the folio would not have been split after it was
> > zapped? As in, if it was zapped at 2MB granularity (no 4KB direct
> > map split required) but then restored at 4KB (split required)? Or
> > it gets merged somehow before this?
>
> AFAIK it can't be zapped at 2MB granularity as the zapping code will
> inevitably cause splitting because guest_memfd faults occur at the
> base page granularity as of now.
Ah, right since there are no huge pages currently. Then the huge page
series will need to keep this in mind and figure out some solution.
Probably worth a comment on that assumption to help anyone that changes
it.
I imagine this feature is really targeted towards machines running a
bunch of untrusted VMs, so cloud hypervisors really. In that case the
direct map will probably be carved up pretty quick. Did you consider
just breaking the full direct map to 4k at the start when it's in use?
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:34 ` Edgecombe, Rick P
@ 2026-01-16 17:28 ` Nikita Kalyazin
2026-01-16 17:36 ` Edgecombe, Rick P
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 17:28 UTC (permalink / raw)
To: Edgecombe, Rick P, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On 16/01/2026 15:34, Edgecombe, Rick P wrote:
> On Fri, 2026-01-16 at 15:00 +0000, Nikita Kalyazin wrote:
>>> Does this assume the folio would not have been split after it was
>>> zapped? As in, if it was zapped at 2MB granularity (no 4KB direct
>>> map split required) but then restored at 4KB (split required)? Or
>>> it gets merged somehow before this?
>>
>> AFAIK it can't be zapped at 2MB granularity as the zapping code will
>> inevitably cause splitting because guest_memfd faults occur at the
>> base page granularity as of now.
>
> Ah, right since there are no huge pages currently. Then the huge page
> series will need to keep this in mind and figure out some solution.
> Probably worth a comment on that assumption to help anyone that changes
> it.
Makes sense. I'll leave a comment.
>
> I imagine this feature is really targeted towards machines running a
> bunch of untrusted VMs, so cloud hypervisors really. In that case the
> direct map will probably be carved up pretty quick. Did you consider
> just breaking the full direct map to 4k at the start when it's in use?
That's an interesting point, I haven't thought about it from this
perspective. We should run some tests internally to see if it'd help.
This will likely change with support for huge pages coming in though.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 17:28 ` Nikita Kalyazin
@ 2026-01-16 17:36 ` Edgecombe, Rick P
2026-01-16 17:51 ` Nikita Kalyazin
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 17:36 UTC (permalink / raw)
To: kalyazin, kernel, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kalyazin, kvmarm,
linux-arm-kernel, linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, jolsa, yang, jmattson, aneesh.kumar, luto,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, oupton, maobibo, peterz, Liam.Howlett, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Fri, 2026-01-16 at 17:28 +0000, Nikita Kalyazin wrote:
> >
> > I imagine this feature is really targeted towards machines running
> > a bunch of untrusted VMs, so cloud hypervisors really. In that case
> > the direct map will probably be carved up pretty quick. Did you
> > consider just breaking the full direct map to 4k at the start when
> > it's in use?
>
> That's an interesting point, I haven't thought about it from this
> perspective. We should run some tests internally to see if it'd
> help. This will likely change with support for huge pages coming in
> though.
The thing is, those no_flush() helpers actually still flush if they
need to split a page. Plus if they need to clear out lazy vmalloc
aliases it could be another flush. There are probably a lot of
opportunities to reduce flushing even beyond pre-split.
Just curious... as far as performance, have you tested this on a big
multi-socket system, where that flushing will hurt more? It's something
that has always been a fear for these directmap unmapping solutions
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 17:36 ` Edgecombe, Rick P
@ 2026-01-16 17:51 ` Nikita Kalyazin
2026-01-16 18:10 ` Edgecombe, Rick P
0 siblings, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 17:51 UTC (permalink / raw)
To: Edgecombe, Rick P, kernel, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kalyazin, kvmarm,
linux-arm-kernel, linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, jolsa, yang, jmattson, aneesh.kumar, luto,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, oupton, maobibo, peterz, Liam.Howlett, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On 16/01/2026 17:36, Edgecombe, Rick P wrote:
> On Fri, 2026-01-16 at 17:28 +0000, Nikita Kalyazin wrote:
>>>
>>> I imagine this feature is really targeted towards machines running
>>> a bunch of untrusted VMs, so cloud hypervisors really. In that case
>>> the direct map will probably be carved up pretty quick. Did you
>>> consider just breaking the full direct map to 4k at the start when
>>> it's in use?
>>
>> That's an interesting point, I haven't thought about it from this
>> perspective. We should run some tests internally to see if it'd
>> help. This will likely change with support for huge pages coming in
>> though.
>
> The thing is, those no_flush() helpers actually still flush if they
> need to split a page. Plus if they need to clear out lazy vmalloc
> aliases it could be another flush. There are probably a lot of
> opportunities to reduce flushing even beyond pre-split.
>
> Just curious... as far as performance, have you tested this on a big
> multi-socket system, where that flushing will hurt more? It's something
> that has always been a fear for these directmap unmapping solutions
Yes, this is a problem that we'd like to address. We have been
discussing it in [1]. The effect of flushing on memory population that
we see on x86 is 5-7x elongation. We are thinking of making use of the
no-direct-map memory allocator that Brendan is working on [2].
[1]
https://lore.kernel.org/lkml/d1b58114-9b88-4535-b28c-09d9cc1ff3be@amazon.com
[2] https://lore.kernel.org/kvm/DDVS9ITBCE2Z.RSTLCU79EX8G@google.com
>
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 17:51 ` Nikita Kalyazin
@ 2026-01-16 18:10 ` Edgecombe, Rick P
2026-01-16 18:16 ` Nikita Kalyazin
0 siblings, 1 reply; 62+ messages in thread
From: Edgecombe, Rick P @ 2026-01-16 18:10 UTC (permalink / raw)
To: kalyazin, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On Fri, 2026-01-16 at 17:51 +0000, Nikita Kalyazin wrote:
> Yes, this is a problem that we'd like to address. We have been
> discussing it in [1]. The effect of flushing on memory population
> that we see on x86 is 5-7x elongation. We are thinking of making use
> of the no-direct-map memory allocator that Brendan is working on [2].
Ah, makes sense.
Do you plan to merge this before the performance problems are
addressed? I guess this series focuses on safety and functionality
first.
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 18:10 ` Edgecombe, Rick P
@ 2026-01-16 18:16 ` Nikita Kalyazin
0 siblings, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 18:16 UTC (permalink / raw)
To: Edgecombe, Rick P, kalyazin, linux-riscv, linux-s390, linux-mm,
linux-fsdevel, linux-kselftest, kernel, kvmarm, linux-arm-kernel,
linux-kernel, kvm, bpf, loongarch, linux-doc
Cc: david, svens, catalin.marinas, palmer, jgross, surenb, vbabka,
riel, pfalcato, x86, rppt, thuth, borntraeger, maz, peterx, ast,
Annapurve, Vishal, pjw, alex, dave.hansen, tglx, hca, willy,
wyihan, ryan.roberts, yang, jolsa, jmattson, luto, aneesh.kumar,
haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk, jgg, hpa,
song, Liam.Howlett, maobibo, peterz, oupton, lorenzo.stoakes,
jhubbard, martin.lau, jthoughton, Jonathan.Cameron, Yu, Yu-cheng,
eddyz87, yonghong.song, chenhuacai, shuah, prsampat,
kevin.brodsky, shijie, itazur, suzuki.poulose, pbonzini,
dev.jain, yuzenghui, gor, jackabt, daniel, agordeev, andrii,
mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, ackerleytng, jannh, john.fastabend, kas,
will, seanjc
On 16/01/2026 18:10, Edgecombe, Rick P wrote:
> On Fri, 2026-01-16 at 17:51 +0000, Nikita Kalyazin wrote:
>> Yes, this is a problem that we'd like to address. We have been
>> discussing it in [1]. The effect of flushing on memory population
>> that we see on x86 is 5-7x elongation. We are thinking of making use
>> of the no-direct-map memory allocator that Brendan is working on [2].
>
> Ah, makes sense.
>
> Do you plan to merge this before the performance problems are
> addressed? I guess this series focuses on safety and functionality
> first.
Yes, we'd like to merge the functional part first and then optimise it
in the further series.
>
^ permalink raw reply [flat|nested] 62+ messages in thread
* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-16 15:00 ` Nikita Kalyazin
2026-01-16 15:34 ` Edgecombe, Rick P
@ 2026-01-22 18:37 ` Ackerley Tng
2026-01-22 18:47 ` Nikita Kalyazin
2026-01-26 16:56 ` Nikita Kalyazin
1 sibling, 2 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-22 18:37 UTC (permalink / raw)
To: kalyazin, Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, jannh, john.fastabend, kas, will, seanjc
Nikita Kalyazin <kalyazin@amazon.com> writes:
> On 16/01/2026 00:00, Edgecombe, Rick P wrote:
>> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>>> +{
>>> + /*
>>> + * Direct map restoration cannot fail, as the only error condition
>>> + * for direct map manipulation is failure to allocate page tables
>>> + * when splitting huge pages, but this split would have already
>>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
Do you know if folio_restore_direct_map() will also end up merging page
table entries to a higher level?
>>> + * Thus folio_restore_direct_map() here only updates prot bits.
>>> + */
>>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>>> + }
>>> +}
>>> +
>>
>> Does this assume the folio would not have been split after it was zapped? As in,
>> if it was zapped at 2MB granularity (no 4KB direct map split required) but then
>> restored at 4KB (split required)? Or it gets merged somehow before this?
I agree with the rest of the discussion that this will probably land
before huge page support, so I will have to figure out the intersection
of the two later.
>
> AFAIK it can't be zapped at 2MB granularity as the zapping code will
> inevitably cause splitting because guest_memfd faults occur at the base
> page granularity as of now.
Here's what I'm thinking for now:
[HugeTLB, no conversions]
With initial HugeTLB support (no conversions), host userspace
guest_memfd faults will be:
+ For guest_memfd with PUD-sized pages
+ At PUD level or PTE level
+ For guest_memfd with PMD-sized pages
+ At PMD level or PTE level
Since this guest_memfd doesn't support conversions, the folio is never
split/merged, so the direct map is restored at whatever level it was
zapped. I think this works out well.
[HugeTLB + conversions]
For a guest_memfd with HugeTLB support and conversions, host userspace
guest_memfd faults will always be at PTE level, so the direct map will
be split and the faulted pages have the direct map zapped in 4K chunks
as they are faulted.
On conversion back to private, put those back into the direct map
(putting aside whether to merge the direct map PTEs for now).
Unfortunately there's no unmapping callback for guest_memfd to use, so
perhaps the principle should be to put the folios back into the direct
map ASAP - at unmapping if guest_memfd is doing the unmapping, otherwise
at freeing time?
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 18:37 ` Ackerley Tng
@ 2026-01-22 18:47 ` Nikita Kalyazin
2026-01-26 16:56 ` Nikita Kalyazin
1 sibling, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-22 18:47 UTC (permalink / raw)
To: Ackerley Tng, Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, jannh, john.fastabend, kas, will, seanjc
On 22/01/2026 18:37, Ackerley Tng wrote:
> Nikita Kalyazin <kalyazin@amazon.com> writes:
>
>> On 16/01/2026 00:00, Edgecombe, Rick P wrote:
>>> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>>>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>>>> +{
>>>> + /*
>>>> + * Direct map restoration cannot fail, as the only error condition
>>>> + * for direct map manipulation is failure to allocate page tables
>>>> + * when splitting huge pages, but this split would have already
>>>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
>
> Do you know if folio_restore_direct_map() will also end up merging page
> table entries to a higher level?
By looking at the callchain in x86 at least, I can't see how it would.
>
>>>> + * Thus folio_restore_direct_map() here only updates prot bits.
>>>> + */
>>>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>>>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>>>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>>>> + }
>>>> +}
>>>> +
>>>
>>> Does this assume the folio would not have been split after it was zapped? As in,
>>> if it was zapped at 2MB granularity (no 4KB direct map split required) but then
>>> restored at 4KB (split required)? Or it gets merged somehow before this?
>
> I agree with the rest of the discussion that this will probably land
> before huge page support, so I will have to figure out the intersection
> of the two later.
>
>>
>> AFAIK it can't be zapped at 2MB granularity as the zapping code will
>> inevitably cause splitting because guest_memfd faults occur at the base
>> page granularity as of now.
>
> Here's what I'm thinking for now:
>
> [HugeTLB, no conversions]
> With initial HugeTLB support (no conversions), host userspace
> guest_memfd faults will be:
>
> + For guest_memfd with PUD-sized pages
> + At PUD level or PTE level
> + For guest_memfd with PMD-sized pages
> + At PMD level or PTE level
>
> Since this guest_memfd doesn't support conversions, the folio is never
> split/merged, so the direct map is restored at whatever level it was
> zapped. I think this works out well.
>
> [HugeTLB + conversions]
> For a guest_memfd with HugeTLB support and conversions, host userspace
> guest_memfd faults will always be at PTE level, so the direct map will
> be split and the faulted pages have the direct map zapped in 4K chunks
> as they are faulted.
>
> On conversion back to private, put those back into the direct map
> (putting aside whether to merge the direct map PTEs for now).
>
>
> Unfortunately there's no unmapping callback for guest_memfd to use, so
> perhaps the principle should be to put the folios back into the direct
> map ASAP - at unmapping if guest_memfd is doing the unmapping, otherwise
> at freeing time?
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-22 18:37 ` Ackerley Tng
2026-01-22 18:47 ` Nikita Kalyazin
@ 2026-01-26 16:56 ` Nikita Kalyazin
2026-01-28 0:21 ` Ackerley Tng
1 sibling, 1 reply; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-26 16:56 UTC (permalink / raw)
To: Ackerley Tng, Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, jannh, john.fastabend, kas, will, seanjc
On 22/01/2026 18:37, Ackerley Tng wrote:
> Nikita Kalyazin <kalyazin@amazon.com> writes:
>
>> On 16/01/2026 00:00, Edgecombe, Rick P wrote:
>>> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>>>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>>>> +{
>>>> + /*
>>>> + * Direct map restoration cannot fail, as the only error condition
>>>> + * for direct map manipulation is failure to allocate page tables
>>>> + * when splitting huge pages, but this split would have already
>>>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
>
> Do you know if folio_restore_direct_map() will also end up merging page
> table entries to a higher level?
>
>>>> + * Thus folio_restore_direct_map() here only updates prot bits.
>>>> + */
>>>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>>>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>>>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>>>> + }
>>>> +}
>>>> +
>>>
>>> Does this assume the folio would not have been split after it was zapped? As in,
>>> if it was zapped at 2MB granularity (no 4KB direct map split required) but then
>>> restored at 4KB (split required)? Or it gets merged somehow before this?
>
> I agree with the rest of the discussion that this will probably land
> before huge page support, so I will have to figure out the intersection
> of the two later.
>
>>
>> AFAIK it can't be zapped at 2MB granularity as the zapping code will
>> inevitably cause splitting because guest_memfd faults occur at the base
>> page granularity as of now.
>
> Here's what I'm thinking for now:
>
> [HugeTLB, no conversions]
> With initial HugeTLB support (no conversions), host userspace
> guest_memfd faults will be:
>
> + For guest_memfd with PUD-sized pages
> + At PUD level or PTE level
> + For guest_memfd with PMD-sized pages
> + At PMD level or PTE level
>
> Since this guest_memfd doesn't support conversions, the folio is never
> split/merged, so the direct map is restored at whatever level it was
> zapped. I think this works out well.
>
> [HugeTLB + conversions]
> For a guest_memfd with HugeTLB support and conversions, host userspace
> guest_memfd faults will always be at PTE level, so the direct map will
> be split and the faulted pages have the direct map zapped in 4K chunks
> as they are faulted.
>
> On conversion back to private, put those back into the direct map
> (putting aside whether to merge the direct map PTEs for now).
Makes sense to me.
>
>
> Unfortunately there's no unmapping callback for guest_memfd to use, so
> perhaps the principle should be to put the folios back into the direct
> map ASAP - at unmapping if guest_memfd is doing the unmapping, otherwise
> at freeing time?
I'm not sure I fully understand what you mean here. What would be the
purpose for hooking up to unmapping? Why would making sure we put
folios back into the direct map whenever they are freed or converted to
private not be sufficient?
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map
2026-01-26 16:56 ` Nikita Kalyazin
@ 2026-01-28 0:21 ` Ackerley Tng
0 siblings, 0 replies; 62+ messages in thread
From: Ackerley Tng @ 2026-01-28 0:21 UTC (permalink / raw)
To: kalyazin, Edgecombe, Rick P, linux-riscv, kalyazin, kernel,
linux-kselftest, linux-mm, linux-fsdevel, linux-s390, kvmarm,
linux-kernel, linux-arm-kernel, kvm, bpf, linux-doc, loongarch
Cc: david, palmer, catalin.marinas, svens, jgross, surenb, riel,
pfalcato, peterx, x86, rppt, thuth, maz, dave.hansen, ast,
vbabka, Annapurve, Vishal, borntraeger, alex, pjw, tglx, willy,
hca, wyihan, ryan.roberts, jolsa, yang, jmattson, luto,
aneesh.kumar, haoluo, patrick.roy, akpm, coxu, mhocko, mlevitsk,
jgg, hpa, song, oupton, peterz, maobibo, lorenzo.stoakes,
Liam.Howlett, jthoughton, martin.lau, jhubbard, Yu, Yu-cheng,
Jonathan.Cameron, eddyz87, yonghong.song, chenhuacai, shuah,
prsampat, kevin.brodsky, shijie, suzuki.poulose, itazur,
pbonzini, yuzenghui, dev.jain, gor, jackabt, daniel, agordeev,
andrii, mingo, aou, joey.gouly, derekmn, xmarcalx, kpsingh, sdf,
jackmanb, bp, corbet, jannh, john.fastabend, kas, will, seanjc
Nikita Kalyazin <kalyazin@amazon.com> writes:
> On 22/01/2026 18:37, Ackerley Tng wrote:
>> Nikita Kalyazin <kalyazin@amazon.com> writes:
>>
>>> On 16/01/2026 00:00, Edgecombe, Rick P wrote:
>>>> On Wed, 2026-01-14 at 13:46 +0000, Kalyazin, Nikita wrote:
>>>>> +static void kvm_gmem_folio_restore_direct_map(struct folio *folio)
>>>>> +{
>>>>> + /*
>>>>> + * Direct map restoration cannot fail, as the only error condition
>>>>> + * for direct map manipulation is failure to allocate page tables
>>>>> + * when splitting huge pages, but this split would have already
>>>>> + * happened in folio_zap_direct_map() in kvm_gmem_folio_zap_direct_map().
>>
>> Do you know if folio_restore_direct_map() will also end up merging page
>> table entries to a higher level?
>>
>>>>> + * Thus folio_restore_direct_map() here only updates prot bits.
>>>>> + */
>>>>> + if (kvm_gmem_folio_no_direct_map(folio)) {
>>>>> + WARN_ON_ONCE(folio_restore_direct_map(folio));
>>>>> + folio->private = (void *)((u64)folio->private & ~KVM_GMEM_FOLIO_NO_DIRECT_MAP);
>>>>> + }
>>>>> +}
>>>>> +
>>>>
>>>> Does this assume the folio would not have been split after it was zapped? As in,
>>>> if it was zapped at 2MB granularity (no 4KB direct map split required) but then
>>>> restored at 4KB (split required)? Or it gets merged somehow before this?
>>
>> I agree with the rest of the discussion that this will probably land
>> before huge page support, so I will have to figure out the intersection
>> of the two later.
>>
>>>
>>> AFAIK it can't be zapped at 2MB granularity as the zapping code will
>>> inevitably cause splitting because guest_memfd faults occur at the base
>>> page granularity as of now.
>>
>> Here's what I'm thinking for now:
>>
>> [HugeTLB, no conversions]
>> With initial HugeTLB support (no conversions), host userspace
>> guest_memfd faults will be:
>>
>> + For guest_memfd with PUD-sized pages
>> + At PUD level or PTE level
>> + For guest_memfd with PMD-sized pages
>> + At PMD level or PTE level
>>
>> Since this guest_memfd doesn't support conversions, the folio is never
>> split/merged, so the direct map is restored at whatever level it was
>> zapped. I think this works out well.
>>
>> [HugeTLB + conversions]
>> For a guest_memfd with HugeTLB support and conversions, host userspace
>> guest_memfd faults will always be at PTE level, so the direct map will
>> be split and the faulted pages have the direct map zapped in 4K chunks
>> as they are faulted.
>>
>> On conversion back to private, put those back into the direct map
>> (putting aside whether to merge the direct map PTEs for now).
>
> Makes sense to me.
>
>>
>>
>> Unfortunately there's no unmapping callback for guest_memfd to use, so
>> perhaps the principle should be to put the folios back into the direct
>> map ASAP - at unmapping if guest_memfd is doing the unmapping, otherwise
>> at freeing time?
>
> I'm not sure I fully understand what you mean here. What would be the
> purpose for hooking up to unmapping? Why would making sure we put
> folios back into the direct map whenever they are freed or converted to
> private not be sufficient?
I think putting the folios back into the direct map when the folios are
freed or converted to private should cover all cases.
I was just thinking that being able to hook up to unmapping is nice
since unmapping is the counterpart to mapping when the folios are
removed from the direct map.
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 08/13] KVM: selftests: load elf via bounce buffer
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (6 preceding siblings ...)
2026-01-14 13:46 ` [PATCH v9 07/13] KVM: guest_memfd: Add flag to remove from direct map Kalyazin, Nikita
@ 2026-01-14 13:46 ` Kalyazin, Nikita
2026-01-14 13:46 ` [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1 Kalyazin, Nikita
` (4 subsequent siblings)
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:46 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
If guest memory is backed using a VMA that does not allow GUP (e.g. a
userspace mapping of guest_memfd when the fd was allocated using
GUEST_MEMFD_FLAG_NO_DIRECT_MAP), then directly loading the test ELF
binary into it via read(2) potentially does not work. To nevertheless
support loading binaries in this cases, do the read(2) syscall using a
bounce buffer, and then memcpy from the bounce buffer into guest memory.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
.../testing/selftests/kvm/include/test_util.h | 1 +
tools/testing/selftests/kvm/lib/elf.c | 8 +++----
tools/testing/selftests/kvm/lib/io.c | 23 +++++++++++++++++++
3 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index b4872ba8ed12..8140e59b59e5 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -48,6 +48,7 @@ do { \
ssize_t test_write(int fd, const void *buf, size_t count);
ssize_t test_read(int fd, void *buf, size_t count);
+ssize_t test_read_bounce(int fd, void *buf, size_t count);
int test_seq_read(const char *path, char **bufp, size_t *sizep);
void __printf(5, 6) test_assert(bool exp, const char *exp_str,
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
index f34d926d9735..e829fbe0a11e 100644
--- a/tools/testing/selftests/kvm/lib/elf.c
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -31,7 +31,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
* the real size of the ELF header.
*/
unsigned char ident[EI_NIDENT];
- test_read(fd, ident, sizeof(ident));
+ test_read_bounce(fd, ident, sizeof(ident));
TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1)
&& (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3),
"ELF MAGIC Mismatch,\n"
@@ -79,7 +79,7 @@ static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
offset_rv = lseek(fd, 0, SEEK_SET);
TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n"
" rv: %zi expected: %i", offset_rv, 0);
- test_read(fd, hdrp, sizeof(*hdrp));
+ test_read_bounce(fd, hdrp, sizeof(*hdrp));
TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr),
"Unexpected physical header size,\n"
" hdrp->e_phentsize: %x\n"
@@ -146,7 +146,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
/* Read in the program header. */
Elf64_Phdr phdr;
- test_read(fd, &phdr, sizeof(phdr));
+ test_read_bounce(fd, &phdr, sizeof(phdr));
/* Skip if this header doesn't describe a loadable segment. */
if (phdr.p_type != PT_LOAD)
@@ -187,7 +187,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename)
" expected: 0x%jx",
n1, errno, (intmax_t) offset_rv,
(intmax_t) phdr.p_offset);
- test_read(fd, addr_gva2hva(vm, phdr.p_vaddr),
+ test_read_bounce(fd, addr_gva2hva(vm, phdr.p_vaddr),
phdr.p_filesz);
}
}
diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c
index fedb2a741f0b..74419becc8bc 100644
--- a/tools/testing/selftests/kvm/lib/io.c
+++ b/tools/testing/selftests/kvm/lib/io.c
@@ -155,3 +155,26 @@ ssize_t test_read(int fd, void *buf, size_t count)
return num_read;
}
+
+/* Test read via intermediary buffer
+ *
+ * Same as test_read, except read(2)s happen into a bounce buffer that is memcpy'd
+ * to buf. For use with buffers that cannot be GUP'd (e.g. guest_memfd VMAs if
+ * guest_memfd was created with GUEST_MEMFD_FLAG_NO_DIRECT_MAP).
+ */
+ssize_t test_read_bounce(int fd, void *buf, size_t count)
+{
+ void *bounce_buffer;
+ ssize_t num_read;
+
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ bounce_buffer = malloc(count);
+ TEST_ASSERT(bounce_buffer != NULL, "Failed to allocate bounce buffer");
+
+ num_read = test_read(fd, bounce_buffer, count);
+ memcpy(buf, bounce_buffer, num_read);
+ free(bounce_buffer);
+
+ return num_read;
+}
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (7 preceding siblings ...)
2026-01-14 13:46 ` [PATCH v9 08/13] KVM: selftests: load elf via bounce buffer Kalyazin, Nikita
@ 2026-01-14 13:46 ` Kalyazin, Nikita
2026-01-15 19:39 ` Ackerley Tng
2026-01-14 13:47 ` [PATCH v9 10/13] KVM: selftests: Add guest_memfd based vm_mem_backing_src_types Kalyazin, Nikita
` (3 subsequent siblings)
12 siblings, 1 reply; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:46 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Have vm_mem_add() always set KVM_MEM_GUEST_MEMFD in the memslot flags if
a guest_memfd is passed in as an argument. This eliminates the
possibility where a guest_memfd instance is passed to vm_mem_add(), but
it ends up being ignored because the flags argument does not specify
KVM_MEM_GUEST_MEMFD at the same time.
This makes it easy to support more scenarios in which no vm_mem_add() is
not passed a guest_memfd instance, but is expected to allocate one.
Currently, this only happens if guest_memfd == -1 but flags &
KVM_MEM_GUEST_MEMFD != 0, but later vm_mem_add() will gain support for
loading the test code itself into guest_memfd (via
GUEST_MEMFD_FLAG_MMAP) if requested via a special
vm_mem_backing_src_type, at which point having to make sure the src_type
and flags are in-sync becomes cumbersome.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
tools/testing/selftests/kvm/lib/kvm_util.c | 24 +++++++++++++---------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 8279b6ced8d2..56ddbca91850 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1057,21 +1057,25 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
region->backing_src_type = src_type;
- if (flags & KVM_MEM_GUEST_MEMFD) {
- if (guest_memfd < 0) {
+ if (guest_memfd < 0) {
+ if (flags & KVM_MEM_GUEST_MEMFD) {
uint32_t guest_memfd_flags = 0;
TEST_ASSERT(!guest_memfd_offset,
"Offset must be zero when creating new guest_memfd");
guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
- } else {
- /*
- * Install a unique fd for each memslot so that the fd
- * can be closed when the region is deleted without
- * needing to track if the fd is owned by the framework
- * or by the caller.
- */
- guest_memfd = kvm_dup(guest_memfd);
}
+ } else {
+ /*
+ * Install a unique fd for each memslot so that the fd
+ * can be closed when the region is deleted without
+ * needing to track if the fd is owned by the framework
+ * or by the caller.
+ */
+ guest_memfd = kvm_dup(guest_memfd);
+ }
+
+ if (guest_memfd > 0) {
+ flags |= KVM_MEM_GUEST_MEMFD;
region->region.guest_memfd = guest_memfd;
region->region.guest_memfd_offset = guest_memfd_offset;
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1
2026-01-14 13:46 ` [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1 Kalyazin, Nikita
@ 2026-01-15 19:39 ` Ackerley Tng
2026-01-16 15:00 ` Nikita Kalyazin
0 siblings, 1 reply; 62+ messages in thread
From: Ackerley Tng @ 2026-01-15 19:39 UTC (permalink / raw)
To: Kalyazin, Nikita, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
"Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
> From: Patrick Roy <patrick.roy@linux.dev>
>
> Have vm_mem_add() always set KVM_MEM_GUEST_MEMFD in the memslot flags if
> a guest_memfd is passed in as an argument. This eliminates the
> possibility where a guest_memfd instance is passed to vm_mem_add(), but
> it ends up being ignored because the flags argument does not specify
> KVM_MEM_GUEST_MEMFD at the same time.
>
> This makes it easy to support more scenarios in which no vm_mem_add() is
> not passed a guest_memfd instance, but is expected to allocate one.
> Currently, this only happens if guest_memfd == -1 but flags &
> KVM_MEM_GUEST_MEMFD != 0, but later vm_mem_add() will gain support for
> loading the test code itself into guest_memfd (via
> GUEST_MEMFD_FLAG_MMAP) if requested via a special
> vm_mem_backing_src_type, at which point having to make sure the src_type
> and flags are in-sync becomes cumbersome.
>
> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> tools/testing/selftests/kvm/lib/kvm_util.c | 24 +++++++++++++---------
> 1 file changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 8279b6ced8d2..56ddbca91850 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -1057,21 +1057,25 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>
> region->backing_src_type = src_type;
>
> - if (flags & KVM_MEM_GUEST_MEMFD) {
> - if (guest_memfd < 0) {
> + if (guest_memfd < 0) {
> + if (flags & KVM_MEM_GUEST_MEMFD) {
> uint32_t guest_memfd_flags = 0;
> TEST_ASSERT(!guest_memfd_offset,
> "Offset must be zero when creating new guest_memfd");
> guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
> - } else {
> - /*
> - * Install a unique fd for each memslot so that the fd
> - * can be closed when the region is deleted without
> - * needing to track if the fd is owned by the framework
> - * or by the caller.
> - */
> - guest_memfd = kvm_dup(guest_memfd);
> }
> + } else {
> + /*
> + * Install a unique fd for each memslot so that the fd
> + * can be closed when the region is deleted without
> + * needing to track if the fd is owned by the framework
> + * or by the caller.
> + */
> + guest_memfd = kvm_dup(guest_memfd);
> + }
> +
> + if (guest_memfd > 0) {
Might 0 turn out to be a valid return from dup() for a guest_memfd?
> + flags |= KVM_MEM_GUEST_MEMFD;
>
> region->region.guest_memfd = guest_memfd;
> region->region.guest_memfd_offset = guest_memfd_offset;
Refactoring vm_mem_add() (/* FIXME: This thing needs to be ripped apart
and rewritten. */) should probably be a separate patch series, but I'd
like to take this opportunity to ask: Sean, what do you have in mind for
the rewritten version?
Would it be something like struct vm_shape, where there are default
mem_shapes, and the shapes get validated and then passed to
vm_mem_add()?
> --
> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* Re: [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1
2026-01-15 19:39 ` Ackerley Tng
@ 2026-01-16 15:00 ` Nikita Kalyazin
0 siblings, 0 replies; 62+ messages in thread
From: Nikita Kalyazin @ 2026-01-16 15:00 UTC (permalink / raw)
To: Ackerley Tng, Kalyazin, Nikita, kvm, linux-doc, linux-kernel,
linux-arm-kernel, kvmarm, linux-fsdevel, linux-mm, bpf,
linux-kselftest, kernel, linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, maobibo, prsampat,
mlevitsk, jmattson, jthoughton, agordeev, alex, aou, borntraeger,
chenhuacai, dev.jain, gor, hca, Jonathan.Cameron, palmer, pjw,
shijie, svens, thuth, wyihan, yang, vannapurve, jackmanb,
aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri, Takahiro,
Manwaring, Derek, Cali, Marco
On 15/01/2026 19:39, Ackerley Tng wrote:
> "Kalyazin, Nikita" <kalyazin@amazon.co.uk> writes:
>
>> From: Patrick Roy <patrick.roy@linux.dev>
>>
>> Have vm_mem_add() always set KVM_MEM_GUEST_MEMFD in the memslot flags if
>> a guest_memfd is passed in as an argument. This eliminates the
>> possibility where a guest_memfd instance is passed to vm_mem_add(), but
>> it ends up being ignored because the flags argument does not specify
>> KVM_MEM_GUEST_MEMFD at the same time.
>>
>> This makes it easy to support more scenarios in which no vm_mem_add() is
>> not passed a guest_memfd instance, but is expected to allocate one.
>> Currently, this only happens if guest_memfd == -1 but flags &
>> KVM_MEM_GUEST_MEMFD != 0, but later vm_mem_add() will gain support for
>> loading the test code itself into guest_memfd (via
>> GUEST_MEMFD_FLAG_MMAP) if requested via a special
>> vm_mem_backing_src_type, at which point having to make sure the src_type
>> and flags are in-sync becomes cumbersome.
>>
>> Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> tools/testing/selftests/kvm/lib/kvm_util.c | 24 +++++++++++++---------
>> 1 file changed, 14 insertions(+), 10 deletions(-)
>>
>> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
>> index 8279b6ced8d2..56ddbca91850 100644
>> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
>> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
>> @@ -1057,21 +1057,25 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>>
>> region->backing_src_type = src_type;
>>
>> - if (flags & KVM_MEM_GUEST_MEMFD) {
>> - if (guest_memfd < 0) {
>> + if (guest_memfd < 0) {
>> + if (flags & KVM_MEM_GUEST_MEMFD) {
>> uint32_t guest_memfd_flags = 0;
>> TEST_ASSERT(!guest_memfd_offset,
>> "Offset must be zero when creating new guest_memfd");
>> guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
>> - } else {
>> - /*
>> - * Install a unique fd for each memslot so that the fd
>> - * can be closed when the region is deleted without
>> - * needing to track if the fd is owned by the framework
>> - * or by the caller.
>> - */
>> - guest_memfd = kvm_dup(guest_memfd);
>> }
>> + } else {
>> + /*
>> + * Install a unique fd for each memslot so that the fd
>> + * can be closed when the region is deleted without
>> + * needing to track if the fd is owned by the framework
>> + * or by the caller.
>> + */
>> + guest_memfd = kvm_dup(guest_memfd);
>> + }
>> +
>> + if (guest_memfd > 0) {
>
> Might 0 turn out to be a valid return from dup() for a guest_memfd?
Yes, you're right, it isn't impossible. Thanks!
>
>> + flags |= KVM_MEM_GUEST_MEMFD;
>>
>> region->region.guest_memfd = guest_memfd;
>> region->region.guest_memfd_offset = guest_memfd_offset;
>
> Refactoring vm_mem_add() (/* FIXME: This thing needs to be ripped apart
> and rewritten. */) should probably be a separate patch series, but I'd
> like to take this opportunity to ask: Sean, what do you have in mind for
> the rewritten version?
>
> Would it be something like struct vm_shape, where there are default
> mem_shapes, and the shapes get validated and then passed to
> vm_mem_add()?
>
>> --
>> 2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread
* [PATCH v9 10/13] KVM: selftests: Add guest_memfd based vm_mem_backing_src_types
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (8 preceding siblings ...)
2026-01-14 13:46 ` [PATCH v9 09/13] KVM: selftests: set KVM_MEM_GUEST_MEMFD in vm_mem_add() if guest_memfd != -1 Kalyazin, Nikita
@ 2026-01-14 13:47 ` Kalyazin, Nikita
2026-01-14 13:47 ` [PATCH v9 11/13] KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing selftests Kalyazin, Nikita
` (2 subsequent siblings)
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:47 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Allow selftests to configure their memslots such that userspace_addr is
set to a MAP_SHARED mapping of the guest_memfd that's associated with
the memslot. This setup is the configuration for non-CoCo VMs, where all
guest memory is backed by a guest_memfd whose folios are all marked
shared, but KVM is still able to access guest memory to provide
functionality such as MMIO emulation on x86.
Add backing types for normal guest_memfd, as well as direct map removed
guest_memfd.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 18 ++++++
.../testing/selftests/kvm/include/test_util.h | 7 +++
tools/testing/selftests/kvm/lib/kvm_util.c | 61 ++++++++++---------
tools/testing/selftests/kvm/lib/test_util.c | 8 +++
4 files changed, 65 insertions(+), 29 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 81f4355ff28a..6689b43810c1 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -641,6 +641,24 @@ static inline bool is_smt_on(void)
void vm_create_irqchip(struct kvm_vm *vm);
+static inline uint32_t backing_src_guest_memfd_flags(enum vm_mem_backing_src_type t)
+{
+ uint32_t flags = 0;
+
+ switch (t) {
+ case VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP:
+ flags |= GUEST_MEMFD_FLAG_NO_DIRECT_MAP;
+ fallthrough;
+ case VM_MEM_SRC_GUEST_MEMFD:
+ flags |= GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+ break;
+ default:
+ break;
+ }
+
+ return flags;
+}
+
static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size,
uint64_t flags)
{
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index 8140e59b59e5..ea6de20ce8ef 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -152,6 +152,8 @@ enum vm_mem_backing_src_type {
VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB,
VM_MEM_SRC_SHMEM,
VM_MEM_SRC_SHARED_HUGETLB,
+ VM_MEM_SRC_GUEST_MEMFD,
+ VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP,
NUM_SRC_TYPES,
};
@@ -184,6 +186,11 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t)
return vm_mem_backing_src_alias(t)->flag & MAP_SHARED;
}
+static inline bool backing_src_is_guest_memfd(enum vm_mem_backing_src_type t)
+{
+ return t == VM_MEM_SRC_GUEST_MEMFD || t == VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP;
+}
+
static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t)
{
return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM;
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 56ddbca91850..28ee51253909 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1013,6 +1013,33 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
alignment = 1;
#endif
+ if (guest_memfd < 0) {
+ if ((flags & KVM_MEM_GUEST_MEMFD) || backing_src_is_guest_memfd(src_type)) {
+ uint32_t guest_memfd_flags = backing_src_guest_memfd_flags(src_type);
+
+ TEST_ASSERT(!guest_memfd_offset,
+ "Offset must be zero when creating new guest_memfd");
+ guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
+ }
+ } else {
+ /*
+ * Install a unique fd for each memslot so that the fd
+ * can be closed when the region is deleted without
+ * needing to track if the fd is owned by the framework
+ * or by the caller.
+ */
+ guest_memfd = kvm_dup(guest_memfd);
+ }
+
+ if (guest_memfd > 0) {
+ flags |= KVM_MEM_GUEST_MEMFD;
+
+ region->region.guest_memfd = guest_memfd;
+ region->region.guest_memfd_offset = guest_memfd_offset;
+ } else {
+ region->region.guest_memfd = -1;
+ }
+
/*
* When using THP mmap is not guaranteed to returned a hugepage aligned
* address so we have to pad the mmap. Padding is not needed for HugeTLB
@@ -1028,10 +1055,13 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
if (alignment > 1)
region->mmap_size += alignment;
- region->fd = -1;
- if (backing_src_is_shared(src_type))
+ if (backing_src_is_guest_memfd(src_type))
+ region->fd = guest_memfd;
+ else if (backing_src_is_shared(src_type))
region->fd = kvm_memfd_alloc(region->mmap_size,
src_type == VM_MEM_SRC_SHARED_HUGETLB);
+ else
+ region->fd = -1;
region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
vm_mem_backing_src_alias(src_type)->flag,
@@ -1056,33 +1086,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
}
region->backing_src_type = src_type;
-
- if (guest_memfd < 0) {
- if (flags & KVM_MEM_GUEST_MEMFD) {
- uint32_t guest_memfd_flags = 0;
- TEST_ASSERT(!guest_memfd_offset,
- "Offset must be zero when creating new guest_memfd");
- guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
- }
- } else {
- /*
- * Install a unique fd for each memslot so that the fd
- * can be closed when the region is deleted without
- * needing to track if the fd is owned by the framework
- * or by the caller.
- */
- guest_memfd = kvm_dup(guest_memfd);
- }
-
- if (guest_memfd > 0) {
- flags |= KVM_MEM_GUEST_MEMFD;
-
- region->region.guest_memfd = guest_memfd;
- region->region.guest_memfd_offset = guest_memfd_offset;
- } else {
- region->region.guest_memfd = -1;
- }
-
region->unused_phy_pages = sparsebit_alloc();
if (vm_arch_has_protected_memory(vm))
region->protected_phy_pages = sparsebit_alloc();
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index 8a1848586a85..ce9fe0271515 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -306,6 +306,14 @@ const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i)
*/
.flag = MAP_SHARED,
},
+ [VM_MEM_SRC_GUEST_MEMFD] = {
+ .name = "guest_memfd",
+ .flag = MAP_SHARED,
+ },
+ [VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP] = {
+ .name = "guest_memfd_no_direct_map",
+ .flag = MAP_SHARED,
+ }
};
_Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES,
"Missing new backing src types?");
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* [PATCH v9 11/13] KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing selftests
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (9 preceding siblings ...)
2026-01-14 13:47 ` [PATCH v9 10/13] KVM: selftests: Add guest_memfd based vm_mem_backing_src_types Kalyazin, Nikita
@ 2026-01-14 13:47 ` Kalyazin, Nikita
2026-01-14 13:47 ` [PATCH v9 12/13] KVM: selftests: stuff vm_mem_backing_src_type into vm_shape Kalyazin, Nikita
2026-01-14 13:47 ` [PATCH v9 13/13] KVM: selftests: Test guest execution from direct map removed gmem Kalyazin, Nikita
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:47 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Extend mem conversion selftests to cover the scenario that the guest can
fault in and write gmem-backed guest memory even if its direct map
removed. Also cover the new flag in guest_memfd_test.c tests.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
tools/testing/selftests/kvm/guest_memfd_test.c | 17 ++++++++++++++++-
.../kvm/x86/private_mem_conversions_test.c | 7 ++++---
2 files changed, 20 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index 618c937f3c90..9615018a1a67 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -403,6 +403,17 @@ static void test_guest_memfd(unsigned long vm_type)
__test_guest_memfd(vm, GUEST_MEMFD_FLAG_MMAP |
GUEST_MEMFD_FLAG_INIT_SHARED);
+ if (flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP) {
+ __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP);
+ if (flags & GUEST_MEMFD_FLAG_MMAP)
+ __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP |
+ GUEST_MEMFD_FLAG_MMAP);
+ if (flags & GUEST_MEMFD_FLAG_INIT_SHARED)
+ __test_guest_memfd(vm, GUEST_MEMFD_FLAG_NO_DIRECT_MAP |
+ GUEST_MEMFD_FLAG_MMAP |
+ GUEST_MEMFD_FLAG_INIT_SHARED);
+ }
+
kvm_vm_free(vm);
}
@@ -445,10 +456,14 @@ static void test_guest_memfd_guest(void)
TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_INIT_SHARED,
"Default VM type should support INIT_SHARED, supported flags = 0x%x",
vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS));
+ TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_NO_DIRECT_MAP,
+ "Default VM type should support INIT_SHARED, supported flags = 0x%x",
+ vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS));
size = vm->page_size;
fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP |
- GUEST_MEMFD_FLAG_INIT_SHARED);
+ GUEST_MEMFD_FLAG_INIT_SHARED |
+ GUEST_MEMFD_FLAG_NO_DIRECT_MAP);
vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0);
mem = kvm_mmap(size, PROT_READ | PROT_WRITE, MAP_SHARED, fd);
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 1969f4ab9b28..8767cb4a037e 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -367,7 +367,7 @@ static void *__test_mem_conversions(void *__vcpu)
}
static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus,
- uint32_t nr_memslots)
+ uint32_t nr_memslots, uint64_t gmem_flags)
{
/*
* Allocate enough memory so that each vCPU's chunk of memory can be
@@ -394,7 +394,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t
vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
- memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+ memfd = vm_create_guest_memfd(vm, memfd_size, gmem_flags);
for (i = 0; i < nr_memslots; i++)
vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
@@ -474,7 +474,8 @@ int main(int argc, char *argv[])
}
}
- test_mem_conversions(src_type, nr_vcpus, nr_memslots);
+ test_mem_conversions(src_type, nr_vcpus, nr_memslots, 0);
+ test_mem_conversions(src_type, nr_vcpus, nr_memslots, GUEST_MEMFD_FLAG_NO_DIRECT_MAP);
return 0;
}
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* [PATCH v9 12/13] KVM: selftests: stuff vm_mem_backing_src_type into vm_shape
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (10 preceding siblings ...)
2026-01-14 13:47 ` [PATCH v9 11/13] KVM: selftests: cover GUEST_MEMFD_FLAG_NO_DIRECT_MAP in existing selftests Kalyazin, Nikita
@ 2026-01-14 13:47 ` Kalyazin, Nikita
2026-01-14 13:47 ` [PATCH v9 13/13] KVM: selftests: Test guest execution from direct map removed gmem Kalyazin, Nikita
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:47 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Use one of the padding fields in struct vm_shape to carry an enum
vm_mem_backing_src_type value, to give the option to overwrite the
default of VM_MEM_SRC_ANONYMOUS in __vm_create().
Overwriting this default will allow tests to create VMs where the test
code is backed by mmap'd guest_memfd instead of anonymous memory.
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
.../testing/selftests/kvm/include/kvm_util.h | 19 ++++++++++---------
tools/testing/selftests/kvm/lib/kvm_util.c | 2 +-
tools/testing/selftests/kvm/lib/x86/sev.c | 1 +
.../selftests/kvm/pre_fault_memory_test.c | 1 +
4 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 6689b43810c1..4bc4af9a40cf 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -192,7 +192,7 @@ enum vm_guest_mode {
struct vm_shape {
uint32_t type;
uint8_t mode;
- uint8_t pad0;
+ uint8_t src_type;
uint16_t pad1;
};
@@ -200,14 +200,15 @@ kvm_static_assert(sizeof(struct vm_shape) == sizeof(uint64_t));
#define VM_TYPE_DEFAULT 0
-#define VM_SHAPE(__mode) \
-({ \
- struct vm_shape shape = { \
- .mode = (__mode), \
- .type = VM_TYPE_DEFAULT \
- }; \
- \
- shape; \
+#define VM_SHAPE(__mode) \
+({ \
+ struct vm_shape shape = { \
+ .mode = (__mode), \
+ .type = VM_TYPE_DEFAULT, \
+ .src_type = VM_MEM_SRC_ANONYMOUS \
+ }; \
+ \
+ shape; \
})
#if defined(__aarch64__)
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 28ee51253909..268a4520633b 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -467,7 +467,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
if (is_guest_memfd_required(shape))
flags |= KVM_MEM_GUEST_MEMFD;
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags);
+ vm_userspace_mem_region_add(vm, shape.src_type, 0, 0, nr_pages, flags);
for (i = 0; i < NR_MEM_REGIONS; i++)
vm->memslots[i] = 0;
diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c
index c3a9838f4806..d920880e4fc0 100644
--- a/tools/testing/selftests/kvm/lib/x86/sev.c
+++ b/tools/testing/selftests/kvm/lib/x86/sev.c
@@ -164,6 +164,7 @@ struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code,
struct vm_shape shape = {
.mode = VM_MODE_DEFAULT,
.type = type,
+ .src_type = VM_MEM_SRC_ANONYMOUS,
};
struct kvm_vm *vm;
struct kvm_vcpu *cpus[1];
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
index 93e603d91311..8a4d5af53fab 100644
--- a/tools/testing/selftests/kvm/pre_fault_memory_test.c
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -165,6 +165,7 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private)
const struct vm_shape shape = {
.mode = VM_MODE_DEFAULT,
.type = vm_type,
+ .src_type = VM_MEM_SRC_ANONYMOUS,
};
struct kvm_vcpu *vcpu;
struct kvm_run *run;
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread* [PATCH v9 13/13] KVM: selftests: Test guest execution from direct map removed gmem
2026-01-14 13:45 [PATCH v9 00/13] Direct Map Removal Support for guest_memfd Kalyazin, Nikita
` (11 preceding siblings ...)
2026-01-14 13:47 ` [PATCH v9 12/13] KVM: selftests: stuff vm_mem_backing_src_type into vm_shape Kalyazin, Nikita
@ 2026-01-14 13:47 ` Kalyazin, Nikita
12 siblings, 0 replies; 62+ messages in thread
From: Kalyazin, Nikita @ 2026-01-14 13:47 UTC (permalink / raw)
To: kvm, linux-doc, linux-kernel, linux-arm-kernel, kvmarm,
linux-fsdevel, linux-mm, bpf, linux-kselftest, kernel,
linux-riscv, linux-s390, loongarch
Cc: pbonzini, corbet, maz, oupton, joey.gouly, suzuki.poulose,
yuzenghui, catalin.marinas, will, seanjc, tglx, mingo, bp,
dave.hansen, x86, hpa, luto, peterz, willy, akpm, david,
lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko, ast,
daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
john.fastabend, kpsingh, sdf, haoluo, jolsa, jgg, jhubbard,
peterx, jannh, pfalcato, shuah, riel, ryan.roberts, jgross,
yu-cheng.yu, kas, coxu, kevin.brodsky, ackerleytng, maobibo,
prsampat, mlevitsk, jmattson, jthoughton, agordeev, alex, aou,
borntraeger, chenhuacai, dev.jain, gor, hca, Jonathan.Cameron,
palmer, pjw, shijie, svens, thuth, wyihan, yang, vannapurve,
jackmanb, aneesh.kumar, patrick.roy, Thomson, Jack, Itazuri,
Takahiro, Manwaring, Derek, Cali, Marco, Kalyazin, Nikita
From: Patrick Roy <patrick.roy@linux.dev>
Add a selftest that loads itself into guest_memfd (via
GUEST_MEMFD_FLAG_MMAP) and triggers an MMIO exit when executed. This
exercises x86 MMIO emulation code inside KVM for guest_memfd-backed
memslots where the guest_memfd folios are direct map removed.
Particularly, it validates that x86 MMIO emulation code (guest page
table walks + instruction fetch) correctly accesses gmem through the VMA
that's been reflected into the memslot's userspace_addr field (instead
of trying to do direct map accesses).
Signed-off-by: Patrick Roy <patrick.roy@linux.dev>
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
.../selftests/kvm/set_memory_region_test.c | 52 +++++++++++++++++--
1 file changed, 48 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index 7fe427ff9b38..6c57fb036b20 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -602,6 +602,41 @@ static void test_mmio_during_vectoring(void)
kvm_vm_free(vm);
}
+
+static void guest_code_trigger_mmio(void)
+{
+ /*
+ * Read some GPA that is not backed by a memslot. KVM consider this
+ * as MMIO and tell userspace to emulate the read.
+ */
+ READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+
+ GUEST_DONE();
+}
+
+static void test_guest_memfd_mmio(void)
+{
+ struct kvm_vm *vm;
+ struct kvm_vcpu *vcpu;
+ struct vm_shape shape = {
+ .mode = VM_MODE_DEFAULT,
+ .src_type = VM_MEM_SRC_GUEST_MEMFD_NO_DIRECT_MAP,
+ };
+ pthread_t vcpu_thread;
+
+ pr_info("Testing MMIO emulation for instructions in gmem\n");
+
+ vm = __vm_create_shape_with_one_vcpu(shape, &vcpu, 0, guest_code_trigger_mmio);
+
+ virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 1);
+
+ pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu);
+
+ /* If the MMIO read was successfully emulated, the vcpu thread will exit */
+ pthread_join(vcpu_thread, NULL);
+
+ kvm_vm_free(vm);
+}
#endif
int main(int argc, char *argv[])
@@ -625,10 +660,19 @@ int main(int argc, char *argv[])
test_add_max_memory_regions();
#ifdef __x86_64__
- if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) &&
- (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) {
- test_add_private_memory_region();
- test_add_overlapping_private_memory_regions();
+ if (kvm_has_cap(KVM_CAP_GUEST_MEMFD)) {
+ uint64_t valid_flags = kvm_check_cap(KVM_CAP_GUEST_MEMFD_FLAGS);
+
+ if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) {
+ test_add_private_memory_region();
+ test_add_overlapping_private_memory_regions();
+ }
+
+ if ((valid_flags & GUEST_MEMFD_FLAG_MMAP)
+ && (valid_flags & GUEST_MEMFD_FLAG_NO_DIRECT_MAP))
+ test_guest_memfd_mmio();
+ else
+ pr_info("Skipping tests requiring GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_NO_DIRECT_MAP");
} else {
pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n");
}
--
2.50.1
^ permalink raw reply [flat|nested] 62+ messages in thread