* [RFC PATCH 1/7] mm: Add softleaf_from_pud
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 2/7] mm: Add {pmd,pud}_huge_lock helper Oscar Salvador
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
We want to be able to operate on HugeTLB pages as we do with normal
pages, which means stop predenting everyting is a pte in HugeTLB world
and be able to operate on the right entry level.
Since we can have HugeTLB as PUD entries, we need the infrastructure that
allows us to operate on them, so add softleaf_from_pud(), and the
infrastructure that comes with it.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
arch/arm64/include/asm/pgtable.h | 12 +++++
arch/loongarch/include/asm/pgtable.h | 1 +
arch/powerpc/include/asm/book3s/64/pgtable.h | 7 +++
arch/s390/include/asm/pgtable.h | 38 ++++++++++++++++
arch/x86/include/asm/pgtable.h | 48 ++++++++++++++++++++
arch/x86/include/asm/pgtable_64.h | 2 +
include/asm-generic/pgtable_uffd.h | 15 ++++++
include/linux/leafops.h | 33 ++++++++++++++
include/linux/pgtable.h | 37 +++++++++++++++
9 files changed, 193 insertions(+)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49b..e42ad56a86d4 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -596,6 +596,13 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+#define pud_uffd_wp(pud) pte_uffd_wp(pud_pte(pud))
+#define pud_mkuffd_wp(pud) pte_pud(pte_mkuffd_wp(pud_pte(pud)))
+#define pud_clear_uffd_wp(pud) pte_pud(pte_clear_uffd_wp(pud_pte(pud)))
+#define pud_swp_uffd_wp(pud) pte_swp_uffd_wp(pud_pte(pud))
+#define pud_swp_mkuffd_wp(pud) pte_pud(pte_swp_mkuffd_wp(pud_pte(pud)))
+#define pud_swp_clear_uffd_wp(pud) \
+ pte_pud(pte_swp_clear_uffd_wp(pud_pte(pud)))
#define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd))
#define pmd_mkuffd_wp(pmd) pte_pmd(pte_mkuffd_wp(pmd_pte(pmd)))
#define pmd_clear_uffd_wp(pmd) pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd)))
@@ -1528,6 +1535,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
#define __swp_entry_to_pmd(swp) __pmd((swp).val)
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#ifdef CONFIG_HUGETLB_PAGE
+#define __pud_to_swp_entry(pud) ((swp_entry_t) { pud_val(pud) })
+#define __swp_entry_to_pud(swp) __pud((swp).val)
+#endif
+
/*
* Ensure that there are not more swap files than can be encoded in the kernel
* PTEs.
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index c33b3bcb733e..eba6d20f007f 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -335,6 +335,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
#define __swp_entry_to_pmd(x) __pmd((x).val | _PAGE_HUGE)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
+#define __pud_to_swp_entry(pud) ((swp_entry_t) { pud_val(pud) })
static inline bool pte_swp_exclusive(pte_t pte)
{
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 1a91762b455d..476781c59d5f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1065,6 +1065,13 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_swp_soft_dirty(pmd) pte_swp_soft_dirty(pmd_pte(pmd))
#define pmd_swp_clear_soft_dirty(pmd) pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)))
#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define pud_swp_mksoft_dirty(pud) pte_pud(pte_swp_mksoft_dirty(pud_pte(pud)))
+#define pud_swp_soft_dirty(pud) pte_swp_soft_dirty(pud_pte(pud))
+#define pud_swp_clear_soft_dirty(pud) pte_pud(pte_swp_clear_soft_dirty(pud_pte(pud)))
+#endif
+
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
#ifdef CONFIG_NUMA_BALANCING
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1c3c3be93be9..0d1d571215c4 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -901,11 +901,31 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
return clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_SOFT_DIRTY));
}
+static inline int pud_soft_dirty(pud_t pud)
+{
+ return pud_val(pud) & _REGION3_ENTRY_SOFT_DIRTY;
+}
+
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+ return set_pud_bit(pud, __pgprot(_REGION3_ENTRY_SOFT_DIRTY));
+}
+
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+ return clear_pud_bit(pud, __pgprot(_REGION3_ENTRY_SOFT_DIRTY));
+}
+
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
#define pmd_swp_soft_dirty(pmd) pmd_soft_dirty(pmd)
#define pmd_swp_mksoft_dirty(pmd) pmd_mksoft_dirty(pmd)
#define pmd_swp_clear_soft_dirty(pmd) pmd_clear_soft_dirty(pmd)
#endif
+#ifdef CONFIG_HUGETLB_PAGE
+#define pud_swp_soft_dirty(pud) pud_soft_dirty(pud)
+#define pud_swp_mksoft_dirty(pud) pud_mksoft_dirty(pud)
+#define pud_swp_clear_soft_dirty(pud) pud_clear_soft_dirty(pud)
+#endif
/*
* query functions pte_write/pte_dirty/pte_young only work if
@@ -1901,6 +1921,24 @@ static inline unsigned long __swp_offset_rste(swp_entry_t entry)
* requires conversion of the swap type and offset, and not all the possible
* PTE bits.
*/
+static inline swp_entry_t __pud_to_swp_entry(pud_t pud)
+{
+ swp_entry_t arch_entry;
+ pte_t pte;
+
+ arch_entry = __rste_to_swp_entry(pud_val(pud));
+ pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+ return __pte_to_swp_entry(pte);
+}
+
+static inline pud_t __swp_entry_to_pud(swp_entry_t arch_entry)
+{
+ pud_t pud;
+
+ pud = __pud(mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)));
+ return pud;
+}
+
static inline swp_entry_t __pmd_to_swp_entry(pmd_t pmd)
{
swp_entry_t arch_entry;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1662c5a8f445..a68ff339cd56 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -656,6 +656,23 @@ static inline pud_t pud_mkwrite(pud_t pud)
return pud_clear_saveddirty(pud);
}
+#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+static inline int pud_uffd_wp(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_UFFD_WP;
+}
+
+static inline pud_t pud_mkuffd_wp(pud_t pud)
+{
+ return pud_wrprotect(pud_set_flags(pud, _PAGE_UFFD_WP));
+}
+
+static inline pud_t pud_clear_uffd_wp(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_UFFD_WP);
+}
+#endif
+
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
@@ -1557,6 +1574,22 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
+#ifdef CONFIG_HUGETLB_PAGE
+static inline pud_t pud_swp_mksoft_dirty(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pud_swp_soft_dirty(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pud_t pud_swp_clear_soft_dirty(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -1589,6 +1622,21 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
+
+static inline pud_t pud_swp_mkuffd_wp(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SWP_UFFD_WP);
+}
+
+static inline int pud_swp_uffd_wp(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SWP_UFFD_WP;
+}
+
+static inline pud_t pud_swp_clear_uffd_wp(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_SWP_UFFD_WP);
+}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
static inline u16 pte_flags_pkey(unsigned long pte_flags)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f06e5d6a2747..0cf02ddd3d4b 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -236,8 +236,10 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
+#define __pud_to_swp_entry(pud) ((swp_entry_t) { pud_val((pud)) })
#define __swp_entry_to_pte(x) (__pte((x).val))
#define __swp_entry_to_pmd(x) (__pmd((x).val))
+#define __swp_entry_to_pud(x) (__pud((x).val))
extern void cleanup_highmap(void);
diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h
index 0d85791efdf7..59c9d6762ec8 100644
--- a/include/asm-generic/pgtable_uffd.h
+++ b/include/asm-generic/pgtable_uffd.h
@@ -78,6 +78,21 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
return pmd;
}
+
+static inline pud_t pud_swp_mkuffd_wp(pud_t pud)
+{
+ return pud;
+}
+
+static inline int pud_swp_uffd_wp(pud_t pud)
+{
+ return 0;
+}
+
+static inline pud_t pud_swp_clear_uffd_wp(pud_t pud)
+{
+ return pud;
+}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index a9ff94b744f2..122ac50aeb09 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -117,6 +117,39 @@ static inline softleaf_t softleaf_from_pmd(pmd_t pmd)
#endif
+#ifdef CONFIG_HUGETLB_PAGE
+/**
+ * softleaf_from_pud() - Obtain a leaf entry from a PUD entry.
+ * @pud: PUD entry.
+ *
+ * If @pud is present (therefore not a leaf entry) the function returns an empty
+ * leaf entry. Otherwise, it returns a leaf entry.
+ *
+ * Returns: Leaf entry.
+ */
+static inline softleaf_t softleaf_from_pud(pud_t pud)
+{
+ softleaf_t arch_entry;
+
+ if (pud_present(pud) || pud_none(pud))
+ return softleaf_mk_none();
+
+ if (pud_swp_soft_dirty(pud))
+ pud = pud_swp_clear_soft_dirty(pud);
+ if (pud_swp_uffd_wp(pud))
+ pud = pud_swp_clear_uffd_wp(pud);
+ arch_entry = __pud_to_swp_entry(pud);
+
+ /* Temporary until swp_entry_t eliminated. */
+ return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+#else
+static inline softleaf_t softleaf_from_pud(pud_t pud)
+{
+ return softleaf_mk_none();
+}
+#endif
+
/**
* softleaf_is_none() - Is the leaf entry empty?
* @entry: Leaf entry.
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893f..1abd9c52a4f2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1761,6 +1761,22 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
return pmd;
}
#endif
+#ifndef CONFIG_HUGETLB_PAGE
+static inline pud_t pud_swp_mksoft_dirty(pud_t pud)
+{
+ return pud;
+}
+
+static inline int pud_swp_soft_dirty(pud_t pud)
+{
+ return 0;
+}
+
+static inline pud_t pud_swp_clear_soft_dirty(pud_t pud)
+{
+ return pud;
+}
+#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
@@ -1821,6 +1837,21 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
return pmd;
}
+
+static inline pud_t pud_swp_mksoft_dirty(pud_t pud)
+{
+ return pud;
+}
+
+static inline int pud_swp_soft_dirty(pud_t pud)
+{
+ return 0;
+}
+
+static inline pud_t pud_swp_clear_soft_dirty(pud_t pud)
+{
+ return pud;
+}
#endif
#ifndef __HAVE_PFNMAP_TRACKING
@@ -2369,4 +2400,10 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \
} \
EXPORT_SYMBOL(vm_get_page_prot);
+#ifdef CONFIG_HUGETLB_PAGE
+#ifndef __pud_to_swp_entry
+#define __pud_to_swp_entry(pud) ((swp_entry_t) { pud_val(pud) })
+#endif
+#endif
+
#endif /* _LINUX_PGTABLE_H */
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 2/7] mm: Add {pmd,pud}_huge_lock helper
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 1/7] mm: Add softleaf_from_pud Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 3/7] mm: Implement folio_pmd_batch Oscar Salvador
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
HugeTLB and THP use the same lock for pud and pmd,
so create two helpers that can be directly used by both of them,
as they will be used in the generic pagewalkers.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
include/linux/mm_inline.h | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index fa2d6ba811b5..3ac77b50e91f 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -655,4 +655,36 @@ static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages)
return i;
}
+static inline spinlock_t *pmd_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+{
+ spinlock_t *ptl;
+
+ if (pmd_present(*pmd) || !pmd_none(*pmd)) {
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (pmd_present(*pmd) && pmd_leaf(*pmd))
+ return ptl;
+ else if (!pmd_present(*pmd) && !pmd_none(*pmd))
+ return ptl;
+ spin_unlock(ptl);
+ }
+
+ return NULL;
+}
+
+static inline spinlock_t *pud_huge_lock(pud_t *pud, struct vm_area_struct *vma)
+{
+ spinlock_t *ptl;
+
+ if (pud_present(*pud) || !pud_none(*pud)) {
+ ptl = pud_lock(vma->vm_mm, pud);
+ if (pud_present(*pud) && pud_leaf(*pud))
+ return ptl;
+ else if (!pud_present(*pud) && !pud_none(*pud))
+ return ptl;
+ spin_unlock(ptl);
+ }
+
+ return NULL;
+}
+
#endif
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 3/7] mm: Implement folio_pmd_batch
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 1/7] mm: Add softleaf_from_pud Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 2/7] mm: Add {pmd,pud}_huge_lock helper Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 4/7] mm: Implement pt_range_walk Oscar Salvador
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
HugeTLB can be mapped as contiguous PMDs, so we need a way to be able
to batch them as we do for contiguous PTEs.
Implement folio_pmd_batch in order to do that.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
arch/arm64/include/asm/pgtable.h | 19 ++++++++
include/linux/pgtable.h | 30 +++++++++++++
mm/internal.h | 75 +++++++++++++++++++++++++++++++-
3 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e42ad56a86d4..5b5490505b94 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -170,6 +170,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
(__boundary - 1 < (end) - 1) ? __boundary : (end); \
})
+#define pmd_valid_cont(pmd) (pmd_valid(pmd) && pmd_cont(pmd))
+
#define pte_hw_dirty(pte) (pte_write(pte) && !pte_rdonly(pte))
#define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY))
#define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte))
@@ -670,6 +672,12 @@ static inline pgprot_t pmd_pgprot(pmd_t pmd)
return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
}
+#define pmd_advance_pfn pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+ return pfn_pmd(pmd_pfn(pmd) + nr, pmd_pgprot(pmd));
+}
+
#define pud_pgprot pud_pgprot
static inline pgprot_t pud_pgprot(pud_t pud)
{
@@ -1645,6 +1653,17 @@ extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long ad
pte_t *ptep, pte_t old_pte, pte_t pte,
unsigned int nr);
+#ifdef CONFIG_HUGETLB_PAGE
+#define pmd_batch_hint pmd_batch_hint
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+ if (!pmd_valid_cont(pmd))
+ return 1;
+
+ return CONT_PMDS - (((unsigned long)pmdp >> 3) & (CONT_PMDS - 1));
+}
+#endif
+
#ifdef CONFIG_ARM64_CONTPTE
/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1abd9c52a4f2..6f01d5ed73f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -358,6 +358,36 @@ static inline void lazy_mmu_mode_pause(void) {}
static inline void lazy_mmu_mode_resume(void) {}
#endif
+#ifndef pmd_batch_hint
+/**
+ * pmd_batch_hint - Number of PMD entries that can be added to batch without scanning.
+ * @pmdp: Page table pointer for the entry.
+ * @pmd: Page table entry.
+ *
+ * Some architectures know that a set of contiguous pmds all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pmd batching without the core code needing to scan every pmd.
+ *
+ * An architecture implementation may ignore the PMD accessed state. Further,
+ * the dirty state must apply atomically to all the PMDs described by the hint.
+ *
+ * May be overridden by the architecture, else pmd_batch_hint is always 1.
+ */
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+ return 1;
+}
+#endif
+
+#ifndef pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+ return __pmd(pmd_val(pmd) + (nr << PFN_PTE_SHIFT));
+}
+#endif
+
+#define pmd_next_pfn(pmd) pmd_advance_pfn(pmd, 1)
+
#ifndef pte_batch_hint
/**
* pte_batch_hint - Number of pages that can be added to batch without scanning.
diff --git a/mm/internal.h b/mm/internal.h
index cb0af847d7d9..8fa0681ff2af 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -269,7 +269,7 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
return __anon_vma_prepare(vma);
}
-/* Flags for folio_pte_batch(). */
+/* Flags for folio_{pmd,pte}_batch(). */
typedef int __bitwise fpb_t;
/* Compare PTEs respecting the dirty bit. */
@@ -293,6 +293,79 @@ typedef int __bitwise fpb_t;
*/
#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
+static inline pmd_t __pmd_batch_clear_ignored(pmd_t pmd, fpb_t flags)
+{
+ if (!(flags & FPB_RESPECT_DIRTY))
+ pmd = pmd_mkclean(pmd);
+ if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
+ pmd = pmd_clear_soft_dirty(pmd);
+ if (likely(!(flags & FPB_RESPECT_WRITE)))
+ pmd = pmd_wrprotect(pmd);
+ return pmd_mkold(pmd);
+}
+
+/**
+ * folio_pmd_batch - detect a PMD batch for a large folio.
+ * - The only user of this is hugetlb for contiguous
+ * PMDs
+ **/
+static inline int folio_pmd_batch(struct folio *folio, pmd_t *pmdp, pmd_t *pmdentp,
+ int max_nr, fpb_t flags, bool *any_writable,
+ bool *any_young, bool *any_dirty)
+{
+ pmd_t expected_pmd, pmd = *pmdentp;
+ bool writable, young, dirty;
+ int nr, cur_nr;
+
+ if (any_writable)
+ *any_writable = !!pmd_write(*pmdentp);
+ if (any_young)
+ *any_young = !!pmd_young(*pmdentp);
+ if (any_dirty)
+ *any_dirty = !!pmd_dirty(*pmdentp);
+
+ VM_WARN_ON_FOLIO(!pmd_present(pmd), folio);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+ VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pmd_pfn(pmd))) != folio, folio);
+
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ (folio_pfn(folio) + folio_nr_pages(folio) -
+ pmd_pfn(pmd)) >> (PMD_SHIFT - PAGE_SHIFT));
+
+ nr = pmd_batch_hint(pmdp, pmd);
+ expected_pmd = __pmd_batch_clear_ignored(pmd_advance_pfn(pmd, nr << (PMD_SHIFT - PAGE_SHIFT)), flags);
+ pmdp = pmdp + nr;
+
+ while (nr < max_nr) {
+ pmd = pmdp_get(pmdp);
+ if (any_writable)
+ writable = !!pmd_write(pmd);
+ if (any_young)
+ young = !!pmd_young(pmd);
+ if (any_dirty)
+ dirty = !!pmd_dirty(pmd);
+ pmd = __pmd_batch_clear_ignored(pmd, flags);
+
+ if (!pmd_same(pmd, expected_pmd))
+ break;
+
+ if (any_writable)
+ *any_writable |= writable;
+ if (any_young)
+ *any_young |= young;
+ if (any_dirty)
+ *any_dirty |= dirty;
+
+ cur_nr = pmd_batch_hint(pmdp, pmd);
+ expected_pmd = pmd_advance_pfn(expected_pmd, cur_nr << (PMD_SHIFT - PAGE_SHIFT));
+ pmdp += cur_nr;
+ nr += cur_nr;
+ }
+
+ return min(nr, max_nr);
+}
+
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
if (!(flags & FPB_RESPECT_DIRTY))
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 4/7] mm: Implement pt_range_walk
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
` (2 preceding siblings ...)
2026-04-12 17:42 ` [RFC PATCH 3/7] mm: Implement folio_pmd_batch Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 5/7] mm: Make /proc/pid/smaps use the new generic pagewalk API Oscar Salvador
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador,
David Hildenbrand
Implement pt_range_walk, which is a pagewalk API that implements locking
and batching itself, and returns a struct containing information
about the address space which is backed by the vma.
It goes through the address range provided, and returns whatever it
find there, softleaf entries, folios, etc. and information about the entry
itself like whether it is dirty, shared, present, size of the entry, pagetable
level of the entry, number of batched entries, etc.
It defines the following types:
#define PT_TYPE_NONE
#define PT_TYPE_FOLIO
#define PT_TYPE_MARKER
#define PT_TYPE_PFN
#define PT_TYPE_SWAP
#define PT_TYPE_MIGRATION
#define PT_TYPE_DEVICE
#define PT_TYPE_HWPOISON
#define PT_TYPE_ALL
and it lets the caller be explicit about what types it is interested in.
If it finds a type, but the caller stated it is not of importance, it keeps
scanning the address range till the next type is found, or till we exhaust
the range.
We have three functions:
.pt_range_walk_start()
.pt_range_walk_next()
.pt_range_walk_done()
pt_range_walk_start() starts scanning the range and it returns the
first type it finds, then we keep calling pt_range_walk_next() until
we get PTW_DONE, which means we exhausted the range, and once that
happens we have to call pt_range_walk_done() in order to cleanup the
pt_range_walk internal state, like locking.
An example below:
´´´´
pt_type_flags_t flags = PT_TYPE_ALL;
type = pt_range_walk_start(&ptw, vma, start, vma->vm_end, flags);
while (type != PTW_DONE) {
do_something
type = pt_range_walk_next(&ptw, vma, start, vma->vm_end, flags);
}
pt_range_walk_done(&ptw);
´´´´
The API manages locking within the interface, and also batching, which means
that it can handle contiguous ptes (or pmds in the case of hugetlb)
itself.
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
arch/arm64/include/asm/pgtable.h | 1 +
include/linux/mm.h | 2 +
include/linux/pagewalk.h | 104 ++++++++
mm/memory.c | 22 ++
mm/pagewalk.c | 400 +++++++++++++++++++++++++++++++
5 files changed, 529 insertions(+)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5b5490505b94..9f8cca8880e0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -642,6 +642,7 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
#define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
+#define pud_dirty(pud) pte_dirty(pud_pte(pud))
#define pud_young(pud) pte_young(pud_pte(pud))
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5be3d8a8f806..c4e7fc558476 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2829,6 +2829,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
+struct folio *vm_normal_folio_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud);
struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t pud);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 88e18615dd72..8662468b4a3f 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -204,4 +204,108 @@ struct folio *folio_walk_start(struct folio_walk *fw,
vma_pgtable_walk_end(__vma); \
} while (0)
+typedef int __bitwise pt_type_flags_t;
+
+/*
+ * Types we are interested in returning. Those which are not explicitly set
+ * will be silently ignored by keep walking the page tables.
+ */
+#define PT_TYPE_NONE ((__force pt_type_flags_t)BIT(0))
+#define PT_TYPE_FOLIO ((__force pt_type_flags_t)BIT(1))
+#define PT_TYPE_MARKER ((__force pt_type_flags_t)BIT(2))
+#define PT_TYPE_PFN ((__force pt_type_flags_t)BIT(3))
+#define PT_TYPE_SWAP ((__force pt_type_flags_t)BIT(4))
+#define PT_TYPE_MIGRATION ((__force pt_type_flags_t)BIT(5))
+#define PT_TYPE_DEVICE ((__force pt_type_flags_t)BIT(6))
+#define PT_TYPE_HWPOISON ((__force pt_type_flags_t)BIT(7))
+#define PT_TYPE_ALL (PT_TYPE_NONE | PT_TYPE_FOLIO | PT_TYPE_MARKER | \
+ PT_TYPE_PFN | PT_TYPE_SWAP | PT_TYPE_MIGRATION | \
+ PT_TYPE_DEVICE | PT_TYPE_HWPOISON)
+
+enum pt_range_walk_level {
+ PTW_PUD_LEVEL,
+ PTW_PMD_LEVEL,
+ PTW_PTE_LEVEL,
+};
+
+enum pt_range_walk_type {
+ PTW_ABORT,
+ PTW_DONE,
+ PTW_NONE,
+ PTW_FOLIO,
+ PTW_MARKER,
+ PTW_PFN,
+ PTW_SWAP,
+ PTW_MIGRATION,
+ PTW_DEVICE,
+ PTW_HWPOISON,
+};
+
+/**
+ * struct pt_range_walk - pt_range_walk()
+ * @page: exact folio page referenced (if applicable)
+ * @folio: folio mapped (if any)
+ * @nr_entries: number of contiguous entries of the same type
+ * @size: stores nr_batched * entry_size
+ * @softleaf_entry: softleaf entry (if any)
+ * @writable: whether it is writable
+ * @young: whether it is young
+ * @dirty: whether it is dirty
+ * @present: whether it is present in the page tables
+ * @vma_locked: whether we are holding the vma lock
+ * @pmd_shared: only used for hugetlb
+ * @curr_addr: current addr we are operating on
+ * @next_addr: next addr to be used walk the page tables
+ * @level: page table level
+ * @pte: copy of the entry value (PTW_PTE_LEVEL).
+ * @pmd: copy of the entry value (PTW_PMD_LEVEL).
+ * @pud: copy of the entry value (PTW_PUD_LEVEL).
+ * @mm: the mm_struct we are walking
+ * @vma: the vma we are walking
+ * @ptl: pointer to the page table lock.
+ */
+
+struct pt_range_walk {
+ struct page *page;
+ struct folio *folio;
+ int nr_entries;
+ unsigned long size;
+ softleaf_t softleaf_entry;
+ bool writable;
+ bool young;
+ bool dirty;
+ bool present;
+ bool vma_locked;
+ bool pmd_shared;
+ unsigned long curr_addr;
+ unsigned long next_addr;
+ enum pt_range_walk_level level;
+ union {
+ pte_t *ptep;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ };
+ union {
+ pte_t pte;
+ pud_t pud;
+ pmd_t pmd;
+ };
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+};
+
+enum pt_range_walk_type pt_range_walk(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags);
+enum pt_range_walk_type pt_range_walk_start(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags);
+enum pt_range_walk_type pt_range_walk_next(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags);
+void pt_range_walk_done(struct pt_range_walk *ptw);
#endif /* _LINUX_PAGEWALK_H */
diff --git a/mm/memory.c b/mm/memory.c
index 07778814b4a8..e016bc7a49d9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -850,6 +850,28 @@ struct page *vm_normal_page_pud(struct vm_area_struct *vma,
return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud),
pud_val(pud), PGTABLE_LEVEL_PUD);
}
+
+/**
+ * vm_normal_folio_pud() - Get the "struct folio" associated with a PUD
+ * @vma: The VMA mapping the @pud.
+ * @addr: The address where the @pud is mapped.
+ * @pud: The PUD.
+ *
+ * Get the "struct folio" associated with a PUD. See __vm_normal_page()
+ * for details on "normal" and "special" mappings.
+ *
+ * Return: Returns the "struct folio" if this is a "normal" mapping. Returns
+ * NULL if this is a "special" mapping.
+ */
+struct folio *vm_normal_folio_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t pud)
+{
+ struct page *page = vm_normal_page_pud(vma, addr, pud);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
#endif
/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index a94c401ab2cf..4c5c28fdccd4 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1029,3 +1029,403 @@ struct folio *folio_walk_start(struct folio_walk *fw,
fw->ptl = ptl;
return page_folio(page);
}
+
+enum pt_range_walk_type pt_range_walk(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags)
+{
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ int nr_batched = 1;
+ spinlock_t *ptl = NULL;
+ unsigned long entry_size;
+ struct page *page;
+ struct folio *folio;
+ enum pt_range_walk_type ret_type = PTW_DONE;
+ bool writable, young, dirty;
+ unsigned long curr_addr, next_addr = ptw->next_addr ? ptw->next_addr : addr;
+
+ if (WARN_ON_ONCE(next_addr < vma->vm_start || next_addr >= vma->vm_end))
+ return ret_type;
+
+ mmap_assert_locked(ptw->mm);
+
+ if (ptw->ptl) {
+ spin_unlock(ptw->ptl);
+ ptw->ptl = NULL;
+ }
+
+ if (ptw->level == PTW_PTE_LEVEL && ptw->ptep) {
+ pte_unmap(ptw->ptep);
+ ptw->ptep = NULL;
+ }
+
+ if (!ptw->vma_locked) {
+ vma_pgtable_walk_begin(vma);
+ ptw->vma_locked = true;
+ ptw->vma = vma;
+ }
+
+keep_walking:
+ ret_type = PTW_DONE;
+ folio = NULL;
+ page = NULL;
+ writable = young = dirty = false;
+ ptw->present = false;
+ ptw->pmd_shared = false;
+ ptw->folio = NULL;
+ ptw->page = NULL;
+
+ curr_addr = next_addr;
+ if (ptl) {
+ spin_unlock(ptl);
+ ptl = NULL;
+ }
+ /*
+ * If we keep walking the page tables because we are not interested
+ * in the type we found, make sure to check whether we reached the end.
+ */
+ if (curr_addr >= end) {
+ ptw->next_addr = next_addr;
+ return ret_type;
+ }
+again:
+ pgdp = pgd_offset(ptw->mm, curr_addr);
+ next_addr = pgd_addr_end(curr_addr, end);
+
+ if (pgd_none_or_clear_bad(pgdp))
+ /* PTW_ABORT? */
+ goto keep_walking;
+
+ next_addr = p4d_addr_end(curr_addr, end);
+ p4dp = p4d_offset(pgdp, curr_addr);
+ if (p4d_none_or_clear_bad(p4dp))
+ /* PTW_ABORT? */
+ goto keep_walking;
+
+ entry_size = PUD_SIZE;
+ ptw->level = PTW_PUD_LEVEL;
+ next_addr = pud_addr_end(curr_addr, end);
+ pudp = pud_offset(p4dp, curr_addr);
+ pud = pudp_get(pudp);
+ if (pud_none(pud)) {
+ if (!(flags & PT_TYPE_NONE))
+ goto keep_walking;
+ ret_type = PTW_NONE;
+ goto found;
+ }
+ /*
+ * For now, there are no architectures which supports pgd or p4d
+ * leafs, pud is the first level that can be a leaf.
+ */
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+ (!pud_present(pud) || pud_leaf(pud))) {
+ ptl = pud_huge_lock(pudp, vma);
+ if (!ptl)
+ goto again;
+
+ pud = pudp_get(pudp);
+ ptw->pudp = pudp;
+ ptw->pud = pud;
+ if (pud_none(pud)) {
+ if (!(flags & PT_TYPE_NONE))
+ goto keep_walking;
+ ret_type = PTW_NONE;
+ } else if (pud_present(pud) && !pud_leaf(pud)) {
+ spin_unlock(ptl);
+ ptl = NULL;
+ goto pmd_table;
+ } else if (pud_present(pud)) {
+ /*
+ * We do not support PUD-device or pud-PFNMAP, so
+ * if it is present, we must have a folio (Tm).
+ */
+ page = vm_normal_page_pud(vma, curr_addr, pud);
+ if (!page || !(flags & PT_TYPE_FOLIO))
+ goto keep_walking;
+
+ ret_type = PTW_FOLIO;
+ folio = page_folio(page);
+ ptw->present = true;
+ dirty = !!pud_dirty(pud);
+ young = !!pud_young(pud);
+ writable = !!pud_write(pud);
+ } else if (!pud_none(pud)) {
+ /* PUD-hugetlbs can have special swap entries */
+ const softleaf_t entry = softleaf_from_pud(pud);
+
+ ptw->softleaf_entry = entry;
+
+ if (softleaf_is_marker(entry)) {
+ if (!(flags & PT_TYPE_MARKER))
+ goto keep_walking;
+ ret_type = PTW_MARKER;
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_migration(entry)) {
+ if (!(flags & PT_TYPE_MIGRATION))
+ goto keep_walking;
+ ret_type = PTW_MIGRATION;
+ } else if (softleaf_is_hwpoison(entry)) {
+ if (!(flags & PT_TYPE_HWPOISON))
+ goto keep_walking;
+ ret_type = PTW_HWPOISON;
+ }
+
+ page = softleaf_to_page(entry);
+ if (page)
+ folio = page_folio(page);
+ }
+ } else {
+ /* We found nothing, keep going */
+ goto keep_walking;
+ }
+
+ /* We found a type */
+ goto found;
+ }
+pmd_table:
+ entry_size = PMD_SIZE;
+ ptw->level = PTW_PMD_LEVEL;
+ next_addr = pmd_addr_end(curr_addr, end);
+ pmdp = pmd_offset(pudp, curr_addr);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_none(pmd)) {
+ if (!(flags & PT_TYPE_NONE))
+ goto keep_walking;
+ ret_type = PTW_NONE;
+ goto found;
+ }
+
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
+ (!pmd_present(pmd) || pmd_leaf(pmd))) {
+ ptl = pmd_huge_lock(pmdp, vma);
+ if (!ptl)
+ goto again;
+
+ pmd = pmdp_get(pmdp);
+ ptw->pmdp = pmdp;
+ ptw->pmd = pmd;
+ if (pmd_none(pmd)) {
+ if (!(flags & PT_TYPE_NONE))
+ goto keep_walking;
+ ret_type = PTW_NONE;
+ } else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
+ spin_unlock(ptl);
+ ptl = NULL;
+ goto pte_table;
+ } else if (pmd_present(pmd)) {
+ page = vm_normal_page_pmd(vma, curr_addr, pmd);
+ if (page) {
+ if (!(flags & PT_TYPE_FOLIO))
+ goto keep_walking;
+ ret_type = PTW_FOLIO;
+ folio = page_folio(page);
+ if (folio_size(folio) > entry_size) {
+ /* We can batch */
+ int max_nr = folio_size(folio) / entry_size;
+
+ nr_batched = folio_pmd_batch(folio, pmdp, &pmd,
+ max_nr, 0,
+ &writable,
+ &young,
+ &dirty);
+ } else {
+ dirty = !!pmd_dirty(pmd);
+ young = !!pmd_young(pmd);
+ writable = !!pmd_write(pmd);
+ }
+ } else if (!page && (is_huge_zero_pmd(pmd) ||
+ vma->vm_flags & VM_PFNMAP)) {
+ if (!(flags & PT_TYPE_PFN))
+ goto keep_walking;
+ /* Create a subtype to differentiate them? */
+ ret_type = PTW_PFN;
+ } else if (!page) {
+ goto keep_walking;
+ }
+ ptw->present = true;
+ next_addr += (nr_batched * entry_size) - entry_size;
+ } else if (!pmd_none(pmd)) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ ptw->softleaf_entry = entry;
+
+ if (softleaf_is_marker(entry)) {
+ if (!(flags & PT_TYPE_MARKER))
+ goto keep_walking;
+ ret_type = PTW_MARKER;
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_migration(entry)) {
+ if (!(flags & PT_TYPE_MIGRATION))
+ goto keep_walking;
+ ret_type = PTW_MIGRATION;
+ } else if (softleaf_is_hwpoison(entry)) {
+ if (!(flags & PT_TYPE_HWPOISON))
+ goto keep_walking;
+ ret_type = PTW_HWPOISON;
+ } else if (softleaf_is_device_private(entry) ||
+ softleaf_is_device_exclusive(entry)) {
+ if (!(flags & PT_TYPE_DEVICE))
+ goto not_found;
+ ptw->present = true;
+ ret_type = PTW_DEVICE;
+ }
+ page = softleaf_to_page(entry);
+ if (page)
+ folio = page_folio(page);
+ }
+ } else {
+ /* We found nothing, keep going */
+ goto keep_walking;
+ }
+
+ if (ret_type != PTW_NONE && is_vm_hugetlb_page(vma) &&
+ hugetlb_pmd_shared((pte_t *)pmdp))
+ ptw->pmd_shared = true;
+
+ goto found;
+ }
+pte_table:
+ entry_size = PAGE_SIZE;
+ ptw->level = PTW_PTE_LEVEL;
+ next_addr = curr_addr + PAGE_SIZE;
+ ptep = pte_offset_map_lock(vma->vm_mm, pmdp, curr_addr, &ptl);
+ if (!ptep)
+ goto again;
+
+ pte = ptep_get(ptep);
+ ptw->ptep = ptep;
+ ptw->pte = pte;
+ if (pte_none(pte)) {
+ if (!(flags & PT_TYPE_NONE))
+ goto not_found;
+ ret_type = PTW_NONE;
+ } else if (pte_present(pte)) {
+ page = vm_normal_page(vma, curr_addr, pte);
+ if (page) {
+ if (!(flags & PT_TYPE_FOLIO))
+ goto not_found;
+ ret_type = PTW_FOLIO;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ /* We can batch */
+ unsigned long end_addr = pmd_addr_end(curr_addr, end);
+ int max_nr = (end_addr - curr_addr) >> PAGE_SHIFT;
+
+ nr_batched = folio_pte_batch_flags(folio, vma, ptep, &pte, max_nr,
+ FPB_MERGE_WRITE | FPB_MERGE_YOUNG_DIRTY);
+ }
+ } else if (!page && (is_zero_pfn(pte_pfn(pte)) ||
+ vma->vm_flags & VM_PFNMAP)) {
+ if (!(flags & PT_TYPE_PFN))
+ goto not_found;
+ ret_type = PTW_PFN;
+ }
+
+ dirty = !!pte_dirty(pte);
+ young = !!pte_young(pte);
+ writable = !!pte_write(pte);
+ ptw->present = true;
+ next_addr += (nr_batched * entry_size) - entry_size;
+ } else if (!pte_none(pte)) {
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ ptw->softleaf_entry = entry;
+
+ if (softleaf_is_marker(entry)) {
+ if (!(flags & PT_TYPE_MARKER))
+ goto not_found;
+ ret_type = PTW_MARKER;
+ } else if (softleaf_is_swap(entry)) {
+ unsigned long end_addr = pmd_addr_end(curr_addr, end);
+ int max_nr = (end_addr - curr_addr) >> PAGE_SHIFT;
+
+ if (!(flags & PT_TYPE_SWAP))
+ goto not_found;
+
+ nr_batched = swap_pte_batch(ptep, max_nr, pte);
+ next_addr += (nr_batched * entry_size) - entry_size;
+ ret_type = PTW_SWAP;
+ } else if (softleaf_has_pfn(entry)) {
+ if (softleaf_is_migration(entry)) {
+ if (!(flags & PT_TYPE_MIGRATION))
+ goto not_found;
+ ret_type = PTW_MIGRATION;
+ } else if (softleaf_is_hwpoison(entry)) {
+ if (!(flags & PT_TYPE_HWPOISON))
+ goto not_found;
+ ret_type = PTW_HWPOISON;
+ } else if (softleaf_is_device_private(entry) ||
+ softleaf_is_device_exclusive(entry)) {
+ if (!(flags & PT_TYPE_DEVICE))
+ goto not_found;
+ ptw->present = true;
+ ret_type = PTW_DEVICE;
+ }
+ page = softleaf_to_page(entry);
+ if (page)
+ folio = page_folio(page);
+ }
+ } else {
+not_found:
+ /* We found nothing, keep going */
+ pte_unmap_unlock(ptep, ptl);
+ ptw->ptep = NULL;
+ ptl = NULL;
+ goto keep_walking;
+ }
+
+found:
+ /* Fill in remaining ptw struct before returning */
+ ptw->ptl = ptl;
+ ptw->curr_addr = curr_addr;
+ ptw->next_addr = next_addr;
+ ptw->writable = writable;
+ ptw->young = young;
+ ptw->dirty = dirty;
+ ptw->nr_entries = nr_batched;
+ ptw->size = nr_batched * entry_size;
+ if (folio) {
+ ptw->folio = folio;
+ ptw->page = page + ((curr_addr & (entry_size - 1)) >> PAGE_SHIFT);
+ }
+ return ret_type;
+}
+
+enum pt_range_walk_type pt_range_walk_start(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags)
+{
+ if (!ptw->mm)
+ return PTW_DONE;
+ if (addr >= end)
+ return PTW_DONE;
+ return pt_range_walk(ptw, vma, addr, end, flags);
+}
+
+enum pt_range_walk_type pt_range_walk_next(struct pt_range_walk *ptw,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ pt_type_flags_t flags)
+{
+ /* We went through the complete range */
+ if (ptw->next_addr >= end)
+ return PTW_DONE;
+ return pt_range_walk(ptw, vma, addr, end, flags);
+}
+
+void pt_range_walk_done(struct pt_range_walk *ptw)
+{
+ if (ptw->ptl)
+ spin_unlock(ptw->ptl);
+ if (ptw->level == PTW_PTE_LEVEL && ptw->ptep)
+ pte_unmap(ptw->ptep);
+ if (ptw->vma_locked)
+ vma_pgtable_walk_end(ptw->vma);
+ cond_resched();
+}
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 5/7] mm: Make /proc/pid/smaps use the new generic pagewalk API
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
` (3 preceding siblings ...)
2026-04-12 17:42 ` [RFC PATCH 4/7] mm: Implement pt_range_walk Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 6/7] mm: Make /proc/pid/numa_maps " Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 7/7] mm: Make /proc/pid/pagemap " Oscar Salvador
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
Have /proc/pid/smaps make use of the new generic API, and remove
the code which was using the old one.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
fs/proc/task_mmu.c | 309 ++++++++++++---------------------------------
1 file changed, 84 insertions(+), 225 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e091931d7ca1..afbcdb11ad80 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -915,7 +915,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
- bool present)
+ bool present, int ssize)
{
struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
@@ -923,6 +923,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool exclusive;
int mapcount;
+ if (ssize) {
+ nr = ssize / PAGE_SIZE;
+ size = ssize;
+ }
+
/*
* First accumulate quantities that depend only on |size| and the type
* of the compound page.
@@ -988,150 +993,6 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
}
}
-#ifdef CONFIG_SHMEM
-static int smaps_pte_hole(unsigned long addr, unsigned long end,
- __always_unused int depth, struct mm_walk *walk)
-{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = walk->vma;
-
- mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
- linear_page_index(vma, addr),
- linear_page_index(vma, end));
-
- return 0;
-}
-#else
-#define smaps_pte_hole NULL
-#endif /* CONFIG_SHMEM */
-
-static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
-{
-#ifdef CONFIG_SHMEM
- if (walk->ops->pte_hole) {
- /* depth is not used */
- smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
- }
-#endif
-}
-
-static void smaps_pte_entry(pte_t *pte, unsigned long addr,
- struct mm_walk *walk)
-{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = walk->vma;
- bool locked = !!(vma->vm_flags & VM_LOCKED);
- struct page *page = NULL;
- bool present = false, young = false, dirty = false;
- pte_t ptent = ptep_get(pte);
-
- if (pte_present(ptent)) {
- page = vm_normal_page(vma, addr, ptent);
- young = pte_young(ptent);
- dirty = pte_dirty(ptent);
- present = true;
- } else if (pte_none(ptent)) {
- smaps_pte_hole_lookup(addr, walk);
- } else {
- const softleaf_t entry = softleaf_from_pte(ptent);
-
- if (softleaf_is_swap(entry)) {
- int mapcount;
-
- mss->swap += PAGE_SIZE;
- mapcount = swp_swapcount(entry);
- if (mapcount >= 2) {
- u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
-
- do_div(pss_delta, mapcount);
- mss->swap_pss += pss_delta;
- } else {
- mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
- }
- } else if (softleaf_has_pfn(entry)) {
- if (softleaf_is_device_private(entry))
- present = true;
- page = softleaf_to_page(entry);
- }
- }
-
- if (!page)
- return;
-
- smaps_account(mss, page, false, young, dirty, locked, present);
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
- struct mm_walk *walk)
-{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = walk->vma;
- bool locked = !!(vma->vm_flags & VM_LOCKED);
- struct page *page = NULL;
- bool present = false;
- struct folio *folio;
-
- if (pmd_none(*pmd))
- return;
- if (pmd_present(*pmd)) {
- page = vm_normal_page_pmd(vma, addr, *pmd);
- present = true;
- } else if (unlikely(thp_migration_supported())) {
- const softleaf_t entry = softleaf_from_pmd(*pmd);
-
- if (softleaf_has_pfn(entry))
- page = softleaf_to_page(entry);
- }
- if (IS_ERR_OR_NULL(page))
- return;
- folio = page_folio(page);
- if (folio_test_anon(folio))
- mss->anonymous_thp += HPAGE_PMD_SIZE;
- else if (folio_test_swapbacked(folio))
- mss->shmem_thp += HPAGE_PMD_SIZE;
- else if (folio_is_zone_device(folio))
- /* pass */;
- else
- mss->file_thp += HPAGE_PMD_SIZE;
-
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
- locked, present);
-}
-#else
-static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
- struct mm_walk *walk)
-{
-}
-#endif
-
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct vm_area_struct *vma = walk->vma;
- pte_t *pte;
- spinlock_t *ptl;
-
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- smaps_pmd_entry(pmd, addr, walk);
- spin_unlock(ptl);
- goto out;
- }
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
- for (; addr != end; pte++, addr += PAGE_SIZE)
- smaps_pte_entry(pte, addr, walk);
- pte_unmap_unlock(pte - 1, ptl);
-out:
- cond_resched();
- return 0;
-}
-
static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
@@ -1228,58 +1089,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
seq_putc(m, '\n');
}
-#ifdef CONFIG_HUGETLB_PAGE
-static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = walk->vma;
- struct folio *folio = NULL;
- bool present = false;
- spinlock_t *ptl;
- pte_t ptent;
-
- ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
- ptent = huge_ptep_get(walk->mm, addr, pte);
- if (pte_present(ptent)) {
- folio = page_folio(pte_page(ptent));
- present = true;
- } else {
- const softleaf_t entry = softleaf_from_pte(ptent);
-
- if (softleaf_has_pfn(entry))
- folio = softleaf_to_folio(entry);
- }
-
- if (folio) {
- /* We treat non-present entries as "maybe shared". */
- if (!present || folio_maybe_mapped_shared(folio) ||
- hugetlb_pmd_shared(pte))
- mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
- else
- mss->private_hugetlb += huge_page_size(hstate_vma(vma));
- }
- spin_unlock(ptl);
- return 0;
-}
-#else
-#define smaps_hugetlb_range NULL
-#endif /* HUGETLB_PAGE */
-
-static const struct mm_walk_ops smaps_walk_ops = {
- .pmd_entry = smaps_pte_range,
- .hugetlb_entry = smaps_hugetlb_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static const struct mm_walk_ops smaps_shmem_walk_ops = {
- .pmd_entry = smaps_pte_range,
- .hugetlb_entry = smaps_hugetlb_range,
- .pte_hole = smaps_pte_hole,
- .walk_lock = PGWALK_RDLOCK,
-};
-
/*
* Gather mem stats from @vma with the indicated beginning
* address @start, and keep them in @mss.
@@ -1287,40 +1096,90 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
* Use vm_start of @vma as the beginning address if @start is 0.
*/
static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss, unsigned long start)
+ struct mem_size_stats *mss,
+ unsigned long start)
{
- const struct mm_walk_ops *ops = &smaps_walk_ops;
-
- /* Invalid start */
- if (start >= vma->vm_end)
- return;
+ struct pt_range_walk ptw = {
+ .mm = vma->vm_mm
+ };
+ enum pt_range_walk_type type;
+ pt_type_flags_t flags = PT_TYPE_ALL;
- if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
- /*
- * For shared or readonly shmem mappings we know that all
- * swapped out pages belong to the shmem object, and we can
- * obtain the swap value much more efficiently. For private
- * writable mappings, we might have COW pages that are
- * not affected by the parent swapped out pages of the shmem
- * object, so we have to distinguish them during the page walk.
- * Unless we know that the shmem object (or the part mapped by
- * our VMA) has no swapped out pages at all.
- */
- unsigned long shmem_swapped = shmem_swap_usage(vma);
+ if (!start)
+ start = vma->vm_start;
+
+ flags &= ~(PT_TYPE_NONE|PT_TYPE_PFN);
+
+ type = pt_range_walk_start(&ptw, vma, start, vma->vm_end, flags);
+ while (type != PTW_DONE) {
+ bool locked = !!(vma->vm_flags & VM_LOCKED);
+ bool compound = false, account = false;
+ unsigned long swap_size;
+ int mapcount;
+
+ switch (type) {
+ case PTW_FOLIO:
+ case PTW_MIGRATION:
+ case PTW_HWPOISON:
+ case PTW_DEVICE:
+ /*
+ * We either have a folio because vm_normal_folio was
+ * successful, or because we had a special swap entry
+ * and could retrieve it with softleaf_to_page.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ /* HugeTLB */
+ unsigned long size = huge_page_size(hstate_vma(ptw.vma));
+
+ if (!ptw.present || folio_maybe_mapped_shared(ptw.folio) ||
+ ptw.pmd_shared)
+ mss->shared_hugetlb += size;
+ else
+ mss->private_hugetlb += size;
+ } else {
+ account = true;
+ if (ptw.level == PTW_PMD_LEVEL) {
+ /* THP */
+ compound = true;
+ if (folio_test_anon(ptw.folio))
+ mss->anonymous_thp += ptw.size;
+ else if (folio_test_swapbacked(ptw.folio))
+ mss->shmem_thp += ptw.size;
+ else if (folio_is_zone_device(ptw.folio))
+ /* pass */;
+ else
+ mss->file_thp += ptw.size;
+ } else if (ptw.level == PTW_PTE_LEVEL && ptw.nr_entries > 1) {
+ compound = true;
+ }
+ }
+ break;
+ case PTW_SWAP:
+ account = true;
+ swap_size = PAGE_SIZE * ptw.nr_entries;
+ mss->swap += swap_size;
+ mapcount = swp_swapcount(ptw.softleaf_entry);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)swap_size << PSS_SHIFT;
- if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
- !(vma->vm_flags & VM_WRITE))) {
- mss->swap += shmem_swapped;
- } else {
- ops = &smaps_shmem_walk_ops;
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)swap_size << PSS_SHIFT;
+ }
+ break;
+ default:
+ /* Ooops */
+ break;
}
+
+ if (account && ptw.folio)
+ smaps_account(mss, ptw.page, compound, ptw.young,
+ ptw.dirty, locked, ptw.present, ptw.size);
+ type = pt_range_walk_next(&ptw, vma, start, vma->vm_end, flags);
}
- /* mmap_lock is held in m_start */
- if (!start)
- walk_page_vma(vma, ops, mss);
- else
- walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
+ pt_range_walk_done(&ptw);
}
#define SEQ_PUT_DEC(str, val) \
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 6/7] mm: Make /proc/pid/numa_maps use the new generic pagewalk API
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
` (4 preceding siblings ...)
2026-04-12 17:42 ` [RFC PATCH 5/7] mm: Make /proc/pid/smaps use the new generic pagewalk API Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 7/7] mm: Make /proc/pid/pagemap " Oscar Salvador
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
Have /proc/pid/numa_maps make use of the new generic API, and remove
the code which was using the old one
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
fs/proc/task_mmu.c | 136 +++++++++++----------------------------------
1 file changed, 32 insertions(+), 104 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index afbcdb11ad80..776e7a6baf00 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -3048,108 +3048,6 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
return page;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
- struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct page *page;
- int nid;
-
- if (!pmd_present(pmd))
- return NULL;
-
- page = vm_normal_page_pmd(vma, addr, pmd);
- if (!page)
- return NULL;
-
- if (PageReserved(page))
- return NULL;
-
- nid = page_to_nid(page);
- if (!node_isset(nid, node_states[N_MEMORY]))
- return NULL;
-
- return page;
-}
-#endif
-
-static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- struct numa_maps *md = walk->private;
- struct vm_area_struct *vma = walk->vma;
- spinlock_t *ptl;
- pte_t *orig_pte;
- pte_t *pte;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- struct page *page;
-
- page = can_gather_numa_stats_pmd(*pmd, vma, addr);
- if (page)
- gather_stats(page, md, pmd_dirty(*pmd),
- HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(ptl);
- return 0;
- }
-#endif
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
- do {
- pte_t ptent = ptep_get(pte);
- struct page *page = can_gather_numa_stats(ptent, vma, addr);
- if (!page)
- continue;
- gather_stats(page, md, pte_dirty(ptent), 1);
-
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- cond_resched();
- return 0;
-}
-#ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end, struct mm_walk *walk)
-{
- pte_t huge_pte;
- struct numa_maps *md;
- struct page *page;
- spinlock_t *ptl;
-
- ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
- huge_pte = huge_ptep_get(walk->mm, addr, pte);
- if (!pte_present(huge_pte))
- goto out;
-
- page = pte_page(huge_pte);
-
- md = walk->private;
- gather_stats(page, md, pte_dirty(huge_pte), 1);
-out:
- spin_unlock(ptl);
- return 0;
-}
-
-#else
-static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
- unsigned long addr, unsigned long end, struct mm_walk *walk)
-{
- return 0;
-}
-#endif
-
-static const struct mm_walk_ops show_numa_ops = {
- .hugetlb_entry = gather_hugetlb_stats,
- .pmd_entry = gather_pte_stats,
- .walk_lock = PGWALK_RDLOCK,
-};
-
/*
* Display pages allocated per node and memory policy via /proc.
*/
@@ -3161,9 +3059,15 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
+ struct pt_range_walk ptw = {
+ .mm = mm
+ };
+ enum pt_range_walk_type type;
+ pt_type_flags_t flags;
char buffer[64];
struct mempolicy *pol;
pgoff_t ilx;
+ int nr_pages;
int nid;
if (!mm)
@@ -3194,8 +3098,32 @@ static int show_numa_map(struct seq_file *m, void *v)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_lock is held by m_start */
- walk_page_vma(vma, &show_numa_ops, md);
+ flags = PT_TYPE_FOLIO;
+ type = pt_range_walk_start(&ptw, vma, vma->vm_start, vma->vm_end, flags);
+ while (type != PTW_DONE) {
+
+ if (!ptw.folio || !ptw.page || PageReserved(ptw.page))
+ goto not_found;
+
+ nid = page_to_nid(ptw.page);
+ if (!node_isset(nid, node_states[N_MEMORY]))
+ goto not_found;
+
+ if (is_vm_hugetlb_page(vma))
+ /*
+ * As opposed to THP, HugeTLB counts the entire huge
+ * page as one unit size.
+ */
+ nr_pages = ptw.nr_entries;
+ else
+ nr_pages = ptw.size / PAGE_SIZE;
+
+ gather_stats(ptw.page, md, ptw.dirty, nr_pages);
+not_found:
+ type = pt_range_walk_next(&ptw, vma, vma->vm_start, vma->vm_end, flags);
+
+ }
+ pt_range_walk_done(&ptw);
if (!md->pages)
goto out;
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread* [RFC PATCH 7/7] mm: Make /proc/pid/pagemap use the new generic pagewalk API
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
` (5 preceding siblings ...)
2026-04-12 17:42 ` [RFC PATCH 6/7] mm: Make /proc/pid/numa_maps " Oscar Salvador
@ 2026-04-12 17:42 ` Oscar Salvador
6 siblings, 0 replies; 8+ messages in thread
From: Oscar Salvador @ 2026-04-12 17:42 UTC (permalink / raw)
To: Andrew Morton
Cc: David Hildenbrand, Michal Hocko, Vlastimil Babka, Muchun Song,
Lorenzo Stoakes, linux-kernel, linux-mm, Oscar Salvador
Have /proc/pid/pagemap make use of the new generic API, and remove
the code which was using the old one.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
arch/x86/include/asm/pgtable.h | 4 +
arch/x86/mm/pgtable.c | 18 +-
fs/proc/task_mmu.c | 906 +++++++++++++++------------------
include/linux/leafops.h | 13 +
include/linux/pgtable.h | 30 ++
mm/pgtable-generic.c | 10 +
6 files changed, 481 insertions(+), 500 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a68ff339cd56..1d18f6177784 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1400,6 +1400,10 @@ static inline pud_t pudp_establish(struct vm_area_struct *vma,
}
#endif
+#define __HAVE_ARCH_PUDP_INVALIDATE_AD
+extern pud_t pudp_invalidate_ad(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp);
+
#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 2e5ecfdce73c..828f5ca9195e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -530,8 +530,22 @@ pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
}
#endif
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
- defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#if (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) || \
+ defined CONFIG_HUGETLB_PAGE
+
+pud_t pudp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+
+ /*
+ * No flush is necessary. Once an invalid PUD is established, the PUD's
+ * access and dirty bits cannot be updated.
+ */
+ return pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
+}
+
pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
pud_t *pudp)
{
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 776e7a6baf00..6b6d5a39cd5a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1856,192 +1856,6 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
return make_pme(frame, flags);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
- unsigned long end, struct vm_area_struct *vma,
- struct pagemapread *pm)
-{
- unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
- u64 flags = 0, frame = 0;
- pmd_t pmd = *pmdp;
- struct page *page = NULL;
- struct folio *folio = NULL;
- int err = 0;
-
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags |= PM_SOFT_DIRTY;
-
- if (pmd_none(pmd))
- goto populate_pagemap;
-
- if (pmd_present(pmd)) {
- page = pmd_page(pmd);
-
- flags |= PM_PRESENT;
- if (pmd_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- if (pmd_uffd_wp(pmd))
- flags |= PM_UFFD_WP;
- if (pm->show_pfn)
- frame = pmd_pfn(pmd) + idx;
- } else if (thp_migration_supported()) {
- const softleaf_t entry = softleaf_from_pmd(pmd);
- unsigned long offset;
-
- if (pm->show_pfn) {
- if (softleaf_has_pfn(entry))
- offset = softleaf_to_pfn(entry) + idx;
- else
- offset = swp_offset(entry) + idx;
- frame = swp_type(entry) |
- (offset << MAX_SWAPFILES_SHIFT);
- }
- flags |= PM_SWAP;
- if (pmd_swp_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
- if (pmd_swp_uffd_wp(pmd))
- flags |= PM_UFFD_WP;
- VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
- page = softleaf_to_page(entry);
- }
-
- if (page) {
- folio = page_folio(page);
- if (!folio_test_anon(folio))
- flags |= PM_FILE;
- }
-
-populate_pagemap:
- for (; addr != end; addr += PAGE_SIZE, idx++) {
- u64 cur_flags = flags;
- pagemap_entry_t pme;
-
- if (folio && (flags & PM_PRESENT) &&
- __folio_page_mapped_exclusively(folio, page))
- cur_flags |= PM_MMAP_EXCLUSIVE;
-
- pme = make_pme(frame, cur_flags);
- err = add_to_pagemap(&pme, pm);
- if (err)
- break;
- if (pm->show_pfn) {
- if (flags & PM_PRESENT)
- frame++;
- else if (flags & PM_SWAP)
- frame += (1 << MAX_SWAPFILES_SHIFT);
- }
- }
- return err;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct vm_area_struct *vma = walk->vma;
- struct pagemapread *pm = walk->private;
- spinlock_t *ptl;
- pte_t *pte, *orig_pte;
- int err = 0;
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- ptl = pmd_trans_huge_lock(pmdp, vma);
- if (ptl) {
- err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm);
- spin_unlock(ptl);
- return err;
- }
-#endif
-
- /*
- * We can assume that @vma always points to a valid one and @end never
- * goes beyond vma->vm_end.
- */
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
- return err;
- }
- for (; addr < end; pte++, addr += PAGE_SIZE) {
- pagemap_entry_t pme;
-
- pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
- err = add_to_pagemap(&pme, pm);
- if (err)
- break;
- }
- pte_unmap_unlock(orig_pte, ptl);
-
- cond_resched();
-
- return err;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
- unsigned long addr, unsigned long end,
- struct mm_walk *walk)
-{
- struct pagemapread *pm = walk->private;
- struct vm_area_struct *vma = walk->vma;
- u64 flags = 0, frame = 0;
- spinlock_t *ptl;
- int err = 0;
- pte_t pte;
-
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags |= PM_SOFT_DIRTY;
-
- ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
- pte = huge_ptep_get(walk->mm, addr, ptep);
- if (pte_present(pte)) {
- struct folio *folio = page_folio(pte_page(pte));
-
- if (!folio_test_anon(folio))
- flags |= PM_FILE;
-
- if (!folio_maybe_mapped_shared(folio) &&
- !hugetlb_pmd_shared(ptep))
- flags |= PM_MMAP_EXCLUSIVE;
-
- if (huge_pte_uffd_wp(pte))
- flags |= PM_UFFD_WP;
-
- flags |= PM_PRESENT;
- if (pm->show_pfn)
- frame = pte_pfn(pte) +
- ((addr & ~hmask) >> PAGE_SHIFT);
- } else if (pte_swp_uffd_wp_any(pte)) {
- flags |= PM_UFFD_WP;
- }
-
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
-
- err = add_to_pagemap(&pme, pm);
- if (err)
- break;
- if (pm->show_pfn && (flags & PM_PRESENT))
- frame++;
- }
-
- spin_unlock(ptl);
- cond_resched();
-
- return err;
-}
-#else
-#define pagemap_hugetlb_range NULL
-#endif /* HUGETLB_PAGE */
-
-static const struct mm_walk_ops pagemap_ops = {
- .pmd_entry = pagemap_pmd_range,
- .pte_hole = pagemap_pte_hole,
- .hugetlb_entry = pagemap_hugetlb_range,
- .walk_lock = PGWALK_RDLOCK,
-};
-
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -2070,99 +1884,6 @@ static const struct mm_walk_ops pagemap_ops = {
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-static ssize_t pagemap_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct mm_struct *mm = file->private_data;
- struct pagemapread pm;
- unsigned long src;
- unsigned long svpfn;
- unsigned long start_vaddr;
- unsigned long end_vaddr;
- int ret = 0, copied = 0;
-
- if (!mm || !mmget_not_zero(mm))
- goto out;
-
- ret = -EINVAL;
- /* file position must be aligned */
- if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_mm;
-
- ret = 0;
- if (!count)
- goto out_mm;
-
- /* do not disclose physical addresses: attack vector */
- pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
-
- pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
- pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
- ret = -ENOMEM;
- if (!pm.buffer)
- goto out_mm;
-
- src = *ppos;
- svpfn = src / PM_ENTRY_BYTES;
- end_vaddr = mm->task_size;
-
- /* watch out for wraparound */
- start_vaddr = end_vaddr;
- if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
- unsigned long end;
-
- ret = mmap_read_lock_killable(mm);
- if (ret)
- goto out_free;
- start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
- mmap_read_unlock(mm);
-
- end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
- if (end >= start_vaddr && end < mm->task_size)
- end_vaddr = end;
- }
-
- /* Ensure the address is inside the task */
- if (start_vaddr > mm->task_size)
- start_vaddr = end_vaddr;
-
- ret = 0;
- while (count && (start_vaddr < end_vaddr)) {
- int len;
- unsigned long end;
-
- pm.pos = 0;
- end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
- /* overflow ? */
- if (end < start_vaddr || end > end_vaddr)
- end = end_vaddr;
- ret = mmap_read_lock_killable(mm);
- if (ret)
- goto out_free;
- ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
- mmap_read_unlock(mm);
- start_vaddr = end;
-
- len = min(count, PM_ENTRY_BYTES * pm.pos);
- if (copy_to_user(buf, pm.buffer, len)) {
- ret = -EFAULT;
- goto out_free;
- }
- copied += len;
- buf += len;
- count -= len;
- }
- *ppos += copied;
- if (!ret || ret == PM_END_OF_BUFFER)
- ret = copied;
-
-out_free:
- kfree(pm.buffer);
-out_mm:
- mmput(mm);
-out:
- return ret;
-}
static int pagemap_open(struct inode *inode, struct file *file)
{
@@ -2267,6 +1988,23 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma,
}
}
+#ifdef CONFIG_HUGETLB_PAGE
+static void make_uffd_wp_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old, pud = *pudp;
+
+ if (pud_present(pud)) {
+ old = pudp_invalidate_ad(vma, addr, pudp);
+ pud = pud_mkuffd_wp(old);
+ set_pud_at(vma->vm_mm, addr, pudp, pud);
+ } else if (pud_is_migration_entry(pud)) {
+ pud = pud_swp_mkuffd_wp(pud);
+ set_pud_at(vma->vm_mm, addr, pudp, pud);
+ }
+}
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
struct vm_area_struct *vma,
@@ -2539,216 +2277,6 @@ static int pagemap_scan_output(unsigned long categories,
return ret;
}
-static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long categories;
- spinlock_t *ptl;
- int ret = 0;
-
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (!ptl)
- return -ENOENT;
-
- categories = p->cur_vma_category |
- pagemap_thp_category(p, vma, start, *pmd);
-
- if (!pagemap_scan_is_interesting_page(categories, p))
- goto out_unlock;
-
- ret = pagemap_scan_output(categories, p, start, &end);
- if (start == end)
- goto out_unlock;
-
- if (~p->arg.flags & PM_SCAN_WP_MATCHING)
- goto out_unlock;
- if (~categories & PAGE_IS_WRITTEN)
- goto out_unlock;
-
- /*
- * Break huge page into small pages if the WP operation
- * needs to be performed on a portion of the huge page.
- */
- if (end != start + HPAGE_SIZE) {
- spin_unlock(ptl);
- split_huge_pmd(vma, pmd, start);
- pagemap_scan_backout_range(p, start, end);
- /* Report as if there was no THP */
- return -ENOENT;
- }
-
- make_uffd_wp_pmd(vma, start, pmd);
- flush_tlb_range(vma, start, end);
-out_unlock:
- spin_unlock(ptl);
- return ret;
-#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
- return -ENOENT;
-#endif
-}
-
-static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
- unsigned long end, struct mm_walk *walk)
-{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long addr, flush_end = 0;
- pte_t *pte, *start_pte;
- spinlock_t *ptl;
- int ret;
-
- ret = pagemap_scan_thp_entry(pmd, start, end, walk);
- if (ret != -ENOENT)
- return ret;
-
- ret = 0;
- start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- if (!pte) {
- walk->action = ACTION_AGAIN;
- return 0;
- }
-
- lazy_mmu_mode_enable();
-
- if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
- /* Fast path for performing exclusive WP */
- for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
- pte_t ptent = ptep_get(pte);
-
- if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
- pte_swp_uffd_wp_any(ptent))
- continue;
- make_uffd_wp_pte(vma, addr, pte, ptent);
- if (!flush_end)
- start = addr;
- flush_end = addr + PAGE_SIZE;
- }
- goto flush_and_return;
- }
-
- if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
- p->arg.category_mask == PAGE_IS_WRITTEN &&
- p->arg.return_mask == PAGE_IS_WRITTEN) {
- for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
- unsigned long next = addr + PAGE_SIZE;
- pte_t ptent = ptep_get(pte);
-
- if ((pte_present(ptent) && pte_uffd_wp(ptent)) ||
- pte_swp_uffd_wp_any(ptent))
- continue;
- ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
- p, addr, &next);
- if (next == addr)
- break;
- if (~p->arg.flags & PM_SCAN_WP_MATCHING)
- continue;
- make_uffd_wp_pte(vma, addr, pte, ptent);
- if (!flush_end)
- start = addr;
- flush_end = next;
- }
- goto flush_and_return;
- }
-
- for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
- pte_t ptent = ptep_get(pte);
- unsigned long categories = p->cur_vma_category |
- pagemap_page_category(p, vma, addr, ptent);
- unsigned long next = addr + PAGE_SIZE;
-
- if (!pagemap_scan_is_interesting_page(categories, p))
- continue;
-
- ret = pagemap_scan_output(categories, p, addr, &next);
- if (next == addr)
- break;
-
- if (~p->arg.flags & PM_SCAN_WP_MATCHING)
- continue;
- if (~categories & PAGE_IS_WRITTEN)
- continue;
-
- make_uffd_wp_pte(vma, addr, pte, ptent);
- if (!flush_end)
- start = addr;
- flush_end = next;
- }
-
-flush_and_return:
- if (flush_end)
- flush_tlb_range(vma, start, addr);
-
- lazy_mmu_mode_disable();
- pte_unmap_unlock(start_pte, ptl);
-
- cond_resched();
- return ret;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
- unsigned long start, unsigned long end,
- struct mm_walk *walk)
-{
- struct pagemap_scan_private *p = walk->private;
- struct vm_area_struct *vma = walk->vma;
- unsigned long categories;
- spinlock_t *ptl;
- int ret = 0;
- pte_t pte;
-
- if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
- /* Go the short route when not write-protecting pages. */
-
- pte = huge_ptep_get(walk->mm, start, ptep);
- categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
-
- if (!pagemap_scan_is_interesting_page(categories, p))
- return 0;
-
- return pagemap_scan_output(categories, p, start, &end);
- }
-
- i_mmap_lock_write(vma->vm_file->f_mapping);
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
-
- pte = huge_ptep_get(walk->mm, start, ptep);
- categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
-
- if (!pagemap_scan_is_interesting_page(categories, p))
- goto out_unlock;
-
- ret = pagemap_scan_output(categories, p, start, &end);
- if (start == end)
- goto out_unlock;
-
- if (~categories & PAGE_IS_WRITTEN)
- goto out_unlock;
-
- if (end != start + HPAGE_SIZE) {
- /* Partial HugeTLB page WP isn't possible. */
- pagemap_scan_backout_range(p, start, end);
- p->arg.walk_end = start;
- ret = 0;
- goto out_unlock;
- }
-
- make_uffd_wp_huge_pte(vma, start, ptep, pte);
- flush_hugetlb_tlb_range(vma, start, end);
-
-out_unlock:
- spin_unlock(ptl);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
-
- return ret;
-}
-#else
-#define pagemap_scan_hugetlb_entry NULL
-#endif
-
static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
int depth, struct mm_walk *walk)
{
@@ -2773,13 +2301,6 @@ static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
return ret;
}
-static const struct mm_walk_ops pagemap_scan_ops = {
- .test_walk = pagemap_scan_test_walk,
- .pmd_entry = pagemap_scan_pmd_entry,
- .pte_hole = pagemap_scan_pte_hole,
- .hugetlb_entry = pagemap_scan_hugetlb_entry,
-};
-
static int pagemap_scan_get_args(struct pm_scan_arg *arg,
unsigned long uarg)
{
@@ -2877,6 +2398,135 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
return n;
}
+static unsigned long pagemap_set_category(struct pagemap_scan_private *p,
+ struct pt_range_walk *ptw,
+ enum pt_range_walk_type type)
+{
+ unsigned long categories = 0;
+
+ if (ptw->present) {
+ categories |= PAGE_IS_PRESENT;
+
+ if (type == PTW_FOLIO && !PageAnon(ptw->page))
+ categories |= PAGE_IS_FILE;
+ if (type == PTW_PFN)
+ categories |= PAGE_IS_PFNZERO;
+ } else {
+ categories |= PAGE_IS_SWAPPED;
+ }
+
+ switch (ptw->level) {
+ case PTW_PUD_LEVEL:
+ if (ptw->present) {
+ if (!pud_uffd_wp(ptw->pud))
+ categories |= PAGE_IS_WRITTEN;
+ if (pud_soft_dirty(ptw->pud))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ if (!pud_swp_uffd_wp(ptw->pud))
+ categories |= PAGE_IS_WRITTEN;
+ if (pud_swp_soft_dirty(ptw->pud))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+ break;
+ case PTW_PMD_LEVEL:
+ if (ptw->present) {
+ if (!pmd_uffd_wp(ptw->pmd))
+ categories |= PAGE_IS_WRITTEN;
+ if (pmd_soft_dirty(ptw->pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ const softleaf_t entry = softleaf_from_pmd(ptw->pmd);
+
+ if (softleaf_has_pfn(entry) &&
+ !folio_test_anon(softleaf_to_folio(entry)))
+ categories |= PAGE_IS_FILE;
+ if (!pmd_swp_uffd_wp(ptw->pmd))
+ categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(ptw->pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+ break;
+ case PTW_PTE_LEVEL:
+ if (ptw->present) {
+ if (!pte_uffd_wp(ptw->pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (pte_soft_dirty(ptw->pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ } else {
+ if (!pte_swp_uffd_wp_any(ptw->pte))
+ categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(ptw->pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
+ }
+ break;
+ }
+
+ return categories;
+}
+
+static int pagemap_scan_walk(struct vm_area_struct *vma, struct pagemap_scan_private *p,
+ unsigned long addr)
+{
+ int ret = 0;
+ struct pt_range_walk ptw = {
+ .mm = vma->vm_mm
+ };
+ enum pt_range_walk_type type;
+ pt_type_flags_t flags = PT_TYPE_ALL;
+
+keep_walking:
+ type = pt_range_walk_start(&ptw, vma, addr, vma->vm_end, flags);
+ while (type != PTW_DONE) {
+ unsigned long categories = p->cur_vma_category |
+ pagemap_set_category(p, &ptw, type);
+ unsigned long curr_addr = ptw.curr_addr;
+
+ if (pagemap_scan_is_interesting_page(categories, p)) {
+ unsigned long end;
+
+ end = ptw.next_addr;
+
+ if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ goto keep_walking;
+ if (~categories & PAGE_IS_WRITTEN)
+ goto keep_walking;
+
+ ret = pagemap_scan_output(categories, p, curr_addr, &end);
+ if (curr_addr == end)
+ goto out;
+
+ if (end != curr_addr + HPAGE_SIZE) {
+ if (is_vm_hugetlb_page(ptw.vma)) {
+ /* Partial HugeTLB page WP isn't possible. */
+ pagemap_scan_backout_range(p, curr_addr, end);
+ p->arg.walk_end = curr_addr;
+ ret = 0;
+ goto keep_walking;
+ }
+ if (ptw.level == PTW_PMD_LEVEL) {
+ pt_range_walk_done(&ptw);
+ split_huge_pmd(ptw.vma, ptw.pmdp, curr_addr);
+ pagemap_scan_backout_range(p, curr_addr, end);
+ /* Relaunch now that we split the pmd */
+ goto keep_walking;
+ }
+ }
+
+ if (ptw.level == PTW_PUD_LEVEL)
+ make_uffd_wp_pud(ptw.vma, curr_addr, ptw.pudp);
+ if (ptw.level == PTW_PMD_LEVEL)
+ make_uffd_wp_pmd(ptw.vma, curr_addr, ptw.pmdp);
+ if (ptw.level == PTW_PTE_LEVEL)
+ make_uffd_wp_pte(ptw.vma, curr_addr, ptw.ptep, ptw.pte);
+ }
+ type = pt_range_walk_next(&ptw, vma, vma->vm_start, vma->vm_end, flags);
+ }
+out:
+ pt_range_walk_done(&ptw);
+ return ret;
+}
+
static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
{
struct pagemap_scan_private p = {0};
@@ -2897,6 +2547,7 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
for (walk_start = p.arg.start; walk_start < p.arg.end;
walk_start = p.arg.walk_end) {
struct mmu_notifier_range range;
+ unsigned long next;
long n_out;
if (fatal_signal_pending(current)) {
@@ -2915,8 +2566,21 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
mmu_notifier_invalidate_range_start(&range);
}
- ret = walk_page_range(mm, walk_start, p.arg.end,
- &pagemap_scan_ops, &p);
+ do {
+ struct vm_area_struct *vma = find_vma(mm, walk_start);
+
+ if (vma) {
+ ret = pagemap_scan_walk(vma, &p, walk_start);
+ if (ret)
+ break;
+ walk_start = min(p.arg.end, vma->vm_end);
+ next = walk_start;
+ } else {
+ walk_start = p.arg.end;
+ next = p.arg.end;
+ }
+
+ } while (next < p.arg.end);
if (p.arg.flags & PM_SCAN_WP_MATCHING)
mmu_notifier_invalidate_range_end(&range);
@@ -2950,6 +2614,251 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
return ret;
}
+static int pagemap_read_walk_range(struct vm_area_struct *vma, unsigned long start,
+ struct pagemapread *pm)
+{
+ int err = 0;
+ struct pt_range_walk ptw = {
+ .mm = vma->vm_mm
+ };
+ enum pt_range_walk_type type;
+ pt_type_flags_t wflags = PT_TYPE_ALL;
+ pte_t *ptep;
+
+ wflags &= ~(PT_TYPE_NONE|PT_TYPE_PFN);
+
+ type = pt_range_walk_start(&ptw, vma, start, vma->vm_end, wflags);
+ while (type != PTW_DONE) {
+ unsigned long end;
+ u64 frame = 0, flags = 0;
+ struct page *page = NULL;
+ struct folio *folio = NULL;
+
+ end = 0;
+ switch (ptw.level) {
+ case PTW_PUD_LEVEL:
+ end = pud_addr_end(start, vma->vm_end);
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ if (pud_present(ptw.pud)) {
+ page = pud_page(ptw.pud);
+ folio = page_folio(page);
+ flags |= PM_PRESENT;
+
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+
+ if (pm->show_pfn) {
+ unsigned long hmask = huge_page_mask(hstate_vma(vma));
+
+ frame = pud_pfn(ptw.pud) +
+ ((start & ~hmask) >> PAGE_SHIFT);
+ }
+ } else if (pud_swp_uffd_wp(ptw.pud)) {
+ flags |= PM_UFFD_WP;
+ }
+ break;
+ case PTW_PMD_LEVEL:
+ unsigned int idx = (start & ~PMD_MASK) >> PAGE_SHIFT;
+
+ end = pmd_addr_end(start, vma->vm_end);
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
+
+ if (pmd_present(ptw.pmd)) {
+ page = pmd_page(ptw.pmd);
+ flags |= PM_PRESENT;
+
+ if (pmd_soft_dirty(ptw.pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(ptw.pmd))
+ flags |= PM_UFFD_WP;
+ if (pm->show_pfn)
+ frame = pmd_pfn(ptw.pmd) + idx;
+ } else if (thp_migration_supported() || IS_ENABLED(CONFIG_HUGETLB_PAGE)) {
+ const softleaf_t entry = softleaf_from_pmd(ptw.pmd);
+ unsigned long offset;
+
+ if (pm->show_pfn) {
+ if (softleaf_has_pfn(entry))
+ offset = softleaf_to_pfn(entry) + idx;
+ else
+ offset = swp_offset(entry) + idx;
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
+
+ if (!is_vm_hugetlb_page(vma))
+ flags |= PM_SWAP;
+ if (pmd_swp_soft_dirty(ptw.pmd))
+ flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(ptw.pmd))
+ flags |= PM_UFFD_WP;
+
+ VM_WARN_ON_ONCE(!pmd_is_migration_entry(ptw.pmd));
+ page = softleaf_to_page(entry);
+ }
+
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+ break;
+ case PTW_PTE_LEVEL:
+ end = pmd_addr_end(start, vma->vm_end);
+ break;
+ }
+
+ if (ptw.level == PTW_PTE_LEVEL) {
+ ptep = ptw.ptep;
+ for (; start < end; ptep++, start += PAGE_SIZE) {
+ pagemap_entry_t pme;
+
+ pme = pte_to_pagemap_entry(pm, vma, start, ptep_get(ptep));
+ err = add_to_pagemap(&pme, pm);
+ ptw.next_addr = start + PAGE_SIZE;
+ if (err)
+ break;
+ }
+ } else {
+ for (; start != end; start += PAGE_SIZE) {
+ u64 cur_flags = flags;
+ pagemap_entry_t pme;
+
+ if (folio && (flags & PM_PRESENT) &&
+ __folio_page_mapped_exclusively(folio, page))
+ cur_flags |= PM_MMAP_EXCLUSIVE;
+
+ pme = make_pme(frame, cur_flags);
+ err = add_to_pagemap(&pme, pm);
+ if (err)
+ break;
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
+ }
+ }
+ type = pt_range_walk_next(&ptw, vma, vma->vm_start, vma->vm_end, wflags);
+ }
+ pt_range_walk_done(&ptw);
+
+ return err;
+}
+
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm = file->private_data;
+ struct pagemapread pm;
+ unsigned long src;
+ unsigned long svpfn;
+ unsigned long start_vaddr;
+ unsigned long end_vaddr;
+ int ret = 0, copied = 0;
+
+ if (!mm || !mmget_not_zero(mm))
+ goto out;
+
+ ret = -EINVAL;
+ /* file position must be aligned */
+ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
+ goto out_mm;
+
+ ret = 0;
+ if (!count)
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+
+ pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+ pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!pm.buffer)
+ goto out_mm;
+
+ src = *ppos;
+ svpfn = src / PM_ENTRY_BYTES;
+ end_vaddr = mm->task_size;
+
+ /* watch out for wraparound */
+ start_vaddr = end_vaddr;
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ unsigned long end;
+
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+
+ end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT);
+ if (end >= start_vaddr && end < mm->task_size)
+ end_vaddr = end;
+ }
+
+ /* Ensure the address is inside the task */
+ if (start_vaddr > mm->task_size)
+ start_vaddr = end_vaddr;
+
+ ret = 0;
+
+ while (count && (start_vaddr < end_vaddr)) {
+ int len;
+ unsigned long end;
+ unsigned long next;
+
+ pm.pos = 0;
+ end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+ if (end < start_vaddr || end > end_vaddr)
+ end = end_vaddr;
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+
+ do {
+ struct vm_area_struct *vma = find_vma(mm, start_vaddr);
+
+ if (vma) {
+ ret = pagemap_read_walk_range(vma, start_vaddr, &pm);
+ if (ret)
+ goto out_err;
+ start_vaddr = min(end, vma->vm_end);
+ next = start_vaddr;
+ } else {
+ next = end;
+ }
+ } while (next < end);
+out_err:
+ mmap_read_unlock(mm);
+
+ len = min(count, PM_ENTRY_BYTES * pm.pos);
+ if (copy_to_user(buf, pm.buffer, len)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+ copied += len;
+ buf += len;
+ count -= len;
+ }
+ *ppos += copied;
+ if (!ret || ret == PM_END_OF_BUFFER)
+ ret = copied;
+
+out_free:
+ kfree(pm.buffer);
+out_mm:
+ mmput(mm);
+out:
+ return ret;
+}
+
static long do_pagemap_cmd(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -2972,6 +2881,7 @@ const struct file_operations proc_pagemap_operations = {
.unlocked_ioctl = do_pagemap_cmd,
.compat_ioctl = do_pagemap_cmd,
};
+
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index 122ac50aeb09..6444625c6fbb 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -618,6 +618,19 @@ static inline bool pmd_is_device_private_entry(pmd_t pmd)
#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#ifdef CONFIG_HUGETLB_PAGE
+/**
+ * pud_is_migration_entry() - Does this PUD entry encode a migration entry?
+ * @pud: PUD entry.
+ *
+ * Returns: true if the PUD encodes a migration entry, otherwise false.
+ */
+static inline bool pud_is_migration_entry(pud_t pud)
+{
+ return softleaf_is_migration(softleaf_from_pud(pud));
+}
+#endif
+
/**
* pmd_is_migration_entry() - Does this PMD entry encode a migration entry?
* @pmd: PMD entry.
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 6f01d5ed73f6..6f8e83a5bb08 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1229,11 +1229,21 @@ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
}
#endif
+#ifndef __HAVE_ARCH_PUDP_INVALIDATE
+extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp);
+#endif
+
#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
#endif
+#ifndef __HAVE_ARCH_PUDP_INVALIDATE_AD
+extern pud_t pudp_invalidate_ad(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp);
+#endif
+
#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
/*
@@ -1776,6 +1786,21 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pud_t pud_swp_mksoft_dirty(pud_t pud)
+{
+ return pud;
+}
+
+static inline int pud_swp_soft_dirty(pud_t pud)
+{
+ return 0;
+}
+
+static inline pud_t pud_swp_clear_soft_dirty(pud_t pud)
+{
+ return pud;
+}
+
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
return pmd;
@@ -1818,6 +1843,11 @@ static inline int pmd_soft_dirty(pmd_t pmd)
return 0;
}
+static inline int pud_soft_dirty(pud_t pud)
+{
+ return 0;
+}
+
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
return pte;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index af7966169d69..f390c93b98b2 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -206,6 +206,16 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
}
#endif
+#ifndef __HAVE_ARCH_PUDP_INVALIDATE_AD
+pud_t pudp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+
+{
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ return pudp_invalidate(vma, address, pudp);
+}
+#endif
+
#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
--
2.35.3
^ permalink raw reply [flat|nested] 8+ messages in thread