From: Oscar Salvador <osalvador@suse.de>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@kernel.org>,
Michal Hocko <mhocko@suse.com>,
Vlastimil Babka <vbabka@kernel.org>,
Muchun Song <muchun.song@linux.dev>,
Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
Oscar Salvador <osalvador@suse.de>
Subject: [RFC PATCH 3/7] mm: Implement folio_pmd_batch
Date: Sun, 12 Apr 2026 19:42:40 +0200 [thread overview]
Message-ID: <20260412174244.133715-4-osalvador@suse.de> (raw)
In-Reply-To: <20260412174244.133715-1-osalvador@suse.de>
HugeTLB can be mapped as contiguous PMDs, so we need a way to be able
to batch them as we do for contiguous PTEs.
Implement folio_pmd_batch in order to do that.
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
arch/arm64/include/asm/pgtable.h | 19 ++++++++
include/linux/pgtable.h | 30 +++++++++++++
mm/internal.h | 75 +++++++++++++++++++++++++++++++-
3 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e42ad56a86d4..5b5490505b94 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -170,6 +170,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
(__boundary - 1 < (end) - 1) ? __boundary : (end); \
})
+#define pmd_valid_cont(pmd) (pmd_valid(pmd) && pmd_cont(pmd))
+
#define pte_hw_dirty(pte) (pte_write(pte) && !pte_rdonly(pte))
#define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY))
#define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte))
@@ -670,6 +672,12 @@ static inline pgprot_t pmd_pgprot(pmd_t pmd)
return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
}
+#define pmd_advance_pfn pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+ return pfn_pmd(pmd_pfn(pmd) + nr, pmd_pgprot(pmd));
+}
+
#define pud_pgprot pud_pgprot
static inline pgprot_t pud_pgprot(pud_t pud)
{
@@ -1645,6 +1653,17 @@ extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long ad
pte_t *ptep, pte_t old_pte, pte_t pte,
unsigned int nr);
+#ifdef CONFIG_HUGETLB_PAGE
+#define pmd_batch_hint pmd_batch_hint
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+ if (!pmd_valid_cont(pmd))
+ return 1;
+
+ return CONT_PMDS - (((unsigned long)pmdp >> 3) & (CONT_PMDS - 1));
+}
+#endif
+
#ifdef CONFIG_ARM64_CONTPTE
/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1abd9c52a4f2..6f01d5ed73f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -358,6 +358,36 @@ static inline void lazy_mmu_mode_pause(void) {}
static inline void lazy_mmu_mode_resume(void) {}
#endif
+#ifndef pmd_batch_hint
+/**
+ * pmd_batch_hint - Number of PMD entries that can be added to batch without scanning.
+ * @pmdp: Page table pointer for the entry.
+ * @pmd: Page table entry.
+ *
+ * Some architectures know that a set of contiguous pmds all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pmd batching without the core code needing to scan every pmd.
+ *
+ * An architecture implementation may ignore the PMD accessed state. Further,
+ * the dirty state must apply atomically to all the PMDs described by the hint.
+ *
+ * May be overridden by the architecture, else pmd_batch_hint is always 1.
+ */
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+ return 1;
+}
+#endif
+
+#ifndef pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+ return __pmd(pmd_val(pmd) + (nr << PFN_PTE_SHIFT));
+}
+#endif
+
+#define pmd_next_pfn(pmd) pmd_advance_pfn(pmd, 1)
+
#ifndef pte_batch_hint
/**
* pte_batch_hint - Number of pages that can be added to batch without scanning.
diff --git a/mm/internal.h b/mm/internal.h
index cb0af847d7d9..8fa0681ff2af 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -269,7 +269,7 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
return __anon_vma_prepare(vma);
}
-/* Flags for folio_pte_batch(). */
+/* Flags for folio_{pmd,pte}_batch(). */
typedef int __bitwise fpb_t;
/* Compare PTEs respecting the dirty bit. */
@@ -293,6 +293,79 @@ typedef int __bitwise fpb_t;
*/
#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
+static inline pmd_t __pmd_batch_clear_ignored(pmd_t pmd, fpb_t flags)
+{
+ if (!(flags & FPB_RESPECT_DIRTY))
+ pmd = pmd_mkclean(pmd);
+ if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
+ pmd = pmd_clear_soft_dirty(pmd);
+ if (likely(!(flags & FPB_RESPECT_WRITE)))
+ pmd = pmd_wrprotect(pmd);
+ return pmd_mkold(pmd);
+}
+
+/**
+ * folio_pmd_batch - detect a PMD batch for a large folio.
+ * - The only user of this is hugetlb for contiguous
+ * PMDs
+ **/
+static inline int folio_pmd_batch(struct folio *folio, pmd_t *pmdp, pmd_t *pmdentp,
+ int max_nr, fpb_t flags, bool *any_writable,
+ bool *any_young, bool *any_dirty)
+{
+ pmd_t expected_pmd, pmd = *pmdentp;
+ bool writable, young, dirty;
+ int nr, cur_nr;
+
+ if (any_writable)
+ *any_writable = !!pmd_write(*pmdentp);
+ if (any_young)
+ *any_young = !!pmd_young(*pmdentp);
+ if (any_dirty)
+ *any_dirty = !!pmd_dirty(*pmdentp);
+
+ VM_WARN_ON_FOLIO(!pmd_present(pmd), folio);
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+ VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pmd_pfn(pmd))) != folio, folio);
+
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ (folio_pfn(folio) + folio_nr_pages(folio) -
+ pmd_pfn(pmd)) >> (PMD_SHIFT - PAGE_SHIFT));
+
+ nr = pmd_batch_hint(pmdp, pmd);
+ expected_pmd = __pmd_batch_clear_ignored(pmd_advance_pfn(pmd, nr << (PMD_SHIFT - PAGE_SHIFT)), flags);
+ pmdp = pmdp + nr;
+
+ while (nr < max_nr) {
+ pmd = pmdp_get(pmdp);
+ if (any_writable)
+ writable = !!pmd_write(pmd);
+ if (any_young)
+ young = !!pmd_young(pmd);
+ if (any_dirty)
+ dirty = !!pmd_dirty(pmd);
+ pmd = __pmd_batch_clear_ignored(pmd, flags);
+
+ if (!pmd_same(pmd, expected_pmd))
+ break;
+
+ if (any_writable)
+ *any_writable |= writable;
+ if (any_young)
+ *any_young |= young;
+ if (any_dirty)
+ *any_dirty |= dirty;
+
+ cur_nr = pmd_batch_hint(pmdp, pmd);
+ expected_pmd = pmd_advance_pfn(expected_pmd, cur_nr << (PMD_SHIFT - PAGE_SHIFT));
+ pmdp += cur_nr;
+ nr += cur_nr;
+ }
+
+ return min(nr, max_nr);
+}
+
static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
if (!(flags & FPB_RESPECT_DIRTY))
--
2.35.3
next prev parent reply other threads:[~2026-04-12 17:43 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 1/7] mm: Add softleaf_from_pud Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 2/7] mm: Add {pmd,pud}_huge_lock helper Oscar Salvador
2026-04-12 17:42 ` Oscar Salvador [this message]
2026-04-12 17:42 ` [RFC PATCH 4/7] mm: Implement pt_range_walk Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 5/7] mm: Make /proc/pid/smaps use the new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 6/7] mm: Make /proc/pid/numa_maps " Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 7/7] mm: Make /proc/pid/pagemap " Oscar Salvador
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260412174244.133715-4-osalvador@suse.de \
--to=osalvador@suse.de \
--cc=akpm@linux-foundation.org \
--cc=david@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=mhocko@suse.com \
--cc=muchun.song@linux.dev \
--cc=vbabka@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox