[RFC PATCH 3/7] mm: Implement folio_pmd_batch

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Oscar Salvador <osalvador@suse.de>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@kernel.org>,
	Michal Hocko <mhocko@suse.com>,
	Vlastimil Babka <vbabka@kernel.org>,
	Muchun Song <muchun.song@linux.dev>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Oscar Salvador <osalvador@suse.de>
Subject: [RFC PATCH 3/7] mm: Implement folio_pmd_batch
Date: Sun, 12 Apr 2026 19:42:40 +0200	[thread overview]
Message-ID: <20260412174244.133715-4-osalvador@suse.de> (raw)
In-Reply-To: <20260412174244.133715-1-osalvador@suse.de>

HugeTLB can be mapped as contiguous PMDs, so we need a way to be able
to batch them as we do for contiguous PTEs.
Implement folio_pmd_batch in order to do that.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 arch/arm64/include/asm/pgtable.h | 19 ++++++++
 include/linux/pgtable.h          | 30 +++++++++++++
 mm/internal.h                    | 75 +++++++++++++++++++++++++++++++-
 3 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e42ad56a86d4..5b5490505b94 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -170,6 +170,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 	(__boundary - 1 < (end) - 1) ? __boundary : (end);			\
 })
 
+#define pmd_valid_cont(pmd)    (pmd_valid(pmd) && pmd_cont(pmd))
+
 #define pte_hw_dirty(pte)	(pte_write(pte) && !pte_rdonly(pte))
 #define pte_sw_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
 #define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))
@@ -670,6 +672,12 @@ static inline pgprot_t pmd_pgprot(pmd_t pmd)
 	return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
 }
 
+#define pmd_advance_pfn pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+	return pfn_pmd(pmd_pfn(pmd) + nr, pmd_pgprot(pmd));
+}
+
 #define pud_pgprot pud_pgprot
 static inline pgprot_t pud_pgprot(pud_t pud)
 {
@@ -1645,6 +1653,17 @@ extern void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long ad
 				    pte_t *ptep, pte_t old_pte, pte_t pte,
 				    unsigned int nr);
 
+#ifdef CONFIG_HUGETLB_PAGE
+#define pmd_batch_hint pmd_batch_hint
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+	if (!pmd_valid_cont(pmd))
+		return 1;
+
+	return CONT_PMDS - (((unsigned long)pmdp >> 3) & (CONT_PMDS - 1));
+}
+#endif
+
 #ifdef CONFIG_ARM64_CONTPTE
 
 /*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1abd9c52a4f2..6f01d5ed73f6 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -358,6 +358,36 @@ static inline void lazy_mmu_mode_pause(void) {}
 static inline void lazy_mmu_mode_resume(void) {}
 #endif
 
+#ifndef pmd_batch_hint
+/**
+ * pmd_batch_hint - Number of PMD entries that can be added to batch without scanning.
+ * @pmdp: Page table pointer for the entry.
+ * @pmd: Page table entry.
+ *
+ * Some architectures know that a set of contiguous pmds all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pmd batching without the core code needing to scan every pmd.
+ *
+ * An architecture implementation may ignore the PMD accessed state. Further,
+ * the dirty state must apply atomically to all the PMDs described by the hint.
+ *
+ * May be overridden by the architecture, else pmd_batch_hint is always 1.
+ */
+static inline unsigned int pmd_batch_hint(pmd_t *pmdp, pmd_t pmd)
+{
+	return 1;
+}
+#endif
+
+#ifndef pmd_advance_pfn
+static inline pmd_t pmd_advance_pfn(pmd_t pmd, unsigned long nr)
+{
+	return __pmd(pmd_val(pmd) + (nr << PFN_PTE_SHIFT));
+}
+#endif
+
+#define pmd_next_pfn(pmd) pmd_advance_pfn(pmd, 1)
+
 #ifndef pte_batch_hint
 /**
  * pte_batch_hint - Number of pages that can be added to batch without scanning.
diff --git a/mm/internal.h b/mm/internal.h
index cb0af847d7d9..8fa0681ff2af 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -269,7 +269,7 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
 	return __anon_vma_prepare(vma);
 }
 
-/* Flags for folio_pte_batch(). */
+/* Flags for folio_{pmd,pte}_batch(). */
 typedef int __bitwise fpb_t;
 
 /* Compare PTEs respecting the dirty bit. */
@@ -293,6 +293,79 @@ typedef int __bitwise fpb_t;
  */
 #define FPB_MERGE_YOUNG_DIRTY		((__force fpb_t)BIT(4))
 
+static inline pmd_t __pmd_batch_clear_ignored(pmd_t pmd, fpb_t flags)
+{
+	if (!(flags & FPB_RESPECT_DIRTY))
+		pmd = pmd_mkclean(pmd);
+	if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
+		pmd = pmd_clear_soft_dirty(pmd);
+	if (likely(!(flags & FPB_RESPECT_WRITE)))
+		pmd = pmd_wrprotect(pmd);
+	return pmd_mkold(pmd);
+}
+
+/**
+ * folio_pmd_batch - detect a PMD batch for a large folio.
+ *                  - The only user of this is hugetlb for contiguous
+ *                    PMDs
+ **/
+static inline int folio_pmd_batch(struct folio *folio, pmd_t *pmdp, pmd_t *pmdentp,
+				  int max_nr, fpb_t flags, bool *any_writable,
+				  bool *any_young, bool *any_dirty)
+{
+	pmd_t expected_pmd, pmd = *pmdentp;
+	bool writable, young, dirty;
+	int nr, cur_nr;
+
+	if (any_writable)
+		*any_writable = !!pmd_write(*pmdentp);
+	if (any_young)
+		*any_young = !!pmd_young(*pmdentp);
+	if (any_dirty)
+		*any_dirty = !!pmd_dirty(*pmdentp);
+
+	VM_WARN_ON_FOLIO(!pmd_present(pmd), folio);
+	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
+	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pmd_pfn(pmd))) != folio, folio);
+
+	/* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+	max_nr = min_t(unsigned long, max_nr,
+		       (folio_pfn(folio) + folio_nr_pages(folio) -
+		       pmd_pfn(pmd)) >> (PMD_SHIFT - PAGE_SHIFT));
+
+	nr = pmd_batch_hint(pmdp, pmd);
+	expected_pmd = __pmd_batch_clear_ignored(pmd_advance_pfn(pmd, nr << (PMD_SHIFT - PAGE_SHIFT)), flags);
+	pmdp = pmdp + nr;
+
+	while (nr < max_nr) {
+		pmd = pmdp_get(pmdp);
+		if (any_writable)
+			writable = !!pmd_write(pmd);
+		if (any_young)
+			young = !!pmd_young(pmd);
+		if (any_dirty)
+			dirty = !!pmd_dirty(pmd);
+		pmd = __pmd_batch_clear_ignored(pmd, flags);
+
+		if (!pmd_same(pmd, expected_pmd))
+			break;
+
+		if (any_writable)
+			*any_writable |= writable;
+		if (any_young)
+			*any_young |= young;
+		if (any_dirty)
+			*any_dirty |= dirty;
+
+		cur_nr = pmd_batch_hint(pmdp, pmd);
+		expected_pmd = pmd_advance_pfn(expected_pmd, cur_nr << (PMD_SHIFT - PAGE_SHIFT));
+		pmdp += cur_nr;
+		nr += cur_nr;
+	}
+
+	return min(nr, max_nr);
+}
+
 static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
 {
 	if (!(flags & FPB_RESPECT_DIRTY))
-- 
2.35.3

next prev parent reply	other threads:[~2026-04-12 17:43 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-12 17:42 [RFC PATCH 0/7] Implement a new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 1/7] mm: Add softleaf_from_pud Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 2/7] mm: Add {pmd,pud}_huge_lock helper Oscar Salvador
2026-04-12 17:42 ` Oscar Salvador [this message]
2026-04-12 17:42 ` [RFC PATCH 4/7] mm: Implement pt_range_walk Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 5/7] mm: Make /proc/pid/smaps use the new generic pagewalk API Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 6/7] mm: Make /proc/pid/numa_maps " Oscar Salvador
2026-04-12 17:42 ` [RFC PATCH 7/7] mm: Make /proc/pid/pagemap " Oscar Salvador

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260412174244.133715-4-osalvador@suse.de \
    --to=osalvador@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=david@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=vbabka@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox