[RFC 04/12] mm: thp: implement PUD THP split to PTE level

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Usama Arif <usamaarif642@gmail.com>
To: ziy@nvidia.com, Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	lorenzo.stoakes@oracle.com, linux-mm@kvack.org
Cc: hannes@cmpxchg.org, riel@surriel.com, shakeel.butt@linux.dev,
	kas@kernel.org, baohua@kernel.org, dev.jain@arm.com,
	baolin.wang@linux.alibaba.com, npache@redhat.com,
	Liam.Howlett@oracle.com, ryan.roberts@arm.com, vbabka@suse.cz,
	lance.yang@linux.dev, linux-kernel@vger.kernel.org,
	kernel-team@meta.com, Usama Arif <usamaarif642@gmail.com>
Subject: [RFC 04/12] mm: thp: implement PUD THP split to PTE level
Date: Sun,  1 Feb 2026 16:50:21 -0800	[thread overview]
Message-ID: <20260202005451.774496-5-usamaarif642@gmail.com> (raw)
In-Reply-To: <20260202005451.774496-1-usamaarif642@gmail.com>

Implement the split operation that converts a PUD THP mapping into
individual PTE mappings.

A PUD THP maps 1GB of memory with a single page table entry. When the
mapping needs to be broken - for COW, partial unmap, permission changes,
or reclaim - it must be split into smaller mappings. Unlike PMD THPs
which split into 512 PTEs in a single level, PUD THPs require a two-level
split: the single PUD entry becomes 512 PMD entries, each pointing to a
PTE table containing 512 PTEs, for a total of 262144 page table entries.

The split uses page tables that were pre-deposited when the PUD THP was
first allocated. This guarantees the split cannot fail due to memory
allocation failure, which is critical since splits often happen under
memory pressure during reclaim. The deposited PMD table is installed in
the PUD entry, and each PMD slot receives one of the 512 deposited PTE
tables.

Each PTE is populated to map one 4KB page of the original 1GB folio.
Page flags from the original PUD entry (dirty, accessed, writable,
soft-dirty) are propagated to each PTE so that no information is lost.
The rmap is updated to remove the single PUD-level mapping entry and
add 262144 PTE-level mapping entries.

The split goes directly to PTE level rather than stopping at PMD level.
This is because the kernel's rmap infrastructure assumes that PMD-level
mappings are for PMD-sized folios. If we mapped a PUD-sized folio at
PMD level (512 PMD entries for one folio), the rmap accounting would
break - it would see 512 "large" mappings for a folio that should have
far more. Going to PTE level avoids this problem entirely.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 mm/huge_memory.c | 181 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 173 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7613caf1e7c30..39b8212b5abd4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3129,12 +3129,82 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return 1;
 }
 
+/*
+ * Structure to hold page tables for PUD split.
+ * Tables are withdrawn from the pre-deposit made at fault time.
+ */
+struct pud_split_ptables {
+	pmd_t *pmd_table;
+	pgtable_t *pte_tables;  /* Array of 512 PTE tables */
+	int nr_pte_tables;      /* Number of PTE tables in array */
+};
+
+/*
+ * Withdraw pre-deposited page tables from PUD THP.
+ * Tables are always deposited at fault time in do_huge_pud_anonymous_page().
+ * Returns true if successful, false if no tables deposited.
+ */
+static bool withdraw_pud_split_ptables(struct mm_struct *mm, pud_t *pud,
+				       struct pud_split_ptables *tables)
+{
+	pmd_t *pmd_table;
+	pgtable_t pte_table;
+	int i;
+
+	tables->pmd_table = NULL;
+	tables->pte_tables = NULL;
+	tables->nr_pte_tables = 0;
+
+	/* Try to withdraw the deposited PMD table */
+	pmd_table = pgtable_trans_huge_pud_withdraw(mm, pud);
+	if (!pmd_table)
+		return false;
+
+	tables->pmd_table = pmd_table;
+
+	/* Allocate array to hold PTE table pointers */
+	tables->pte_tables = kmalloc_array(NR_PTE_TABLES_FOR_PUD,
+					   sizeof(pgtable_t), GFP_ATOMIC);
+	if (!tables->pte_tables)
+		goto fail;
+
+	/* Withdraw PTE tables from the PMD table */
+	for (i = 0; i < NR_PTE_TABLES_FOR_PUD; i++) {
+		pte_table = pud_withdraw_pte(pmd_table);
+		if (!pte_table)
+			goto fail;
+		tables->pte_tables[i] = pte_table;
+		tables->nr_pte_tables++;
+	}
+
+	return true;
+
+fail:
+	/* Put back any tables we withdrew */
+	for (i = 0; i < tables->nr_pte_tables; i++)
+		pud_deposit_pte(pmd_table, tables->pte_tables[i]);
+	kfree(tables->pte_tables);
+	pgtable_trans_huge_pud_deposit(mm, pud, pmd_table);
+	tables->pmd_table = NULL;
+	tables->pte_tables = NULL;
+	tables->nr_pte_tables = 0;
+	return false;
+}
+
 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long haddr)
 {
+	bool dirty = false, young = false, write = false;
+	struct pud_split_ptables tables = { 0 };
+	struct mm_struct *mm = vma->vm_mm;
+	rmap_t rmap_flags = RMAP_NONE;
+	bool anon_exclusive = false;
+	bool soft_dirty = false;
 	struct folio *folio;
+	unsigned long addr;
 	struct page *page;
 	pud_t old_pud;
+	int i, j;
 
 	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
@@ -3145,20 +3215,115 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 
 	old_pud = pudp_huge_clear_flush(vma, haddr, pud);
 
-	if (!vma_is_dax(vma))
+	if (!vma_is_anonymous(vma)) {
+		if (!vma_is_dax(vma))
+			return;
+
+		page = pud_page(old_pud);
+		folio = page_folio(page);
+
+		if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+			folio_mark_dirty(folio);
+		if (!folio_test_referenced(folio) && pud_young(old_pud))
+			folio_set_referenced(folio);
+		folio_remove_rmap_pud(folio, page, vma);
+		folio_put(folio);
+		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PUD_NR);
 		return;
+	}
+
+	/*
+	 * Anonymous PUD split: split directly to PTE level.
+	 *
+	 * We cannot create PMD huge entries pointing to portions of a larger
+	 * folio because the kernel's rmap infrastructure assumes PMD mappings
+	 * are for PMD-sized folios only (see __folio_rmap_sanity_checks).
+	 * Instead, we create a PMD table with 512 entries, each pointing to
+	 * a PTE table with 512 PTEs.
+	 *
+	 * Tables are always deposited at fault time in do_huge_pud_anonymous_page().
+	 */
+	if (!withdraw_pud_split_ptables(mm, pud, &tables)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
 
 	page = pud_page(old_pud);
 	folio = page_folio(page);
 
-	if (!folio_test_dirty(folio) && pud_dirty(old_pud))
-		folio_mark_dirty(folio);
-	if (!folio_test_referenced(folio) && pud_young(old_pud))
-		folio_set_referenced(folio);
+	dirty = pud_dirty(old_pud);
+	write = pud_write(old_pud);
+	young = pud_young(old_pud);
+	soft_dirty = pud_soft_dirty(old_pud);
+	anon_exclusive = PageAnonExclusive(page);
+
+	if (dirty)
+		folio_set_dirty(folio);
+
+	/*
+	 * Add references for each page that will have its own PTE.
+	 * Original folio has 1 reference. After split, each of 262144 PTEs
+	 * will eventually be unmapped, each calling folio_put().
+	 */
+	folio_ref_add(folio, HPAGE_PUD_NR - 1);
+
+	/*
+	 * Add PTE-level rmap for all pages at once.
+	 */
+	if (anon_exclusive)
+		rmap_flags |= RMAP_EXCLUSIVE;
+	folio_add_anon_rmap_ptes(folio, page, HPAGE_PUD_NR,
+				 vma, haddr, rmap_flags);
+
+	/* Remove PUD-level rmap */
 	folio_remove_rmap_pud(folio, page, vma);
-	folio_put(folio);
-	add_mm_counter(vma->vm_mm, mm_counter_file(folio),
-		-HPAGE_PUD_NR);
+
+	/*
+	 * Create 512 PMD entries, each pointing to a PTE table.
+	 * Each PTE table has 512 PTEs pointing to individual pages.
+	 */
+	addr = haddr;
+	for (i = 0; i < (HPAGE_PUD_NR / HPAGE_PMD_NR); i++) {
+		pmd_t *pmd_entry = tables.pmd_table + i;
+		pgtable_t pte_table = tables.pte_tables[i];
+		pte_t *pte;
+		struct page *subpage_base = page + i * HPAGE_PMD_NR;
+
+		/* Populate the PTE table */
+		pte = page_address(pte_table);
+		for (j = 0; j < HPAGE_PMD_NR; j++) {
+			struct page *subpage = subpage_base + j;
+			pte_t entry;
+
+			entry = mk_pte(subpage, vma->vm_page_prot);
+			if (write)
+				entry = pte_mkwrite(entry, vma);
+			if (dirty)
+				entry = pte_mkdirty(entry);
+			if (young)
+				entry = pte_mkyoung(entry);
+			if (soft_dirty)
+				entry = pte_mksoft_dirty(entry);
+
+			set_pte_at(mm, addr + j * PAGE_SIZE, pte + j, entry);
+		}
+
+		/* Set PMD to point to PTE table */
+		pmd_populate(mm, pmd_entry, pte_table);
+		addr += HPAGE_PMD_SIZE;
+	}
+
+	/*
+	 * Memory barrier ensures all PMD entries are visible before
+	 * installing the PMD table in the PUD.
+	 */
+	smp_wmb();
+
+	/* Install the PMD table in the PUD */
+	pud_populate(mm, pud, tables.pmd_table);
+
+	/* Free the temporary array holding PTE table pointers */
+	kfree(tables.pte_tables);
 }
 
 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
-- 
2.47.3

next prev parent reply	other threads:[~2026-02-02  0:55 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-02  0:50 [RFC 00/12] mm: PUD (1GB) THP implementation Usama Arif
2026-02-02  0:50 ` [RFC 01/12] mm: add PUD THP ptdesc and rmap support Usama Arif
2026-02-02 10:44   ` Kiryl Shutsemau
2026-02-02 16:01     ` Zi Yan
2026-02-03 22:07       ` Usama Arif
2026-02-05  4:17         ` Matthew Wilcox
2026-02-05  4:21           ` Matthew Wilcox
2026-02-05  5:13             ` Usama Arif
2026-02-05 17:40               ` David Hildenbrand (Arm)
2026-02-05 18:05                 ` Usama Arif
2026-02-05 18:11                   ` Usama Arif
2026-02-02 12:15   ` Lorenzo Stoakes
2026-02-04  7:38     ` Usama Arif
2026-02-04 12:55       ` Lorenzo Stoakes
2026-02-05  6:40         ` Usama Arif
2026-02-02  0:50 ` [RFC 02/12] mm/thp: add mTHP stats infrastructure for PUD THP Usama Arif
2026-02-02 11:56   ` Lorenzo Stoakes
2026-02-05  5:53     ` Usama Arif
2026-02-02  0:50 ` [RFC 03/12] mm: thp: add PUD THP allocation and fault handling Usama Arif
2026-02-02  0:50 ` Usama Arif [this message]
2026-02-02  0:50 ` [RFC 05/12] mm: thp: add reclaim and migration support for PUD THP Usama Arif
2026-02-02  0:50 ` [RFC 06/12] selftests/mm: add PUD THP basic allocation test Usama Arif
2026-02-02  0:50 ` [RFC 07/12] selftests/mm: add PUD THP read/write access test Usama Arif
2026-02-02  0:50 ` [RFC 08/12] selftests/mm: add PUD THP fork COW test Usama Arif
2026-02-02  0:50 ` [RFC 09/12] selftests/mm: add PUD THP partial munmap test Usama Arif
2026-02-02  0:50 ` [RFC 10/12] selftests/mm: add PUD THP mprotect split test Usama Arif
2026-02-02  0:50 ` [RFC 11/12] selftests/mm: add PUD THP reclaim test Usama Arif
2026-02-02  0:50 ` [RFC 12/12] selftests/mm: add PUD THP migration test Usama Arif
2026-02-02  2:44 ` [RFC 00/12] mm: PUD (1GB) THP implementation Rik van Riel
2026-02-02 11:30   ` Lorenzo Stoakes
2026-02-02 15:50     ` Zi Yan
2026-02-04 10:56       ` Lorenzo Stoakes
2026-02-05 11:29         ` David Hildenbrand (arm)
2026-02-05 11:22       ` David Hildenbrand (arm)
2026-02-02  4:00 ` Matthew Wilcox
2026-02-02  9:06   ` David Hildenbrand (arm)
2026-02-03 21:11     ` Usama Arif
2026-02-02 11:20 ` Lorenzo Stoakes
2026-02-04  1:00   ` Usama Arif
2026-02-04 11:08     ` Lorenzo Stoakes
2026-02-04 11:50       ` Dev Jain
2026-02-04 12:01         ` Dev Jain
2026-02-05  6:08       ` Usama Arif
2026-02-02 16:24 ` Zi Yan
2026-02-03 23:29   ` Usama Arif
2026-02-04  0:08     ` Frank van der Linden
2026-02-05  5:46       ` Usama Arif
2026-02-05 18:07     ` Zi Yan
2026-02-07 23:22       ` Usama Arif

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260202005451.774496-5-usamaarif642@gmail.com \
    --to=usamaarif642@gmail.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=hannes@cmpxchg.org \
    --cc=kas@kernel.org \
    --cc=kernel-team@meta.com \
    --cc=lance.yang@linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=npache@redhat.com \
    --cc=riel@surriel.com \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=vbabka@suse.cz \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox