[PATCH mm-unstable v14 08/16] khugepaged: generalize collapse_huge_page for mTHP collapse

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Nico Pache <npache@redhat.com>
To: linux-mm@kvack.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Cc: npache@redhat.com, akpm@linux-foundation.org, david@kernel.org,
	lorenzo.stoakes@oracle.com, ziy@nvidia.com,
	baolin.wang@linux.alibaba.com, Liam.Howlett@oracle.com,
	ryan.roberts@arm.com, dev.jain@arm.com, baohua@kernel.org,
	lance.yang@linux.dev, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, corbet@lwn.net,
	rostedt@goodmis.org, mhiramat@kernel.org,
	mathieu.desnoyers@efficios.com, matthew.brost@intel.com,
	joshua.hahnjy@gmail.com, rakie.kim@sk.com, byungchul@sk.com,
	gourry@gourry.net, ying.huang@linux.alibaba.com,
	apopple@nvidia.com, jannh@google.com, pfalcato@suse.de,
	jackmanb@google.com, hannes@cmpxchg.org, willy@infradead.org,
	peterx@redhat.com, wangkefeng.wang@huawei.com,
	usamaarif642@gmail.com, sunnanyong@huawei.com,
	vishal.moola@gmail.com, thomas.hellstrom@linux.intel.com,
	yang@os.amperecomputing.com, kas@kernel.org, aarcange@redhat.com,
	raquini@redhat.com, anshuman.khandual@arm.com,
	catalin.marinas@arm.com, tiwai@suse.de, will@kernel.org,
	dave.hansen@linux.intel.com, jack@suse.cz, cl@gentwo.org,
	jglisse@google.com, zokeefe@google.com, rientjes@google.com,
	rdunlap@infradead.org, hughd@google.com,
	richard.weiyang@gmail.com
Subject: [PATCH mm-unstable v14 08/16] khugepaged: generalize collapse_huge_page for mTHP collapse
Date: Thu, 22 Jan 2026 12:28:33 -0700	[thread overview]
Message-ID: <20260122192841.128719-9-npache@redhat.com> (raw)
In-Reply-To: <20260122192841.128719-1-npache@redhat.com>

Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.

For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
changes to the VMA from occurring.

Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 mm/khugepaged.c | 111 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 40 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9b7e05827749..76cb17243793 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1151,44 +1151,54 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
 	return SCAN_SUCCEED;
 }
 
-static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
-		int referenced, int unmapped, struct collapse_control *cc)
+static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
+		int referenced, int unmapped, struct collapse_control *cc,
+		bool *mmap_locked, unsigned int order)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
-	pte_t *pte;
+	pte_t *pte = NULL;
 	pgtable_t pgtable;
 	struct folio *folio;
 	spinlock_t *pmd_ptl, *pte_ptl;
 	enum scan_result result = SCAN_FAIL;
 	struct vm_area_struct *vma;
 	struct mmu_notifier_range range;
+	bool anon_vma_locked = false;
+	const unsigned long nr_pages = 1UL << order;
+	const unsigned long pmd_address = start_addr & HPAGE_PMD_MASK;
 
-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_WARN_ON_ONCE(pmd_address & ~HPAGE_PMD_MASK);
 
 	/*
 	 * Before allocating the hugepage, release the mmap_lock read lock.
 	 * The allocation can take potentially a long time if it involves
 	 * sync compaction, and we do not need to hold the mmap_lock during
 	 * that. We will recheck the vma after taking it again in write mode.
+	 * If collapsing mTHPs we may have already released the read_lock.
 	 */
-	mmap_read_unlock(mm);
+	if (*mmap_locked) {
+		mmap_read_unlock(mm);
+		*mmap_locked = false;
+	}
 
-	result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+	result = alloc_charge_folio(&folio, mm, cc, order);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
 	mmap_read_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 HPAGE_PMD_ORDER);
+	*mmap_locked = true;
+	result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
+		*mmap_locked = false;
 		goto out_nolock;
 	}
 
-	result = find_pmd_or_thp_or_none(mm, address, &pmd);
+	result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
 	if (result != SCAN_SUCCEED) {
 		mmap_read_unlock(mm);
+		*mmap_locked = false;
 		goto out_nolock;
 	}
 
@@ -1198,13 +1208,16 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		 * released when it fails. So we jump out_nolock directly in
 		 * that case.  Continuing to collapse causes inconsistency.
 		 */
-		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
-						     referenced, HPAGE_PMD_ORDER);
-		if (result != SCAN_SUCCEED)
+		result = __collapse_huge_page_swapin(mm, vma, start_addr, pmd,
+						     referenced, order);
+		if (result != SCAN_SUCCEED) {
+			*mmap_locked = false;
 			goto out_nolock;
+		}
 	}
 
 	mmap_read_unlock(mm);
+	*mmap_locked = false;
 	/*
 	 * Prevent all access to pagetables with the exception of
 	 * gup_fast later handled by the ptep_clear_flush and the VM
@@ -1214,20 +1227,20 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * mmap_lock.
 	 */
 	mmap_write_lock(mm);
-	result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
-					 HPAGE_PMD_ORDER);
+	result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 	/* check if the pmd is still valid */
 	vma_start_write(vma);
-	result = check_pmd_still_valid(mm, address, pmd);
+	result = check_pmd_still_valid(mm, pmd_address, pmd);
 	if (result != SCAN_SUCCEED)
 		goto out_up_write;
 
 	anon_vma_lock_write(vma->anon_vma);
+	anon_vma_locked = true;
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
-				address + HPAGE_PMD_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start_addr,
+				start_addr + (PAGE_SIZE << order));
 	mmu_notifier_invalidate_range_start(&range);
 
 	pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1239,24 +1252,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * Parallel GUP-fast is fine since GUP-fast will back off when
 	 * it detects PMD is changed.
 	 */
-	_pmd = pmdp_collapse_flush(vma, address, pmd);
+	_pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
 	spin_unlock(pmd_ptl);
 	mmu_notifier_invalidate_range_end(&range);
 	tlb_remove_table_sync_one();
 
-	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+	pte = pte_offset_map_lock(mm, &_pmd, start_addr, &pte_ptl);
 	if (pte) {
-		result = __collapse_huge_page_isolate(vma, address, pte, cc,
-						      HPAGE_PMD_ORDER,
-						      &compound_pagelist);
+		result = __collapse_huge_page_isolate(vma, start_addr, pte, cc,
+						      order, &compound_pagelist);
 		spin_unlock(pte_ptl);
 	} else {
 		result = SCAN_NO_PTE_TABLE;
 	}
 
 	if (unlikely(result != SCAN_SUCCEED)) {
-		if (pte)
-			pte_unmap(pte);
 		spin_lock(pmd_ptl);
 		BUG_ON(!pmd_none(*pmd));
 		/*
@@ -1266,21 +1276,21 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 		 */
 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
 		spin_unlock(pmd_ptl);
-		anon_vma_unlock_write(vma->anon_vma);
 		goto out_up_write;
 	}
 
 	/*
-	 * All pages are isolated and locked so anon_vma rmap
-	 * can't run anymore.
+	 * For PMD collapse all pages are isolated and locked so anon_vma
+	 * rmap can't run anymore. For mTHP collapse we must hold the lock
 	 */
-	anon_vma_unlock_write(vma->anon_vma);
+	if (is_pmd_order(order)) {
+		anon_vma_unlock_write(vma->anon_vma);
+		anon_vma_locked = false;
+	}
 
 	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
-					   vma, address, pte_ptl,
-					   HPAGE_PMD_ORDER,
-					   &compound_pagelist);
-	pte_unmap(pte);
+					   vma, start_addr, pte_ptl,
+					   order, &compound_pagelist);
 	if (unlikely(result != SCAN_SUCCEED))
 		goto out_up_write;
 
@@ -1290,20 +1300,42 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
 	 * write.
 	 */
 	__folio_mark_uptodate(folio);
-	pgtable = pmd_pgtable(_pmd);
+	if (is_pmd_order(order)) { /* PMD collapse */
+		pgtable = pmd_pgtable(_pmd);
 
-	spin_lock(pmd_ptl);
-	BUG_ON(!pmd_none(*pmd));
-	pgtable_trans_huge_deposit(mm, pmd, pgtable);
-	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
+		spin_lock(pmd_ptl);
+		WARN_ON_ONCE(!pmd_none(*pmd));
+		pgtable_trans_huge_deposit(mm, pmd, pgtable);
+		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
+	} else { /* mTHP collapse */
+		pte_t mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
+
+		mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
+		spin_lock(pmd_ptl);
+		WARN_ON_ONCE(!pmd_none(*pmd));
+		folio_ref_add(folio, nr_pages - 1);
+		folio_add_new_anon_rmap(folio, vma, start_addr, RMAP_EXCLUSIVE);
+		folio_add_lru_vma(folio, vma);
+		set_ptes(vma->vm_mm, start_addr, pte, mthp_pte, nr_pages);
+		update_mmu_cache_range(NULL, vma, start_addr, pte, nr_pages);
+
+		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+	}
 	spin_unlock(pmd_ptl);
 
 	folio = NULL;
 
 	result = SCAN_SUCCEED;
 out_up_write:
+	if (anon_vma_locked)
+		anon_vma_unlock_write(vma->anon_vma);
+	if (pte)
+		pte_unmap(pte);
 	mmap_write_unlock(mm);
+	*mmap_locked = false;
 out_nolock:
+	WARN_ON_ONCE(*mmap_locked);
 	if (folio)
 		folio_put(folio);
 	trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
@@ -1471,9 +1503,8 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
 	pte_unmap_unlock(pte, ptl);
 	if (result == SCAN_SUCCEED) {
 		result = collapse_huge_page(mm, start_addr, referenced,
-					    unmapped, cc);
-		/* collapse_huge_page will return with the mmap_lock released */
-		*mmap_locked = false;
+					    unmapped, cc, mmap_locked,
+					    HPAGE_PMD_ORDER);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
-- 
2.52.0

next prev parent reply	other threads:[~2026-01-22 19:31 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-22 19:28 [PATCH mm-unstable v14 00/16] khugepaged: mTHP support Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 01/16] mm: introduce is_pmd_order helper Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 02/16] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 03/16] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
2026-01-23  5:07   ` Lance Yang
2026-01-23  9:31     ` Baolin Wang
2026-01-26 12:25       ` Lorenzo Stoakes
2026-01-23 23:26     ` Nico Pache
2026-01-24  4:41       ` Lance Yang
2026-01-26 12:25       ` Lorenzo Stoakes
2026-01-26 11:40     ` Lorenzo Stoakes
2026-01-26 15:09       ` Andrew Morton
2026-01-26 15:18         ` Lorenzo Stoakes
2026-01-28 16:38   ` Nico Pache
2026-02-03 11:43     ` Lorenzo Stoakes
2026-02-03 11:35   ` Lorenzo Stoakes
2026-01-22 19:28 ` [PATCH mm-unstable v14 04/16] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 05/16] khugepaged: generalize alloc_charge_folio() Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 06/16] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 07/16] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
2026-02-03 12:08   ` Lorenzo Stoakes
2026-02-04 21:39     ` Nico Pache
2026-02-06 17:44     ` Nico Pache
2026-02-16 15:16       ` Lorenzo Stoakes
2026-01-22 19:28 ` Nico Pache [this message]
2026-02-03 13:07   ` [PATCH mm-unstable v14 08/16] khugepaged: generalize collapse_huge_page for mTHP collapse Lorenzo Stoakes
2026-02-04 22:00     ` Nico Pache
2026-02-16 15:20       ` Lorenzo Stoakes
2026-01-22 19:28 ` [PATCH mm-unstable v14 09/16] khugepaged: skip collapsing mTHP to smaller orders Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 10/16] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 11/16] khugepaged: improve tracepoints for mTHP orders Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 12/16] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 13/16] khugepaged: Introduce mTHP collapse support Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 14/16] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 15/16] khugepaged: run khugepaged for all orders Nico Pache
2026-01-22 19:28 ` [PATCH mm-unstable v14 16/16] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2026-01-26 11:21 ` [PATCH mm-unstable v14 00/16] khugepaged: mTHP support Lorenzo Stoakes
2026-01-26 11:32 ` Lorenzo Stoakes
2026-02-04 21:35   ` Nico Pache

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260122192841.128719-9-npache@redhat.com \
    --to=npache@redhat.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=anshuman.khandual@arm.com \
    --cc=apopple@nvidia.com \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=byungchul@sk.com \
    --cc=catalin.marinas@arm.com \
    --cc=cl@gentwo.org \
    --cc=corbet@lwn.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=dev.jain@arm.com \
    --cc=gourry@gourry.net \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=jackmanb@google.com \
    --cc=jannh@google.com \
    --cc=jglisse@google.com \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kas@kernel.org \
    --cc=lance.yang@linux.dev \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=matthew.brost@intel.com \
    --cc=mhiramat@kernel.org \
    --cc=mhocko@suse.com \
    --cc=peterx@redhat.com \
    --cc=pfalcato@suse.de \
    --cc=rakie.kim@sk.com \
    --cc=raquini@redhat.com \
    --cc=rdunlap@infradead.org \
    --cc=richard.weiyang@gmail.com \
    --cc=rientjes@google.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=sunnanyong@huawei.com \
    --cc=surenb@google.com \
    --cc=thomas.hellstrom@linux.intel.com \
    --cc=tiwai@suse.de \
    --cc=usamaarif642@gmail.com \
    --cc=vbabka@suse.cz \
    --cc=vishal.moola@gmail.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=yang@os.amperecomputing.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    --cc=zokeefe@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox