[PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split
@ 2026-04-15  1:08 Wei Yang
  2026-04-15  1:08 ` [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry Wei Yang
  2026-04-15  1:08 ` [PATCH 2/2] mm/selftests: add split_shared_pmd() Wei Yang
  0 siblings, 2 replies; 3+ messages in thread
From: Wei Yang @ 2026-04-15  1:08 UTC (permalink / raw)
  To: akpm, david, ljs, ziy, baolin.wang, Liam.Howlett, npache,
	ryan.roberts, dev.jain, baohua, lance.yang, riel, vbabka, harry,
	jannh, rppt, surenb, mhocko, shuah
  Cc: linux-mm, Wei Yang, Gavin Guo

This is a follow up optimization for commit 939080834fef ("mm/huge_memory: 
fix early failure try_to_migrate() when split huge pmd for shared THP").

When split_huge_pmd_locked() successfully split the PMD entry with @freeze
= true, it means each PTE entry are properly set to migration entry.  And 
we can return from try_to_migrate_one() directly. 

Currently it is done in a sub-optimal way: it always restarts the walk and go
through each PTE entry by page_vma_mapped_walk() which then skip all of them
when the PMD is split to migration entry.  

Let split_huge_pmd_locked() indicate whether it split PMD to migration entry,
so that to optimize migration if huge PMD needs split.

Also add a selftest to check the bug fixed in commit 939080834fef
("mm/huge_memory: fix early failure try_to_migrate() when split huge pmd for
shared THP") will not be introduced.

Cc: Gavin Guo <gavinguo@igalia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Lance Yang <lance.yang@linux.dev>

Wei Yang (2):
  mm/huge_memory: return true if split_huge_pmd_locked() split PMD to
    migration entry
  mm/selftests: add split_shared_pmd()

 include/linux/huge_mm.h                       |  9 ++-
 mm/huge_memory.c                              | 21 ++++--
 mm/rmap.c                                     | 11 ++-
 .../selftests/mm/split_huge_page_test.c       | 73 ++++++++++++++++++-
 4 files changed, 99 insertions(+), 15 deletions(-)

-- 
2.34.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry
  2026-04-15  1:08 [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split Wei Yang
@ 2026-04-15  1:08 ` Wei Yang
  2026-04-15  1:08 ` [PATCH 2/2] mm/selftests: add split_shared_pmd() Wei Yang
  1 sibling, 0 replies; 3+ messages in thread
From: Wei Yang @ 2026-04-15  1:08 UTC (permalink / raw)
  To: akpm, david, ljs, ziy, baolin.wang, Liam.Howlett, npache,
	ryan.roberts, dev.jain, baohua, lance.yang, riel, vbabka, harry,
	jannh, rppt, surenb, mhocko, shuah
  Cc: linux-mm, Wei Yang, Gavin Guo

When @freeze is set to true, split_huge_pmd_locked() is intended to
split the PMD to migration entry. But if it doesn't manage to clear
PageAnonExclusive(), it just split PMD and leave the folio mapped
through PTE.

This patch let split_huge_pmd_locked() return true to indicate it does
split PMD to migration entry. With this knowledge, we can return
directly in try_to_migrate_one() if it does.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Gavin Guo <gavinguo@igalia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Lance Yang <lance.yang@linux.dev>
---
 include/linux/huge_mm.h |  9 ++++++---
 mm/huge_memory.c        | 21 +++++++++++++--------
 mm/rmap.c               | 11 ++++++++---
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..6ae423b8dbc0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -561,7 +561,7 @@ static inline bool thp_migration_supported(void)
 	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
 }
 
-void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+bool split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze);
 bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 			   pmd_t *pmdp, struct folio *folio);
@@ -658,9 +658,12 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze) {}
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 		unsigned long address, bool freeze) {}
-static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
+static inline bool split_huge_pmd_locked(struct vm_area_struct *vma,
 					 unsigned long address, pmd_t *pmd,
-					 bool freeze) {}
+					 bool freeze)
+{
+	return false;
+}
 
 static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
 					 unsigned long addr, pmd_t *pmdp,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 970e077019b7..ec84bb4a0cc3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3087,7 +3087,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmd_populate(mm, pmd, pgtable);
 }
 
-static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+static bool __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long haddr, bool freeze)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -3096,7 +3096,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	pgtable_t pgtable;
 	pmd_t old_pmd, _pmd;
 	bool soft_dirty, uffd_wp = false, young = false, write = false;
-	bool anon_exclusive = false, dirty = false;
+	bool anon_exclusive = false, dirty = false, ret = false;
 	unsigned long addr;
 	pte_t *pte;
 	int i;
@@ -3118,13 +3118,13 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		if (arch_needs_pgtable_deposit())
 			zap_deposited_table(mm, pmd);
 		if (vma_is_special_huge(vma))
-			return;
+			return ret;
 		if (unlikely(pmd_is_migration_entry(old_pmd))) {
 			const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
 
 			folio = softleaf_to_folio(old_entry);
 		} else if (is_huge_zero_pmd(old_pmd)) {
-			return;
+			return ret;
 		} else {
 			page = pmd_page(old_pmd);
 			folio = page_folio(page);
@@ -3136,7 +3136,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			folio_put(folio);
 		}
 		add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
-		return;
+		return ret;
 	}
 
 	if (is_huge_zero_pmd(*pmd)) {
@@ -3149,7 +3149,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 * small page also write protected so it does not seems useful
 		 * to invalidate secondary mmu at this time.
 		 */
-		return __split_huge_zero_page_pmd(vma, haddr, pmd);
+		__split_huge_zero_page_pmd(vma, haddr, pmd);
+		return ret;
 	}
 
 	if (pmd_is_migration_entry(*pmd)) {
@@ -3309,6 +3310,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
 			set_pte_at(mm, addr, pte + i, entry);
 		}
+		ret = true;
 	} else if (pmd_is_device_private_entry(old_pmd)) {
 		pte_t entry;
 		swp_entry_t swp_entry;
@@ -3366,14 +3368,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
+	return ret;
 }
 
-void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
+bool split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze)
 {
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
 	if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
-		__split_huge_pmd_locked(vma, pmd, address, freeze);
+		return __split_huge_pmd_locked(vma, pmd, address, freeze);
+	else
+		return false;
 }
 
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/mm/rmap.c b/mm/rmap.c
index 78b7fb5f367c..91fb495bebbe 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2464,13 +2464,18 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 
 			if (flags & TTU_SPLIT_HUGE_PMD) {
 				/*
-				 * split_huge_pmd_locked() might leave the
+				 * If split_huge_pmd_locked() does split PMD
+				 * to migration entry, we are done.
+				 * If split_huge_pmd_locked() leave the
 				 * folio mapped through PTEs. Retry the walk
 				 * so we can detect this scenario and properly
 				 * abort the walk.
 				 */
-				split_huge_pmd_locked(vma, pvmw.address,
-						      pvmw.pmd, true);
+				if (split_huge_pmd_locked(vma, pvmw.address,
+						      pvmw.pmd, true)) {
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
 				flags &= ~TTU_SPLIT_HUGE_PMD;
 				page_vma_mapped_walk_restart(&pvmw);
 				continue;
-- 
2.34.1



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 2/2] mm/selftests: add split_shared_pmd()
  2026-04-15  1:08 [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split Wei Yang
  2026-04-15  1:08 ` [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry Wei Yang
@ 2026-04-15  1:08 ` Wei Yang
  1 sibling, 0 replies; 3+ messages in thread
From: Wei Yang @ 2026-04-15  1:08 UTC (permalink / raw)
  To: akpm, david, ljs, ziy, baolin.wang, Liam.Howlett, npache,
	ryan.roberts, dev.jain, baohua, lance.yang, riel, vbabka, harry,
	jannh, rppt, surenb, mhocko, shuah
  Cc: linux-mm, Wei Yang, Gavin Guo

Commit 60fbb14396d5 ("mm/huge_memory: adjust try_to_migrate_one() and
split_huge_pmd_locked()") introduced a bug to fail try_to_migrate()
early by returning false unconditionally after split_huge_pmd_locked(),
when this large pmd is shared by multi-process.

This is fixed by commit 939080834fef ("mm/huge_memory: fix
early failure try_to_migrate() when split huge pmd for shared THP").

Let's add a selftest to check this is not broken any more.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Gavin Guo <gavinguo@igalia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Lance Yang <lance.yang@linux.dev>
---
 .../selftests/mm/split_huge_page_test.c       | 73 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index 500d07c4938b..9d1de67f9929 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -16,6 +16,7 @@
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/param.h>
+#include <sys/wait.h>
 #include <malloc.h>
 #include <stdbool.h>
 #include <time.h>
@@ -332,6 +333,74 @@ static void split_pmd_zero_pages(void)
 	free(one_page);
 }
 
+static void split_shared_pmd(void)
+{
+	char *one_page;
+	int nr_pmds = 1;
+	size_t len = nr_pmds * pmd_pagesize;
+	size_t i;
+	pid_t pid;
+	int status;
+	int ret = 0, level = 0;
+
+	one_page = memalign(pmd_pagesize, len);
+	if (!one_page)
+		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
+
+	madvise(one_page, len, MADV_HUGEPAGE);
+
+	for (i = 0; i < len; i++)
+		one_page[i] = (char)i;
+
+	if (!check_huge_anon(one_page, nr_pmds, pmd_pagesize))
+		ksft_exit_fail_msg("No THP is allocated\n");
+
+	for (;;) {
+		pid = fork();
+
+		if (pid < 0) {
+			perror("Error: fork\n");
+			exit(KSFT_SKIP);
+		}
+
+		if (pid != 0)
+			break;
+
+		/*
+		 * Current /sys/kernel/debug/split_huge_pages interface would
+		 * call folio_split() for each page in the range. So we need
+		 * to create one more map to the PMD, otherwise it still split
+		 * successfully after 512 = (pmd_pagesize / pagesize) trials.
+		 */
+		if (++level == (pmd_pagesize / pagesize)) {
+			/* split THPs */
+			write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+				(uint64_t)one_page + len, 0);
+
+			memset(expected_orders, 0, sizeof(int) * (pmd_order + 1));
+			expected_orders[0] = nr_pmds << pmd_order;
+
+			if (check_after_split_folio_orders(one_page, len, pagemap_fd,
+							   kpageflags_fd, expected_orders,
+							   (pmd_order + 1)))
+				exit(KSFT_FAIL);
+
+			exit(KSFT_PASS);
+		}
+	}
+
+	wait(&status);
+	free(one_page);
+
+	if (WIFEXITED(status))
+		ret = WEXITSTATUS(status);
+
+	if (level != 0)
+		exit(ret);
+
+	ksft_test_result_report(ret, "Split shared pmd\n");
+}
+
 static void split_pmd_thp_to_order(int order)
 {
 	char *one_page;
@@ -777,7 +846,7 @@ int main(int argc, char **argv)
 	if (!expected_orders)
 		ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno));
 
-	tests = 2 + (pmd_order - 1) + (2 * pmd_order) + (pmd_order - 1) * 4 + 2;
+	tests = 3 + (pmd_order - 1) + (2 * pmd_order) + (pmd_order - 1) * 4 + 2;
 	ksft_set_plan(tests);
 
 	pagemap_fd = open(pagemap_proc, O_RDONLY);
@@ -792,6 +861,8 @@ int main(int argc, char **argv)
 
 	split_pmd_zero_pages();
 
+	split_shared_pmd();
+
 	for (i = 0; i < pmd_order; i++)
 		if (i != 1)
 			split_pmd_thp_to_order(i);
-- 
2.34.1



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-04-15  1:10 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-04-15  1:08 [PATCH 0/2] mm/huge_memory: optimize migration when huge PMD needs split Wei Yang
2026-04-15  1:08 ` [PATCH 1/2] mm/huge_memory: return true if split_huge_pmd_locked() split PMD to migration entry Wei Yang
2026-04-15  1:08 ` [PATCH 2/2] mm/selftests: add split_shared_pmd() Wei Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox