* [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_*
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-08 1:42 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
` (14 subsequent siblings)
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
The hpage_collapse functions describe functions used by madvise_collapse
and khugepaged. remove the unnecessary hpage prefix to shorten the
function name.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 73 ++++++++++++++++++++++++-------------------------
mm/mremap.c | 2 +-
2 files changed, 37 insertions(+), 38 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5b7276bc14b1..6c4abc7f45cf 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -395,14 +395,14 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline int hpage_collapse_test_exit(struct mm_struct *mm)
+static inline int collapse_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
-static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
+static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
{
- return hpage_collapse_test_exit(mm) ||
+ return collapse_test_exit(mm) ||
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
@@ -436,7 +436,7 @@ void __khugepaged_enter(struct mm_struct *mm)
int wakeup;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ VM_BUG_ON_MM(collapse_test_exit(mm), mm);
if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
@@ -490,7 +490,7 @@ void __khugepaged_exit(struct mm_struct *mm)
} else if (slot) {
/*
* This is required to serialize against
- * hpage_collapse_test_exit() (which is guaranteed to run
+ * collapse_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we return all
* pagetables will be destroyed) until khugepaged has finished
* working on the pagetables under the mmap_lock.
@@ -580,7 +580,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
- /* See hpage_collapse_scan_pmd(). */
+ /* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
@@ -831,7 +831,7 @@ struct collapse_control khugepaged_collapse_control = {
.is_khugepaged = true,
};
-static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
+static bool collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -866,7 +866,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int hpage_collapse_find_target_node(struct collapse_control *cc)
+static int collapse_find_target_node(struct collapse_control *cc)
{
int nid, target_node = 0, max_value = 0;
@@ -885,7 +885,7 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc)
return target_node;
}
#else
-static int hpage_collapse_find_target_node(struct collapse_control *cc)
+static int collapse_find_target_node(struct collapse_control *cc)
{
return 0;
}
@@ -906,7 +906,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
TVA_FORCED_COLLAPSE;
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -979,7 +979,7 @@ static int check_pmd_still_valid(struct mm_struct *mm,
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
+ * Only done if khugepaged_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
@@ -1065,7 +1065,7 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
- int node = hpage_collapse_find_target_node(cc);
+ int node = collapse_find_target_node(cc);
struct folio *folio;
folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
@@ -1244,10 +1244,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
return result;
}
-static int hpage_collapse_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, bool *mmap_locked,
- struct collapse_control *cc)
+static int collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start_addr, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
@@ -1355,7 +1355,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = folio_nid(folio);
- if (hpage_collapse_scan_abort(node, cc)) {
+ if (collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
@@ -1421,7 +1421,7 @@ static void collect_mm_slot(struct mm_slot *slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit(mm)) {
+ if (collapse_test_exit(mm)) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
@@ -1741,7 +1741,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
continue;
- if (hpage_collapse_test_exit(mm))
+ if (collapse_test_exit(mm))
continue;
/*
* When a vma is registered with uffd-wp, we cannot recycle
@@ -2263,9 +2263,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
+static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct folio *folio = NULL;
struct address_space *mapping = file->f_mapping;
@@ -2320,7 +2320,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
}
node = folio_nid(folio);
- if (hpage_collapse_scan_abort(node, cc)) {
+ if (collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
folio_put(folio);
break;
@@ -2370,7 +2370,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
return result;
}
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
@@ -2405,7 +2405,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
goto breakouterloop_mmap_lock;
progress++;
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
goto breakouterloop;
vma_iter_init(&vmi, mm, khugepaged_scan.address);
@@ -2413,7 +2413,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
unsigned long hstart, hend;
cond_resched();
- if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
+ if (unlikely(collapse_test_exit_or_disable(mm))) {
progress++;
break;
}
@@ -2434,7 +2434,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
bool mmap_locked = true;
cond_resched();
- if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
+ if (unlikely(collapse_test_exit_or_disable(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2447,12 +2447,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mmap_read_unlock(mm);
mmap_locked = false;
- *result = hpage_collapse_scan_file(mm,
+ *result = collapse_scan_file(mm,
khugepaged_scan.address, file, pgoff, cc);
fput(file);
if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
mmap_read_lock(mm);
- if (hpage_collapse_test_exit_or_disable(mm))
+ if (collapse_test_exit_or_disable(mm))
goto breakouterloop;
*result = collapse_pte_mapped_thp(mm,
khugepaged_scan.address, false);
@@ -2461,7 +2461,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
mmap_read_unlock(mm);
}
} else {
- *result = hpage_collapse_scan_pmd(mm, vma,
+ *result = collapse_scan_pmd(mm, vma,
khugepaged_scan.address, &mmap_locked, cc);
}
@@ -2494,7 +2494,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (hpage_collapse_test_exit(mm) || !vma) {
+ if (collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
@@ -2545,8 +2545,8 @@ static void khugepaged_do_scan(struct collapse_control *cc)
pass_through_head++;
if (khugepaged_has_work() &&
pass_through_head < 2)
- progress += khugepaged_scan_mm_slot(pages - progress,
- &result, cc);
+ progress += collapse_scan_mm_slot(pages - progress,
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
@@ -2787,12 +2787,11 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_unlock(mm);
mmap_locked = false;
- result = hpage_collapse_scan_file(mm, addr, file, pgoff,
- cc);
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
fput(file);
} else {
- result = hpage_collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
+ result = collapse_scan_pmd(mm, vma, addr,
+ &mmap_locked, cc);
}
if (!mmap_locked)
*lock_dropped = true;
diff --git a/mm/mremap.c b/mm/mremap.c
index bd7314898ec5..e2a1793b43ce 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -240,7 +240,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
goto out;
}
/*
- * Now new_pte is none, so hpage_collapse_scan_file() path can not find
+ * Now new_pte is none, so collapse_scan_file() path can not find
* this by traversing file->f_mapping, so there is no concurrency with
* retract_page_tables(). In addition, we already hold the exclusive
* mmap_lock, so this new_pte page is stable, so there is no need to get
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_*
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
@ 2025-11-08 1:42 ` Wei Yang
0 siblings, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-08 1:42 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:03PM -0600, Nico Pache wrote:
>The hpage_collapse functions describe functions used by madvise_collapse
>and khugepaged. remove the unnecessary hpage prefix to shorten the
>function name.
>
>Reviewed-by: Lance Yang <lance.yang@linux.dev>
>Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>Reviewed-by: Zi Yan <ziy@nvidia.com>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:00 ` Lance Yang
` (2 more replies)
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
` (13 subsequent siblings)
15 siblings, 3 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
The khugepaged daemon and madvise_collapse have two different
implementations that do almost the same thing.
Create collapse_single_pmd to increase code reuse and create an entry
point to these two users.
Refactor madvise_collapse and collapse_scan_mm_slot to use the new
collapse_single_pmd function. This introduces a minor behavioral change
that is most likely an undiscovered bug. The current implementation of
khugepaged tests collapse_test_exit_or_disable before calling
collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
case. By unifying these two callers madvise_collapse now also performs
this check. We also modify the return value to be SCAN_ANY_PROCESS which
properly indicates that this process is no longer valid to operate on.
We also guard the khugepaged_pages_collapsed variable to ensure its only
incremented for khugepaged.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
1 file changed, 52 insertions(+), 45 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c4abc7f45cf..36e31d99e507 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
return result;
}
+/*
+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
+ * the results.
+ */
+static int collapse_single_pmd(unsigned long addr,
+ struct vm_area_struct *vma, bool *mmap_locked,
+ struct collapse_control *cc)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int result;
+ struct file *file;
+ pgoff_t pgoff;
+
+ if (vma_is_anonymous(vma)) {
+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
+ goto end;
+ }
+
+ file = get_file(vma->vm_file);
+ pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
+ fput(file);
+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
+ goto end;
+
+ mmap_read_lock(mm);
+ *mmap_locked = true;
+ if (collapse_test_exit_or_disable(mm)) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ return SCAN_ANY_PROCESS;
+ }
+ result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
+ if (result == SCAN_PMD_MAPPED)
+ result = SCAN_SUCCEED;
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+
+end:
+ if (cc->is_khugepaged && result == SCAN_SUCCEED)
+ ++khugepaged_pages_collapsed;
+ return result;
+}
+
static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
@@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma,
- khugepaged_scan.address);
-
- mmap_read_unlock(mm);
- mmap_locked = false;
- *result = collapse_scan_file(mm,
- khugepaged_scan.address, file, pgoff, cc);
- fput(file);
- if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
- mmap_read_lock(mm);
- if (collapse_test_exit_or_disable(mm))
- goto breakouterloop;
- *result = collapse_pte_mapped_thp(mm,
- khugepaged_scan.address, false);
- if (*result == SCAN_PMD_MAPPED)
- *result = SCAN_SUCCEED;
- mmap_read_unlock(mm);
- }
- } else {
- *result = collapse_scan_pmd(mm, vma,
- khugepaged_scan.address, &mmap_locked, cc);
- }
-
- if (*result == SCAN_SUCCEED)
- ++khugepaged_pages_collapsed;
+ *result = collapse_single_pmd(khugepaged_scan.address,
+ vma, &mmap_locked, cc);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
@@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- if (!vma_is_anonymous(vma)) {
- struct file *file = get_file(vma->vm_file);
- pgoff_t pgoff = linear_page_index(vma, addr);
- mmap_read_unlock(mm);
- mmap_locked = false;
- result = collapse_scan_file(mm, addr, file, pgoff, cc);
- fput(file);
- } else {
- result = collapse_scan_pmd(mm, vma, addr,
- &mmap_locked, cc);
- }
+ result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
+
if (!mmap_locked)
*lock_dropped = true;
-handle_result:
switch (result) {
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
break;
- case SCAN_PTE_MAPPED_HUGEPAGE:
- BUG_ON(mmap_locked);
- mmap_read_lock(mm);
- result = collapse_pte_mapped_thp(mm, addr, true);
- mmap_read_unlock(mm);
- goto handle_result;
/* Whitelisted set of results where continuing OK */
+ case SCAN_PTE_MAPPED_HUGEPAGE:
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
@ 2025-10-27 9:00 ` Lance Yang
2025-10-27 15:44 ` Lorenzo Stoakes
2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 91+ messages in thread
From: Lance Yang @ 2025-10-27 9:00 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, linux-mm, linux-trace-kernel,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
linux-kernel, vishal.moola, thomas.hellstrom, linux-doc, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
vbabka, rppt, jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
Nice cleanup! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6c4abc7f45cf..36e31d99e507 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static int collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + int result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_assert_locked(mm);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> - fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> - }
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> if (!mmap_locked)
> *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PMD_NULL:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
2025-10-27 9:00 ` Lance Yang
@ 2025-10-27 15:44 ` Lorenzo Stoakes
2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 15:44 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:04PM -0600, Nico Pache wrote:
> The khugepaged daemon and madvise_collapse have two different
> implementations that do almost the same thing.
>
> Create collapse_single_pmd to increase code reuse and create an entry
> point to these two users.
>
> Refactor madvise_collapse and collapse_scan_mm_slot to use the new
> collapse_single_pmd function. This introduces a minor behavioral change
> that is most likely an undiscovered bug. The current implementation of
> khugepaged tests collapse_test_exit_or_disable before calling
> collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
> case. By unifying these two callers madvise_collapse now also performs
> this check. We also modify the return value to be SCAN_ANY_PROCESS which
> properly indicates that this process is no longer valid to operate on.
>
> We also guard the khugepaged_pages_collapsed variable to ensure its only
> incremented for khugepaged.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
Thanks, this LGTM so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6c4abc7f45cf..36e31d99e507 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
> +/*
> + * Try to collapse a single PMD starting at a PMD aligned addr, and return
> + * the results.
> + */
> +static int collapse_single_pmd(unsigned long addr,
> + struct vm_area_struct *vma, bool *mmap_locked,
> + struct collapse_control *cc)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + int result;
> + struct file *file;
> + pgoff_t pgoff;
> +
> + if (vma_is_anonymous(vma)) {
> + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
> + goto end;
> + }
> +
> + file = get_file(vma->vm_file);
> + pgoff = linear_page_index(vma, addr);
> +
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + result = collapse_scan_file(mm, addr, file, pgoff, cc);
> + fput(file);
> + if (result != SCAN_PTE_MAPPED_HUGEPAGE)
> + goto end;
> +
> + mmap_read_lock(mm);
> + *mmap_locked = true;
> + if (collapse_test_exit_or_disable(mm)) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + return SCAN_ANY_PROCESS;
> + }
> + result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
> + if (result == SCAN_PMD_MAPPED)
> + result = SCAN_SUCCEED;
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> +
> +end:
> + if (cc->is_khugepaged && result == SCAN_SUCCEED)
> + ++khugepaged_pages_collapsed;
> + return result;
> +}
> +
> static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> struct collapse_control *cc)
> __releases(&khugepaged_mm_lock)
> @@ -2440,34 +2487,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> VM_BUG_ON(khugepaged_scan.address < hstart ||
> khugepaged_scan.address + HPAGE_PMD_SIZE >
> hend);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma,
> - khugepaged_scan.address);
> -
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - *result = collapse_scan_file(mm,
> - khugepaged_scan.address, file, pgoff, cc);
> - fput(file);
> - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> - mmap_read_lock(mm);
> - if (collapse_test_exit_or_disable(mm))
> - goto breakouterloop;
> - *result = collapse_pte_mapped_thp(mm,
> - khugepaged_scan.address, false);
> - if (*result == SCAN_PMD_MAPPED)
> - *result = SCAN_SUCCEED;
> - mmap_read_unlock(mm);
> - }
> - } else {
> - *result = collapse_scan_pmd(mm, vma,
> - khugepaged_scan.address, &mmap_locked, cc);
> - }
> -
> - if (*result == SCAN_SUCCEED)
> - ++khugepaged_pages_collapsed;
>
> + *result = collapse_single_pmd(khugepaged_scan.address,
> + vma, &mmap_locked, cc);
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> progress += HPAGE_PMD_NR;
> @@ -2781,34 +2803,19 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_assert_locked(mm);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> - if (!vma_is_anonymous(vma)) {
> - struct file *file = get_file(vma->vm_file);
> - pgoff_t pgoff = linear_page_index(vma, addr);
>
> - mmap_read_unlock(mm);
> - mmap_locked = false;
> - result = collapse_scan_file(mm, addr, file, pgoff, cc);
> - fput(file);
> - } else {
> - result = collapse_scan_pmd(mm, vma, addr,
> - &mmap_locked, cc);
> - }
> + result = collapse_single_pmd(addr, vma, &mmap_locked, cc);
> +
> if (!mmap_locked)
> *lock_dropped = true;
>
> -handle_result:
> switch (result) {
> case SCAN_SUCCEED:
> case SCAN_PMD_MAPPED:
> ++thps;
> break;
> - case SCAN_PTE_MAPPED_HUGEPAGE:
> - BUG_ON(mmap_locked);
> - mmap_read_lock(mm);
> - result = collapse_pte_mapped_thp(mm, addr, true);
> - mmap_read_unlock(mm);
> - goto handle_result;
> /* Whitelisted set of results where continuing OK */
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> case SCAN_PMD_NULL:
> case SCAN_PTE_NON_PRESENT:
> case SCAN_PTE_UFFD_WP:
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
2025-10-27 9:00 ` Lance Yang
2025-10-27 15:44 ` Lorenzo Stoakes
@ 2025-11-08 1:44 ` Wei Yang
2 siblings, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-08 1:44 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:04PM -0600, Nico Pache wrote:
>The khugepaged daemon and madvise_collapse have two different
>implementations that do almost the same thing.
>
>Create collapse_single_pmd to increase code reuse and create an entry
>point to these two users.
>
>Refactor madvise_collapse and collapse_scan_mm_slot to use the new
>collapse_single_pmd function. This introduces a minor behavioral change
>that is most likely an undiscovered bug. The current implementation of
>khugepaged tests collapse_test_exit_or_disable before calling
>collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse
>case. By unifying these two callers madvise_collapse now also performs
>this check. We also modify the return value to be SCAN_ANY_PROCESS which
>properly indicates that this process is no longer valid to operate on.
>
>We also guard the khugepaged_pages_collapsed variable to ensure its only
>incremented for khugepaged.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
One nit below.
>---
> mm/khugepaged.c | 97 ++++++++++++++++++++++++++-----------------------
> 1 file changed, 52 insertions(+), 45 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 6c4abc7f45cf..36e31d99e507 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -2370,6 +2370,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr,
> return result;
> }
>
>+/*
>+ * Try to collapse a single PMD starting at a PMD aligned addr, and return
>+ * the results.
>+ */
>+static int collapse_single_pmd(unsigned long addr,
>+ struct vm_area_struct *vma, bool *mmap_locked,
>+ struct collapse_control *cc)
>+{
>+ struct mm_struct *mm = vma->vm_mm;
>+ int result;
>+ struct file *file;
>+ pgoff_t pgoff;
>+
>+ if (vma_is_anonymous(vma)) {
>+ result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc);
>+ goto end;
>+ }
>+
>+ file = get_file(vma->vm_file);
>+ pgoff = linear_page_index(vma, addr);
>+
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
>+ result = collapse_scan_file(mm, addr, file, pgoff, cc);
>+ fput(file);
>+ if (result != SCAN_PTE_MAPPED_HUGEPAGE)
>+ goto end;
>+
>+ mmap_read_lock(mm);
>+ *mmap_locked = true;
>+ if (collapse_test_exit_or_disable(mm)) {
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
>+ return SCAN_ANY_PROCESS;
>+ }
>+ result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged);
>+ if (result == SCAN_PMD_MAPPED)
>+ result = SCAN_SUCCEED;
>+ mmap_read_unlock(mm);
>+ *mmap_locked = false;
For all cases, we would set mmap_locked to false. Not sure it bother to adjust
it.
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 01/15] khugepaged: rename hpage_collapse_* to collapse_* Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 02/15] introduce collapse_single_pmd to unify khugepaged and madvise_collapse Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:02 ` Lance Yang
2025-11-08 1:54 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
` (12 subsequent siblings)
15 siblings, 2 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
For khugepaged to support different mTHP orders, we must generalize this
to check if the PMD is not shared by another VMA and that the order is
enabled.
No functional change in this patch. Also correct a comment about the
functionality of the revalidation.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 36e31d99e507..6cf8700823f9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -893,14 +893,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
/*
* If mmap_lock temporarily dropped, revalidate vma
- * before taking mmap_lock.
+ * after taking the mmap_lock again.
* Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- bool expect_anon,
- struct vm_area_struct **vmap,
- struct collapse_control *cc)
+ bool expect_anon, struct vm_area_struct **vmap,
+ struct collapse_control *cc, unsigned int order)
{
struct vm_area_struct *vma;
enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
@@ -913,15 +912,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!vma)
return SCAN_VMA_NULL;
+ /* Always check the PMD order to ensure its not shared by another VMA */
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+ if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
* remapped to file after khugepaged reaquired the mmap_lock.
*
- * thp_vma_allowable_order may return true for qualified file
+ * thp_vma_allowable_orders may return true for qualified file
* vmas.
*/
if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@ -1117,7 +1117,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1151,7 +1152,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
+ HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
@@ -2792,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
mmap_read_lock(mm);
mmap_locked = true;
result = hugepage_vma_revalidate(mm, addr, false, &vma,
- cc);
+ cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED) {
last_fail = result;
goto out_nolock;
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
@ 2025-10-27 9:02 ` Lance Yang
2025-11-08 1:54 ` Wei Yang
1 sibling, 0 replies; 91+ messages in thread
From: Lance Yang @ 2025-10-27 9:02 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, linux-kernel, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, linux-doc, linux-mm,
linux-trace-kernel, aarcange, raquini, anshuman.khandual,
catalin.marinas, tiwai, will, dave.hansen, jack, cl, jglisse,
surenb, zokeefe, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, vbabka, rppt, jannh, pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> For khugepaged to support different mTHP orders, we must generalize this
> to check if the PMD is not shared by another VMA and that the order is
> enabled.
>
> No functional change in this patch. Also correct a comment about the
> functionality of the revalidation.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
LGTM!
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 20 +++++++++++---------
> 1 file changed, 11 insertions(+), 9 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36e31d99e507..6cf8700823f9 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -893,14 +893,13 @@ static int collapse_find_target_node(struct collapse_control *cc)
>
> /*
> * If mmap_lock temporarily dropped, revalidate vma
> - * before taking mmap_lock.
> + * after taking the mmap_lock again.
> * Returns enum scan_result value.
> */
>
> static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> - bool expect_anon,
> - struct vm_area_struct **vmap,
> - struct collapse_control *cc)
> + bool expect_anon, struct vm_area_struct **vmap,
> + struct collapse_control *cc, unsigned int order)
> {
> struct vm_area_struct *vma;
> enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
> @@ -913,15 +912,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
> if (!vma)
> return SCAN_VMA_NULL;
>
> + /* Always check the PMD order to ensure its not shared by another VMA */
> if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
> return SCAN_ADDRESS_RANGE;
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
> + if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order)))
> return SCAN_VMA_CHECK;
> /*
> * Anon VMA expected, the address may be unmapped then
> * remapped to file after khugepaged reaquired the mmap_lock.
> *
> - * thp_vma_allowable_order may return true for qualified file
> + * thp_vma_allowable_orders may return true for qualified file
> * vmas.
> */
> if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
> @@ -1117,7 +1117,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> goto out_nolock;
>
> mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> + HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
> @@ -1151,7 +1152,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
> + result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> + HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> @@ -2792,7 +2794,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> mmap_read_lock(mm);
> mmap_locked = true;
> result = hugepage_vma_revalidate(mm, addr, false, &vma,
> - cc);
> + cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED) {
> last_fail = result;
> goto out_nolock;
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
2025-10-27 9:02 ` Lance Yang
@ 2025-11-08 1:54 ` Wei Yang
1 sibling, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-08 1:54 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:05PM -0600, Nico Pache wrote:
>For khugepaged to support different mTHP orders, we must generalize this
>to check if the PMD is not shared by another VMA and that the order is
>enabled.
>
>No functional change in this patch. Also correct a comment about the
>functionality of the revalidation.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (2 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 03/15] khugepaged: generalize hugepage_vma_revalidate for mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:05 ` Lance Yang
2025-11-08 2:34 ` Wei Yang
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
` (11 subsequent siblings)
15 siblings, 2 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
From: Dev Jain <dev.jain@arm.com>
Pass order to alloc_charge_folio() and update mTHP statistics.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 4 ++++
mm/khugepaged.c | 17 +++++++++++------
4 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 1654211cc6cf..13269a0074d4 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -634,6 +634,14 @@ anon_fault_fallback_charge
instead falls back to using huge pages with lower orders or
small pages even though the allocation was successful.
+collapse_alloc
+ is incremented every time a huge page is successfully allocated for a
+ khugepaged collapse.
+
+collapse_alloc_failed
+ is incremented every time a huge page allocation fails during a
+ khugepaged collapse.
+
zswpout
is incremented every time a huge page is swapped out to zswap in one
piece without splitting.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7698b3542c4f..3d29624c4f3f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -128,6 +128,8 @@ enum mthp_stat_item {
MTHP_STAT_ANON_FAULT_ALLOC,
MTHP_STAT_ANON_FAULT_FALLBACK,
MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+ MTHP_STAT_COLLAPSE_ALLOC,
+ MTHP_STAT_COLLAPSE_ALLOC_FAILED,
MTHP_STAT_ZSWPOUT,
MTHP_STAT_SWPIN,
MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 370ecfd6a182..0063d1ba926e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -620,6 +620,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -685,6 +687,8 @@ static struct attribute *any_stats_attrs[] = {
#endif
&split_attr.attr,
&split_failed_attr.attr,
+ &collapse_alloc_attr.attr,
+ &collapse_alloc_failed_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6cf8700823f9..36ee659acfbb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1061,21 +1061,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
}
static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
- struct collapse_control *cc)
+ struct collapse_control *cc, unsigned int order)
{
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = collapse_find_target_node(cc);
struct folio *folio;
- folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+ folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
if (!folio) {
*foliop = NULL;
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
return SCAN_ALLOC_HUGE_PAGE_FAIL;
}
- count_vm_event(THP_COLLAPSE_ALLOC);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
+
if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
folio_put(folio);
*foliop = NULL;
@@ -1112,7 +1117,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
mmap_read_unlock(mm);
- result = alloc_charge_folio(&folio, mm, cc);
+ result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
@@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- result = alloc_charge_folio(&new_folio, mm, cc);
+ result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out;
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
@ 2025-10-27 9:05 ` Lance Yang
2025-11-08 2:34 ` Wei Yang
1 sibling, 0 replies; 91+ messages in thread
From: Lance Yang @ 2025-10-27 9:05 UTC (permalink / raw)
To: Nico Pache, dev.jain
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, linux-trace-kernel, willy, peterx, wangkefeng.wang,
linux-mm, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, linux-kernel, cl, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, vbabka, rppt,
jannh, pfalcato, linux-doc
On 2025/10/23 02:37, Nico Pache wrote:
> From: Dev Jain <dev.jain@arm.com>
>
> Pass order to alloc_charge_folio() and update mTHP statistics.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
Cool! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++
> include/linux/huge_mm.h | 2 ++
> mm/huge_memory.c | 4 ++++
> mm/khugepaged.c | 17 +++++++++++------
> 4 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 1654211cc6cf..13269a0074d4 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -634,6 +634,14 @@ anon_fault_fallback_charge
> instead falls back to using huge pages with lower orders or
> small pages even though the allocation was successful.
>
> +collapse_alloc
> + is incremented every time a huge page is successfully allocated for a
> + khugepaged collapse.
> +
> +collapse_alloc_failed
> + is incremented every time a huge page allocation fails during a
> + khugepaged collapse.
> +
> zswpout
> is incremented every time a huge page is swapped out to zswap in one
> piece without splitting.
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7698b3542c4f..3d29624c4f3f 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -128,6 +128,8 @@ enum mthp_stat_item {
> MTHP_STAT_ANON_FAULT_ALLOC,
> MTHP_STAT_ANON_FAULT_FALLBACK,
> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> + MTHP_STAT_COLLAPSE_ALLOC,
> + MTHP_STAT_COLLAPSE_ALLOC_FAILED,
> MTHP_STAT_ZSWPOUT,
> MTHP_STAT_SWPIN,
> MTHP_STAT_SWPIN_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 370ecfd6a182..0063d1ba926e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -620,6 +620,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
> DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
> DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
> DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
> DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
> DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
> @@ -685,6 +687,8 @@ static struct attribute *any_stats_attrs[] = {
> #endif
> &split_attr.attr,
> &split_failed_attr.attr,
> + &collapse_alloc_attr.attr,
> + &collapse_alloc_failed_attr.attr,
> NULL,
> };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 6cf8700823f9..36ee659acfbb 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1061,21 +1061,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> }
>
> static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> - struct collapse_control *cc)
> + struct collapse_control *cc, unsigned int order)
> {
> gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
> GFP_TRANSHUGE);
> int node = collapse_find_target_node(cc);
> struct folio *folio;
>
> - folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
> + folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
> if (!folio) {
> *foliop = NULL;
> - count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
> return SCAN_ALLOC_HUGE_PAGE_FAIL;
> }
>
> - count_vm_event(THP_COLLAPSE_ALLOC);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_COLLAPSE_ALLOC);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
> +
> if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
> folio_put(folio);
> *foliop = NULL;
> @@ -1112,7 +1117,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> */
> mmap_read_unlock(mm);
>
> - result = alloc_charge_folio(&folio, mm, cc);
> + result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> @@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>
> - result = alloc_charge_folio(&new_folio, mm, cc);
> + result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out;
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio()
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
2025-10-27 9:05 ` Lance Yang
@ 2025-11-08 2:34 ` Wei Yang
1 sibling, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-08 2:34 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:06PM -0600, Nico Pache wrote:
>From: Dev Jain <dev.jain@arm.com>
>
>Pass order to alloc_charge_folio() and update mTHP statistics.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Nico Pache <npache@redhat.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (3 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 04/15] khugepaged: generalize alloc_charge_folio() Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 9:17 ` Lance Yang
` (2 more replies)
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
` (10 subsequent siblings)
15 siblings, 3 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
generalize the order of the __collapse_huge_page_* functions
to support future mTHP collapse.
mTHP collapse will not honor the khugepaged_max_ptes_shared or
khugepaged_max_ptes_swap parameters, and will fail if it encounters a
shared or swapped entry.
No functional changes in this patch.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
1 file changed, 48 insertions(+), 30 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 36ee659acfbb..4ccebf5dda97 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long start_addr,
- pte_t *pte,
- struct collapse_control *cc,
- struct list_head *compound_pagelist)
+ unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
+ unsigned int order, struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
+ const unsigned long nr_pages = 1UL << order;
+ int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* See collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
++shared;
- if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ /*
+ * TODO: Support shared pages without leading to further
+ * mTHP collapses. Currently bringing in new pages via
+ * shared may cause a future higher order collapse on a
+ * rescan of the same range.
+ */
+ if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared)) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
static void __collapse_huge_page_copy_succeeded(pte_t *pte,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+ struct vm_area_struct *vma, unsigned long address,
+ spinlock_t *ptl, unsigned int order,
+ struct list_head *compound_pagelist)
{
- unsigned long end = address + HPAGE_PMD_SIZE;
+ unsigned long end = address + (PAGE_SIZE << order);
struct folio *src, *tmp;
pte_t pteval;
pte_t *_pte;
unsigned int nr_ptes;
+ const unsigned long nr_pages = 1UL << order;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
@@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
}
static void __collapse_huge_page_copy_failed(pte_t *pte,
- pmd_t *pmd,
- pmd_t orig_pmd,
- struct vm_area_struct *vma,
- struct list_head *compound_pagelist)
+ pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
+ unsigned int order, struct list_head *compound_pagelist)
{
spinlock_t *pmd_ptl;
-
+ const unsigned long nr_pages = 1UL << order;
/*
* Re-establish the PMD to point to the original page table
* entry. Restoring PMD needs to be done prior to releasing
@@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
* Release both raw and compound pages isolated
* in __collapse_huge_page_isolate.
*/
- release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+ release_pte_pages(pte, pte + nr_pages, compound_pagelist);
}
/*
@@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
*/
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
- unsigned long address, spinlock_t *ptl,
+ unsigned long address, spinlock_t *ptl, unsigned int order,
struct list_head *compound_pagelist)
{
unsigned int i;
int result = SCAN_SUCCEED;
-
+ const unsigned long nr_pages = 1UL << order;
/*
* Copying pages' contents is subject to memory poison at any iteration.
*/
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr_pages; i++) {
pte_t pteval = ptep_get(pte + i);
struct page *page = folio_page(folio, i);
unsigned long src_addr = address + i * PAGE_SIZE;
@@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
if (likely(result == SCAN_SUCCEED))
__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
- compound_pagelist);
+ order, compound_pagelist);
else
__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
- compound_pagelist);
+ order, compound_pagelist);
return result;
}
@@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
* Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start_addr, pmd_t *pmd,
- int referenced)
+ struct vm_area_struct *vma, unsigned long start_addr,
+ pmd_t *pmd, int referenced, unsigned int order)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (PAGE_SIZE << order);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
@@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
if (!is_swap_pte(vmf.orig_pte))
continue;
+ /*
+ * TODO: Support swapin without leading to further mTHP
+ * collapses. Currently bringing in new pages via swapin may
+ * cause a future higher order collapse on a rescan of the same
+ * range.
+ */
+ if (order != HPAGE_PMD_ORDER) {
+ pte_unmap(pte);
+ mmap_read_unlock(mm);
+ result = SCAN_EXCEED_SWAP_PTE;
+ goto out;
+ }
+
vmf.pte = pte;
vmf.ptl = ptl;
ret = do_swap_page(&vmf);
@@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* that case. Continuing to collapse causes inconsistency.
*/
result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced);
+ referenced, HPAGE_PMD_ORDER);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
@@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
@@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
vma, address, pte_ptl,
+ HPAGE_PMD_ORDER,
&compound_pagelist);
pte_unmap(pte);
if (unlikely(result != SCAN_SUCCEED))
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
@ 2025-10-27 9:17 ` Lance Yang
2025-10-27 16:00 ` Lorenzo Stoakes
2025-11-08 3:01 ` Wei Yang
2 siblings, 0 replies; 91+ messages in thread
From: Lance Yang @ 2025-10-27 9:17 UTC (permalink / raw)
To: Nico Pache
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, linux-kernel, baohua, willy, peterx,
wangkefeng.wang, linux-mm, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, vbabka, rppt, jannh, pfalcato,
linux-trace-kernel, linux-doc
On 2025/10/23 02:37, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
Yeah, IMHO, it's the right call to avoid the complexity of potential
"collapse creep" at this stage and get the core functionality right first ;)
>
> No functional changes in this patch.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
Cool! LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
> mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36ee659acfbb..4ccebf5dda97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> }
>
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> - unsigned long start_addr,
> - pte_t *pte,
> - struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> struct page *page = NULL;
> struct folio *folio = NULL;
> unsigned long addr = start_addr;
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + const unsigned long nr_pages = 1UL << order;
> + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> /* See collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> - if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + /*
> + * TODO: Support shared pages without leading to further
> + * mTHP collapses. Currently bringing in new pages via
> + * shared may cause a future higher order collapse on a
> + * rescan of the same range.
> + */
> + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared)) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out;
> @@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - spinlock_t *ptl,
> - struct list_head *compound_pagelist)
> + struct vm_area_struct *vma, unsigned long address,
> + spinlock_t *ptl, unsigned int order,
> + struct list_head *compound_pagelist)
> {
> - unsigned long end = address + HPAGE_PMD_SIZE;
> + unsigned long end = address + (PAGE_SIZE << order);
> struct folio *src, *tmp;
> pte_t pteval;
> pte_t *_pte;
> unsigned int nr_ptes;
> + const unsigned long nr_pages = 1UL << order;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> address += nr_ptes * PAGE_SIZE) {
> nr_ptes = 1;
> pteval = ptep_get(_pte);
> @@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> }
>
> static void __collapse_huge_page_copy_failed(pte_t *pte,
> - pmd_t *pmd,
> - pmd_t orig_pmd,
> - struct vm_area_struct *vma,
> - struct list_head *compound_pagelist)
> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> spinlock_t *pmd_ptl;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Re-establish the PMD to point to the original page table
> * entry. Restoring PMD needs to be done prior to releasing
> @@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> * Release both raw and compound pages isolated
> * in __collapse_huge_page_isolate.
> */
> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> }
>
> /*
> @@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> */
> static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> - unsigned long address, spinlock_t *ptl,
> + unsigned long address, spinlock_t *ptl, unsigned int order,
> struct list_head *compound_pagelist)
> {
> unsigned int i;
> int result = SCAN_SUCCEED;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Copying pages' contents is subject to memory poison at any iteration.
> */
> - for (i = 0; i < HPAGE_PMD_NR; i++) {
> + for (i = 0; i < nr_pages; i++) {
> pte_t pteval = ptep_get(pte + i);
> struct page *page = folio_page(folio, i);
> unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>
> if (likely(result == SCAN_SUCCEED))
> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> - compound_pagelist);
> + order, compound_pagelist);
> else
> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> - compound_pagelist);
> + order, compound_pagelist);
>
> return result;
> }
> @@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> */
> static int __collapse_huge_page_swapin(struct mm_struct *mm,
> - struct vm_area_struct *vma,
> - unsigned long start_addr, pmd_t *pmd,
> - int referenced)
> + struct vm_area_struct *vma, unsigned long start_addr,
> + pmd_t *pmd, int referenced, unsigned int order)
> {
> int swapped_in = 0;
> vm_fault_t ret = 0;
> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> int result;
> pte_t *pte = NULL;
> spinlock_t *ptl;
> @@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> if (!is_swap_pte(vmf.orig_pte))
> continue;
>
> + /*
> + * TODO: Support swapin without leading to further mTHP
> + * collapses. Currently bringing in new pages via swapin may
> + * cause a future higher order collapse on a rescan of the same
> + * range.
> + */
> + if (order != HPAGE_PMD_ORDER) {
> + pte_unmap(pte);
> + mmap_read_unlock(mm);
> + result = SCAN_EXCEED_SWAP_PTE;
> + goto out;
> + }
> +
> vmf.pte = pte;
> vmf.ptl = ptl;
> ret = do_swap_page(&vmf);
> @@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * that case. Continuing to collapse causes inconsistency.
> */
> result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced);
> + referenced, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
> @@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> @@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> vma, address, pte_ptl,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> pte_unmap(pte);
> if (unlikely(result != SCAN_SUCCEED))
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2025-10-27 9:17 ` Lance Yang
@ 2025-10-27 16:00 ` Lorenzo Stoakes
2025-11-10 13:20 ` Nico Pache
2025-11-08 3:01 ` Wei Yang
2 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 16:00 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:07PM -0600, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* functions
> to support future mTHP collapse.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
>
> No functional changes in this patch.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
Thanks for addressing the v10 stuff (didn't check at v11).
Overall LGTM, so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Few minor nits below.
> ---
> mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 48 insertions(+), 30 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 36ee659acfbb..4ccebf5dda97 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> }
>
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> - unsigned long start_addr,
> - pte_t *pte,
> - struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> + unsigned int order, struct list_head *compound_pagelist)
This series isn't the right place for it, but god do we need helper structs in
this code... :)
> {
> struct page *page = NULL;
> struct folio *folio = NULL;
> unsigned long addr = start_addr;
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + const unsigned long nr_pages = 1UL << order;
> + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
Nit, but we should const-ify this too.
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> /* See collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> - if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + /*
> + * TODO: Support shared pages without leading to further
> + * mTHP collapses. Currently bringing in new pages via
> + * shared may cause a future higher order collapse on a
> + * rescan of the same range.
> + */
Yeah, I wish we could find a way to address this in some other way but given the
mire of THP code putting this comment here for now is probably the only sensible
way.
> + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared)) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out;
> @@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - spinlock_t *ptl,
> - struct list_head *compound_pagelist)
> + struct vm_area_struct *vma, unsigned long address,
> + spinlock_t *ptl, unsigned int order,
> + struct list_head *compound_pagelist)
> {
> - unsigned long end = address + HPAGE_PMD_SIZE;
> + unsigned long end = address + (PAGE_SIZE << order);
> struct folio *src, *tmp;
> pte_t pteval;
> pte_t *_pte;
> unsigned int nr_ptes;
> + const unsigned long nr_pages = 1UL << order;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> address += nr_ptes * PAGE_SIZE) {
> nr_ptes = 1;
> pteval = ptep_get(_pte);
> @@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> }
>
> static void __collapse_huge_page_copy_failed(pte_t *pte,
> - pmd_t *pmd,
> - pmd_t orig_pmd,
> - struct vm_area_struct *vma,
> - struct list_head *compound_pagelist)
> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> spinlock_t *pmd_ptl;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Re-establish the PMD to point to the original page table
> * entry. Restoring PMD needs to be done prior to releasing
> @@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> * Release both raw and compound pages isolated
> * in __collapse_huge_page_isolate.
> */
> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> }
>
> /*
> @@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> */
> static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> - unsigned long address, spinlock_t *ptl,
> + unsigned long address, spinlock_t *ptl, unsigned int order,
> struct list_head *compound_pagelist)
> {
> unsigned int i;
> int result = SCAN_SUCCEED;
> -
> + const unsigned long nr_pages = 1UL << order;
> /*
> * Copying pages' contents is subject to memory poison at any iteration.
> */
> - for (i = 0; i < HPAGE_PMD_NR; i++) {
> + for (i = 0; i < nr_pages; i++) {
> pte_t pteval = ptep_get(pte + i);
> struct page *page = folio_page(folio, i);
> unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>
> if (likely(result == SCAN_SUCCEED))
> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> - compound_pagelist);
> + order, compound_pagelist);
> else
> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> - compound_pagelist);
> + order, compound_pagelist);
>
> return result;
> }
> @@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> */
> static int __collapse_huge_page_swapin(struct mm_struct *mm,
> - struct vm_area_struct *vma,
> - unsigned long start_addr, pmd_t *pmd,
> - int referenced)
> + struct vm_area_struct *vma, unsigned long start_addr,
> + pmd_t *pmd, int referenced, unsigned int order)
Nit, super nit really, but since other __collapse_huge_page_*() functions have
..., order, param) as their last parameters, perhaps worth flipping referenced +
order here?
Not a big deal though.
> {
> int swapped_in = 0;
> vm_fault_t ret = 0;
> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> int result;
> pte_t *pte = NULL;
> spinlock_t *ptl;
> @@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> if (!is_swap_pte(vmf.orig_pte))
> continue;
>
> + /*
> + * TODO: Support swapin without leading to further mTHP
> + * collapses. Currently bringing in new pages via swapin may
> + * cause a future higher order collapse on a rescan of the same
> + * range.
> + */
Same comment as above re: this, i.e. that it's a pity but probably unavoidable
for now.
> + if (order != HPAGE_PMD_ORDER) {
> + pte_unmap(pte);
> + mmap_read_unlock(mm);
> + result = SCAN_EXCEED_SWAP_PTE;
> + goto out;
> + }
> +
> vmf.pte = pte;
> vmf.ptl = ptl;
> ret = do_swap_page(&vmf);
> @@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * that case. Continuing to collapse causes inconsistency.
> */
> result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced);
> + referenced, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
> @@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> @@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> vma, address, pte_ptl,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> pte_unmap(pte);
> if (unlikely(result != SCAN_SUCCEED))
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-27 16:00 ` Lorenzo Stoakes
@ 2025-11-10 13:20 ` Nico Pache
0 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-10 13:20 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Mon, Oct 27, 2025 at 10:02 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:07PM -0600, Nico Pache wrote:
> > generalize the order of the __collapse_huge_page_* functions
> > to support future mTHP collapse.
> >
> > mTHP collapse will not honor the khugepaged_max_ptes_shared or
> > khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> > shared or swapped entry.
> >
> > No functional changes in this patch.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Acked-by: David Hildenbrand <david@redhat.com>
> > Co-developed-by: Dev Jain <dev.jain@arm.com>
> > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
>
> Thanks for addressing the v10 stuff (didn't check at v11).
>
> Overall LGTM, so:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thanks!
>
> Few minor nits below.
>
> > ---
> > mm/khugepaged.c | 78 ++++++++++++++++++++++++++++++-------------------
> > 1 file changed, 48 insertions(+), 30 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 36ee659acfbb..4ccebf5dda97 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -537,25 +537,25 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> > }
> >
> > static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > - unsigned long start_addr,
> > - pte_t *pte,
> > - struct collapse_control *cc,
> > - struct list_head *compound_pagelist)
> > + unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> > + unsigned int order, struct list_head *compound_pagelist)
>
> This series isn't the right place for it, but god do we need helper structs in
> this code... :)
Well we have collapse_control! I can spend some time in a follow up
series to better leverage this struct.
>
> > {
> > struct page *page = NULL;
> > struct folio *folio = NULL;
> > unsigned long addr = start_addr;
> > pte_t *_pte;
> > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> > + const unsigned long nr_pages = 1UL << order;
> > + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>
> Nit, but we should const-ify this too.
This gets converted to collapse_max_ptes_none in the future.
>
> >
> > - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> > + for (_pte = pte; _pte < pte + nr_pages;
> > _pte++, addr += PAGE_SIZE) {
> > pte_t pteval = ptep_get(_pte);
> > if (pte_none_or_zero(pteval)) {
> > ++none_or_zero;
> > if (!userfaultfd_armed(vma) &&
> > (!cc->is_khugepaged ||
> > - none_or_zero <= khugepaged_max_ptes_none)) {
> > + none_or_zero <= max_ptes_none)) {
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> > @@ -583,8 +583,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > /* See collapse_scan_pmd(). */
> > if (folio_maybe_mapped_shared(folio)) {
> > ++shared;
> > - if (cc->is_khugepaged &&
> > - shared > khugepaged_max_ptes_shared) {
> > + /*
> > + * TODO: Support shared pages without leading to further
> > + * mTHP collapses. Currently bringing in new pages via
> > + * shared may cause a future higher order collapse on a
> > + * rescan of the same range.
> > + */
>
> Yeah, I wish we could find a way to address this in some other way but given the
> mire of THP code putting this comment here for now is probably the only sensible
> way.
>
> > + if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> > + shared > khugepaged_max_ptes_shared)) {
> > result = SCAN_EXCEED_SHARED_PTE;
> > count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > goto out;
> > @@ -677,18 +683,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > }
> >
> > static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> > - struct vm_area_struct *vma,
> > - unsigned long address,
> > - spinlock_t *ptl,
> > - struct list_head *compound_pagelist)
> > + struct vm_area_struct *vma, unsigned long address,
> > + spinlock_t *ptl, unsigned int order,
> > + struct list_head *compound_pagelist)
> > {
> > - unsigned long end = address + HPAGE_PMD_SIZE;
> > + unsigned long end = address + (PAGE_SIZE << order);
> > struct folio *src, *tmp;
> > pte_t pteval;
> > pte_t *_pte;
> > unsigned int nr_ptes;
> > + const unsigned long nr_pages = 1UL << order;
> >
> > - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> > + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> > address += nr_ptes * PAGE_SIZE) {
> > nr_ptes = 1;
> > pteval = ptep_get(_pte);
> > @@ -741,13 +747,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> > }
> >
> > static void __collapse_huge_page_copy_failed(pte_t *pte,
> > - pmd_t *pmd,
> > - pmd_t orig_pmd,
> > - struct vm_area_struct *vma,
> > - struct list_head *compound_pagelist)
> > + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> > + unsigned int order, struct list_head *compound_pagelist)
> > {
> > spinlock_t *pmd_ptl;
> > -
> > + const unsigned long nr_pages = 1UL << order;
> > /*
> > * Re-establish the PMD to point to the original page table
> > * entry. Restoring PMD needs to be done prior to releasing
> > @@ -761,7 +765,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> > * Release both raw and compound pages isolated
> > * in __collapse_huge_page_isolate.
> > */
> > - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> > + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> > }
> >
> > /*
> > @@ -781,16 +785,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> > */
> > static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> > pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> > - unsigned long address, spinlock_t *ptl,
> > + unsigned long address, spinlock_t *ptl, unsigned int order,
> > struct list_head *compound_pagelist)
> > {
> > unsigned int i;
> > int result = SCAN_SUCCEED;
> > -
> > + const unsigned long nr_pages = 1UL << order;
> > /*
> > * Copying pages' contents is subject to memory poison at any iteration.
> > */
> > - for (i = 0; i < HPAGE_PMD_NR; i++) {
> > + for (i = 0; i < nr_pages; i++) {
> > pte_t pteval = ptep_get(pte + i);
> > struct page *page = folio_page(folio, i);
> > unsigned long src_addr = address + i * PAGE_SIZE;
> > @@ -809,10 +813,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> >
> > if (likely(result == SCAN_SUCCEED))
> > __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> > - compound_pagelist);
> > + order, compound_pagelist);
> > else
> > __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> > - compound_pagelist);
> > + order, compound_pagelist);
> >
> > return result;
> > }
> > @@ -985,13 +989,12 @@ static int check_pmd_still_valid(struct mm_struct *mm,
> > * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> > */
> > static int __collapse_huge_page_swapin(struct mm_struct *mm,
> > - struct vm_area_struct *vma,
> > - unsigned long start_addr, pmd_t *pmd,
> > - int referenced)
> > + struct vm_area_struct *vma, unsigned long start_addr,
> > + pmd_t *pmd, int referenced, unsigned int order)
>
> Nit, super nit really, but since other __collapse_huge_page_*() functions have
> ..., order, param) as their last parameters, perhaps worth flipping referenced +
> order here?
>
> Not a big deal though.
>
> > {
> > int swapped_in = 0;
> > vm_fault_t ret = 0;
> > - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> > + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> > int result;
> > pte_t *pte = NULL;
> > spinlock_t *ptl;
> > @@ -1022,6 +1025,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> > if (!is_swap_pte(vmf.orig_pte))
> > continue;
> >
> > + /*
> > + * TODO: Support swapin without leading to further mTHP
> > + * collapses. Currently bringing in new pages via swapin may
> > + * cause a future higher order collapse on a rescan of the same
> > + * range.
> > + */
>
> Same comment as above re: this, i.e. that it's a pity but probably unavoidable
> for now.
>
> > + if (order != HPAGE_PMD_ORDER) {
> > + pte_unmap(pte);
> > + mmap_read_unlock(mm);
> > + result = SCAN_EXCEED_SWAP_PTE;
> > + goto out;
> > + }
> > +
> > vmf.pte = pte;
> > vmf.ptl = ptl;
> > ret = do_swap_page(&vmf);
> > @@ -1142,7 +1158,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * that case. Continuing to collapse causes inconsistency.
> > */
> > result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > - referenced);
> > + referenced, HPAGE_PMD_ORDER);
> > if (result != SCAN_SUCCEED)
> > goto out_nolock;
> > }
> > @@ -1190,6 +1206,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > if (pte) {
> > result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > + HPAGE_PMD_ORDER,
> > &compound_pagelist);
> > spin_unlock(pte_ptl);
> > } else {
> > @@ -1220,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >
> > result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > vma, address, pte_ptl,
> > + HPAGE_PMD_ORDER,
> > &compound_pagelist);
> > pte_unmap(pte);
> > if (unlikely(result != SCAN_SUCCEED))
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
2025-10-27 9:17 ` Lance Yang
2025-10-27 16:00 ` Lorenzo Stoakes
@ 2025-11-08 3:01 ` Wei Yang
2 siblings, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-08 3:01 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:07PM -0600, Nico Pache wrote:
>generalize the order of the __collapse_huge_page_* functions
>to support future mTHP collapse.
>
>mTHP collapse will not honor the khugepaged_max_ptes_shared or
>khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>shared or swapped entry.
>
>No functional changes in this patch.
>
>Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>Acked-by: David Hildenbrand <david@redhat.com>
>Co-developed-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Dev Jain <dev.jain@arm.com>
>Signed-off-by: Nico Pache <npache@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (4 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 05/15] khugepaged: generalize __collapse_huge_page_* for mTHP support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
` (9 subsequent siblings)
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
The current mechanism for determining mTHP collapse scales the
khugepaged_max_ptes_none value based on the target order. This
introduces an undesirable feedback loop, or "creep", when max_ptes_none
is set to a value greater than HPAGE_PMD_NR / 2.
With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.
To fix this issue introduce a helper function that caps the max_ptes_none
to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
the max_ptes_none number by the (PMD_ORDER - target collapse order).
The limits can be ignored by passing full_scan=true, this is useful for
madvise_collapse (which ignores limits), or in the case of
collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
collapse is available.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4ccebf5dda97..286c3a7afdee 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
wake_up_interruptible(&khugepaged_wait);
}
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
+ * @order: The folio order being collapsed to
+ * @full_scan: Whether this is a full scan (ignore limits)
+ *
+ * For madvise-triggered collapses (full_scan=true), all limits are bypassed
+ * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
+ *
+ * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
+ * khugepaged_max_ptes_none value.
+ *
+ * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
+ * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
+ *
+ * Return: Maximum number of empty PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
+{
+ unsigned int max_ptes_none;
+
+ /* ignore max_ptes_none limits */
+ if (full_scan)
+ return HPAGE_PMD_NR - 1;
+
+ if (order == HPAGE_PMD_ORDER)
+ return khugepaged_max_ptes_none;
+
+ max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
+
+ return max_ptes_none >> (HPAGE_PMD_ORDER - order);
+
+}
+
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
@@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
const unsigned long nr_pages = 1UL << order;
- int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
+ int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
for (_pte = pte; _pte < pte + nr_pages;
_pte++, addr += PAGE_SIZE) {
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
@ 2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:36 ` Nico Pache
0 siblings, 2 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-27 17:53 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> The current mechanism for determining mTHP collapse scales the
> khugepaged_max_ptes_none value based on the target order. This
> introduces an undesirable feedback loop, or "creep", when max_ptes_none
> is set to a value greater than HPAGE_PMD_NR / 2.
>
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
>
> To fix this issue introduce a helper function that caps the max_ptes_none
> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>
> The limits can be ignored by passing full_scan=true, this is useful for
> madvise_collapse (which ignores limits), or in the case of
> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> collapse is available.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 4ccebf5dda97..286c3a7afdee 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> wake_up_interruptible(&khugepaged_wait);
> }
>
> +/**
> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> + * @order: The folio order being collapsed to
> + * @full_scan: Whether this is a full scan (ignore limits)
> + *
> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> + *
> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> + * khugepaged_max_ptes_none value.
> + *
> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> + *
> + * Return: Maximum number of empty PTEs allowed for the collapse operation
> + */
> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> +{
> + unsigned int max_ptes_none;
> +
> + /* ignore max_ptes_none limits */
> + if (full_scan)
> + return HPAGE_PMD_NR - 1;
> +
> + if (order == HPAGE_PMD_ORDER)
> + return khugepaged_max_ptes_none;
> +
> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
I mean not to beat a dead horse re: v11 commentary, but I thought we were going
to implement David's idea re: the new 'eagerness' tunable, and again we're now just
implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
I'm still really quite uncomfortable with us silently capping this value.
If we're putting forward theoretical ideas that are to be later built upon, this
series should be an RFC.
But if we really intend to silently ignore user input the problem is that then
becomes established uAPI.
I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
visibility I think.
I think people are going to find it odd that you set it to something, but then
get something else.
As an alternative we could have a new sysfs field:
/sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
That shows the cap clearly.
In fact, it could be read-only... and just expose it to the user. That reduces
complexity.
We can then bring in eagerness later and have the same situation of
max_ptes_none being a parameter that exists (plus this additional read-only
parameter).
> +
> + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> +
> +}
> +
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> const unsigned long nr_pages = 1UL << order;
> - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
>
> for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-27 17:53 ` Lorenzo Stoakes
@ 2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:57 ` Nico Pache
2025-10-28 17:07 ` Lorenzo Stoakes
2025-10-28 13:36 ` Nico Pache
1 sibling, 2 replies; 91+ messages in thread
From: Baolin Wang @ 2025-10-28 10:09 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
>> The current mechanism for determining mTHP collapse scales the
>> khugepaged_max_ptes_none value based on the target order. This
>> introduces an undesirable feedback loop, or "creep", when max_ptes_none
>> is set to a value greater than HPAGE_PMD_NR / 2.
>>
>> With this configuration, a successful collapse to order N will populate
>> enough pages to satisfy the collapse condition on order N+1 on the next
>> scan. This leads to unnecessary work and memory churn.
>>
>> To fix this issue introduce a helper function that caps the max_ptes_none
>> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
>> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>>
>> The limits can be ignored by passing full_scan=true, this is useful for
>> madvise_collapse (which ignores limits), or in the case of
>> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
>> collapse is available.
>>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>> 1 file changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 4ccebf5dda97..286c3a7afdee 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
>> wake_up_interruptible(&khugepaged_wait);
>> }
>>
>> +/**
>> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>> + * @order: The folio order being collapsed to
>> + * @full_scan: Whether this is a full scan (ignore limits)
>> + *
>> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
>> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
>> + *
>> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
>> + * khugepaged_max_ptes_none value.
>> + *
>> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
>> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
>> + *
>> + * Return: Maximum number of empty PTEs allowed for the collapse operation
>> + */
>> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
>> +{
>> + unsigned int max_ptes_none;
>> +
>> + /* ignore max_ptes_none limits */
>> + if (full_scan)
>> + return HPAGE_PMD_NR - 1;
>> +
>> + if (order == HPAGE_PMD_ORDER)
>> + return khugepaged_max_ptes_none;
>> +
>> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I'm still really quite uncomfortable with us silently capping this value.
>
> If we're putting forward theoretical ideas that are to be later built upon, this
> series should be an RFC.
>
> But if we really intend to silently ignore user input the problem is that then
> becomes established uAPI.
>
> I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> visibility I think.
>
> I think people are going to find it odd that you set it to something, but then
> get something else.
>
> As an alternative we could have a new sysfs field:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> That shows the cap clearly.
>
> In fact, it could be read-only... and just expose it to the user. That reduces
> complexity.
>
> We can then bring in eagerness later and have the same situation of
> max_ptes_none being a parameter that exists (plus this additional read-only
> parameter).
We all know that ultimately using David's suggestion to add the
'eagerness' tunable parameter is the best approach, but for now, we need
an initial version to support mTHP collapse (as we've already discussed
extensively here:)).
I don't like the idea of adding another and potentially confusing
'max_mthp_ptes_none' interface, which might make it more difficult to
accommodate the 'eagerness' parameter in the future.
If Nico's current proposal still doesn't satisfy everyone, I personally
lean towards David's earlier simplified approach:
max_ptes_none == 511 -> collapse mTHP always
max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
Let's first have an initial approach in place, which will also simplify
the following addition of the 'eagerness' tunable parameter.
Nico, Lorenzo, and David, what do you think?
Code should be:
static unsigned int collapse_max_ptes_none(unsigned int order, bool
full_scan)
{
unsigned int max_ptes_none;
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
/*
* For mTHP collapse, we can simplify the logic:
* max_ptes_none == 511 -> collapse mTHP always
* max_ptes_none != 511 -> collapse mTHP only if we all PTEs
are non-none/zero
*/
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
order);
return 0;
}
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 10:09 ` Baolin Wang
@ 2025-10-28 13:57 ` Nico Pache
2025-10-28 17:07 ` Lorenzo Stoakes
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-28 13:57 UTC (permalink / raw)
To: Baolin Wang
Cc: Lorenzo Stoakes, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, david, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 4:10 AM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> >> The current mechanism for determining mTHP collapse scales the
> >> khugepaged_max_ptes_none value based on the target order. This
> >> introduces an undesirable feedback loop, or "creep", when max_ptes_none
> >> is set to a value greater than HPAGE_PMD_NR / 2.
> >>
> >> With this configuration, a successful collapse to order N will populate
> >> enough pages to satisfy the collapse condition on order N+1 on the next
> >> scan. This leads to unnecessary work and memory churn.
> >>
> >> To fix this issue introduce a helper function that caps the max_ptes_none
> >> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> >> the max_ptes_none number by the (PMD_ORDER - target collapse order).
> >>
> >> The limits can be ignored by passing full_scan=true, this is useful for
> >> madvise_collapse (which ignores limits), or in the case of
> >> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> >> collapse is available.
> >>
> >> Signed-off-by: Nico Pache <npache@redhat.com>
> >> ---
> >> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> >> 1 file changed, 34 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> index 4ccebf5dda97..286c3a7afdee 100644
> >> --- a/mm/khugepaged.c
> >> +++ b/mm/khugepaged.c
> >> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> >> wake_up_interruptible(&khugepaged_wait);
> >> }
> >>
> >> +/**
> >> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> >> + * @order: The folio order being collapsed to
> >> + * @full_scan: Whether this is a full scan (ignore limits)
> >> + *
> >> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> >> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> >> + *
> >> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> >> + * khugepaged_max_ptes_none value.
> >> + *
> >> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> >> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> >> + *
> >> + * Return: Maximum number of empty PTEs allowed for the collapse operation
> >> + */
> >> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> >> +{
> >> + unsigned int max_ptes_none;
> >> +
> >> + /* ignore max_ptes_none limits */
> >> + if (full_scan)
> >> + return HPAGE_PMD_NR - 1;
> >> +
> >> + if (order == HPAGE_PMD_ORDER)
> >> + return khugepaged_max_ptes_none;
> >> +
> >> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
>
Hey Baolin,
> We all know that ultimately using David's suggestion to add the
> 'eagerness' tunable parameter is the best approach, but for now, we need
> an initial version to support mTHP collapse (as we've already discussed
> extensively here:)).
>
> I don't like the idea of adding another and potentially confusing
> 'max_mthp_ptes_none' interface, which might make it more difficult to
> accommodate the 'eagerness' parameter in the future.
>
> If Nico's current proposal still doesn't satisfy everyone, I personally
> lean towards David's earlier simplified approach:
> max_ptes_none == 511 -> collapse mTHP always
> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>
> Let's first have an initial approach in place, which will also simplify
> the following addition of the 'eagerness' tunable parameter.
>
> Nico, Lorenzo, and David, what do you think?
I still believe capping it at PMD_NR/2 provides the right mix between
preventing the undesired behavior, and keeping some degree of
tunability, as the admin guides suggests max_ptes_none should be used.
I would be willing to compromise and take this other approach until
the "eagerness" is in place. However, I do believe David's idea for
eagerness is to also cap the max_ptes_none at PMD_NR/2 for the second
to highest eagerness level (ie, 511, 255, ...). So in practice, we
won't see any behavioral changes when that series comes around;
whereas setting max_ptes_none=0 for mTHP initially, then adding
eagerness will result in a change in behavior from the initial
implementation.
With that said, Lorenzo, David, What's the final verdict?
-- Nico
>
> Code should be:
> static unsigned int collapse_max_ptes_none(unsigned int order, bool
> full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * For mTHP collapse, we can simplify the logic:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none != 511 -> collapse mTHP only if we all PTEs
> are non-none/zero
> */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> return 0;
> }
Side note: Thank you Baolin for your review/testing of the V12 :)
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 10:09 ` Baolin Wang
2025-10-28 13:57 ` Nico Pache
@ 2025-10-28 17:07 ` Lorenzo Stoakes
2025-10-28 17:56 ` David Hildenbrand
1 sibling, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:07 UTC (permalink / raw)
To: Baolin Wang
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, david, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 06:09:43PM +0800, Baolin Wang wrote:
>
>
> On 2025/10/28 01:53, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > The current mechanism for determining mTHP collapse scales the
> > > khugepaged_max_ptes_none value based on the target order. This
> > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > is set to a value greater than HPAGE_PMD_NR / 2.
> > >
> > > With this configuration, a successful collapse to order N will populate
> > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > scan. This leads to unnecessary work and memory churn.
> > >
> > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > >
> > > The limits can be ignored by passing full_scan=true, this is useful for
> > > madvise_collapse (which ignores limits), or in the case of
> > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > collapse is available.
> > >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 4ccebf5dda97..286c3a7afdee 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > wake_up_interruptible(&khugepaged_wait);
> > > }
> > >
> > > +/**
> > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > + * @order: The folio order being collapsed to
> > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > + *
> > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > + *
> > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > + * khugepaged_max_ptes_none value.
> > > + *
> > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > + *
> > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > + */
> > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > +{
> > > + unsigned int max_ptes_none;
> > > +
> > > + /* ignore max_ptes_none limits */
> > > + if (full_scan)
> > > + return HPAGE_PMD_NR - 1;
> > > +
> > > + if (order == HPAGE_PMD_ORDER)
> > > + return khugepaged_max_ptes_none;
> > > +
> > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
>
> We all know that ultimately using David's suggestion to add the 'eagerness'
> tunable parameter is the best approach, but for now, we need an initial
> version to support mTHP collapse (as we've already discussed extensively
> here:)).
>
> I don't like the idea of adding another and potentially confusing
> 'max_mthp_ptes_none' interface, which might make it more difficult to
> accommodate the 'eagerness' parameter in the future.
See my reply to Nico, I disagree that it affects eagerness.
>
> If Nico's current proposal still doesn't satisfy everyone, I personally lean
It's not upstreamable. We cannot silently violate user expectation or silently
change behaviour like this.
> towards David's earlier simplified approach:
> max_ptes_none == 511 -> collapse mTHP always
> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
Pretty sure David's suggestion was that max_ptes_none would literally get set to
511 if you specified 511, or 0 if you specified anything else.
Which would make thing visible to users and not ignore their tunable setting,
which is the whole issue IMO.
But we can't do that, because we know at the very least Meta use small non-zero
values that they expect to be honoured.
So again we're stuck in the situation of max_ptes_none being ignored for mTHP
and users being totally unaware.
>
> Let's first have an initial approach in place, which will also simplify the
Well hang on, this isn't the same as 'do anything we like'.
It immediately becomes uAPI, and 'I'll do that later' often becomes 'I'll never
do that because I got too busy'.
Yes perhaps we have to wait for the eagerness parameter, but any interim
solution must be _solid_ and not do strange/unexpected things.
We've (and of course, it was a silly thing to do) provided the ability for users
to specify this max_ptes_none behaviour for khugepaged.
Suddenly putting an asterix next to that like '*except mTHP where we totally
ignore you if you specify values we don't like' doesn't seem like a great way
forward.
As I said to Nico too, we _have_ to export and support max_ptes_none for uAPI
reasons. And presumably eagerness will want to specify different settings for
mTHP vs. PMD THP, so exposing this (read-only mind you) somehow isn't as crazy
as it might seem.
> following addition of the 'eagerness' tunable parameter.
>
> Nico, Lorenzo, and David, what do you think?
>
> Code should be:
> static unsigned int collapse_max_ptes_none(unsigned int order, bool
> full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * For mTHP collapse, we can simplify the logic:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none != 511 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> return 0;
> }
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:07 ` Lorenzo Stoakes
@ 2025-10-28 17:56 ` David Hildenbrand
2025-10-28 18:09 ` Lorenzo Stoakes
0 siblings, 1 reply; 91+ messages in thread
From: David Hildenbrand @ 2025-10-28 17:56 UTC (permalink / raw)
To: Lorenzo Stoakes, Baolin Wang
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
[...]
>
>> towards David's earlier simplified approach:
>> max_ptes_none == 511 -> collapse mTHP always
>> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>
> Pretty sure David's suggestion was that max_ptes_none would literally get set to
> 511 if you specified 511, or 0 if you specified anything else.
We had multiple incarnations of this approach, but the first one really was:
max_ptes_none == 511 -> collapse mTHP always
max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
And for the intermediate values
(1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
supported yet with other values
(2) treat it like max_ptes_none == 0 or (maybe better?) just disable
mTHP collapse
I still like that approach because it let's us defer solving the creep
problem later and doesn't add a silent capping.
Using intermediate max_ptes_none values are really only reasonable with
the deferred shrinker today. And that one does not support mTHP even
with this series, so it's future work either way.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:56 ` David Hildenbrand
@ 2025-10-28 18:09 ` Lorenzo Stoakes
2025-10-28 18:17 ` David Hildenbrand
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:09 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
(It'd be good if we could keep all the 'solutions' in one thread as I made a
detailed reply there and now all that will get lost across two threads but
*sigh* never mind. Insert rant about email development here.)
On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
> [...]
>
> >
> > > towards David's earlier simplified approach:
> > > max_ptes_none == 511 -> collapse mTHP always
> > > max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
> >
> > Pretty sure David's suggestion was that max_ptes_none would literally get set to
> > 511 if you specified 511, or 0 if you specified anything else.
>
> We had multiple incarnations of this approach, but the first one really was:
>
> max_ptes_none == 511 -> collapse mTHP always
But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
anything?
> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>
> And for the intermediate values
>
> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> supported yet with other values
It feels a bit much to issue a kernel warning every time somebody twiddles that
value, and it's kind of against user expectation a bit.
But maybe it's the least worst way of communicating things. It's still
absolutely gross.
> (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
> collapse
Yeah disabling mTHP collapse for these values seems sane, but it also seems that
we should be capping for this to work correctly no?
Also I think all this probably violates requirements of users who want to have
different behaviour for mTHP and PMD THP.
The default is 511 so we're in creep territory even with the damn default :)
>
>
> I still like that approach because it let's us defer solving the creep
> problem later and doesn't add a silent capping.
I mean it seems you're more or less saying allow creep. Which I'm kind of ok
with for a first pass thing, and defer it for later.
>
> Using intermediate max_ptes_none values are really only reasonable with the
> deferred shrinker today. And that one does not support mTHP even with this
> series, so it's future work either way.
Right, that's a nice fact to be aware of.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:09 ` Lorenzo Stoakes
@ 2025-10-28 18:17 ` David Hildenbrand
2025-10-28 18:41 ` Lorenzo Stoakes
0 siblings, 1 reply; 91+ messages in thread
From: David Hildenbrand @ 2025-10-28 18:17 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 28.10.25 19:09, Lorenzo Stoakes wrote:
> (It'd be good if we could keep all the 'solutions' in one thread as I made a
> detailed reply there and now all that will get lost across two threads but
> *sigh* never mind. Insert rant about email development here.)
Yeah, I focused in my other mails on things to avoid creep while
allowing for mTHP collapse.
>
> On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
>> [...]
>>
>>>
>>>> towards David's earlier simplified approach:
>>>> max_ptes_none == 511 -> collapse mTHP always
>>>> max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
>>>
>>> Pretty sure David's suggestion was that max_ptes_none would literally get set to
>>> 511 if you specified 511, or 0 if you specified anything else.
>>
>> We had multiple incarnations of this approach, but the first one really was:
>>
>> max_ptes_none == 511 -> collapse mTHP always
>
> But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
> anything?
No creep, because you'll always collapse.
Creep only happens if you wouldn't collapse a PMD without prior mTHP
collapse, but suddenly would in the same scenario simply because you had
prior mTHP collapse.
At least that's my understanding.
>
>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>>
>> And for the intermediate values
>>
>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
>> supported yet with other values
>
> It feels a bit much to issue a kernel warning every time somebody twiddles that
> value, and it's kind of against user expectation a bit.
pr_warn_once() is what I meant.
>
> But maybe it's the least worst way of communicating things. It's still
> absolutely gross.
>
>> (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
>> collapse
>
> Yeah disabling mTHP collapse for these values seems sane, but it also seems that
> we should be capping for this to work correctly no?
I didn't get the interaction with capping, can you elaborate?
>
> Also I think all this probably violates requirements of users who want to have
> different behaviour for mTHP and PMD THP.
>
> The default is 511 so we're in creep territory even with the damn default :)
I don't think so, but maybe I am wrong.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:17 ` David Hildenbrand
@ 2025-10-28 18:41 ` Lorenzo Stoakes
2025-10-29 15:04 ` David Hildenbrand
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 07:17:16PM +0100, David Hildenbrand wrote:
> On 28.10.25 19:09, Lorenzo Stoakes wrote:
> > (It'd be good if we could keep all the 'solutions' in one thread as I made a
> > detailed reply there and now all that will get lost across two threads but
> > *sigh* never mind. Insert rant about email development here.)
>
> Yeah, I focused in my other mails on things to avoid creep while allowing
> for mTHP collapse.
>
> >
> > On Tue, Oct 28, 2025 at 06:56:10PM +0100, David Hildenbrand wrote:
> > > [...]
> > >
> > > >
> > > > > towards David's earlier simplified approach:
> > > > > max_ptes_none == 511 -> collapse mTHP always
> > > > > max_ptes_none != 511 -> collapse mTHP only if all PTEs are non-none/zero
> > > >
> > > > Pretty sure David's suggestion was that max_ptes_none would literally get set to
> > > > 511 if you specified 511, or 0 if you specified anything else.
> > >
> > > We had multiple incarnations of this approach, but the first one really was:
> > >
> > > max_ptes_none == 511 -> collapse mTHP always
> >
> > But won't 511 mean we just 'creep' to maximum collapse again? Does that solve
> > anything?
>
> No creep, because you'll always collapse.
OK so in the 511 scenario, do we simply immediately collapse to the largest
possible _mTHP_ page size if based on adjacent none/zero page entries in the
PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
none/zero PTE entries to do so?
And only collapse to PMD size if we have sufficient adjacent PTE entries that
are populated?
Let's really nail this down actually so we can be super clear what the issue is
here.
>
> Creep only happens if you wouldn't collapse a PMD without prior mTHP
> collapse, but suddenly would in the same scenario simply because you had
> prior mTHP collapse.
>
> At least that's my understanding.
OK, that makes sense, is the logic (this may be part of the bit I haven't
reviewed yet tbh) then that for khugepaged mTHP we have the system where we
always require prior mTHP collapse _first_?
>
> >
> > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > >
> > > And for the intermediate values
> > >
> > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > supported yet with other values
> >
> > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > value, and it's kind of against user expectation a bit.
>
> pr_warn_once() is what I meant.
Right, but even then it feels a bit extreme, warnings are pretty serious
things. Then again there's precedent for this, and it may be the least worse
solution.
I just picture a cloud provider turning this on with mTHP then getting their
monitoring team reporting some urgent communication about warnings in dmesg :)
>
> >
> > But maybe it's the least worst way of communicating things. It's still
> > absolutely gross.
> >
> > > (2) treat it like max_ptes_none == 0 or (maybe better?) just disable mTHP
> > > collapse
> >
> > Yeah disabling mTHP collapse for these values seems sane, but it also seems that
> > we should be capping for this to work correctly no?
>
> I didn't get the interaction with capping, can you elaborate?
I think that's addressed in the discussion above, once we clarify the creep
thing then the rest should fall out.
>
> >
> > Also I think all this probably violates requirements of users who want to have
> > different behaviour for mTHP and PMD THP.
> >
> > The default is 511 so we're in creep territory even with the damn default :)
>
> I don't think so, but maybe I am wrong.
Discussed above.
>
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:41 ` Lorenzo Stoakes
@ 2025-10-29 15:04 ` David Hildenbrand
2025-10-29 18:41 ` Lorenzo Stoakes
2025-10-29 20:45 ` Nico Pache
0 siblings, 2 replies; 91+ messages in thread
From: David Hildenbrand @ 2025-10-29 15:04 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
>>
>> No creep, because you'll always collapse.
>
> OK so in the 511 scenario, do we simply immediately collapse to the largest
> possible _mTHP_ page size if based on adjacent none/zero page entries in the
> PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> none/zero PTE entries to do so?
Right. And if we fail to allocate a PMD, we would collapse to smaller
sizes, and later, once a PMD is possible, collapse to a PMD.
But there is no creep, as we would have collapsed a PMD right from the
start either way.
>
> And only collapse to PMD size if we have sufficient adjacent PTE entries that
> are populated?
>
> Let's really nail this down actually so we can be super clear what the issue is
> here.
>
I hope what I wrote above made sense.
>
>>
>> Creep only happens if you wouldn't collapse a PMD without prior mTHP
>> collapse, but suddenly would in the same scenario simply because you had
>> prior mTHP collapse.
>>
>> At least that's my understanding.
>
> OK, that makes sense, is the logic (this may be part of the bit I haven't
> reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> always require prior mTHP collapse _first_?
So I would describe creep as
"we would not collapse a PMD THP because max_ptes_none is violated, but
because we collapsed smaller mTHP THPs before, we essentially suddenly
have more PTEs that are not none-or-zero, making us suddenly collapse a
PMD THP at the same place".
Assume the following: max_ptes_none = 256
This means we would only collapse if at most half (256/512) of the PTEs
are none-or-zero.
But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
[ P Z P Z P Z Z Z ]
3 Present vs. 5 Zero -> do not collapse a PMD (8)
But sssume we collapse smaller mTHP (2 entries) first
[ P P P P P P Z Z ]
We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
Suddenly we have
6 Present vs 2 Zero and we collapse a PMD (8)
[ P P P P P P P P ]
That's the "creep" problem.
>
>>
>>>
>>>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
>>>>
>>>> And for the intermediate values
>>>>
>>>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
>>>> supported yet with other values
>>>
>>> It feels a bit much to issue a kernel warning every time somebody twiddles that
>>> value, and it's kind of against user expectation a bit.
>>
>> pr_warn_once() is what I meant.
>
> Right, but even then it feels a bit extreme, warnings are pretty serious
> things. Then again there's precedent for this, and it may be the least worse
> solution.
>
> I just picture a cloud provider turning this on with mTHP then getting their
> monitoring team reporting some urgent communication about warnings in dmesg :)
I mean, one could make the states mutually, maybe?
Disallow enabling mTHP with max_ptes_none set to unsupported values and
the other way around.
That would probably be cleanest, although the implementation might get a
bit more involved (but it's solvable).
But the concern could be that there are configs that could suddenly
break: someone that set max_ptes_none and enabled mTHP.
I'll note that we could also consider only supporting "max_ptes_none =
511" (default) to start with.
The nice thing about that value is that it us fully supported with the
underused shrinker, because max_ptes_none=511 -> never shrink.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 15:04 ` David Hildenbrand
@ 2025-10-29 18:41 ` Lorenzo Stoakes
2025-10-29 21:10 ` Nico Pache
2025-10-29 20:45 ` Nico Pache
1 sibling, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Baolin Wang, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > >
> > > No creep, because you'll always collapse.
> >
> > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > none/zero PTE entries to do so?
>
> Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> and later, once a PMD is possible, collapse to a PMD.
>
> But there is no creep, as we would have collapsed a PMD right from the start
> either way.
Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
ever collapse to PMD, except in cases where, for instance, PTE entries
belong to distinct VMAs and so you have to collapse to mTHP as a result?
Or IOW 'always collapse to the largest size you can I don't care if it
takes up more memory'
And at 0, we'd never collapse anything across zero entries, and only when
adjacent present entries can be collapse to mTHP/PMD do we do so?
>
> >
> > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > are populated?
> >
> > Let's really nail this down actually so we can be super clear what the issue is
> > here.
> >
>
> I hope what I wrote above made sense.
Asking some q's still, probably more a me thing :)
>
> >
> > >
> > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > collapse, but suddenly would in the same scenario simply because you had
> > > prior mTHP collapse.
> > >
> > > At least that's my understanding.
> >
> > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > always require prior mTHP collapse _first_?
>
> So I would describe creep as
>
> "we would not collapse a PMD THP because max_ptes_none is violated, but
> because we collapsed smaller mTHP THPs before, we essentially suddenly have
> more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> at the same place".
Yeah that makes sense.
>
> Assume the following: max_ptes_none = 256
>
> This means we would only collapse if at most half (256/512) of the PTEs are
> none-or-zero.
>
> But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
>
> [ P Z P Z P Z Z Z ]
>
> 3 Present vs. 5 Zero -> do not collapse a PMD (8)
OK I'm thinking this is more about /ratio/ than anything else.
PMD - <=50% - ok 5/8 = 62.5% no collapse.
>
> But sssume we collapse smaller mTHP (2 entries) first
>
> [ P P P P P P Z Z ]
...512 KB mTHP (2 entries) - <= 50% means we can do...
>
> We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
Yes so that's:
[ P Z P Z P Z Z Z ]
->
[ P P P P P P Z Z ]
Right?
>
> Suddenly we have
>
> 6 Present vs 2 Zero and we collapse a PMD (8)
>
> [ P P P P P P P P ]
>
> That's the "creep" problem.
I guess we try PMD collapse first then mTHP, but the worry is another pass
will collapse to PMD right?
Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
this because each collapse never provides enough reduction in zero entries
to allow for higher order collapse.
Hence the idea of capping at 255
>
> >
> > >
> > > >
> > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > >
> > > > > And for the intermediate values
> > > > >
> > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > supported yet with other values
> > > >
> > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > value, and it's kind of against user expectation a bit.
> > >
> > > pr_warn_once() is what I meant.
> >
> > Right, but even then it feels a bit extreme, warnings are pretty serious
> > things. Then again there's precedent for this, and it may be the least worse
> > solution.
> >
> > I just picture a cloud provider turning this on with mTHP then getting their
> > monitoring team reporting some urgent communication about warnings in dmesg :)
>
> I mean, one could make the states mutually, maybe?
>
> Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> other way around.
>
> That would probably be cleanest, although the implementation might get a bit
> more involved (but it's solvable).
>
> But the concern could be that there are configs that could suddenly break:
> someone that set max_ptes_none and enabled mTHP.
Yeah we could always return an error on setting to an unsupported value.
I mean pr_warn() is nasty but maybe necessary.
>
>
> I'll note that we could also consider only supporting "max_ptes_none = 511"
> (default) to start with.
>
> The nice thing about that value is that it us fully supported with the
> underused shrinker, because max_ptes_none=511 -> never shrink.
It feels like = 0 would be useful though?
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:41 ` Lorenzo Stoakes
@ 2025-10-29 21:10 ` Nico Pache
2025-10-30 18:03 ` Lorenzo Stoakes
0 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-29 21:10 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 12:42 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > > >
> > > > No creep, because you'll always collapse.
> > >
> > > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > > none/zero PTE entries to do so?
> >
> > Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> > and later, once a PMD is possible, collapse to a PMD.
> >
> > But there is no creep, as we would have collapsed a PMD right from the start
> > either way.
>
> Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
> ever collapse to PMD, except in cases where, for instance, PTE entries
> belong to distinct VMAs and so you have to collapse to mTHP as a result?
There are a few failure cases, like exceeding thresholds, or
allocations failures, but yes your assessment is correct.
At 511, the PMD collapse will be satisfied by a single PTE. If the
collapse fails we will try both sides of the PMD (1024kb , 1024kb).
the one that contains the non-none PTE will collapse
This is where the (HPAGE_PMD_ORDER - order) comes from.
imagine the 511 case above
511 >> HPAGE_PMD_ORDER - 9 == 511 >> 0 = 511 max ptes none
511 >> PMD_ORDER - 8 (1024kb) == 511 >> 1 = 255 max_ptes_none
both of these align to the orders size minus 1.
>
> Or IOW 'always collapse to the largest size you can I don't care if it
> takes up more memory'
>
> And at 0, we'd never collapse anything across zero entries, and only when
> adjacent present entries can be collapse to mTHP/PMD do we do so?
Yep!
max_pte_none =0 + all mTHP sizes enabled, gives you a really good
distribution of mTHP sizes in the systems, as zero memory will be
wasted and the most optimal size (space wise) will eb found. At least
for the memory allocated through khugepaged. The Defer patchset I had
on top of this series was exactly for that purpose-- Allow khugepaged
to determine all the THP usage in the system (other than madvise), and
allow granular control of memory waste.
>
> >
> > >
> > > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > > are populated?
> > >
> > > Let's really nail this down actually so we can be super clear what the issue is
> > > here.
> > >
> >
> > I hope what I wrote above made sense.
>
> Asking some q's still, probably more a me thing :)
>
> >
> > >
> > > >
> > > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > > collapse, but suddenly would in the same scenario simply because you had
> > > > prior mTHP collapse.
> > > >
> > > > At least that's my understanding.
> > >
> > > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > > always require prior mTHP collapse _first_?
> >
> > So I would describe creep as
> >
> > "we would not collapse a PMD THP because max_ptes_none is violated, but
> > because we collapsed smaller mTHP THPs before, we essentially suddenly have
> > more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> > at the same place".
>
> Yeah that makes sense.
>
> >
> > Assume the following: max_ptes_none = 256
> >
> > This means we would only collapse if at most half (256/512) of the PTEs are
> > none-or-zero.
> >
> > But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
> >
> > [ P Z P Z P Z Z Z ]
> >
> > 3 Present vs. 5 Zero -> do not collapse a PMD (8)
>
> OK I'm thinking this is more about /ratio/ than anything else.
>
> PMD - <=50% - ok 5/8 = 62.5% no collapse.
< 50%*.
At 50% it's 256 which is actually the worst case scenario. But I read
further, and it seems like you grasped the issue.
>
> >
> > But sssume we collapse smaller mTHP (2 entries) first
> >
> > [ P P P P P P Z Z ]
>
> ...512 KB mTHP (2 entries) - <= 50% means we can do...
>
> >
> > We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
>
> Yes so that's:
>
> [ P Z P Z P Z Z Z ]
>
> ->
>
> [ P P P P P P Z Z ]
>
> Right?
>
> >
> > Suddenly we have
> >
> > 6 Present vs 2 Zero and we collapse a PMD (8)
> >
> > [ P P P P P P P P ]
> >
> > That's the "creep" problem.
>
> I guess we try PMD collapse first then mTHP, but the worry is another pass
> will collapse to PMD right?
>
>
> Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
> this because each collapse never provides enough reduction in zero entries
> to allow for higher order collapse.
>
> Hence the idea of capping at 255
Yep! We've discussed other solutions, like tracking collapsed pages,
or the solutions brought up by David. But this seemed like the most
logical to me, as it keeps some of the tunability. I now understand
the concern wasnt so much the capping, but rather the silent nature of
it, and the uAPI expectations surrounding enforcing such a limit (for
both past and future behavioral expectations).
>
> >
> > >
> > > >
> > > > >
> > > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > > >
> > > > > > And for the intermediate values
> > > > > >
> > > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > > supported yet with other values
> > > > >
> > > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > > value, and it's kind of against user expectation a bit.
> > > >
> > > > pr_warn_once() is what I meant.
> > >
> > > Right, but even then it feels a bit extreme, warnings are pretty serious
> > > things. Then again there's precedent for this, and it may be the least worse
> > > solution.
> > >
> > > I just picture a cloud provider turning this on with mTHP then getting their
> > > monitoring team reporting some urgent communication about warnings in dmesg :)
> >
> > I mean, one could make the states mutually, maybe?
> >
> > Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> > other way around.
> >
> > That would probably be cleanest, although the implementation might get a bit
> > more involved (but it's solvable).
> >
> > But the concern could be that there are configs that could suddenly break:
> > someone that set max_ptes_none and enabled mTHP.
>
> Yeah we could always return an error on setting to an unsupported value.
>
> I mean pr_warn() is nasty but maybe necessary.
>
> >
> >
> > I'll note that we could also consider only supporting "max_ptes_none = 511"
> > (default) to start with.
> >
> > The nice thing about that value is that it us fully supported with the
> > underused shrinker, because max_ptes_none=511 -> never shrink.
>
> It feels like = 0 would be useful though?
I personally think the default of 511 is wrong and should be on the
lower end of the scale. The exception being thp=always, where I
believe the kernel should treat it as 511.
But the second part of that would also violate the users max_ptes_none
setting, so it's probably much harder in practice, and also not really
part of this series, just my opinion.
Cheers.
-- Nico
>
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:10 ` Nico Pache
@ 2025-10-30 18:03 ` Lorenzo Stoakes
0 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-30 18:03 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 03:10:19PM -0600, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:42 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Wed, Oct 29, 2025 at 04:04:06PM +0100, David Hildenbrand wrote:
> > > > >
> > > > > No creep, because you'll always collapse.
> > > >
> > > > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > > > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > > > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > > > none/zero PTE entries to do so?
> > >
> > > Right. And if we fail to allocate a PMD, we would collapse to smaller sizes,
> > > and later, once a PMD is possible, collapse to a PMD.
> > >
> > > But there is no creep, as we would have collapsed a PMD right from the start
> > > either way.
> >
> > Hmm, would this mean at 511 mTHP collapse _across zero entries_ would only
> > ever collapse to PMD, except in cases where, for instance, PTE entries
> > belong to distinct VMAs and so you have to collapse to mTHP as a result?
>
> There are a few failure cases, like exceeding thresholds, or
> allocations failures, but yes your assessment is correct.
Yeah of course being mm there are thorny edge cases :) we do love those...
>
> At 511, the PMD collapse will be satisfied by a single PTE. If the
> collapse fails we will try both sides of the PMD (1024kb , 1024kb).
> the one that contains the non-none PTE will collapse
Right yes.
>
> This is where the (HPAGE_PMD_ORDER - order) comes from.
> imagine the 511 case above
> 511 >> HPAGE_PMD_ORDER - 9 == 511 >> 0 = 511 max ptes none
> 511 >> PMD_ORDER - 8 (1024kb) == 511 >> 1 = 255 max_ptes_none
>
> both of these align to the orders size minus 1.
Right.
>
> >
> > Or IOW 'always collapse to the largest size you can I don't care if it
> > takes up more memory'
> >
> > And at 0, we'd never collapse anything across zero entries, and only when
> > adjacent present entries can be collapse to mTHP/PMD do we do so?
>
> Yep!
>
> max_pte_none =0 + all mTHP sizes enabled, gives you a really good
> distribution of mTHP sizes in the systems, as zero memory will be
> wasted and the most optimal size (space wise) will eb found. At least
> for the memory allocated through khugepaged. The Defer patchset I had
> on top of this series was exactly for that purpose-- Allow khugepaged
> to determine all the THP usage in the system (other than madvise), and
> allow granular control of memory waste.
Yeah, well it's a trade off really isn't it on 'eagerness' to collapse
non-present entries :)
But we'll come back to that when David has time :)
>
> >
> > >
> > > >
> > > > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > > > are populated?
> > > >
> > > > Let's really nail this down actually so we can be super clear what the issue is
> > > > here.
> > > >
> > >
> > > I hope what I wrote above made sense.
> >
> > Asking some q's still, probably more a me thing :)
> >
> > >
> > > >
> > > > >
> > > > > Creep only happens if you wouldn't collapse a PMD without prior mTHP
> > > > > collapse, but suddenly would in the same scenario simply because you had
> > > > > prior mTHP collapse.
> > > > >
> > > > > At least that's my understanding.
> > > >
> > > > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > > > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > > > always require prior mTHP collapse _first_?
> > >
> > > So I would describe creep as
> > >
> > > "we would not collapse a PMD THP because max_ptes_none is violated, but
> > > because we collapsed smaller mTHP THPs before, we essentially suddenly have
> > > more PTEs that are not none-or-zero, making us suddenly collapse a PMD THP
> > > at the same place".
> >
> > Yeah that makes sense.
> >
> > >
> > > Assume the following: max_ptes_none = 256
> > >
> > > This means we would only collapse if at most half (256/512) of the PTEs are
> > > none-or-zero.
> > >
> > > But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
> > >
> > > [ P Z P Z P Z Z Z ]
> > >
> > > 3 Present vs. 5 Zero -> do not collapse a PMD (8)
> >
> > OK I'm thinking this is more about /ratio/ than anything else.
> >
> > PMD - <=50% - ok 5/8 = 62.5% no collapse.
>
> < 50%*.
>
> At 50% it's 256 which is actually the worst case scenario. But I read
> further, and it seems like you grasped the issue.
Yeah this is < 50% vs. <= 50% which are fundamentally different obviously :)
>
> >
> > >
> > > But sssume we collapse smaller mTHP (2 entries) first
> > >
> > > [ P P P P P P Z Z ]
> >
> > ...512 KB mTHP (2 entries) - <= 50% means we can do...
> >
> > >
> > > We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
> >
> > Yes so that's:
> >
> > [ P Z P Z P Z Z Z ]
> >
> > ->
> >
> > [ P P P P P P Z Z ]
> >
> > Right?
> >
> > >
> > > Suddenly we have
> > >
> > > 6 Present vs 2 Zero and we collapse a PMD (8)
> > >
> > > [ P P P P P P P P ]
> > >
> > > That's the "creep" problem.
> >
> > I guess we try PMD collapse first then mTHP, but the worry is another pass
> > will collapse to PMD right?
> >
> >
> > Whereas < 50% ratio means we never end up 'propagating' or 'creeping' like
> > this because each collapse never provides enough reduction in zero entries
> > to allow for higher order collapse.
> >
> > Hence the idea of capping at 255
>
> Yep! We've discussed other solutions, like tracking collapsed pages,
> or the solutions brought up by David. But this seemed like the most
> logical to me, as it keeps some of the tunability. I now understand
> the concern wasnt so much the capping, but rather the silent nature of
> it, and the uAPI expectations surrounding enforcing such a limit (for
> both past and future behavioral expectations).
Yes, that's the primary concern on my side.
>
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> > > > > > >
> > > > > > > And for the intermediate values
> > > > > > >
> > > > > > > (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> > > > > > > supported yet with other values
> > > > > >
> > > > > > It feels a bit much to issue a kernel warning every time somebody twiddles that
> > > > > > value, and it's kind of against user expectation a bit.
> > > > >
> > > > > pr_warn_once() is what I meant.
> > > >
> > > > Right, but even then it feels a bit extreme, warnings are pretty serious
> > > > things. Then again there's precedent for this, and it may be the least worse
> > > > solution.
> > > >
> > > > I just picture a cloud provider turning this on with mTHP then getting their
> > > > monitoring team reporting some urgent communication about warnings in dmesg :)
> > >
> > > I mean, one could make the states mutually, maybe?
> > >
> > > Disallow enabling mTHP with max_ptes_none set to unsupported values and the
> > > other way around.
> > >
> > > That would probably be cleanest, although the implementation might get a bit
> > > more involved (but it's solvable).
> > >
> > > But the concern could be that there are configs that could suddenly break:
> > > someone that set max_ptes_none and enabled mTHP.
> >
> > Yeah we could always return an error on setting to an unsupported value.
> >
> > I mean pr_warn() is nasty but maybe necessary.
> >
> > >
> > >
> > > I'll note that we could also consider only supporting "max_ptes_none = 511"
> > > (default) to start with.
> > >
> > > The nice thing about that value is that it us fully supported with the
> > > underused shrinker, because max_ptes_none=511 -> never shrink.
> >
> > It feels like = 0 would be useful though?
>
> I personally think the default of 511 is wrong and should be on the
> lower end of the scale. The exception being thp=always, where I
> believe the kernel should treat it as 511.
I think that'd be confusing to have different behaviour for thp=always, and I'd
rather we didn't do that.
But ultimately it's all moot I think as these are all uAPI things now.
It was a mistake to even export this IMO, but that can't be helped now :)
>
> But the second part of that would also violate the users max_ptes_none
> setting, so it's probably much harder in practice, and also not really
> part of this series, just my opinion.
I'm confused what you mean here?
In any case I think the 511/0 solution is the way forwards.
>
> Cheers.
> -- Nico
>
> >
> > >
> > > --
> > > Cheers
> > >
> > > David / dhildenb
> > >
> >
> > Thanks, Lorenzo
> >
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 15:04 ` David Hildenbrand
2025-10-29 18:41 ` Lorenzo Stoakes
@ 2025-10-29 20:45 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-29 20:45 UTC (permalink / raw)
To: David Hildenbrand
Cc: Lorenzo Stoakes, Baolin Wang, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 9:04 AM David Hildenbrand <david@redhat.com> wrote:
>
> >>
> >> No creep, because you'll always collapse.
> >
> > OK so in the 511 scenario, do we simply immediately collapse to the largest
> > possible _mTHP_ page size if based on adjacent none/zero page entries in the
> > PTE, and _never_ collapse to PMD on this basis even if we do have sufficient
> > none/zero PTE entries to do so?
>
> Right. And if we fail to allocate a PMD, we would collapse to smaller
> sizes, and later, once a PMD is possible, collapse to a PMD.
>
> But there is no creep, as we would have collapsed a PMD right from the
> start either way.
>
> >
> > And only collapse to PMD size if we have sufficient adjacent PTE entries that
> > are populated?
> >
> > Let's really nail this down actually so we can be super clear what the issue is
> > here.
> >
>
> I hope what I wrote above made sense.
>
> >
> >>
> >> Creep only happens if you wouldn't collapse a PMD without prior mTHP
> >> collapse, but suddenly would in the same scenario simply because you had
> >> prior mTHP collapse.
> >>
> >> At least that's my understanding.
> >
> > OK, that makes sense, is the logic (this may be part of the bit I haven't
> > reviewed yet tbh) then that for khugepaged mTHP we have the system where we
> > always require prior mTHP collapse _first_?
>
> So I would describe creep as
>
> "we would not collapse a PMD THP because max_ptes_none is violated, but
> because we collapsed smaller mTHP THPs before, we essentially suddenly
> have more PTEs that are not none-or-zero, making us suddenly collapse a
> PMD THP at the same place".
>
> Assume the following: max_ptes_none = 256
>
> This means we would only collapse if at most half (256/512) of the PTEs
> are none-or-zero.
>
> But imagine the (simplified) PTE layout with PMD = 8 entries to simplify:
>
> [ P Z P Z P Z Z Z ]
>
> 3 Present vs. 5 Zero -> do not collapse a PMD (8)
>
> But sssume we collapse smaller mTHP (2 entries) first
>
> [ P P P P P P Z Z ]
>
> We collapsed 3x "P Z" into "P P" because the ratio allowed for it.
>
> Suddenly we have
>
> 6 Present vs 2 Zero and we collapse a PMD (8)
>
> [ P P P P P P P P ]
>
> That's the "creep" problem.
I'd like to add a little to this,
The worst case scenario is all mTHP sizes enabled and a value of 256.
A 16kb collapse would then lead all the way up to a PMD collapse,
stopping to collapse at each mTHP level on each subsequent scan of the
same PMD range. The larger the max_pte_none value is, the less "stops"
it will make before reaching a PMD size, but it will ultimately creep
up to a PMD. Hence the cap. At 511, a single pte in a range will
always satisfy the PMD collapse, so we will never attempt any other
orders (other than in the case of the collapse failing, which David
explains above).
Hopefully that helps give some more insight to the creep problem.
Cheers
-- Nico
>
> >
> >>
> >>>
> >>>> max_ptes_none == 0 -> collapse mTHP only if all non-none/zero
> >>>>
> >>>> And for the intermediate values
> >>>>
> >>>> (1) pr_warn() when mTHPs are enabled, stating that mTHP collapse is not
> >>>> supported yet with other values
> >>>
> >>> It feels a bit much to issue a kernel warning every time somebody twiddles that
> >>> value, and it's kind of against user expectation a bit.
> >>
> >> pr_warn_once() is what I meant.
> >
> > Right, but even then it feels a bit extreme, warnings are pretty serious
> > things. Then again there's precedent for this, and it may be the least worse
> > solution.
> >
> > I just picture a cloud provider turning this on with mTHP then getting their
> > monitoring team reporting some urgent communication about warnings in dmesg :)
>
> I mean, one could make the states mutually, maybe?
>
> Disallow enabling mTHP with max_ptes_none set to unsupported values and
> the other way around.
>
> That would probably be cleanest, although the implementation might get a
> bit more involved (but it's solvable).
>
> But the concern could be that there are configs that could suddenly
> break: someone that set max_ptes_none and enabled mTHP.
>
>
> I'll note that we could also consider only supporting "max_ptes_none =
> 511" (default) to start with.
>
> The nice thing about that value is that it us fully supported with the
> underused shrinker, because max_ptes_none=511 -> never shrink.
>
> --
> Cheers
>
> David / dhildenb
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-27 17:53 ` Lorenzo Stoakes
2025-10-28 10:09 ` Baolin Wang
@ 2025-10-28 13:36 ` Nico Pache
2025-10-28 14:15 ` David Hildenbrand
2025-10-28 16:57 ` Lorenzo Stoakes
1 sibling, 2 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-28 13:36 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > The current mechanism for determining mTHP collapse scales the
> > khugepaged_max_ptes_none value based on the target order. This
> > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > is set to a value greater than HPAGE_PMD_NR / 2.
> >
> > With this configuration, a successful collapse to order N will populate
> > enough pages to satisfy the collapse condition on order N+1 on the next
> > scan. This leads to unnecessary work and memory churn.
> >
> > To fix this issue introduce a helper function that caps the max_ptes_none
> > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> >
> > The limits can be ignored by passing full_scan=true, this is useful for
> > madvise_collapse (which ignores limits), or in the case of
> > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > collapse is available.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > 1 file changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 4ccebf5dda97..286c3a7afdee 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > wake_up_interruptible(&khugepaged_wait);
> > }
> >
> > +/**
> > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > + * @order: The folio order being collapsed to
> > + * @full_scan: Whether this is a full scan (ignore limits)
> > + *
> > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > + *
> > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > + * khugepaged_max_ptes_none value.
> > + *
> > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > + *
> > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > + */
> > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > +{
> > + unsigned int max_ptes_none;
> > +
> > + /* ignore max_ptes_none limits */
> > + if (full_scan)
> > + return HPAGE_PMD_NR - 1;
> > +
> > + if (order == HPAGE_PMD_ORDER)
> > + return khugepaged_max_ptes_none;
> > +
> > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
Hey Lorenzo,
> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
I spoke to David and he said to continue forward with this series; the
"eagerness" tunable will take some time, and may require further
considerations/discussion.
>
> I'm still really quite uncomfortable with us silently capping this value.
>
> If we're putting forward theoretical ideas that are to be later built upon, this
> series should be an RFC.
>
> But if we really intend to silently ignore user input the problem is that then
> becomes established uAPI.
>
> I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> visibility I think.
>
> I think people are going to find it odd that you set it to something, but then
> get something else.
The alternative solution is to not support max_ptes_none for mTHP
collapse and not allow none/zero pages. This is essentially "capping"
the value too.
>
> As an alternative we could have a new sysfs field:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> That shows the cap clearly.
>
> In fact, it could be read-only... and just expose it to the user. That reduces
> complexity.
I agree with Baolin here; adding another tunable will only increase
the complexity for our future goals, and also provides needless
insight into the internals when they can not be customized.
Cheers,
-- Nico
>
> We can then bring in eagerness later and have the same situation of
> max_ptes_none being a parameter that exists (plus this additional read-only
> parameter).
>
> > +
> > + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > +
> > +}
> > +
> > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > vm_flags_t vm_flags)
> > {
> > @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > pte_t *_pte;
> > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> > const unsigned long nr_pages = 1UL << order;
> > - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
> >
> > for (_pte = pte; _pte < pte + nr_pages;
> > _pte++, addr += PAGE_SIZE) {
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 13:36 ` Nico Pache
@ 2025-10-28 14:15 ` David Hildenbrand
2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 16:57 ` Lorenzo Stoakes
1 sibling, 1 reply; 91+ messages in thread
From: David Hildenbrand @ 2025-10-28 14:15 UTC (permalink / raw)
To: Nico Pache, Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 28.10.25 14:36, Nico Pache wrote:
> On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
>>
>> On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
>>> The current mechanism for determining mTHP collapse scales the
>>> khugepaged_max_ptes_none value based on the target order. This
>>> introduces an undesirable feedback loop, or "creep", when max_ptes_none
>>> is set to a value greater than HPAGE_PMD_NR / 2.
>>>
>>> With this configuration, a successful collapse to order N will populate
>>> enough pages to satisfy the collapse condition on order N+1 on the next
>>> scan. This leads to unnecessary work and memory churn.
>>>
>>> To fix this issue introduce a helper function that caps the max_ptes_none
>>> to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
>>> the max_ptes_none number by the (PMD_ORDER - target collapse order).
>>>
>>> The limits can be ignored by passing full_scan=true, this is useful for
>>> madvise_collapse (which ignores limits), or in the case of
>>> collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
>>> collapse is available.
>>>
>>> Signed-off-by: Nico Pache <npache@redhat.com>
>>> ---
>>> mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 34 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 4ccebf5dda97..286c3a7afdee 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
>>> wake_up_interruptible(&khugepaged_wait);
>>> }
>>>
>>> +/**
>>> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>>> + * @order: The folio order being collapsed to
>>> + * @full_scan: Whether this is a full scan (ignore limits)
>>> + *
>>> + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
>>> + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
>>> + *
>>> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
>>> + * khugepaged_max_ptes_none value.
>>> + *
>>> + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
>>> + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
>>> + *
>>> + * Return: Maximum number of empty PTEs allowed for the collapse operation
>>> + */
>>> +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
>>> +{
>>> + unsigned int max_ptes_none;
>>> +
>>> + /* ignore max_ptes_none limits */
>>> + if (full_scan)
>>> + return HPAGE_PMD_NR - 1;
>>> +
>>> + if (order == HPAGE_PMD_ORDER)
>>> + return khugepaged_max_ptes_none;
>>> +
>>> + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>>
>
> Hey Lorenzo,
>
>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I spoke to David and he said to continue forward with this series; the
> "eagerness" tunable will take some time, and may require further
> considerations/discussion.
Right, after talking to Johannes it got clearer that what we envisioned
with "eagerness" would not be like swappiness, and we will really have
to be careful here. I don't know yet when I will have time to look into
that.
If we want to avoid the implicit capping, I think there are the
following possible approaches
(1) Tolerate creep for now, maybe warning if the user configures it.
(2) Avoid creep by counting zero-filled pages towards none_or_zero.
(3) Have separate toggles for each THP size. Doesn't quite solve the
problem, only shifts it.
Anything else?
IIUC, creep is less of a problem when we have the underused shrinker
enabled: whatever we over-allocated can (unless longterm-pinned etc) get
reclaimed again.
So maybe having underused-shrinker support for mTHP as well would be a
solution to tackle (1) later?
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 14:15 ` David Hildenbrand
@ 2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 17:36 ` Lorenzo Stoakes
2025-10-28 18:08 ` David Hildenbrand
0 siblings, 2 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:29 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 03:15:26PM +0100, David Hildenbrand wrote:
> On 28.10.25 14:36, Nico Pache wrote:
> > On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > > The current mechanism for determining mTHP collapse scales the
> > > > khugepaged_max_ptes_none value based on the target order. This
> > > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > > is set to a value greater than HPAGE_PMD_NR / 2.
> > > >
> > > > With this configuration, a successful collapse to order N will populate
> > > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > > scan. This leads to unnecessary work and memory churn.
> > > >
> > > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > > >
> > > > The limits can be ignored by passing full_scan=true, this is useful for
> > > > madvise_collapse (which ignores limits), or in the case of
> > > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > > collapse is available.
> > > >
> > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > > ---
> > > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > > index 4ccebf5dda97..286c3a7afdee 100644
> > > > --- a/mm/khugepaged.c
> > > > +++ b/mm/khugepaged.c
> > > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > > wake_up_interruptible(&khugepaged_wait);
> > > > }
> > > >
> > > > +/**
> > > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > > + * @order: The folio order being collapsed to
> > > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > > + *
> > > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > > + *
> > > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > > + * khugepaged_max_ptes_none value.
> > > > + *
> > > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > > + *
> > > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > > + */
> > > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > > +{
> > > > + unsigned int max_ptes_none;
> > > > +
> > > > + /* ignore max_ptes_none limits */
> > > > + if (full_scan)
> > > > + return HPAGE_PMD_NR - 1;
> > > > +
> > > > + if (order == HPAGE_PMD_ORDER)
> > > > + return khugepaged_max_ptes_none;
> > > > +
> > > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> > >
> >
> > Hey Lorenzo,
> >
> > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >
> > I spoke to David and he said to continue forward with this series; the
> > "eagerness" tunable will take some time, and may require further
> > considerations/discussion.
>
> Right, after talking to Johannes it got clearer that what we envisioned with
I'm not sure that you meant to say go ahead with the series as-is with this
silent capping?
Either way we need better communication of this, because I wasn't aware that was
the plan for one, and it means this patch directly ignores review from 2
versions ago, which needs to be documented _somewhere_ so people aren't confused.
And it would maybe allowed us to have this converation ahead of time rather than
now.
> "eagerness" would not be like swappiness, and we will really have to be
> careful here. I don't know yet when I will have time to look into that.
I guess I missed this part of the converastion, what do you mean?
The whole concept is that we have a paramaeter whose value is _abstracted_ and
which we control what it means.
I'm not sure exactly why that would now be problematic? The fundamental concept
seems sound no? Last I remember of the conversation this was the case.
>
> If we want to avoid the implicit capping, I think there are the following
> possible approaches
>
> (1) Tolerate creep for now, maybe warning if the user configures it.
I mean this seems a viable option if there is pressure to land this series
before we have a viable uAPI for configuring this.
A part of me thinks we shouldn't rush series in for that reason though and
should require that we have a proper control here.
But I guess this approach is the least-worst as it leaves us with the most
options moving forwards.
> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
Would this really make all that much difference?
> (3) Have separate toggles for each THP size. Doesn't quite solve the
> problem, only shifts it.
Yeah I did wonder about this as an alternative solution. But of course it then
makes it vague what the parent values means in respect of the individual levels,
unless we have an 'inherit' mode there too (possible).
It's going to be confusing though as max_ptes_none sits at the root khugepaged/
level and I don't think any other parameter from khugepaged/ is exposed at
individual page size levels.
And of course doing this means we
>
> Anything else?
Err... I mean I'm not sure if you missed it but I suggested an approach in the
sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
/sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
Then we allow the capping, but simply document that we specify what the capped
value will be here for mTHP.
That struck me as the simplest way of getting this series landed without
necessarily violating any future eagerness which:
a. Must still support khugepaged/max_ptes_none - we aren't getting away from
this, it's uAPI.
b. Surely must want to do different things for mTHP in eagerness, so if we're
exposing some PTE value in max_ptes_none doing so in
khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
readonly so unlike max_ptes_none we don't have to worry about the other
direction).
HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
which case perhaps mthp_max_ptes_none would be problematic in that it is some
kind of average.
Then again we could always revert to putting this parameter as in (3) in that
case, ugly but kinda viable.
>
> IIUC, creep is less of a problem when we have the underused shrinker
> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> reclaimed again.
>
> So maybe having underused-shrinker support for mTHP as well would be a
> solution to tackle (1) later?
How viable is this in the short term?
>
> --
> Cheers
>
> David / dhildenb
>
Another possible solution:
If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
/sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
As a simple boolean. If switched on then we document that it caps mTHP as
per Nico's suggestion.
That way we avoid the 'silent' issue I have with all this and it's an
explicit setting.
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:29 ` Lorenzo Stoakes
@ 2025-10-28 17:36 ` Lorenzo Stoakes
2025-10-28 18:08 ` David Hildenbrand
1 sibling, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:36 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 05:29:59PM +0000, Lorenzo Stoakes wrote:
> >
> > If we want to avoid the implicit capping, I think there are the following
> > possible approaches
> >
> > (1) Tolerate creep for now, maybe warning if the user configures it.
>
> I mean this seems a viable option if there is pressure to land this series
> before we have a viable uAPI for configuring this.
>
> A part of me thinks we shouldn't rush series in for that reason though and
> should require that we have a proper control here.
>
> But I guess this approach is the least-worst as it leaves us with the most
> options moving forwards.
>
> > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>
> Would this really make all that much difference?
>
> > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > problem, only shifts it.
>
> Yeah I did wonder about this as an alternative solution. But of course it then
> makes it vague what the parent values means in respect of the individual levels,
> unless we have an 'inherit' mode there too (possible).
>
> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> level and I don't think any other parameter from khugepaged/ is exposed at
> individual page size levels.
>
> And of course doing this means we
Oops didn't finish the thought!
Here it is:
And of course this means we continue to propagate this max_ptes_none concept
only now in more places which is yuck.
Unless you meant putting something other than max_ptes_none at different levels?
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:29 ` Lorenzo Stoakes
2025-10-28 17:36 ` Lorenzo Stoakes
@ 2025-10-28 18:08 ` David Hildenbrand
2025-10-28 18:59 ` Lorenzo Stoakes
1 sibling, 1 reply; 91+ messages in thread
From: David Hildenbrand @ 2025-10-28 18:08 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
>>> Hey Lorenzo,
>>>
>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>>
>>> I spoke to David and he said to continue forward with this series; the
>>> "eagerness" tunable will take some time, and may require further
>>> considerations/discussion.
>>
>> Right, after talking to Johannes it got clearer that what we envisioned with
>
> I'm not sure that you meant to say go ahead with the series as-is with this
> silent capping?
No, "go ahead" as in "let's find some way forward that works for all and
is not too crazy".
[...]
>> "eagerness" would not be like swappiness, and we will really have to be
>> careful here. I don't know yet when I will have time to look into that.
>
> I guess I missed this part of the converastion, what do you mean?
Johannes raised issues with that on the list and afterwards we had an
offline discussion about some of the details and why something
unpredictable is not good.
>
> The whole concept is that we have a paramaeter whose value is _abstracted_ and
> which we control what it means.
>
> I'm not sure exactly why that would now be problematic? The fundamental concept
> seems sound no? Last I remember of the conversation this was the case.
The basic idea was to do something abstracted as swappiness. Turns out
"swappiness" is really something predictable, not something we can
randomly change how it behaves under the hood.
So we'd have to find something similar for "eagerness", and that's where
it stops being easy.
>
>>
>> If we want to avoid the implicit capping, I think there are the following
>> possible approaches
>>
>> (1) Tolerate creep for now, maybe warning if the user configures it.
>
> I mean this seems a viable option if there is pressure to land this series
> before we have a viable uAPI for configuring this.
>
> A part of me thinks we shouldn't rush series in for that reason though and
> should require that we have a proper control here.
>
> But I guess this approach is the least-worst as it leaves us with the most
> options moving forwards.
Yes. There is also the alternative of respecting only 0 / 511 for mTHP
collapse for now as discussed in the other thread.
>
>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>
> Would this really make all that much difference?
It solves the creep problem I think, but it's a bit nasty IMHO.
>
>> (3) Have separate toggles for each THP size. Doesn't quite solve the
>> problem, only shifts it.
>
> Yeah I did wonder about this as an alternative solution. But of course it then
> makes it vague what the parent values means in respect of the individual levels,
> unless we have an 'inherit' mode there too (possible).
>
> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> level and I don't think any other parameter from khugepaged/ is exposed at
> individual page size levels.
>
> And of course doing this means we
>
>>
>> Anything else?
>
> Err... I mean I'm not sure if you missed it but I suggested an approach in the
> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>
> Then we allow the capping, but simply document that we specify what the capped
> value will be here for mTHP.
I did not have time to read the details on that so far.
It would be one solution forward. I dislike it because I think the whole
capping is an intermediate thing that can be (and likely must be, when
considering mTHP underused shrinking I think) solved in the future
differently. That's why I would prefer adding this only if there is no
other, simpler, way forward.
>
> That struck me as the simplest way of getting this series landed without
> necessarily violating any future eagerness which:
>
> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> this, it's uAPI.
>
> b. Surely must want to do different things for mTHP in eagerness, so if we're
> exposing some PTE value in max_ptes_none doing so in
> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> readonly so unlike max_ptes_none we don't have to worry about the other
> direction).
>
> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> which case perhaps mthp_max_ptes_none would be problematic in that it is some
> kind of average.
>
> Then again we could always revert to putting this parameter as in (3) in that
> case, ugly but kinda viable.
>
>>
>> IIUC, creep is less of a problem when we have the underused shrinker
>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
>> reclaimed again.
>>
>> So maybe having underused-shrinker support for mTHP as well would be a
>> solution to tackle (1) later?
>
> How viable is this in the short term?
I once started looking into it, but it will require quite some work,
because the lists will essentially include each and every (m)THP in the
system ... so i think we will need some redesign.
>
> Another possible solution:
>
> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
>
> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
>
> As a simple boolean. If switched on then we document that it caps mTHP as
> per Nico's suggestion.
>
> That way we avoid the 'silent' issue I have with all this and it's an
> explicit setting.
Right, but it's another toggle I wish we wouldn't need. We could of
course also make it some compile-time option, but not sure if that's
really any better.
I'd hope we find an easy way forward that doesn't require new toggles,
at least for now ...
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:08 ` David Hildenbrand
@ 2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
` (3 more replies)
0 siblings, 4 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 18:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
>
> > > > Hey Lorenzo,
> > > >
> > > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > > >
> > > > I spoke to David and he said to continue forward with this series; the
> > > > "eagerness" tunable will take some time, and may require further
> > > > considerations/discussion.
> > >
> > > Right, after talking to Johannes it got clearer that what we envisioned with
> >
> > I'm not sure that you meant to say go ahead with the series as-is with this
> > silent capping?
>
> No, "go ahead" as in "let's find some way forward that works for all and is
> not too crazy".
Right we clearly needed to discuss that further at the time but that's moot now,
we're figuring it out now :)
>
> [...]
>
> > > "eagerness" would not be like swappiness, and we will really have to be
> > > careful here. I don't know yet when I will have time to look into that.
> >
> > I guess I missed this part of the converastion, what do you mean?
>
> Johannes raised issues with that on the list and afterwards we had an
> offline discussion about some of the details and why something unpredictable
> is not good.
Could we get these details on-list so we can discuss them? This doesn't have to
be urgent, but I would like to have a say in this or at least be part of the
converastion please.
>
> >
> > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > which we control what it means.
> >
> > I'm not sure exactly why that would now be problematic? The fundamental concept
> > seems sound no? Last I remember of the conversation this was the case.
>
> The basic idea was to do something abstracted as swappiness. Turns out
> "swappiness" is really something predictable, not something we can randomly
> change how it behaves under the hood.
>
> So we'd have to find something similar for "eagerness", and that's where it
> stops being easy.
I think we shouldn't be too stuck on
>
> >
> > >
> > > If we want to avoid the implicit capping, I think there are the following
> > > possible approaches
> > >
> > > (1) Tolerate creep for now, maybe warning if the user configures it.
> >
> > I mean this seems a viable option if there is pressure to land this series
> > before we have a viable uAPI for configuring this.
> >
> > A part of me thinks we shouldn't rush series in for that reason though and
> > should require that we have a proper control here.
> >
> > But I guess this approach is the least-worst as it leaves us with the most
> > options moving forwards.
>
> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> collapse for now as discussed in the other thread.
Yes I guess let's carry that on over there.
I mean this is why I said it's better to try to keep things in one thread :) but
anyway, we've forked and can't be helped now.
To be clear that was a criticism of - email development - not you.
It's _extremely easy_ to have this happen because one thread naturally leads to
a broader discussion of a given topic, whereas another has questions from
somebody else about the same topic, to which people reply and then... you have a
fork and it can't be helped.
I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
But that's also broken in its own way, you can't stop people from replying in
the other thread still and yeah. It's a limitation of this model :)
>
> >
> > > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> >
> > Would this really make all that much difference?
>
> It solves the creep problem I think, but it's a bit nasty IMHO.
Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
collapses, interesting...
Scanning for that does seem a bit nasty though yes...
>
> >
> > > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > > problem, only shifts it.
> >
> > Yeah I did wonder about this as an alternative solution. But of course it then
> > makes it vague what the parent values means in respect of the individual levels,
> > unless we have an 'inherit' mode there too (possible).
> >
> > It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> > level and I don't think any other parameter from khugepaged/ is exposed at
> > individual page size levels.
> >
> > And of course doing this means we
> >
> > >
> > > Anything else?
> >
> > Err... I mean I'm not sure if you missed it but I suggested an approach in the
> > sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > Then we allow the capping, but simply document that we specify what the capped
> > value will be here for mTHP.
>
> I did not have time to read the details on that so far.
OK. It is a bit nasty, yes. The idea is to find something that allows the
capping to work.
>
> It would be one solution forward. I dislike it because I think the whole
> capping is an intermediate thing that can be (and likely must be, when
> considering mTHP underused shrinking I think) solved in the future
> differently. That's why I would prefer adding this only if there is no
> other, simpler, way forward.
Yes I agree that if we could avoid it it'd be great.
Really I proposed this solution on the basis that we were somehow ok with the
capping.
If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
behaviour.
We'll clarify on the other thread, but the 511/0 was compelling to me before as
a simplification, and if we can have a straightforward model of how mTHP
collapse across none/zero page PTEs behaves this is ideal.
The only question is w.r.t. warnings etc. but we can handle details there.
>
> >
> > That struck me as the simplest way of getting this series landed without
> > necessarily violating any future eagerness which:
> >
> > a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> > this, it's uAPI.
> >
> > b. Surely must want to do different things for mTHP in eagerness, so if we're
> > exposing some PTE value in max_ptes_none doing so in
> > khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> > readonly so unlike max_ptes_none we don't have to worry about the other
> > direction).
> >
> > HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> > which case perhaps mthp_max_ptes_none would be problematic in that it is some
> > kind of average.
> >
> > Then again we could always revert to putting this parameter as in (3) in that
> > case, ugly but kinda viable.
> >
> > >
> > > IIUC, creep is less of a problem when we have the underused shrinker
> > > enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> > > reclaimed again.
> > >
> > > So maybe having underused-shrinker support for mTHP as well would be a
> > > solution to tackle (1) later?
> >
> > How viable is this in the short term?
>
> I once started looking into it, but it will require quite some work, because
> the lists will essentially include each and every (m)THP in the system ...
> so i think we will need some redesign.
Ack.
This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
> >
> > Another possible solution:
> >
> > If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> >
> > As a simple boolean. If switched on then we document that it caps mTHP as
> > per Nico's suggestion.
> >
> > That way we avoid the 'silent' issue I have with all this and it's an
> > explicit setting.
>
> Right, but it's another toggle I wish we wouldn't need. We could of course
> also make it some compile-time option, but not sure if that's really any
> better.
>
> I'd hope we find an easy way forward that doesn't require new toggles, at
> least for now ...
Right, well I agree if we can make this 0/511 thing work, let's do that.
Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
@ 2025-10-28 19:08 ` Lorenzo Stoakes
2025-10-29 2:09 ` Baolin Wang
` (2 subsequent siblings)
3 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 19:08 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 06:59:31PM +0000, Lorenzo Stoakes wrote:
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> > >
> > > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > > which we control what it means.
> > >
> > > I'm not sure exactly why that would now be problematic? The fundamental concept
> > > seems sound no? Last I remember of the conversation this was the case.
> >
> > The basic idea was to do something abstracted as swappiness. Turns out
> > "swappiness" is really something predictable, not something we can randomly
> > change how it behaves under the hood.
> >
> > So we'd have to find something similar for "eagerness", and that's where it
> > stops being easy.
>
> I think we shouldn't be too stuck on
>
I really am the master of the unfinished sentence :)
I was going to say we shouldn't be too stuck on the analogy to swappiness and
just maintain the broad concept that eagerness is abstracted and we get to
determine what that looks like.
But absolutely I accept that it's highly sensitive and likely embodies a great
many moving parts and we must be cautious absolutely.
This is something that can be deferred for later.
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
@ 2025-10-29 2:09 ` Baolin Wang
2025-10-29 2:49 ` Nico Pache
2025-10-29 18:55 ` Lorenzo Stoakes
2025-10-29 2:47 ` Nico Pache
2025-10-31 11:12 ` David Hildenbrand
3 siblings, 2 replies; 91+ messages in thread
From: Baolin Wang @ 2025-10-29 2:09 UTC (permalink / raw)
To: Lorenzo Stoakes, David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/10/29 02:59, Lorenzo Stoakes wrote:
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
>>
>>>>> Hey Lorenzo,
>>>>>
>>>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>>>>
>>>>> I spoke to David and he said to continue forward with this series; the
>>>>> "eagerness" tunable will take some time, and may require further
>>>>> considerations/discussion.
>>>>
>>>> Right, after talking to Johannes it got clearer that what we envisioned with
>>>
>>> I'm not sure that you meant to say go ahead with the series as-is with this
>>> silent capping?
>>
>> No, "go ahead" as in "let's find some way forward that works for all and is
>> not too crazy".
>
> Right we clearly needed to discuss that further at the time but that's moot now,
> we're figuring it out now :)
>
>>
>> [...]
>>
>>>> "eagerness" would not be like swappiness, and we will really have to be
>>>> careful here. I don't know yet when I will have time to look into that.
>>>
>>> I guess I missed this part of the converastion, what do you mean?
>>
>> Johannes raised issues with that on the list and afterwards we had an
>> offline discussion about some of the details and why something unpredictable
>> is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
>
>>
>>>
>>> The whole concept is that we have a paramaeter whose value is _abstracted_ and
>>> which we control what it means.
>>>
>>> I'm not sure exactly why that would now be problematic? The fundamental concept
>>> seems sound no? Last I remember of the conversation this was the case.
>>
>> The basic idea was to do something abstracted as swappiness. Turns out
>> "swappiness" is really something predictable, not something we can randomly
>> change how it behaves under the hood.
>>
>> So we'd have to find something similar for "eagerness", and that's where it
>> stops being easy.
>
> I think we shouldn't be too stuck on
>
>>
>>>
>>>>
>>>> If we want to avoid the implicit capping, I think there are the following
>>>> possible approaches
>>>>
>>>> (1) Tolerate creep for now, maybe warning if the user configures it.
>>>
>>> I mean this seems a viable option if there is pressure to land this series
>>> before we have a viable uAPI for configuring this.
>>>
>>> A part of me thinks we shouldn't rush series in for that reason though and
>>> should require that we have a proper control here.
>>>
>>> But I guess this approach is the least-worst as it leaves us with the most
>>> options moving forwards.
>>
>> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
>> collapse for now as discussed in the other thread.
>
> Yes I guess let's carry that on over there.
>
> I mean this is why I said it's better to try to keep things in one thread :) but
> anyway, we've forked and can't be helped now.
>
> To be clear that was a criticism of - email development - not you.
>
> It's _extremely easy_ to have this happen because one thread naturally leads to
> a broader discussion of a given topic, whereas another has questions from
> somebody else about the same topic, to which people reply and then... you have a
> fork and it can't be helped.
>
> I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
>
> But that's also broken in its own way, you can't stop people from replying in
> the other thread still and yeah. It's a limitation of this model :)
>
>>
>>>
>>>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
>>>
>>> Would this really make all that much difference?
>>
>> It solves the creep problem I think, but it's a bit nasty IMHO.
>
> Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> collapses, interesting...
>
> Scanning for that does seem a bit nasty though yes...
>
>>
>>>
>>>> (3) Have separate toggles for each THP size. Doesn't quite solve the
>>>> problem, only shifts it.
>>>
>>> Yeah I did wonder about this as an alternative solution. But of course it then
>>> makes it vague what the parent values means in respect of the individual levels,
>>> unless we have an 'inherit' mode there too (possible).
>>>
>>> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
>>> level and I don't think any other parameter from khugepaged/ is exposed at
>>> individual page size levels.
>>>
>>> And of course doing this means we
>>>
>>>>
>>>> Anything else?
>>>
>>> Err... I mean I'm not sure if you missed it but I suggested an approach in the
>>> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
>>>
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
>>>
>>> Then we allow the capping, but simply document that we specify what the capped
>>> value will be here for mTHP.
>>
>> I did not have time to read the details on that so far.
>
> OK. It is a bit nasty, yes. The idea is to find something that allows the
> capping to work.
>
>>
>> It would be one solution forward. I dislike it because I think the whole
>> capping is an intermediate thing that can be (and likely must be, when
>> considering mTHP underused shrinking I think) solved in the future
>> differently. That's why I would prefer adding this only if there is no
>> other, simpler, way forward.
>
> Yes I agree that if we could avoid it it'd be great.
>
> Really I proposed this solution on the basis that we were somehow ok with the
> capping.
>
> If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> behaviour.
>
> We'll clarify on the other thread, but the 511/0 was compelling to me before as
> a simplification, and if we can have a straightforward model of how mTHP
> collapse across none/zero page PTEs behaves this is ideal.
>
> The only question is w.r.t. warnings etc. but we can handle details there.
>
>>
>>>
>>> That struck me as the simplest way of getting this series landed without
>>> necessarily violating any future eagerness which:
>>>
>>> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
>>> this, it's uAPI.
>>>
>>> b. Surely must want to do different things for mTHP in eagerness, so if we're
>>> exposing some PTE value in max_ptes_none doing so in
>>> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
>>> readonly so unlike max_ptes_none we don't have to worry about the other
>>> direction).
>>>
>>> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
>>> which case perhaps mthp_max_ptes_none would be problematic in that it is some
>>> kind of average.
>>>
>>> Then again we could always revert to putting this parameter as in (3) in that
>>> case, ugly but kinda viable.
>>>
>>>>
>>>> IIUC, creep is less of a problem when we have the underused shrinker
>>>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
>>>> reclaimed again.
>>>>
>>>> So maybe having underused-shrinker support for mTHP as well would be a
>>>> solution to tackle (1) later?
>>>
>>> How viable is this in the short term?
>>
>> I once started looking into it, but it will require quite some work, because
>> the lists will essentially include each and every (m)THP in the system ...
>> so i think we will need some redesign.
>
> Ack.
>
> This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
>>
>>>
>>> Another possible solution:
>>>
>>> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
>>>
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
>>>
>>> As a simple boolean. If switched on then we document that it caps mTHP as
>>> per Nico's suggestion.
>>>
>>> That way we avoid the 'silent' issue I have with all this and it's an
>>> explicit setting.
>>
>> Right, but it's another toggle I wish we wouldn't need. We could of course
>> also make it some compile-time option, but not sure if that's really any
>> better.
>>
>> I'd hope we find an easy way forward that doesn't require new toggles, at
>> least for now ...
>
> Right, well I agree if we can make this 0/511 thing work, let's do that.
>
> Toggle are just 'least worst' workarounds on assumption of the need for capping.
I finally finished reading through the discussions across multiple
threads:), and it looks like we've reached a preliminary consensus (make
0/511 work). Great and thanks!
IIUC, the strategy is, configuring it to 511 means always enabling mTHP
collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
non-none/zero, and for other values, we issue a warning and prohibit
mTHP collapse (avoid Lorenzo's concern about silently changing
max_ptes_none). Then the implementation for collapse_max_ptes_none()
should be as follows:
static int collapse_max_ptes_none(unsigned int order, bool full_scan)
{
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
/*
* To prevent creeping towards larger order collapses for mTHP
collapse,
* we restrict khugepaged_max_ptes_none to only 511 or 0,
simplifying the
* logic. This means:
* max_ptes_none == 511 -> collapse mTHP always
* max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
non-none/zero
*/
if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
HPAGE_PMD_NR - 1)
return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
order);
pr_warn_once("mTHP collapse only supports
khugepaged_max_ptes_none configured as 0 or %d\n", HPAGE_PMD_NR - 1);
return -EINVAL;
}
So what do you think?
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:09 ` Baolin Wang
@ 2025-10-29 2:49 ` Nico Pache
2025-10-29 18:55 ` Lorenzo Stoakes
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-29 2:49 UTC (permalink / raw)
To: Baolin Wang
Cc: Lorenzo Stoakes, David Hildenbrand, linux-kernel,
linux-trace-kernel, linux-mm, linux-doc, ziy, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
On Tue, Oct 28, 2025 at 8:10 PM Baolin Wang
<baolin.wang@linux.alibaba.com> wrote:
>
>
>
> On 2025/10/29 02:59, Lorenzo Stoakes wrote:
> > On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> >>
> >>>>> Hey Lorenzo,
> >>>>>
> >>>>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> >>>>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> >>>>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> >>>>>
> >>>>> I spoke to David and he said to continue forward with this series; the
> >>>>> "eagerness" tunable will take some time, and may require further
> >>>>> considerations/discussion.
> >>>>
> >>>> Right, after talking to Johannes it got clearer that what we envisioned with
> >>>
> >>> I'm not sure that you meant to say go ahead with the series as-is with this
> >>> silent capping?
> >>
> >> No, "go ahead" as in "let's find some way forward that works for all and is
> >> not too crazy".
> >
> > Right we clearly needed to discuss that further at the time but that's moot now,
> > we're figuring it out now :)
> >
> >>
> >> [...]
> >>
> >>>> "eagerness" would not be like swappiness, and we will really have to be
> >>>> careful here. I don't know yet when I will have time to look into that.
> >>>
> >>> I guess I missed this part of the converastion, what do you mean?
> >>
> >> Johannes raised issues with that on the list and afterwards we had an
> >> offline discussion about some of the details and why something unpredictable
> >> is not good.
> >
> > Could we get these details on-list so we can discuss them? This doesn't have to
> > be urgent, but I would like to have a say in this or at least be part of the
> > converastion please.
> >
> >>
> >>>
> >>> The whole concept is that we have a paramaeter whose value is _abstracted_ and
> >>> which we control what it means.
> >>>
> >>> I'm not sure exactly why that would now be problematic? The fundamental concept
> >>> seems sound no? Last I remember of the conversation this was the case.
> >>
> >> The basic idea was to do something abstracted as swappiness. Turns out
> >> "swappiness" is really something predictable, not something we can randomly
> >> change how it behaves under the hood.
> >>
> >> So we'd have to find something similar for "eagerness", and that's where it
> >> stops being easy.
> >
> > I think we shouldn't be too stuck on
> >
> >>
> >>>
> >>>>
> >>>> If we want to avoid the implicit capping, I think there are the following
> >>>> possible approaches
> >>>>
> >>>> (1) Tolerate creep for now, maybe warning if the user configures it.
> >>>
> >>> I mean this seems a viable option if there is pressure to land this series
> >>> before we have a viable uAPI for configuring this.
> >>>
> >>> A part of me thinks we shouldn't rush series in for that reason though and
> >>> should require that we have a proper control here.
> >>>
> >>> But I guess this approach is the least-worst as it leaves us with the most
> >>> options moving forwards.
> >>
> >> Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> >> collapse for now as discussed in the other thread.
> >
> > Yes I guess let's carry that on over there.
> >
> > I mean this is why I said it's better to try to keep things in one thread :) but
> > anyway, we've forked and can't be helped now.
> >
> > To be clear that was a criticism of - email development - not you.
> >
> > It's _extremely easy_ to have this happen because one thread naturally leads to
> > a broader discussion of a given topic, whereas another has questions from
> > somebody else about the same topic, to which people reply and then... you have a
> > fork and it can't be helped.
> >
> > I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
> >
> > But that's also broken in its own way, you can't stop people from replying in
> > the other thread still and yeah. It's a limitation of this model :)
> >
> >>
> >>>
> >>>> (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> >>>
> >>> Would this really make all that much difference?
> >>
> >> It solves the creep problem I think, but it's a bit nasty IMHO.
> >
> > Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> > collapses, interesting...
> >
> > Scanning for that does seem a bit nasty though yes...
> >
> >>
> >>>
> >>>> (3) Have separate toggles for each THP size. Doesn't quite solve the
> >>>> problem, only shifts it.
> >>>
> >>> Yeah I did wonder about this as an alternative solution. But of course it then
> >>> makes it vague what the parent values means in respect of the individual levels,
> >>> unless we have an 'inherit' mode there too (possible).
> >>>
> >>> It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> >>> level and I don't think any other parameter from khugepaged/ is exposed at
> >>> individual page size levels.
> >>>
> >>> And of course doing this means we
> >>>
> >>>>
> >>>> Anything else?
> >>>
> >>> Err... I mean I'm not sure if you missed it but I suggested an approach in the
> >>> sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >>>
> >>> Then we allow the capping, but simply document that we specify what the capped
> >>> value will be here for mTHP.
> >>
> >> I did not have time to read the details on that so far.
> >
> > OK. It is a bit nasty, yes. The idea is to find something that allows the
> > capping to work.
> >
> >>
> >> It would be one solution forward. I dislike it because I think the whole
> >> capping is an intermediate thing that can be (and likely must be, when
> >> considering mTHP underused shrinking I think) solved in the future
> >> differently. That's why I would prefer adding this only if there is no
> >> other, simpler, way forward.
> >
> > Yes I agree that if we could avoid it it'd be great.
> >
> > Really I proposed this solution on the basis that we were somehow ok with the
> > capping.
> >
> > If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> > behaviour.
> >
> > We'll clarify on the other thread, but the 511/0 was compelling to me before as
> > a simplification, and if we can have a straightforward model of how mTHP
> > collapse across none/zero page PTEs behaves this is ideal.
> >
> > The only question is w.r.t. warnings etc. but we can handle details there.
> >
> >>
> >>>
> >>> That struck me as the simplest way of getting this series landed without
> >>> necessarily violating any future eagerness which:
> >>>
> >>> a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> >>> this, it's uAPI.
> >>>
> >>> b. Surely must want to do different things for mTHP in eagerness, so if we're
> >>> exposing some PTE value in max_ptes_none doing so in
> >>> khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> >>> readonly so unlike max_ptes_none we don't have to worry about the other
> >>> direction).
> >>>
> >>> HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> >>> which case perhaps mthp_max_ptes_none would be problematic in that it is some
> >>> kind of average.
> >>>
> >>> Then again we could always revert to putting this parameter as in (3) in that
> >>> case, ugly but kinda viable.
> >>>
> >>>>
> >>>> IIUC, creep is less of a problem when we have the underused shrinker
> >>>> enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> >>>> reclaimed again.
> >>>>
> >>>> So maybe having underused-shrinker support for mTHP as well would be a
> >>>> solution to tackle (1) later?
> >>>
> >>> How viable is this in the short term?
> >>
> >> I once started looking into it, but it will require quite some work, because
> >> the lists will essentially include each and every (m)THP in the system ...
> >> so i think we will need some redesign.
> >
> > Ack.
> >
> > This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
> >
> >>
> >>>
> >>> Another possible solution:
> >>>
> >>> If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> >>>
> >>> /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> >>>
> >>> As a simple boolean. If switched on then we document that it caps mTHP as
> >>> per Nico's suggestion.
> >>>
> >>> That way we avoid the 'silent' issue I have with all this and it's an
> >>> explicit setting.
> >>
> >> Right, but it's another toggle I wish we wouldn't need. We could of course
> >> also make it some compile-time option, but not sure if that's really any
> >> better.
> >>
> >> I'd hope we find an easy way forward that doesn't require new toggles, at
> >> least for now ...
> >
> > Right, well I agree if we can make this 0/511 thing work, let's do that.
> >
> > Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> I finally finished reading through the discussions across multiple
> threads:), and it looks like we've reached a preliminary consensus (make
> 0/511 work). Great and thanks!
>
> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> non-none/zero, and for other values, we issue a warning and prohibit
> mTHP collapse (avoid Lorenzo's concern about silently changing
> max_ptes_none). Then the implementation for collapse_max_ptes_none()
> should be as follows:
>
> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * To prevent creeping towards larger order collapses for mTHP
> collapse,
> * we restrict khugepaged_max_ptes_none to only 511 or 0,
> simplifying the
> * logic. This means:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> pr_warn_once("mTHP collapse only supports
> khugepaged_max_ptes_none configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> return -EINVAL;
> }
>
> So what do you think?
Yes i'm glad we finally came to some consensus, despite it being a
less than ideal solution.
Hopefully the eagerness patchset re-introduces all the lost
functionality in the future.
Cheers
-- Nico
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:09 ` Baolin Wang
2025-10-29 2:49 ` Nico Pache
@ 2025-10-29 18:55 ` Lorenzo Stoakes
2025-10-29 21:14 ` Nico Pache
1 sibling, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:55 UTC (permalink / raw)
To: Baolin Wang
Cc: David Hildenbrand, Nico Pache, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
> I finally finished reading through the discussions across multiple
> threads:), and it looks like we've reached a preliminary consensus (make
> 0/511 work). Great and thanks!
Yes we're getting there :) it's a sincere effort to try to find a way to move
forwards.
>
> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> non-none/zero, and for other values, we issue a warning and prohibit mTHP
> collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
> Then the implementation for collapse_max_ptes_none() should be as follows:
>
> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> /*
> * To prevent creeping towards larger order collapses for mTHP
> collapse,
> * we restrict khugepaged_max_ptes_none to only 511 or 0,
> simplifying the
> * logic. This means:
> * max_ptes_none == 511 -> collapse mTHP always
> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> non-none/zero
> */
> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> HPAGE_PMD_NR - 1)
> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> order);
>
> pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
> configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> return -EINVAL;
> }
>
> So what do you think?
Yeah I think something like this.
Though I'd implement it more explicitly like:
/* Zero/non-present collapse disabled. */
if (!khugepaged_max_ptes_none)
return 0;
/* Collapse the maximum number of zero/non-present PTEs. */
if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
return (1 << order) - 1;
Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
A quick check in google sheets suggests my maths is ok here but do correct me if
I'm wrong :)
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:55 ` Lorenzo Stoakes
@ 2025-10-29 21:14 ` Nico Pache
2025-10-30 1:15 ` Baolin Wang
0 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-29 21:14 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Baolin Wang, David Hildenbrand, linux-kernel, linux-trace-kernel,
linux-mm, linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 12:56 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
> > I finally finished reading through the discussions across multiple
> > threads:), and it looks like we've reached a preliminary consensus (make
> > 0/511 work). Great and thanks!
>
> Yes we're getting there :) it's a sincere effort to try to find a way to move
> forwards.
>
> >
> > IIUC, the strategy is, configuring it to 511 means always enabling mTHP
> > collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
> > non-none/zero, and for other values, we issue a warning and prohibit mTHP
> > collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
> > Then the implementation for collapse_max_ptes_none() should be as follows:
> >
> > static int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > {
> > /* ignore max_ptes_none limits */
> > if (full_scan)
> > return HPAGE_PMD_NR - 1;
> >
> > if (order == HPAGE_PMD_ORDER)
> > return khugepaged_max_ptes_none;
> >
> > /*
> > * To prevent creeping towards larger order collapses for mTHP
> > collapse,
> > * we restrict khugepaged_max_ptes_none to only 511 or 0,
> > simplifying the
> > * logic. This means:
> > * max_ptes_none == 511 -> collapse mTHP always
> > * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
> > non-none/zero
> > */
> > if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
> > HPAGE_PMD_NR - 1)
> > return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
> > order);
> >
> > pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
> > configured as 0 or %d\n", HPAGE_PMD_NR - 1);
> > return -EINVAL;
> > }
> >
> > So what do you think?
>
> Yeah I think something like this.
>
> Though I'd implement it more explicitly like:
>
> /* Zero/non-present collapse disabled. */
> if (!khugepaged_max_ptes_none)
> return 0;
>
> /* Collapse the maximum number of zero/non-present PTEs. */
> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
> return (1 << order) - 1;
>
> Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
This looks cleaner/more explicit given the limits we are enforcing!
I'll go for something like that.
>
> A quick check in google sheets suggests my maths is ok here but do correct me if
> I'm wrong :)
LGTM!
Thanks for all the reviews! I'm glad we were able to find a solution :)
-- Nico
>
> Cheers, Lorenzo
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:14 ` Nico Pache
@ 2025-10-30 1:15 ` Baolin Wang
0 siblings, 0 replies; 91+ messages in thread
From: Baolin Wang @ 2025-10-30 1:15 UTC (permalink / raw)
To: Nico Pache, Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/10/30 05:14, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:56 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
>>
>> On Wed, Oct 29, 2025 at 10:09:43AM +0800, Baolin Wang wrote:
>>> I finally finished reading through the discussions across multiple
>>> threads:), and it looks like we've reached a preliminary consensus (make
>>> 0/511 work). Great and thanks!
>>
>> Yes we're getting there :) it's a sincere effort to try to find a way to move
>> forwards.
>>
>>>
>>> IIUC, the strategy is, configuring it to 511 means always enabling mTHP
>>> collapse, configuring it to 0 means collapsing mTHP only if all PTEs are
>>> non-none/zero, and for other values, we issue a warning and prohibit mTHP
>>> collapse (avoid Lorenzo's concern about silently changing max_ptes_none).
>>> Then the implementation for collapse_max_ptes_none() should be as follows:
>>>
>>> static int collapse_max_ptes_none(unsigned int order, bool full_scan)
>>> {
>>> /* ignore max_ptes_none limits */
>>> if (full_scan)
>>> return HPAGE_PMD_NR - 1;
>>>
>>> if (order == HPAGE_PMD_ORDER)
>>> return khugepaged_max_ptes_none;
>>>
>>> /*
>>> * To prevent creeping towards larger order collapses for mTHP
>>> collapse,
>>> * we restrict khugepaged_max_ptes_none to only 511 or 0,
>>> simplifying the
>>> * logic. This means:
>>> * max_ptes_none == 511 -> collapse mTHP always
>>> * max_ptes_none == 0 -> collapse mTHP only if we all PTEs are
>>> non-none/zero
>>> */
>>> if (!khugepaged_max_ptes_none || khugepaged_max_ptes_none ==
>>> HPAGE_PMD_NR - 1)
>>> return khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER -
>>> order);
>>>
>>> pr_warn_once("mTHP collapse only supports khugepaged_max_ptes_none
>>> configured as 0 or %d\n", HPAGE_PMD_NR - 1);
>>> return -EINVAL;
>>> }
>>>
>>> So what do you think?
>>
>> Yeah I think something like this.
>>
>> Though I'd implement it more explicitly like:
>>
>> /* Zero/non-present collapse disabled. */
>> if (!khugepaged_max_ptes_none)
>> return 0;
>>
>> /* Collapse the maximum number of zero/non-present PTEs. */
>> if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
>> return (1 << order) - 1;
>>
>> Then we can do away with this confusing (HPAGE_PMD_ORDER - order) stuff.
>
> This looks cleaner/more explicit given the limits we are enforcing!
>
> I'll go for something like that.
>
>>
>> A quick check in google sheets suggests my maths is ok here but do correct me if
>> I'm wrong :)
>
> LGTM!
LGTM. Thanks.
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
2025-10-28 19:08 ` Lorenzo Stoakes
2025-10-29 2:09 ` Baolin Wang
@ 2025-10-29 2:47 ` Nico Pache
2025-10-29 18:58 ` Lorenzo Stoakes
2025-10-31 11:12 ` David Hildenbrand
3 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-29 2:47 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Oct 28, 2025 at 07:08:38PM +0100, David Hildenbrand wrote:
> >
> > > > > Hey Lorenzo,
> > > > >
> > > > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > > > >
> > > > > I spoke to David and he said to continue forward with this series; the
> > > > > "eagerness" tunable will take some time, and may require further
> > > > > considerations/discussion.
> > > >
> > > > Right, after talking to Johannes it got clearer that what we envisioned with
> > >
> > > I'm not sure that you meant to say go ahead with the series as-is with this
> > > silent capping?
> >
> > No, "go ahead" as in "let's find some way forward that works for all and is
> > not too crazy".
>
> Right we clearly needed to discuss that further at the time but that's moot now,
> we're figuring it out now :)
>
> >
> > [...]
> >
> > > > "eagerness" would not be like swappiness, and we will really have to be
> > > > careful here. I don't know yet when I will have time to look into that.
> > >
> > > I guess I missed this part of the converastion, what do you mean?
> >
> > Johannes raised issues with that on the list and afterwards we had an
> > offline discussion about some of the details and why something unpredictable
> > is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
>
> >
> > >
> > > The whole concept is that we have a paramaeter whose value is _abstracted_ and
> > > which we control what it means.
> > >
> > > I'm not sure exactly why that would now be problematic? The fundamental concept
> > > seems sound no? Last I remember of the conversation this was the case.
> >
> > The basic idea was to do something abstracted as swappiness. Turns out
> > "swappiness" is really something predictable, not something we can randomly
> > change how it behaves under the hood.
> >
> > So we'd have to find something similar for "eagerness", and that's where it
> > stops being easy.
>
> I think we shouldn't be too stuck on
>
> >
> > >
> > > >
> > > > If we want to avoid the implicit capping, I think there are the following
> > > > possible approaches
> > > >
> > > > (1) Tolerate creep for now, maybe warning if the user configures it.
> > >
> > > I mean this seems a viable option if there is pressure to land this series
> > > before we have a viable uAPI for configuring this.
> > >
> > > A part of me thinks we shouldn't rush series in for that reason though and
> > > should require that we have a proper control here.
> > >
> > > But I guess this approach is the least-worst as it leaves us with the most
> > > options moving forwards.
> >
> > Yes. There is also the alternative of respecting only 0 / 511 for mTHP
> > collapse for now as discussed in the other thread.
>
> Yes I guess let's carry that on over there.
>
> I mean this is why I said it's better to try to keep things in one thread :) but
> anyway, we've forked and can't be helped now.
>
> To be clear that was a criticism of - email development - not you.
>
> It's _extremely easy_ to have this happen because one thread naturally leads to
> a broader discussion of a given topic, whereas another has questions from
> somebody else about the same topic, to which people reply and then... you have a
> fork and it can't be helped.
>
> I guess I'm saying it'd be good if we could say 'ok let's move this to X'.
>
> But that's also broken in its own way, you can't stop people from replying in
> the other thread still and yeah. It's a limitation of this model :)
>
> >
> > >
> > > > (2) Avoid creep by counting zero-filled pages towards none_or_zero.
> > >
> > > Would this really make all that much difference?
> >
> > It solves the creep problem I think, but it's a bit nasty IMHO.
>
> Ah because you'd end up wtih a bunch of zeroed pages from the prior mTHP
> collapses, interesting...
>
> Scanning for that does seem a bit nasty though yes...
>
> >
> > >
> > > > (3) Have separate toggles for each THP size. Doesn't quite solve the
> > > > problem, only shifts it.
> > >
> > > Yeah I did wonder about this as an alternative solution. But of course it then
> > > makes it vague what the parent values means in respect of the individual levels,
> > > unless we have an 'inherit' mode there too (possible).
> > >
> > > It's going to be confusing though as max_ptes_none sits at the root khugepaged/
> > > level and I don't think any other parameter from khugepaged/ is exposed at
> > > individual page size levels.
> > >
> > > And of course doing this means we
> > >
> > > >
> > > > Anything else?
> > >
> > > Err... I mean I'm not sure if you missed it but I suggested an approach in the
> > > sub-thread - exposing mthp_max_ptes_none as a _READ-ONLY_ field at:
> > >
> > > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> > >
> > > Then we allow the capping, but simply document that we specify what the capped
> > > value will be here for mTHP.
> >
> > I did not have time to read the details on that so far.
>
> OK. It is a bit nasty, yes. The idea is to find something that allows the
> capping to work.
>
> >
> > It would be one solution forward. I dislike it because I think the whole
> > capping is an intermediate thing that can be (and likely must be, when
> > considering mTHP underused shrinking I think) solved in the future
> > differently. That's why I would prefer adding this only if there is no
> > other, simpler, way forward.
>
> Yes I agree that if we could avoid it it'd be great.
>
> Really I proposed this solution on the basis that we were somehow ok with the
> capping.
>
> If we can avoid that'd be ideal as it reduces complexity and 'unexpected'
> behaviour.
>
> We'll clarify on the other thread, but the 511/0 was compelling to me before as
> a simplification, and if we can have a straightforward model of how mTHP
> collapse across none/zero page PTEs behaves this is ideal.
>
> The only question is w.r.t. warnings etc. but we can handle details there.
>
> >
> > >
> > > That struck me as the simplest way of getting this series landed without
> > > necessarily violating any future eagerness which:
> > >
> > > a. Must still support khugepaged/max_ptes_none - we aren't getting away from
> > > this, it's uAPI.
> > >
> > > b. Surely must want to do different things for mTHP in eagerness, so if we're
> > > exposing some PTE value in max_ptes_none doing so in
> > > khugepaged/mthp_max_ptes_none wouldn't be problematic (note again - it's
> > > readonly so unlike max_ptes_none we don't have to worry about the other
> > > direction).
> > >
> > > HOWEVER, eagerness might want want to change this behaviour per-mTHP size, in
> > > which case perhaps mthp_max_ptes_none would be problematic in that it is some
> > > kind of average.
> > >
> > > Then again we could always revert to putting this parameter as in (3) in that
> > > case, ugly but kinda viable.
> > >
> > > >
> > > > IIUC, creep is less of a problem when we have the underused shrinker
> > > > enabled: whatever we over-allocated can (unless longterm-pinned etc) get
> > > > reclaimed again.
> > > >
> > > > So maybe having underused-shrinker support for mTHP as well would be a
> > > > solution to tackle (1) later?
> > >
> > > How viable is this in the short term?
> >
> > I once started looking into it, but it will require quite some work, because
> > the lists will essentially include each and every (m)THP in the system ...
> > so i think we will need some redesign.
>
> Ack.
>
> This aligns with non-0/511 settings being non-functional for mTHP atm anyway.
>
> >
> > >
> > > Another possible solution:
> > >
> > > If mthp_max_ptes_none is not workable, we could have a toggle at, e.g.:
> > >
> > > /sys/kernel/mm/transparent_hugepage/khugepaged/mthp_cap_collapse_none
> > >
> > > As a simple boolean. If switched on then we document that it caps mTHP as
> > > per Nico's suggestion.
> > >
> > > That way we avoid the 'silent' issue I have with all this and it's an
> > > explicit setting.
> >
> > Right, but it's another toggle I wish we wouldn't need. We could of course
> > also make it some compile-time option, but not sure if that's really any
> > better.
> >
> > I'd hope we find an easy way forward that doesn't require new toggles, at
> > least for now ...
>
> Right, well I agree if we can make this 0/511 thing work, let's do that.
Ok, great, some consensus! I will go ahead with that solution.
Just to make sure we are all on the same page,
the max_ptes_none value will be treated as 0 for anything other than
PMD collapse, or in the case of 511. Or will the max_ptes_none only
work for mTHP collapse when it is 0.
static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
{
unsigned int max_ptes_none;
/* ignore max_ptes_none limits */
if (full_scan)
return HPAGE_PMD_NR - 1;
if (order == HPAGE_PMD_ORDER)
return khugepaged_max_ptes_none;
if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
return 0;
return max_ptes_none >> (HPAGE_PMD_ORDER - order);
}
Here's the implementation for the first approach, looks like Baolin
was able to catch up and beat me to the other solution while I was
mulling over the thread lol
Cheers,
-- Nico
>
> Toggle are just 'least worst' workarounds on assumption of the need for capping.
>
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 2:47 ` Nico Pache
@ 2025-10-29 18:58 ` Lorenzo Stoakes
2025-10-29 21:23 ` Nico Pache
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-29 18:58 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > Right, well I agree if we can make this 0/511 thing work, let's do that.
>
> Ok, great, some consensus! I will go ahead with that solution.
:) awesome.
>
> Just to make sure we are all on the same page,
I am still stabilising my understanding of the creep issue, see the thread
where David kindly + patiently goes in detail, I think I am at a
(pre-examining algorithm itself) broad understanding of this.
>
> the max_ptes_none value will be treated as 0 for anything other than
> PMD collapse, or in the case of 511. Or will the max_ptes_none only
> work for mTHP collapse when it is 0.
511 implies always collapse zero/none, 0 implies never, as I understand it.
>
> static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> {
> unsigned int max_ptes_none;
>
> /* ignore max_ptes_none limits */
> if (full_scan)
> return HPAGE_PMD_NR - 1;
>
> if (order == HPAGE_PMD_ORDER)
> return khugepaged_max_ptes_none;
>
> if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> return 0;
>
> return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> }
>
> Here's the implementation for the first approach, looks like Baolin
> was able to catch up and beat me to the other solution while I was
> mulling over the thread lol
Broadly looks similar to Baolin's, I made some suggestions over there
though!
>
> Cheers,
> -- Nico
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 18:58 ` Lorenzo Stoakes
@ 2025-10-29 21:23 ` Nico Pache
2025-10-30 10:15 ` Lorenzo Stoakes
0 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-29 21:23 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 12:59 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> > On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > > Right, well I agree if we can make this 0/511 thing work, let's do that.
> >
> > Ok, great, some consensus! I will go ahead with that solution.
>
> :) awesome.
>
> >
> > Just to make sure we are all on the same page,
>
> I am still stabilising my understanding of the creep issue, see the thread
> where David kindly + patiently goes in detail, I think I am at a
> (pre-examining algorithm itself) broad understanding of this.
I added some details of the creep issue in my other replies, hopefully
that also helps!
>
> >
> > the max_ptes_none value will be treated as 0 for anything other than
> > PMD collapse, or in the case of 511. Or will the max_ptes_none only
> > work for mTHP collapse when it is 0.
>
> 511 implies always collapse zero/none, 0 implies never, as I understand it.
0 implies only collapse if a given mTHP size is fully occupied by
present PTES. Since we start at PMD and work our way down we will
always end up with a PMD range of fully occupied mTHPs, potentially of
all different sizes.
>
> >
> > static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > {
> > unsigned int max_ptes_none;
> >
> > /* ignore max_ptes_none limits */
> > if (full_scan)
> > return HPAGE_PMD_NR - 1;
> >
> > if (order == HPAGE_PMD_ORDER)
> > return khugepaged_max_ptes_none;
> >
> > if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> > return 0;
> >
> > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > }
> >
> > Here's the implementation for the first approach, looks like Baolin
> > was able to catch up and beat me to the other solution while I was
> > mulling over the thread lol
>
> Broadly looks similar to Baolin's, I made some suggestions over there
> though!
Thanks! They are both based on my current collapse_max_ptes_none! Just
a slight difference in behavior surrounding the two suggested
solutions by David.
I will still have to implement the logic for not attempting mTHP
collapses if it is any intermediate value (i.e. the function returns
-EINVAL).
-- Nico
>
> >
> > Cheers,
> > -- Nico
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-29 21:23 ` Nico Pache
@ 2025-10-30 10:15 ` Lorenzo Stoakes
0 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-30 10:15 UTC (permalink / raw)
To: Nico Pache
Cc: David Hildenbrand, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 29, 2025 at 03:23:27PM -0600, Nico Pache wrote:
> On Wed, Oct 29, 2025 at 12:59 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Tue, Oct 28, 2025 at 08:47:12PM -0600, Nico Pache wrote:
> > > On Tue, Oct 28, 2025 at 1:00 PM Lorenzo Stoakes
> > > > Right, well I agree if we can make this 0/511 thing work, let's do that.
> > >
> > > Ok, great, some consensus! I will go ahead with that solution.
> >
> > :) awesome.
> >
> > >
> > > Just to make sure we are all on the same page,
> >
> > I am still stabilising my understanding of the creep issue, see the thread
> > where David kindly + patiently goes in detail, I think I am at a
> > (pre-examining algorithm itself) broad understanding of this.
>
> I added some details of the creep issue in my other replies, hopefully
> that also helps!
>
> >
> > >
> > > the max_ptes_none value will be treated as 0 for anything other than
> > > PMD collapse, or in the case of 511. Or will the max_ptes_none only
> > > work for mTHP collapse when it is 0.
> >
> > 511 implies always collapse zero/none, 0 implies never, as I understand it.
>
> 0 implies only collapse if a given mTHP size is fully occupied by
> present PTES. Since we start at PMD and work our way down we will
> always end up with a PMD range of fully occupied mTHPs, potentially of
> all different sizes.
Yeah this was my understanding, I mean terminology is tricky here (+ I am
probably not being entirely clear tbh), so I mean less so '0 means no
collapse' but rather '0 means no collapse of zero/none' but of course can
allow for collapse of present PTEs (within the same VMA).
>
> >
> > >
> > > static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > {
> > > unsigned int max_ptes_none;
> > >
> > > /* ignore max_ptes_none limits */
> > > if (full_scan)
> > > return HPAGE_PMD_NR - 1;
> > >
> > > if (order == HPAGE_PMD_ORDER)
> > > return khugepaged_max_ptes_none;
> > >
> > > if (khugepaged_max_ptes_none != HPAGE_PMD_NR - 1)
> > > return 0;
> > >
> > > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > }
> > >
> > > Here's the implementation for the first approach, looks like Baolin
> > > was able to catch up and beat me to the other solution while I was
> > > mulling over the thread lol
> >
> > Broadly looks similar to Baolin's, I made some suggestions over there
> > though!
>
> Thanks! They are both based on my current collapse_max_ptes_none! Just
> a slight difference in behavior surrounding the two suggested
> solutions by David.
Yes which is convenient as it's less delta for you!
>
> I will still have to implement the logic for not attempting mTHP
> collapses if it is any intermediate value (i.e. the function returns
> -EINVAL).
Ack
>
> -- Nico
>
> >
> > >
> > > Cheers,
> > > -- Nico
> >
> > Thanks, Lorenzo
> >
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 18:59 ` Lorenzo Stoakes
` (2 preceding siblings ...)
2025-10-29 2:47 ` Nico Pache
@ 2025-10-31 11:12 ` David Hildenbrand
3 siblings, 0 replies; 91+ messages in thread
From: David Hildenbrand @ 2025-10-31 11:12 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
>>>> "eagerness" would not be like swappiness, and we will really have to be
>>>> careful here. I don't know yet when I will have time to look into that.
>>>
>>> I guess I missed this part of the converastion, what do you mean?
>>
>> Johannes raised issues with that on the list and afterwards we had an
>> offline discussion about some of the details and why something unpredictable
>> is not good.
>
> Could we get these details on-list so we can discuss them? This doesn't have to
> be urgent, but I would like to have a say in this or at least be part of the
> converastion please.
Sorry, I only found now time to reply on this point. Johannes raised the
point in [1], and afterwards we went a bit into detail in a off-list
discussion.
In essence, I think he is right that is something we have to be very
careful about. So it turned out as something that will take a lot more
time+effort on my side than I originally thought, turning it not
feasible in the short term given how I already lack behind on so many
other things.
So I concluded that it's probably best to have such and effort be
independent of this series. And in some way it is either way, because
max_ptes_none is just a horrible interface given the values are
architecture dependent.
I'll be happy if we can focus in this series on the bare minimum initial
support, and avoid any magic (scaling / capping) as it all turned out to
be much more tricky (interaction with the deferred shrinker ...) than
most of us initially thought.
But I think we're already on the same page here, just wanted to share a
bit more details on the max_ptes_none vs. eagerness idea.
[1] https://lkml.kernel.org/r/20250915134359.GA827803@cmpxchg.org
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 13:36 ` Nico Pache
2025-10-28 14:15 ` David Hildenbrand
@ 2025-10-28 16:57 ` Lorenzo Stoakes
2025-10-28 17:49 ` David Hildenbrand
1 sibling, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 16:57 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 07:36:55AM -0600, Nico Pache wrote:
> On Mon, Oct 27, 2025 at 11:54 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Wed, Oct 22, 2025 at 12:37:08PM -0600, Nico Pache wrote:
> > > The current mechanism for determining mTHP collapse scales the
> > > khugepaged_max_ptes_none value based on the target order. This
> > > introduces an undesirable feedback loop, or "creep", when max_ptes_none
> > > is set to a value greater than HPAGE_PMD_NR / 2.
> > >
> > > With this configuration, a successful collapse to order N will populate
> > > enough pages to satisfy the collapse condition on order N+1 on the next
> > > scan. This leads to unnecessary work and memory churn.
> > >
> > > To fix this issue introduce a helper function that caps the max_ptes_none
> > > to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
> > > the max_ptes_none number by the (PMD_ORDER - target collapse order).
> > >
> > > The limits can be ignored by passing full_scan=true, this is useful for
> > > madvise_collapse (which ignores limits), or in the case of
> > > collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
> > > collapse is available.
> > >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > > mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
> > > 1 file changed, 34 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index 4ccebf5dda97..286c3a7afdee 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
> > > wake_up_interruptible(&khugepaged_wait);
> > > }
> > >
> > > +/**
> > > + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
> > > + * @order: The folio order being collapsed to
> > > + * @full_scan: Whether this is a full scan (ignore limits)
> > > + *
> > > + * For madvise-triggered collapses (full_scan=true), all limits are bypassed
> > > + * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
> > > + *
> > > + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> > > + * khugepaged_max_ptes_none value.
> > > + *
> > > + * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
> > > + * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
> > > + *
> > > + * Return: Maximum number of empty PTEs allowed for the collapse operation
> > > + */
> > > +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > > +{
> > > + unsigned int max_ptes_none;
> > > +
> > > + /* ignore max_ptes_none limits */
> > > + if (full_scan)
> > > + return HPAGE_PMD_NR - 1;
> > > +
> > > + if (order == HPAGE_PMD_ORDER)
> > > + return khugepaged_max_ptes_none;
> > > +
> > > + max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
>
> Hey Lorenzo,
>
> > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>
> I spoke to David and he said to continue forward with this series; the
> "eagerness" tunable will take some time, and may require further
> considerations/discussion.
It would be good to communicate this in the patch, I wasn't aware he had said go
ahead with it. Maybe I missed the mail.
Also others might not be aware. When you're explicitly ignoring prior
review from 2 version ago you really do need to spell out why, at least for
civility's sake.
Apologies if there was communication I've forgotten about/missed. But
either way please can we very explicitly communicate these things.
>
> >
> > I'm still really quite uncomfortable with us silently capping this value.
> >
> > If we're putting forward theoretical ideas that are to be later built upon, this
> > series should be an RFC.
> >
> > But if we really intend to silently ignore user input the problem is that then
> > becomes established uAPI.
> >
> > I think it's _sensible_ to avoid this mTHP escalation problem, but the issue is
> > visibility I think.
> >
> > I think people are going to find it odd that you set it to something, but then
> > get something else.
>
> The alternative solution is to not support max_ptes_none for mTHP
> collapse and not allow none/zero pages. This is essentially "capping"
> the value too.
No that alternative equally _silently_ ignores the user-specified tunable,
which is my objection.
The problem you have here is max_ptes_none _defaults_ to a value that
violates the cap for mTHP (511).
So neither solution is workable.
>
> >
> > As an alternative we could have a new sysfs field:
> >
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_mthp_ptes_none
> >
> > That shows the cap clearly.
> >
> > In fact, it could be read-only... and just expose it to the user. That reduces
> > complexity.
>
> I agree with Baolin here; adding another tunable will only increase
> the complexity for our future goals, and also provides needless
> insight into the internals when they can not be customized.
We already have needless insight into internals with max_pte_none which we can
never, ever remove due to uAPI so that ship has sailed I'm afraid.
I don't personally think adding a read-only view of this data really makes
that much worse.
Also if we do go ahead with eagerness, I expect we are going to want to
have different max_pte_none values for mTHP/non-mTHP.
We _will_ need to convert between max_pte_none and eagerness in some way
(though when eagerness comes along, we can start having 'detent' values,
that is if a use specifies max_ptes_none of 237 we could change it to 128
for instance) and as a result show eagerness _in terms of_ max_pte_none.
Since we _have_ to do this for uAPI reasons, it doesn't seem really that
harmful or adding to complexity to do the equivalent for a _read-only_
field for mTHP.
AFAIC this patch right now is not upstreamable for the simple reason of
violating user expectation (even if that expectation might be silly) and
_silently_ updating max_ptes_none for mTHP.
So this suggestion was designed to try to get us towards something
upstreamable.
So it's not a case of 'sorry I don't like that we can't do it' + we go
ahead with things as they are, it's a case of - we really need to find a
way to do this not-silently or AFAICT, the series is blocked on this until
this is resolved.
Perhaps we should have discussed 'what to do for v12' more on-list and
could have avoided this ahead of time...
Thanks, Lorenzo
>
> Cheers,
> -- Nico
>
> >
> > We can then bring in eagerness later and have the same situation of
> > max_ptes_none being a parameter that exists (plus this additional read-only
> > parameter).
> >
> > > +
> > > + return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > +
> > > +}
> > > +
> > > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > > vm_flags_t vm_flags)
> > > {
> > > @@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > > pte_t *_pte;
> > > int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> > > const unsigned long nr_pages = 1UL << order;
> > > - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > > + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
> > >
> > > for (_pte = pte; _pte < pte + nr_pages;
> > > _pte++, addr += PAGE_SIZE) {
> > > --
> > > 2.51.0
> > >
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 16:57 ` Lorenzo Stoakes
@ 2025-10-28 17:49 ` David Hildenbrand
2025-10-28 17:59 ` Lorenzo Stoakes
0 siblings, 1 reply; 91+ messages in thread
From: David Hildenbrand @ 2025-10-28 17:49 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
>> Hey Lorenzo,
>>
>>> I mean not to beat a dead horse re: v11 commentary, but I thought we were going
>>> to implement David's idea re: the new 'eagerness' tunable, and again we're now just
>>> implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
>>
>> I spoke to David and he said to continue forward with this series; the
>> "eagerness" tunable will take some time, and may require further
>> considerations/discussion.
>
> It would be good to communicate this in the patch, I wasn't aware he had said go
> ahead with it. Maybe I missed the mail.
Just to clarify: yes, I think we should find a way to move forward with
this series without an eagerness toggle.
That doesn't imply that we'll be using the capping as proposed here (I
hate it, it's just tricky to work around it for now).
And ideally, we can do that without any temporary tunables, because I'm
sure it is a problem we can solve internally long-term.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function
2025-10-28 17:49 ` David Hildenbrand
@ 2025-10-28 17:59 ` Lorenzo Stoakes
0 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-28 17:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Tue, Oct 28, 2025 at 06:49:48PM +0100, David Hildenbrand wrote:
> > > Hey Lorenzo,
> > >
> > > > I mean not to beat a dead horse re: v11 commentary, but I thought we were going
> > > > to implement David's idea re: the new 'eagerness' tunable, and again we're now just
> > > > implementing the capping at HPAGE_PMD_NR/2 - 1 thing again?
> > >
> > > I spoke to David and he said to continue forward with this series; the
> > > "eagerness" tunable will take some time, and may require further
> > > considerations/discussion.
> >
> > It would be good to communicate this in the patch, I wasn't aware he had said go
> > ahead with it. Maybe I missed the mail.
>
> Just to clarify: yes, I think we should find a way to move forward with this
> series without an eagerness toggle.
OK, let's please communicate this clearly in future. Maybe I missed the comms on
that.
>
> That doesn't imply that we'll be using the capping as proposed here (I hate
> it, it's just tricky to work around it for now).
OK well this is what I thought, that you hadn't meant that we should go ahead
with the logic completely unaltered from that which was explicitly pushed back
on in v10 I think.
We obviously need to figure out a way forward on this so let's get that
done as quickly as we can.
>
> And ideally, we can do that without any temporary tunables, because I'm sure
> it is a problem we can solve internally long-term.
I went into great detail replying on the relevant thread about this, that's
have that discussion there for sanity's sake.
>
> --
> Cheers
>
> David / dhildenb
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (5 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 06/15] khugepaged: introduce collapse_max_ptes_none helper function Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 3:25 ` Baolin Wang
2025-11-06 18:14 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders Nico Pache
` (8 subsequent siblings)
15 siblings, 2 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
Pass an order and offset to collapse_huge_page to support collapsing anon
memory to arbitrary orders within a PMD. order indicates what mTHP size we
are attempting to collapse to, and offset indicates were in the PMD to
start the collapse attempt.
For non-PMD collapse we must leave the anon VMA write locked until after
we collapse the mTHP-- in the PMD case all the pages are isolated, but in
the mTHP case this is not true, and we must keep the lock to prevent
changes to the VMA from occurring.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
1 file changed, 67 insertions(+), 41 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 286c3a7afdee..75e7ebdccc36 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
return SCAN_SUCCEED;
}
-static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
- int referenced, int unmapped,
- struct collapse_control *cc)
+static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ bool *mmap_locked, unsigned int order, unsigned long offset)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
- pte_t *pte;
+ pte_t *pte = NULL, mthp_pte;
pgtable_t pgtable;
struct folio *folio;
spinlock_t *pmd_ptl, *pte_ptl;
int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
+ bool anon_vma_locked = false;
+ const unsigned long nr_pages = 1UL << order;
+ unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
* sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
+ * If collapsing mTHPs we may have already released the read_lock.
*/
- mmap_read_unlock(mm);
+ if (*mmap_locked) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ }
- result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
+ result = alloc_charge_folio(&folio, mm, cc, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ *mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
@@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* released when it fails. So we jump out_nolock directly in
* that case. Continuing to collapse causes inconsistency.
*/
- result = __collapse_huge_page_swapin(mm, vma, address, pmd,
- referenced, HPAGE_PMD_ORDER);
+ result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
+ referenced, order);
if (result != SCAN_SUCCEED)
goto out_nolock;
}
mmap_read_unlock(mm);
+ *mmap_locked = false;
/*
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
@@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* mmap_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
- HPAGE_PMD_ORDER);
+ result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
vma_start_write(vma);
- result = check_pmd_still_valid(mm, address, pmd);
+ result = check_pmd_still_valid(mm, pmd_address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
+ anon_vma_locked = true;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
- address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
+ mthp_address + (PAGE_SIZE << order));
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
@@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
- _pmd = pmdp_collapse_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
tlb_remove_table_sync_one();
- pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
+ pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
if (pte) {
- result = __collapse_huge_page_isolate(vma, address, pte, cc,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
+ order, &compound_pagelist);
spin_unlock(pte_ptl);
} else {
result = SCAN_PMD_NULL;
}
if (unlikely(result != SCAN_SUCCEED)) {
- if (pte)
- pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
@@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
- anon_vma_unlock_write(vma->anon_vma);
goto out_up_write;
}
/*
- * All pages are isolated and locked so anon_vma rmap
- * can't run anymore.
+ * For PMD collapse all pages are isolated and locked so anon_vma
+ * rmap can't run anymore. For mTHP collapse we must hold the lock
*/
- anon_vma_unlock_write(vma->anon_vma);
+ if (order == HPAGE_PMD_ORDER) {
+ anon_vma_unlock_write(vma->anon_vma);
+ anon_vma_locked = false;
+ }
result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
- vma, address, pte_ptl,
- HPAGE_PMD_ORDER,
- &compound_pagelist);
- pte_unmap(pte);
+ vma, mthp_address, pte_ptl,
+ order, &compound_pagelist);
if (unlikely(result != SCAN_SUCCEED))
goto out_up_write;
@@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
* write.
*/
__folio_mark_uptodate(folio);
- pgtable = pmd_pgtable(_pmd);
+ if (order == HPAGE_PMD_ORDER) {
+ pgtable = pmd_pgtable(_pmd);
- spin_lock(pmd_ptl);
- BUG_ON(!pmd_none(*pmd));
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
- map_anon_folio_pmd_nopf(folio, pmd, vma, address);
- spin_unlock(pmd_ptl);
+ spin_lock(pmd_ptl);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
+ spin_unlock(pmd_ptl);
+ } else { /* mTHP collapse */
+ mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
+ mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
+
+ spin_lock(pmd_ptl);
+ WARN_ON_ONCE(!pmd_none(*pmd));
+ folio_ref_add(folio, nr_pages - 1);
+ folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
+ update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
+
+ smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
+ spin_unlock(pmd_ptl);
+ }
folio = NULL;
result = SCAN_SUCCEED;
out_up_write:
+ if (anon_vma_locked)
+ anon_vma_unlock_write(vma->anon_vma);
+ if (pte)
+ pte_unmap(pte);
mmap_write_unlock(mm);
out_nolock:
+ *mmap_locked = false;
if (folio)
folio_put(folio);
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
@@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc);
- /* collapse_huge_page will return with the mmap_lock released */
- *mmap_locked = false;
+ unmapped, cc, mmap_locked,
+ HPAGE_PMD_ORDER, 0);
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
@ 2025-10-27 3:25 ` Baolin Wang
2025-11-06 18:14 ` Lorenzo Stoakes
1 sibling, 0 replies; 91+ messages in thread
From: Baolin Wang @ 2025-10-27 3:25 UTC (permalink / raw)
To: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
LGTM. And passed my mTHP collapse testing cases. So:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
2025-10-27 3:25 ` Baolin Wang
@ 2025-11-06 18:14 ` Lorenzo Stoakes
2025-11-07 3:09 ` Dev Jain
2025-11-07 19:33 ` Nico Pache
1 sibling, 2 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:14 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:09PM -0600, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
>
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
NIT but is this -- a typo?
> the mTHP case this is not true, and we must keep the lock to prevent
> changes to the VMA from occurring.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
> 1 file changed, 67 insertions(+), 41 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 286c3a7afdee..75e7ebdccc36 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> return SCAN_SUCCEED;
> }
>
> -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> - int referenced, int unmapped,
> - struct collapse_control *cc)
> +static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
Presumably pmd_address is the PMD-aligned address?
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned int order, unsigned long offset)
It'd be nice to pass through a helper struct at this point having so many params
but perhaps we can deal with that in a follow up series.
If PMD address is the PMD-aligned address, and mthp_address = pmd_address +
offset * PAGE_SIZE, couldn't we just pass in the mthp address and get the
PMD address by aligning down to PMD size and reduce the number of args by
1?
> {
> LIST_HEAD(compound_pagelist);
> pmd_t *pmd, _pmd;
> - pte_t *pte;
> + pte_t *pte = NULL, mthp_pte;
mthp_pte is only used in a single if () branch and can be declared there
AFAICT?
> pgtable_t pgtable;
> struct folio *folio;
> spinlock_t *pmd_ptl, *pte_ptl;
> int result = SCAN_FAIL;
> struct vm_area_struct *vma;
> struct mmu_notifier_range range;
> + bool anon_vma_locked = false;
> + const unsigned long nr_pages = 1UL << order;
> + unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
Do we ever update this? If not we can const-ify.
>
> - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
NIT: Be nice to convert this to a VM_WARN_ON_ONCE(), as VM_BUG_ON() is not
right here.
>
> /*
> * Before allocating the hugepage, release the mmap_lock read lock.
> * The allocation can take potentially a long time if it involves
> * sync compaction, and we do not need to hold the mmap_lock during
> * that. We will recheck the vma after taking it again in write mode.
> + * If collapsing mTHPs we may have already released the read_lock.
> */
> - mmap_read_unlock(mm);
> + if (*mmap_locked) {
> + mmap_read_unlock(mm);
> + *mmap_locked = false;
> + }
>
> - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> + result = alloc_charge_folio(&folio, mm, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
>
> mmap_read_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + *mmap_locked = true;
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
I don't really love the semantics of 'sometimes we set *mmap_locked false
when we unlock, sometimes we rely on out_nolock doing it'.
Let's just set it false when we unlock and VM_WARN_ON_ONCE(*mmap_locked) in
out_nolock.
> goto out_nolock;
> }
>
> - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
> if (result != SCAN_SUCCEED) {
> mmap_read_unlock(mm);
> goto out_nolock;
> @@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * released when it fails. So we jump out_nolock directly in
> * that case. Continuing to collapse causes inconsistency.
> */
> - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced, HPAGE_PMD_ORDER);
> + result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
> + referenced, order);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
>
> mmap_read_unlock(mm);
> + *mmap_locked = false;
> /*
> * Prevent all access to pagetables with the exception of
> * gup_fast later handled by the ptep_clear_flush and the VM
> @@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * mmap_lock.
> */
> mmap_write_lock(mm);
> - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> - HPAGE_PMD_ORDER);
> + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
> /* check if the pmd is still valid */
> vma_start_write(vma);
> - result = check_pmd_still_valid(mm, address, pmd);
> + result = check_pmd_still_valid(mm, pmd_address, pmd);
> if (result != SCAN_SUCCEED)
> goto out_up_write;
>
> anon_vma_lock_write(vma->anon_vma);
> + anon_vma_locked = true;
>
> - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> - address + HPAGE_PMD_SIZE);
> + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
> + mthp_address + (PAGE_SIZE << order));
> mmu_notifier_invalidate_range_start(&range);
>
> pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> @@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * Parallel GUP-fast is fine since GUP-fast will back off when
> * it detects PMD is changed.
> */
> - _pmd = pmdp_collapse_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
Not your fault but so hate this _p** convention. One for a follow up I
suppose.
> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(&range);
> tlb_remove_table_sync_one();
>
> - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> + pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
> if (pte) {
> - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> + result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
> + order, &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_PMD_NULL;
> }
>
> if (unlikely(result != SCAN_SUCCEED)) {
> - if (pte)
> - pte_unmap(pte);
OK I guess we drop this because it's handled in out_up_write. I assume no
issue keeping PTE mapped here?
> spin_lock(pmd_ptl);
> BUG_ON(!pmd_none(*pmd));
> /*
> @@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> */
> pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> spin_unlock(pmd_ptl);
> - anon_vma_unlock_write(vma->anon_vma);
> goto out_up_write;
> }
>
> /*
> - * All pages are isolated and locked so anon_vma rmap
> - * can't run anymore.
> + * For PMD collapse all pages are isolated and locked so anon_vma
> + * rmap can't run anymore. For mTHP collapse we must hold the lock
> */
> - anon_vma_unlock_write(vma->anon_vma);
> + if (order == HPAGE_PMD_ORDER) {
> + anon_vma_unlock_write(vma->anon_vma);
> + anon_vma_locked = false;
> + }
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> - vma, address, pte_ptl,
> - HPAGE_PMD_ORDER,
> - &compound_pagelist);
> - pte_unmap(pte);
> + vma, mthp_address, pte_ptl,
> + order, &compound_pagelist);
Looking through __collapse_huge_page_copy() there doesn't seem to be any
issue with holding anon lock here.
> if (unlikely(result != SCAN_SUCCEED))
> goto out_up_write;
>
> @@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> * write.
> */
> __folio_mark_uptodate(folio);
> - pgtable = pmd_pgtable(_pmd);
> + if (order == HPAGE_PMD_ORDER) {
> + pgtable = pmd_pgtable(_pmd);
>
> - spin_lock(pmd_ptl);
> - BUG_ON(!pmd_none(*pmd));
> - pgtable_trans_huge_deposit(mm, pmd, pgtable);
> - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> - spin_unlock(pmd_ptl);
> + spin_lock(pmd_ptl);
> + WARN_ON_ONCE(!pmd_none(*pmd));
> + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
> + spin_unlock(pmd_ptl);
> + } else { /* mTHP collapse */
As per above, let's just declare mthp_pte here.
> + mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
Hm, so we make a PTE that references the first page of the folio? I guess
the folio will be an mTHP folio so we're just creating essentially a
> + mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
In set_pte_range() we have a whole host of other checks like dirty,
uffd_wp, etc. I wonder if we need to consider those?
> +
> + spin_lock(pmd_ptl);
We're duplicating this in both branches, why not do outside if/else?
> + WARN_ON_ONCE(!pmd_none(*pmd));
Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
could be collapsing more than one mTHP into an existing PTE table no? I may
be missing something here/confused :)
> + folio_ref_add(folio, nr_pages - 1);
If we're setting the refcount here, where is the ref count being set in the
PMD path?
> + folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
> + folio_add_lru_vma(folio, vma);
> + set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
> + update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
Prior to this change the only user of this are functions in memory.c, I
do wonder if this is the wrong abstraction here.
But maybe that's _yet another_ thing for a follow up (the THP code is a
mess).
> +
> + smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
Feels like we could avoid open-coding this by just using pmd_install()?
Also are we therefore missing a mm_inc_nr_ptes() invocation here, or do we
update mm->pgtables_bytes elsewhere?
> + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
Why are we referencing pmd in PMD branch and _pmd here?
> + spin_unlock(pmd_ptl);
The PMD case does this stuff in map_anon_pmd_nopf(), could we add one for
mTHP?
This function is already horribly overwrought (not your fault) so I'd like
to avoid adding open-coded blocks as much as possible.
> + }
>
> folio = NULL;
>
> result = SCAN_SUCCEED;
> out_up_write:
> + if (anon_vma_locked)
> + anon_vma_unlock_write(vma->anon_vma);
> + if (pte)
> + pte_unmap(pte);
> mmap_write_unlock(mm);
> out_nolock:
> + *mmap_locked = false;
See above comment about setting this prior to jumping to out_nolock.
> if (folio)
> folio_put(folio);
> trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> @@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc);
> - /* collapse_huge_page will return with the mmap_lock released */
> - *mmap_locked = false;
> + unmapped, cc, mmap_locked,
> + HPAGE_PMD_ORDER, 0);
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-06 18:14 ` Lorenzo Stoakes
@ 2025-11-07 3:09 ` Dev Jain
2025-11-07 9:18 ` Lorenzo Stoakes
2025-11-07 19:33 ` Nico Pache
1 sibling, 1 reply; 91+ messages in thread
From: Dev Jain @ 2025-11-07 3:09 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
> ----------[snip]------------
>
>> +
>> + spin_lock(pmd_ptl);
> We're duplicating this in both branches, why not do outside if/else?
>
>> + WARN_ON_ONCE(!pmd_none(*pmd));
> Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> could be collapsing more than one mTHP into an existing PTE table no? I may
> be missing something here/confused :)
After this code path isolates the PTE table, we don't want any other code path
doing "Hey, I see an empty PMD, let's install a PTE table here". One of the
reasons why all the heavy locking is required here.
Also, I want to ask a question about WARN vs BUG_ON: suppose that the
race I described above occurs. After khugepaged isolates the PTE table, someone
faults in a PTE table there, and eventually writes data in the underlying folios.
Then the buggy khugepaged nukes out that table and installs a new one, installing
an mTHP folio which had old data. How do we decide whether such a condition is
worthy of a BUG_ON (leading to system crash) vs letting this pass with WARN?
>
> ------------[snip]----------
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-07 3:09 ` Dev Jain
@ 2025-11-07 9:18 ` Lorenzo Stoakes
0 siblings, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-07 9:18 UTC (permalink / raw)
To: Dev Jain
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, david, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
corbet, rostedt, mhiramat, mathieu.desnoyers, akpm, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Fri, Nov 07, 2025 at 08:39:03AM +0530, Dev Jain wrote:
> > ----------[snip]------------
PLease when you snip can you not snip way the code being referenced?
That's really unhelpful and now this sub-thread loses a ton of context...
> >
> > > +
> > > + spin_lock(pmd_ptl);
> > We're duplicating this in both branches, why not do outside if/else?
> >
> > > + WARN_ON_ONCE(!pmd_none(*pmd));
> > Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> > could be collapsing more than one mTHP into an existing PTE table no? I may
> > be missing something here/confused :)
>
> After this code path isolates the PTE table, we don't want any other code path
> doing "Hey, I see an empty PMD, let's install a PTE table here". One of the
> reasons why all the heavy locking is required here.
That wasn't the question, the question was why are not able to install mTHP
entries in an existing PTE table.
I'm obviously aware that we need to lock here.
>
> Also, I want to ask a question about WARN vs BUG_ON: suppose that the
> race I described above occurs. After khugepaged isolates the PTE table, someone
> faults in a PTE table there, and eventually writes data in the underlying folios.
> Then the buggy khugepaged nukes out that table and installs a new one, installing
> an mTHP folio which had old data. How do we decide whether such a condition is
> worthy of a BUG_ON (leading to system crash) vs letting this pass with WARN?
To all intents and purposes just use a WARN_ON(). A BUG_ON() is almost
never right. This has been done to death.
Probably the WARN_ON() should be a VM_WARN_ON_ONCE() because this is
something that should simply not be happening in practice.
Or can make if (WARN_ON_ONCE(...)) abort, but then we complicate already
very complciated code.
>
>
> >
> > ------------[snip]----------
> >
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse
2025-11-06 18:14 ` Lorenzo Stoakes
2025-11-07 3:09 ` Dev Jain
@ 2025-11-07 19:33 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-07 19:33 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Thu, Nov 6, 2025 at 11:15 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:09PM -0600, Nico Pache wrote:
> > Pass an order and offset to collapse_huge_page to support collapsing anon
> > memory to arbitrary orders within a PMD. order indicates what mTHP size we
> > are attempting to collapse to, and offset indicates were in the PMD to
> > start the collapse attempt.
> >
> > For non-PMD collapse we must leave the anon VMA write locked until after
> > we collapse the mTHP-- in the PMD case all the pages are isolated, but in
>
> NIT but is this -- a typo?
no its an em dash. I can replace it with a period if you'd like, but
both work in this context.
>
> > the mTHP case this is not true, and we must keep the lock to prevent
> > changes to the VMA from occurring.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++------------------
> > 1 file changed, 67 insertions(+), 41 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 286c3a7afdee..75e7ebdccc36 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1142,43 +1142,50 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> > return SCAN_SUCCEED;
> > }
> >
> > -static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > - int referenced, int unmapped,
> > - struct collapse_control *cc)
> > +static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
>
> Presumably pmd_address is the PMD-aligned address?
>
> > + int referenced, int unmapped, struct collapse_control *cc,
> > + bool *mmap_locked, unsigned int order, unsigned long offset)
>
> It'd be nice to pass through a helper struct at this point having so many params
> but perhaps we can deal with that in a follow up series.
>
> If PMD address is the PMD-aligned address, and mthp_address = pmd_address +
> offset * PAGE_SIZE, couldn't we just pass in the mthp address and get the
> PMD address by aligning down to PMD size and reduce the number of args by
> 1?
Yeah that seems like a good idea. Thanks
>
> > {
> > LIST_HEAD(compound_pagelist);
> > pmd_t *pmd, _pmd;
> > - pte_t *pte;
> > + pte_t *pte = NULL, mthp_pte;
>
> mthp_pte is only used in a single if () branch and can be declared there
> AFAICT?
ack!
>
> > pgtable_t pgtable;
> > struct folio *folio;
> > spinlock_t *pmd_ptl, *pte_ptl;
> > int result = SCAN_FAIL;
> > struct vm_area_struct *vma;
> > struct mmu_notifier_range range;
> > + bool anon_vma_locked = false;
> > + const unsigned long nr_pages = 1UL << order;
> > + unsigned long mthp_address = pmd_address + offset * PAGE_SIZE;
>
> Do we ever update this? If not we can const-ify.
ack!
>
> >
> > - VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > + VM_BUG_ON(pmd_address & ~HPAGE_PMD_MASK);
>
> NIT: Be nice to convert this to a VM_WARN_ON_ONCE(), as VM_BUG_ON() is not
> right here.
>
> >
> > /*
> > * Before allocating the hugepage, release the mmap_lock read lock.
> > * The allocation can take potentially a long time if it involves
> > * sync compaction, and we do not need to hold the mmap_lock during
> > * that. We will recheck the vma after taking it again in write mode.
> > + * If collapsing mTHPs we may have already released the read_lock.
> > */
> > - mmap_read_unlock(mm);
> > + if (*mmap_locked) {
> > + mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > + }
> >
> > - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER);
> > + result = alloc_charge_folio(&folio, mm, cc, order);
> > if (result != SCAN_SUCCEED)
> > goto out_nolock;
> >
> > mmap_read_lock(mm);
> > - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > - HPAGE_PMD_ORDER);
> > + *mmap_locked = true;
> > + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> > if (result != SCAN_SUCCEED) {
> > mmap_read_unlock(mm);
>
> I don't really love the semantics of 'sometimes we set *mmap_locked false
> when we unlock, sometimes we rely on out_nolock doing it'.
>
> Let's just set it false when we unlock and VM_WARN_ON_ONCE(*mmap_locked) in
> out_nolock.
Ok that sounds like a good idea! thanks
>
> > goto out_nolock;
> > }
> >
> > - result = find_pmd_or_thp_or_none(mm, address, &pmd);
> > + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd);
> > if (result != SCAN_SUCCEED) {
> > mmap_read_unlock(mm);
> > goto out_nolock;
> > @@ -1190,13 +1197,14 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * released when it fails. So we jump out_nolock directly in
> > * that case. Continuing to collapse causes inconsistency.
> > */
> > - result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> > - referenced, HPAGE_PMD_ORDER);
> > + result = __collapse_huge_page_swapin(mm, vma, mthp_address, pmd,
> > + referenced, order);
> > if (result != SCAN_SUCCEED)
> > goto out_nolock;
> > }
> >
> > mmap_read_unlock(mm);
> > + *mmap_locked = false;
> > /*
> > * Prevent all access to pagetables with the exception of
> > * gup_fast later handled by the ptep_clear_flush and the VM
> > @@ -1206,20 +1214,20 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * mmap_lock.
> > */
> > mmap_write_lock(mm);
> > - result = hugepage_vma_revalidate(mm, address, true, &vma, cc,
> > - HPAGE_PMD_ORDER);
> > + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order);
> > if (result != SCAN_SUCCEED)
> > goto out_up_write;
> > /* check if the pmd is still valid */
> > vma_start_write(vma);
> > - result = check_pmd_still_valid(mm, address, pmd);
> > + result = check_pmd_still_valid(mm, pmd_address, pmd);
> > if (result != SCAN_SUCCEED)
> > goto out_up_write;
> >
> > anon_vma_lock_write(vma->anon_vma);
> > + anon_vma_locked = true;
> >
> > - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> > - address + HPAGE_PMD_SIZE);
> > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, mthp_address,
> > + mthp_address + (PAGE_SIZE << order));
> > mmu_notifier_invalidate_range_start(&range);
> >
> > pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
> > @@ -1231,24 +1239,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * Parallel GUP-fast is fine since GUP-fast will back off when
> > * it detects PMD is changed.
> > */
> > - _pmd = pmdp_collapse_flush(vma, address, pmd);
> > + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
>
> Not your fault but so hate this _p** convention. One for a follow up I
> suppose.
>
> > spin_unlock(pmd_ptl);
> > mmu_notifier_invalidate_range_end(&range);
> > tlb_remove_table_sync_one();
> >
> > - pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> > + pte = pte_offset_map_lock(mm, &_pmd, mthp_address, &pte_ptl);
> > if (pte) {
> > - result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > - HPAGE_PMD_ORDER,
> > - &compound_pagelist);
> > + result = __collapse_huge_page_isolate(vma, mthp_address, pte, cc,
> > + order, &compound_pagelist);
> > spin_unlock(pte_ptl);
> > } else {
> > result = SCAN_PMD_NULL;
> > }
> >
> > if (unlikely(result != SCAN_SUCCEED)) {
> > - if (pte)
> > - pte_unmap(pte);
>
> OK I guess we drop this because it's handled in out_up_write. I assume no
> issue keeping PTE mapped here?
Correct, I dont think there are any issues here. The checks for pte
and anon_vma_locked in out_up_write should keep everything in order.
>
> > spin_lock(pmd_ptl);
> > BUG_ON(!pmd_none(*pmd));
> > /*
> > @@ -1258,21 +1263,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > */
> > pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> > spin_unlock(pmd_ptl);
> > - anon_vma_unlock_write(vma->anon_vma);
> > goto out_up_write;
> > }
> >
> > /*
> > - * All pages are isolated and locked so anon_vma rmap
> > - * can't run anymore.
> > + * For PMD collapse all pages are isolated and locked so anon_vma
> > + * rmap can't run anymore. For mTHP collapse we must hold the lock
> > */
> > - anon_vma_unlock_write(vma->anon_vma);
> > + if (order == HPAGE_PMD_ORDER) {
> > + anon_vma_unlock_write(vma->anon_vma);
> > + anon_vma_locked = false;
> > + }
> >
> > result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> > - vma, address, pte_ptl,
> > - HPAGE_PMD_ORDER,
> > - &compound_pagelist);
> > - pte_unmap(pte);
> > + vma, mthp_address, pte_ptl,
> > + order, &compound_pagelist);
>
> Looking through __collapse_huge_page_copy() there doesn't seem to be any
> issue with holding anon lock here.
>
> > if (unlikely(result != SCAN_SUCCEED))
> > goto out_up_write;
> >
> > @@ -1282,20 +1287,42 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > * write.
> > */
> > __folio_mark_uptodate(folio);
> > - pgtable = pmd_pgtable(_pmd);
> > + if (order == HPAGE_PMD_ORDER) {
> > + pgtable = pmd_pgtable(_pmd);
> >
> > - spin_lock(pmd_ptl);
> > - BUG_ON(!pmd_none(*pmd));
> > - pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > - map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> > - spin_unlock(pmd_ptl);
> > + spin_lock(pmd_ptl);
> > + WARN_ON_ONCE(!pmd_none(*pmd));
> > + pgtable_trans_huge_deposit(mm, pmd, pgtable);
> > + map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_address);
> > + spin_unlock(pmd_ptl);
> > + } else { /* mTHP collapse */
>
> As per above, let's just declare mthp_pte here.
ack
>
> > + mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot);
>
> Hm, so we make a PTE that references the first page of the folio? I guess
> the folio will be an mTHP folio so we're just creating essentially a
>
> > + mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma);
>
> In set_pte_range() we have a whole host of other checks like dirty,
> uffd_wp, etc. I wonder if we need to consider those?
I dont believe so because those checks are coming from fault handling.
Here we are doing almost the same thing that the PMD case was doing
with some influence from do_anonymous_page()
>
> > +
> > + spin_lock(pmd_ptl);
>
> We're duplicating this in both branches, why not do outside if/else?
ack
>
> > + WARN_ON_ONCE(!pmd_none(*pmd));
>
> Hmm so the PMD entry will still always be empty on mTHP collapse? Surely we
> could be collapsing more than one mTHP into an existing PTE table no? I may
> be missing something here/confused :)
We remove the PMD entry to ensure no GUP-fast call can operate on this PMD.
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* This removes any huge TLB entry from the CPU so we won't allow
* huge and small TLB entries for the same virtual address to
* avoid the risk of CPU bugs in that area.
*
* Parallel GUP-fast is fine since GUP-fast will back off when
* it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, pmd_address, pmd);
pmdp_collapse_flush removes the PMD
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
In the PMD case we install a new PMD, in the mTHP case (and in the
failure cases), we reinstall the same PMD once we are done/exit.
>
> > + folio_ref_add(folio, nr_pages - 1);
>
> If we're setting the refcount here, where is the ref count being set in the
> PMD path?
Both folios are initiated with a single ref. PMDs only need 1 ref,
while mTHPs need a ref for each PTE; hence why we increment by
nr_pages - 1.
>
> > + folio_add_new_anon_rmap(folio, vma, mthp_address, RMAP_EXCLUSIVE);
> > + folio_add_lru_vma(folio, vma);
> > + set_ptes(vma->vm_mm, mthp_address, pte, mthp_pte, nr_pages);
> > + update_mmu_cache_range(NULL, vma, mthp_address, pte, nr_pages);
>
> Prior to this change the only user of this are functions in memory.c, I
> do wonder if this is the wrong abstraction here.
>
> But maybe that's _yet another_ thing for a follow up (the THP code is a
> mess).
Yes, I tried to do something similar to the new
map_anon_folio_pmd_nopf, but it proved to be harder than expected. The
other cases that do similar operations all differ slightly so unifying
is going to be tricky/require more testing.
>
> > +
> > + smp_wmb(); /* make PTEs visible before PMD. See c() */
>
> Feels like we could avoid open-coding this by just using pmd_install()?
The locking seems to differ which may make that tricky.
>
> Also are we therefore missing a mm_inc_nr_ptes() invocation here, or do we
> update mm->pgtables_bytes elsewhere?
If I understand correctly, we already have accounted for the ptes when
we alloc'd them and their parent PMD. Since we are operating on an
already allocated PMD, I dont think we need to handle accounting for
PMD or mTHP collapse. Ill send some time confirming this before
posting.
>
>
> > + pmd_populate(mm, pmd, pmd_pgtable(_pmd));
>
> Why are we referencing pmd in PMD branch and _pmd here?
I explained it a little more above, but we are reinstalling the
original PMD entry, which was removed for gup race reasons.
>
> > + spin_unlock(pmd_ptl);
>
> The PMD case does this stuff in map_anon_pmd_nopf(), could we add one for
> mTHP?
Yes but I believe we should clean it up after. Unifying most of the
callers proved tricky.
>
> This function is already horribly overwrought (not your fault) so I'd like
> to avoid adding open-coded blocks as much as possible.
>
> > + }
> >
> > folio = NULL;
> >
> > result = SCAN_SUCCEED;
> > out_up_write:
> > + if (anon_vma_locked)
> > + anon_vma_unlock_write(vma->anon_vma);
> > + if (pte)
> > + pte_unmap(pte);
> > mmap_write_unlock(mm);
> > out_nolock:
> > + *mmap_locked = false;
>
> See above comment about setting this prior to jumping to out_nolock.
ack
Thanks for the reviews!
-- Nico
>
> > if (folio)
> > folio_put(folio);
> > trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
> > @@ -1463,9 +1490,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > pte_unmap_unlock(pte, ptl);
> > if (result == SCAN_SUCCEED) {
> > result = collapse_huge_page(mm, start_addr, referenced,
> > - unmapped, cc);
> > - /* collapse_huge_page will return with the mmap_lock released */
> > - *mmap_locked = false;
> > + unmapped, cc, mmap_locked,
> > + HPAGE_PMD_ORDER, 0);
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (6 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 07/15] khugepaged: generalize collapse_huge_page for mTHP collapse Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
` (7 subsequent siblings)
15 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in
some pages being unmapped. Skip these cases until we have a way to check
if its ok to collapse to a smaller mTHP size (like in the case of a
partially mapped folio).
This patch is inspired by Dev Jain's work on khugepaged mTHP support [1].
[1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Co-developed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 75e7ebdccc36..d741af15e18c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -629,6 +629,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
}
+ /*
+ * TODO: In some cases of partially-mapped folios, we'd actually
+ * want to collapse.
+ */
+ if (order != HPAGE_PMD_ORDER && folio_order(folio) >= order) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ goto out;
+ }
if (folio_test_large(folio)) {
struct folio *f;
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (7 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 08/15] khugepaged: skip collapsing mTHP to smaller orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-06 18:45 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders Nico Pache
` (6 subsequent siblings)
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
Add three new mTHP statistics to track collapse failures for different
orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
- collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
PTEs
- collapse_exceed_none_pte: Counts when mTHP collapse fails due to
exceeding the none PTE threshold for the given order
- collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
PTEs
These statistics complement the existing THP_SCAN_EXCEED_* events by
providing per-order granularity for mTHP collapse attempts. The stats are
exposed via sysfs under
`/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
supported hugepage size.
As we currently dont support collapsing mTHPs that contain a swap or
shared entry, those statistics keep track of how often we are
encountering failed mTHP collapses due to these restrictions.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
include/linux/huge_mm.h | 3 +++
mm/huge_memory.c | 7 +++++++
mm/khugepaged.c | 16 ++++++++++++---
4 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 13269a0074d4..7c71cda8aea1 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -709,6 +709,29 @@ nr_anon_partially_mapped
an anonymous THP as "partially mapped" and count it here, even though it
is not actually partially mapped anymore.
+collapse_exceed_none_pte
+ The number of anonymous mTHP pte ranges where the number of none PTEs
+ exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
+ checks a PMD region and tracks which PTEs are present. It then tries
+ to collapse to the largest enabled mTHP size. The allowed number of empty
+ PTEs is the max_ptes_none threshold scaled by the collapse order. This
+ counter records the number of times a collapse attempt was skipped for
+ this reason, and khugepaged moved on to try the next available mTHP size.
+
+collapse_exceed_swap_pte
+ The number of anonymous mTHP pte ranges which contain at least one swap
+ PTE. Currently khugepaged does not support collapsing mTHP regions
+ that contain a swap PTE. This counter can be used to monitor the
+ number of khugepaged mTHP collapses that failed due to the presence
+ of a swap PTE.
+
+collapse_exceed_shared_pte
+ The number of anonymous mTHP pte ranges which contain at least one shared
+ PTE. Currently khugepaged does not support collapsing mTHP pte ranges
+ that contain a shared PTE. This counter can be used to monitor the
+ number of khugepaged mTHP collapses that failed due to the presence
+ of a shared PTE.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3d29624c4f3f..4b2773235041 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
MTHP_STAT_SPLIT_DEFERRED,
MTHP_STAT_NR_ANON,
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+ MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+ MTHP_STAT_COLLAPSE_EXCEED_NONE,
+ MTHP_STAT_COLLAPSE_EXCEED_SHARED,
__MTHP_STAT_COUNT
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0063d1ba926e..7335b92969d6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
static struct attribute *anon_stats_attrs[] = {
&anon_fault_alloc_attr.attr,
@@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
&split_deferred_attr.attr,
&nr_anon_attr.attr,
&nr_anon_partially_mapped_attr.attr,
+ &collapse_exceed_swap_pte_attr.attr,
+ &collapse_exceed_none_pte_attr.attr,
+ &collapse_exceed_shared_pte_attr.attr,
NULL,
};
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d741af15e18c..053202141ea3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ if (order == HPAGE_PMD_ORDER)
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
goto out;
}
}
@@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* shared may cause a future higher order collapse on a
* rescan of the same range.
*/
- if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared)) {
+ if (order != HPAGE_PMD_ORDER) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+ goto out;
+ }
+
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
goto out;
}
}
@@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* range.
*/
if (order != HPAGE_PMD_ORDER) {
+ count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
pte_unmap(pte);
mmap_read_unlock(mm);
result = SCAN_EXCEED_SWAP_PTE;
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
@ 2025-11-06 18:45 ` Lorenzo Stoakes
2025-11-07 17:14 ` Nico Pache
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:45 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:11PM -0600, Nico Pache wrote:
> Add three new mTHP statistics to track collapse failures for different
> orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
>
> - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> PTEs
>
> - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> exceeding the none PTE threshold for the given order
>
> - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> PTEs
>
> These statistics complement the existing THP_SCAN_EXCEED_* events by
> providing per-order granularity for mTHP collapse attempts. The stats are
> exposed via sysfs under
> `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> supported hugepage size.
>
> As we currently dont support collapsing mTHPs that contain a swap or
> shared entry, those statistics keep track of how often we are
> encountering failed mTHP collapses due to these restrictions.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
> include/linux/huge_mm.h | 3 +++
> mm/huge_memory.c | 7 +++++++
> mm/khugepaged.c | 16 ++++++++++++---
> 4 files changed, 46 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 13269a0074d4..7c71cda8aea1 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -709,6 +709,29 @@ nr_anon_partially_mapped
> an anonymous THP as "partially mapped" and count it here, even though it
> is not actually partially mapped anymore.
>
> +collapse_exceed_none_pte
> + The number of anonymous mTHP pte ranges where the number of none PTEs
Ranges? Is the count per-mTHP folio? Or per PTE entry? Let's clarify.
> + exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
> + checks a PMD region and tracks which PTEs are present. It then tries
> + to collapse to the largest enabled mTHP size. The allowed number of empty
Well and then tries to collapse to the next and etc. right? So maybe worth
mentioning?
> + PTEs is the max_ptes_none threshold scaled by the collapse order. This
I think this needs clarification, scaled how? Also obviously with the proposed
new approach we will need to correct this to reflect the 511/0 situation.
> + counter records the number of times a collapse attempt was skipped for
> + this reason, and khugepaged moved on to try the next available mTHP size.
OK you mention the moving on here, so for each attempted mTHP size which exeeds
max_none_pte we increment this stat correct? Probably worth clarifying that.
> +
> +collapse_exceed_swap_pte
> + The number of anonymous mTHP pte ranges which contain at least one swap
> + PTE. Currently khugepaged does not support collapsing mTHP regions
> + that contain a swap PTE. This counter can be used to monitor the
> + number of khugepaged mTHP collapses that failed due to the presence
> + of a swap PTE.
OK so as soon as we encounter a swap PTE we abort and this counts each instance
of that?
I guess worth spelling that out? Given we don't support it, surely the opening
description should be 'The number of anonymous mTHP PTE ranges which were unable
to be collapsed due to containing one or more swap PTEs'.
> +
> +collapse_exceed_shared_pte
> + The number of anonymous mTHP pte ranges which contain at least one shared
> + PTE. Currently khugepaged does not support collapsing mTHP pte ranges
> + that contain a shared PTE. This counter can be used to monitor the
> + number of khugepaged mTHP collapses that failed due to the presence
> + of a shared PTE.
Same comments as above.
> +
> As the system ages, allocating huge pages may be expensive as the
> system uses memory compaction to copy data around memory to free a
> huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 3d29624c4f3f..4b2773235041 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
> MTHP_STAT_SPLIT_DEFERRED,
> MTHP_STAT_NR_ANON,
> MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> + MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> + MTHP_STAT_COLLAPSE_EXCEED_NONE,
> + MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> __MTHP_STAT_COUNT
> };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0063d1ba926e..7335b92969d6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>
> static struct attribute *anon_stats_attrs[] = {
> &anon_fault_alloc_attr.attr,
> @@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
> &split_deferred_attr.attr,
> &nr_anon_attr.attr,
> &nr_anon_partially_mapped_attr.attr,
> + &collapse_exceed_swap_pte_attr.attr,
> + &collapse_exceed_none_pte_attr.attr,
> + &collapse_exceed_shared_pte_attr.attr,
> NULL,
> };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index d741af15e18c..053202141ea3 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> - count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + if (order == HPAGE_PMD_ORDER)
> + count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> goto out;
> }
> }
> @@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> * shared may cause a future higher order collapse on a
> * rescan of the same range.
> */
> - if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared)) {
> + if (order != HPAGE_PMD_ORDER) {
A little nit/idea in general for series - since we do this order !=
HPAGE_PMD_ORDER check all over, maybe have a predict function like:
static bool is_mthp_order(unsigned int order)
{
return order != HPAGE_PMD_ORDER;
}
> + result = SCAN_EXCEED_SHARED_PTE;
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> + goto out;
> + }
> +
> + if (cc->is_khugepaged &&
> + shared > khugepaged_max_ptes_shared) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
OK I _think_ I mentioned this in a previous revision so forgive me for being
repetitious but we also count PMD orders here?
But in the MTHP_STAT_COLLAPSE_EXCEED_NONE and MTP_STAT_COLLAPSE_EXCEED_SWAP
cases we don't? Why's that?
> goto out;
> }
> }
> @@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> * range.
> */
> if (order != HPAGE_PMD_ORDER) {
> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> pte_unmap(pte);
> mmap_read_unlock(mm);
> result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.51.0
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics
2025-11-06 18:45 ` Lorenzo Stoakes
@ 2025-11-07 17:14 ` Nico Pache
0 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-07 17:14 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Thu, Nov 6, 2025 at 11:47 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:11PM -0600, Nico Pache wrote:
> > Add three new mTHP statistics to track collapse failures for different
> > orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
> >
> > - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> > PTEs
> >
> > - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
> > exceeding the none PTE threshold for the given order
> >
> > - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
> > PTEs
> >
> > These statistics complement the existing THP_SCAN_EXCEED_* events by
> > providing per-order granularity for mTHP collapse attempts. The stats are
> > exposed via sysfs under
> > `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> > supported hugepage size.
> >
> > As we currently dont support collapsing mTHPs that contain a swap or
> > shared entry, those statistics keep track of how often we are
> > encountering failed mTHP collapses due to these restrictions.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > Documentation/admin-guide/mm/transhuge.rst | 23 ++++++++++++++++++++++
> > include/linux/huge_mm.h | 3 +++
> > mm/huge_memory.c | 7 +++++++
> > mm/khugepaged.c | 16 ++++++++++++---
> > 4 files changed, 46 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index 13269a0074d4..7c71cda8aea1 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -709,6 +709,29 @@ nr_anon_partially_mapped
> > an anonymous THP as "partially mapped" and count it here, even though it
> > is not actually partially mapped anymore.
> >
> > +collapse_exceed_none_pte
> > + The number of anonymous mTHP pte ranges where the number of none PTEs
>
> Ranges? Is the count per-mTHP folio? Or per PTE entry? Let's clarify.
I dont know the proper terminology. But what we have here is a range
of PTEs that is being considered for mTHP folio collapse; however, it
is still not a mTHP folio which is why I hesitated to call it that.
Given this counter is per mTHP size I think the proper way to say this would be:
The number of collapse attempts that failed due to exceeding the
max_ptes_none threshold.
>
> > + exceeded the max_ptes_none threshold. For mTHP collapse, khugepaged
> > + checks a PMD region and tracks which PTEs are present. It then tries
> > + to collapse to the largest enabled mTHP size. The allowed number of empty
>
> Well and then tries to collapse to the next and etc. right? So maybe worth
> mentioning?
>
> > + PTEs is the max_ptes_none threshold scaled by the collapse order. This
>
> I think this needs clarification, scaled how? Also obviously with the proposed
> new approach we will need to correct this to reflect the 511/0 situation.
>
> > + counter records the number of times a collapse attempt was skipped for
> > + this reason, and khugepaged moved on to try the next available mTHP size.
>
> OK you mention the moving on here, so for each attempted mTHP size which exeeds
> max_none_pte we increment this stat correct? Probably worth clarifying that.
>
> > +
> > +collapse_exceed_swap_pte
> > + The number of anonymous mTHP pte ranges which contain at least one swap
> > + PTE. Currently khugepaged does not support collapsing mTHP regions
> > + that contain a swap PTE. This counter can be used to monitor the
> > + number of khugepaged mTHP collapses that failed due to the presence
> > + of a swap PTE.
>
> OK so as soon as we encounter a swap PTE we abort and this counts each instance
> of that?
>
> I guess worth spelling that out? Given we don't support it, surely the opening
> description should be 'The number of anonymous mTHP PTE ranges which were unable
> to be collapsed due to containing one or more swap PTEs'.
>
> > +
> > +collapse_exceed_shared_pte
> > + The number of anonymous mTHP pte ranges which contain at least one shared
> > + PTE. Currently khugepaged does not support collapsing mTHP pte ranges
> > + that contain a shared PTE. This counter can be used to monitor the
> > + number of khugepaged mTHP collapses that failed due to the presence
> > + of a shared PTE.
>
> Same comments as above.
>
> > +
> > As the system ages, allocating huge pages may be expensive as the
> > system uses memory compaction to copy data around memory to free a
> > huge page for use. There are some counters in ``/proc/vmstat`` to help
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 3d29624c4f3f..4b2773235041 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -144,6 +144,9 @@ enum mthp_stat_item {
> > MTHP_STAT_SPLIT_DEFERRED,
> > MTHP_STAT_NR_ANON,
> > MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> > + MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> > + MTHP_STAT_COLLAPSE_EXCEED_NONE,
> > + MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> > __MTHP_STAT_COUNT
> > };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 0063d1ba926e..7335b92969d6 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -638,6 +638,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> > DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> > DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> > DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > +
> >
> > static struct attribute *anon_stats_attrs[] = {
> > &anon_fault_alloc_attr.attr,
> > @@ -654,6 +658,9 @@ static struct attribute *anon_stats_attrs[] = {
> > &split_deferred_attr.attr,
> > &nr_anon_attr.attr,
> > &nr_anon_partially_mapped_attr.attr,
> > + &collapse_exceed_swap_pte_attr.attr,
> > + &collapse_exceed_none_pte_attr.attr,
> > + &collapse_exceed_shared_pte_attr.attr,
> > NULL,
> > };
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index d741af15e18c..053202141ea3 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -592,7 +592,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> > - count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > + if (order == HPAGE_PMD_ORDER)
> > + count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > goto out;
> > }
> > }
> > @@ -622,10 +624,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > * shared may cause a future higher order collapse on a
> > * rescan of the same range.
> > */
> > - if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> > - shared > khugepaged_max_ptes_shared)) {
> > + if (order != HPAGE_PMD_ORDER) {
>
Thanks for the review! I'll go clean these up for the next version
> A little nit/idea in general for series - since we do this order !=
> HPAGE_PMD_ORDER check all over, maybe have a predict function like:
>
> static bool is_mthp_order(unsigned int order)
> {
> return order != HPAGE_PMD_ORDER;
> }
sure!
>
> > + result = SCAN_EXCEED_SHARED_PTE;
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > + goto out;
> > + }
> > +
> > + if (cc->is_khugepaged &&
> > + shared > khugepaged_max_ptes_shared) {
> > result = SCAN_EXCEED_SHARED_PTE;
> > count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>
> OK I _think_ I mentioned this in a previous revision so forgive me for being
> repetitious but we also count PMD orders here?
>
> But in the MTHP_STAT_COLLAPSE_EXCEED_NONE and MTP_STAT_COLLAPSE_EXCEED_SWAP
> cases we don't? Why's that?
Hmm I could have sworn I fixed that... perhaps I reintroduced the
missing stat update when I had to rebase/undo the cleanup series by
Lance. I will fix this.
Cheers.
-- Nico
>
>
> > goto out;
> > }
> > }
> > @@ -1073,6 +1082,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> > * range.
> > */
> > if (order != HPAGE_PMD_ORDER) {
> > + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > pte_unmap(pte);
> > mmap_read_unlock(mm);
> > result = SCAN_EXCEED_SWAP_PTE;
> > --
> > 2.51.0
> >
>
> Thanks, Lorenzo
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (8 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 09/15] khugepaged: add per-order mTHP collapse failure statistics Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
` (5 subsequent siblings)
15 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to
give better insight into what order is being operated at for.
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/trace/events/huge_memory.h | 34 +++++++++++++++++++-----------
mm/khugepaged.c | 9 ++++----
2 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index dd94d14a2427..19d99b2549e6 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -88,40 +88,44 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
TRACE_EVENT(mm_collapse_huge_page,
- TP_PROTO(struct mm_struct *mm, int isolated, int status),
+ TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order),
- TP_ARGS(mm, isolated, status),
+ TP_ARGS(mm, isolated, status, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, isolated)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
__entry->mm = mm;
__entry->isolated = isolated;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("mm=%p, isolated=%d, status=%s",
+ TP_printk("mm=%p, isolated=%d, status=%s order=%u",
__entry->mm,
__entry->isolated,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_isolate,
TP_PROTO(struct folio *folio, int none_or_zero,
- int referenced, int status),
+ int referenced, int status, unsigned int order),
- TP_ARGS(folio, none_or_zero, referenced, status),
+ TP_ARGS(folio, none_or_zero, referenced, status, order),
TP_STRUCT__entry(
__field(unsigned long, pfn)
__field(int, none_or_zero)
__field(int, referenced)
__field(int, status)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -129,26 +133,30 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
__entry->none_or_zero = none_or_zero;
__entry->referenced = referenced;
__entry->status = status;
+ __entry->order = order;
),
- TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s",
+ TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s order=%u",
__entry->pfn,
__entry->none_or_zero,
__entry->referenced,
- __print_symbolic(__entry->status, SCAN_STATUS))
+ __print_symbolic(__entry->status, SCAN_STATUS),
+ __entry->order)
);
TRACE_EVENT(mm_collapse_huge_page_swapin,
- TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
+ TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret,
+ unsigned int order),
- TP_ARGS(mm, swapped_in, referenced, ret),
+ TP_ARGS(mm, swapped_in, referenced, ret, order),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(int, swapped_in)
__field(int, referenced)
__field(int, ret)
+ __field(unsigned int, order)
),
TP_fast_assign(
@@ -156,13 +164,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
__entry->swapped_in = swapped_in;
__entry->referenced = referenced;
__entry->ret = ret;
+ __entry->order = order;
),
- TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
+ TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u",
__entry->mm,
__entry->swapped_in,
__entry->referenced,
- __entry->ret)
+ __entry->ret,
+ __entry->order)
);
TRACE_EVENT(mm_khugepaged_scan_file,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 053202141ea3..0dbbe04c31fe 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -722,13 +722,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, result);
+ referenced, result, order);
return result;
}
@@ -1123,7 +1123,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
result = SCAN_SUCCEED;
out:
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result,
+ order);
return result;
}
@@ -1343,7 +1344,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
*mmap_locked = false;
if (folio)
folio_put(folio);
- trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order);
return result;
}
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (9 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 10/15] khugepaged: improve tracepoints for mTHP orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-06 18:49 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
` (4 subsequent siblings)
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
Add collapse_allowable_orders() to generalize THP order eligibility. The
function determines which THP orders are permitted based on collapse
context (khugepaged vs madv_collapse).
This consolidates collapse configuration logic and provides a clean
interface for future mTHP collapse support where the orders may be
different.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0dbbe04c31fe..89a105124790 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
return max_ptes_none >> (HPAGE_PMD_ORDER - order);
+}
+
+/* Check what orders are allowed based on the vma and collapse type */
+static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
+ vm_flags_t vm_flags, bool is_khugepaged)
+{
+ enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
+ unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
@@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (collapse_allowable_orders(vma, vm_flags, true))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+ if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
skip:
progress++;
continue;
@@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!collapse_allowable_orders(vma, vma->vm_flags, false))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
@ 2025-11-06 18:49 ` Lorenzo Stoakes
2025-11-07 18:01 ` Nico Pache
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-06 18:49 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:13PM -0600, Nico Pache wrote:
> Add collapse_allowable_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse).
>
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
With nits below adddressed, LGTM so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/khugepaged.c | 15 ++++++++++++---
> 1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0dbbe04c31fe..89a105124790 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
>
> return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> +}
> +
> +/* Check what orders are allowed based on the vma and collapse type */
> +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> + vm_flags_t vm_flags, bool is_khugepaged)
> +{
> + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> + unsigned long orders = BIT(HPAGE_PMD_ORDER);
Nit, but can const-ify.
>
> + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> }
>
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> @@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
> {
> if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> hugepage_pmd_enabled()) {
> - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> + if (collapse_allowable_orders(vma, vm_flags, true))
If we have a 'mystery meat' boolean parameter can we always use the convention of:
collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)
Please? Same goes for other invocations obviously.
> __khugepaged_enter(vma->vm_mm);
> }
> }
> @@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> progress++;
> break;
> }
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> + if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
> skip:
> progress++;
> continue;
> @@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> BUG_ON(vma->vm_start > start);
> BUG_ON(vma->vm_end < end);
>
> - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> + if (!collapse_allowable_orders(vma, vma->vm_flags, false))
> return -EINVAL;
>
> cc = kmalloc(sizeof(*cc), GFP_KERNEL);
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function
2025-11-06 18:49 ` Lorenzo Stoakes
@ 2025-11-07 18:01 ` Nico Pache
0 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-07 18:01 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Thu, Nov 6, 2025 at 11:51 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:13PM -0600, Nico Pache wrote:
> > Add collapse_allowable_orders() to generalize THP order eligibility. The
> > function determines which THP orders are permitted based on collapse
> > context (khugepaged vs madv_collapse).
> >
> > This consolidates collapse configuration logic and provides a clean
> > interface for future mTHP collapse support where the orders may be
> > different.
> >
> > Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
>
> With nits below adddressed, LGTM so:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thank you!
> > ---
> > mm/khugepaged.c | 15 ++++++++++++---
> > 1 file changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 0dbbe04c31fe..89a105124790 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -489,7 +489,16 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
> > max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
> >
> > return max_ptes_none >> (HPAGE_PMD_ORDER - order);
> > +}
> > +
> > +/* Check what orders are allowed based on the vma and collapse type */
> > +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> > + vm_flags_t vm_flags, bool is_khugepaged)
> > +{
> > + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> > + unsigned long orders = BIT(HPAGE_PMD_ORDER);
>
> Nit, but can const-ify.
It becomes a function of is_khugepaged in a later patch.
>
> >
> > + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> > }
> >
> > void khugepaged_enter_vma(struct vm_area_struct *vma,
> > @@ -497,7 +506,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
> > {
> > if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> > hugepage_pmd_enabled()) {
> > - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> > + if (collapse_allowable_orders(vma, vm_flags, true))
>
> If we have a 'mystery meat' boolean parameter can we always use the convention of:
>
> collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)
>
> Please? Same goes for other invocations obviously.
Sounds good! I'll fix those up.
Thanks,
-- Nico
>
>
> > __khugepaged_enter(vma->vm_mm);
> > }
> > }
> > @@ -2567,7 +2576,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
> > progress++;
> > break;
> > }
> > - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> > + if (!collapse_allowable_orders(vma, vma->vm_flags, true)) {
> > skip:
> > progress++;
> > continue;
> > @@ -2873,7 +2882,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
> > BUG_ON(vma->vm_start > start);
> > BUG_ON(vma->vm_end < end);
> >
> > - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
> > + if (!collapse_allowable_orders(vma, vma->vm_flags, false))
> > return -EINVAL;
> >
> > cc = kmalloc(sizeof(*cc), GFP_KERNEL);
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (10 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 11/15] khugepaged: introduce collapse_allowable_orders helper function Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-27 6:28 ` Baolin Wang
` (2 more replies)
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
` (3 subsequent siblings)
15 siblings, 3 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
enabled we remove the restriction of max_ptes_none during the scan phase
to avoid missing potential mTHP candidates.
Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
and determine the best eligible order for the collapse. A stack struct is
used instead of traditional recursion. The algorithm splits the bitmap
into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
the attempted collapse order to determine how "full" an order must be
before being considered for collapse.
Once we determine what mTHP sizes fits best in that PMD range a collapse
is attempted. A minimum collapse order of 2 is used as this is the lowest
order supported by anon memory.
mTHP collapses reject regions containing swapped out or shared pages.
This is because adding new entries can lead to new none pages, and these
may lead to constant promotion into a higher order (m)THP. A similar
issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
introducing at least 2x the number of pages, and on a future scan will
satisfy the promotion condition once again. This issue is prevented via
the collapse_allowable_orders() function.
Currently madv_collapse is not supported and will only attempt PMD
collapse.
We can also remove the check for is_khugepaged inside the PMD scan as
the collapse_max_ptes_none() function handles this logic now.
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/linux/khugepaged.h | 2 +
mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
2 files changed, 122 insertions(+), 8 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eb1946a70cff..179ce716e769 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KHUGEPAGED_H
#define _LINUX_KHUGEPAGED_H
+#define KHUGEPAGED_MIN_MTHP_ORDER 2
+#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
#include <linux/mm.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 89a105124790..e2319bfd0065 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __ro_after_init;
+struct scan_bit_state {
+ u8 order;
+ u16 offset;
+};
+
struct collapse_control {
bool is_khugepaged;
@@ -101,6 +106,13 @@ struct collapse_control {
/* nodemask for allocation fallback */
nodemask_t alloc_nmask;
+
+ /*
+ * bitmap used to collapse mTHP sizes.
+ */
+ DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
+ DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
+ struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
};
/**
@@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
return result;
}
+static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
+ u8 order, u16 offset)
+{
+ cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
+ { order, offset };
+}
+
+/*
+ * collapse_scan_bitmap() consumes the bitmap that is generated during
+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
+ *
+ * Each bit in the bitmap represents a single occupied (!none/zero) page.
+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
+ * of the bitmap for collapse eligibility. We start at the PMD order and
+ * check if it is eligible for collapse; if not, we add two entries to the
+ * stack at a lower order to represent the left and right halves of the region.
+ *
+ * For each region, we calculate the number of set bits and compare it
+ * against a threshold derived from collapse_max_ptes_none(). A region is
+ * eligible if the number of set bits exceeds this threshold.
+ */
+static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped, struct collapse_control *cc,
+ bool *mmap_locked, unsigned long enabled_orders)
+{
+ u8 order, next_order;
+ u16 offset, mid_offset;
+ int num_chunks;
+ int bits_set, threshold_bits;
+ int top = -1;
+ int collapsed = 0;
+ int ret;
+ struct scan_bit_state state;
+ unsigned int max_none_ptes;
+
+ push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
+
+ while (top >= 0) {
+ state = cc->mthp_bitmap_stack[top--];
+ order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
+ offset = state.offset;
+ num_chunks = 1UL << order;
+
+ /* Skip mTHP orders that are not enabled */
+ if (!test_bit(order, &enabled_orders))
+ goto next_order;
+
+ max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
+
+ /* Calculate weight of the range */
+ bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
+ bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
+ bits_set = bitmap_weight_and(cc->mthp_bitmap,
+ cc->mthp_bitmap_mask, HPAGE_PMD_NR);
+
+ threshold_bits = (1UL << order) - max_none_ptes - 1;
+
+ /* Check if the region is eligible based on the threshold */
+ if (bits_set > threshold_bits) {
+ ret = collapse_huge_page(mm, address, referenced,
+ unmapped, cc, mmap_locked,
+ order, offset);
+ if (ret == SCAN_SUCCEED) {
+ collapsed += 1UL << order;
+ continue;
+ }
+ }
+
+next_order:
+ if (state.order > 0) {
+ next_order = state.order - 1;
+ mid_offset = offset + (num_chunks / 2);
+ push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
+ push_mthp_bitmap_stack(cc, &top, next_order, offset);
+ }
+ }
+ return collapsed;
+}
+
static int collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long start_addr, bool *mmap_locked,
@@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
+ int i;
int result = SCAN_FAIL, referenced = 0;
- int none_or_zero = 0, shared = 0;
+ int none_or_zero = 0, shared = 0, nr_collapsed = 0;
struct page *page = NULL;
+ unsigned int max_ptes_none;
struct folio *folio = NULL;
unsigned long addr;
+ unsigned long enabled_orders;
+ bool full_scan = true;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
@@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
if (result != SCAN_SUCCEED)
goto out;
+ bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
+
+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
+
+ /*
+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
+ * scan all pages to populate the bitmap for mTHP collapse.
+ */
+ if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
+ full_scan = false;
+ max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
+
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, addr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ _pte = pte + i;
+ addr = start_addr + i * PAGE_SIZE;
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
@@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
}
}
+ /* Set bit for occupied pages */
+ bitmap_set(cc->mthp_bitmap, i, 1);
/*
* Record which node the original page is from and save this
* information to cc->node_load[].
@@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, start_addr, referenced,
- unmapped, cc, mmap_locked,
- HPAGE_PMD_ORDER, 0);
+ nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
+ cc, mmap_locked, enabled_orders);
+ if (nr_collapsed > 0)
+ result = SCAN_SUCCEED;
+ else
+ result = SCAN_FAIL;
}
out:
trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
@ 2025-10-27 6:28 ` Baolin Wang
2025-11-09 2:08 ` Wei Yang
2025-11-19 11:53 ` Lorenzo Stoakes
2 siblings, 0 replies; 91+ messages in thread
From: Baolin Wang @ 2025-10-27 6:28 UTC (permalink / raw)
To: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/10/23 02:37, Nico Pache wrote:
> During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> enabled we remove the restriction of max_ptes_none during the scan phase
> to avoid missing potential mTHP candidates.
>
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack struct is
> used instead of traditional recursion. The algorithm splits the bitmap
> into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> the attempted collapse order to determine how "full" an order must be
> before being considered for collapse.
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory.
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order (m)THP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_allowable_orders() function.
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
I've tested this patch, and it works as expected. (Some nits are listed
below)
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
> include/linux/khugepaged.h | 2 +
> mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 122 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index eb1946a70cff..179ce716e769 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_KHUGEPAGED_H
> #define _LINUX_KHUGEPAGED_H
> +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> +#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> #include <linux/mm.h>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 89a105124790..e2319bfd0065 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
> static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +struct scan_bit_state {
> + u8 order;
> + u16 offset;
> +};
> +
> struct collapse_control {
> bool is_khugepaged;
>
> @@ -101,6 +106,13 @@ struct collapse_control {
>
> /* nodemask for allocation fallback */
> nodemask_t alloc_nmask;
> +
> + /*
> + * bitmap used to collapse mTHP sizes.
> + */
> + DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
> + DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
Nit: please remove the extra spaces.
> + struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
> };
>
> /**
> @@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> return result;
> }
>
> +static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> + u8 order, u16 offset)
> +{
> + cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> + { order, offset };
> +}
> +
> +/*
> + * collapse_scan_bitmap() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in the bitmap represents a single occupied (!none/zero) page.
> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. We start at the PMD order and
> + * check if it is eligible for collapse; if not, we add two entries to the
> + * stack at a lower order to represent the left and right halves of the region.
> + *
> + * For each region, we calculate the number of set bits and compare it
> + * against a threshold derived from collapse_max_ptes_none(). A region is
> + * eligible if the number of set bits exceeds this threshold.
> + */
> +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned long enabled_orders)
> +{
> + u8 order, next_order;
> + u16 offset, mid_offset;
> + int num_chunks;
> + int bits_set, threshold_bits;
> + int top = -1;
> + int collapsed = 0;
> + int ret;
> + struct scan_bit_state state;
> + unsigned int max_none_ptes;
Nit: could you rearrange the order of variable definitions? Like reverse
Christmas trees.
> +
> + push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
> +
> + while (top >= 0) {
> + state = cc->mthp_bitmap_stack[top--];
> + order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
> + offset = state.offset;
> + num_chunks = 1UL << order;
Nit: ‘num_chunks’ should be 'unsigned long'.
> +
> + /* Skip mTHP orders that are not enabled */
> + if (!test_bit(order, &enabled_orders))
> + goto next_order;
> +
> + max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
> +
> + /* Calculate weight of the range */
> + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> + bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> + bits_set = bitmap_weight_and(cc->mthp_bitmap,
> + cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> +
> + threshold_bits = (1UL << order) - max_none_ptes - 1;
> +
> + /* Check if the region is eligible based on the threshold */
> + if (bits_set > threshold_bits) {
> + ret = collapse_huge_page(mm, address, referenced,
> + unmapped, cc, mmap_locked,
> + order, offset);
> + if (ret == SCAN_SUCCEED) {
> + collapsed += 1UL << order;
> + continue;
> + }
> + }
> +
> +next_order:
> + if (state.order > 0) {
> + next_order = state.order - 1;
> + mid_offset = offset + (num_chunks / 2);
> + push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> + push_mthp_bitmap_stack(cc, &top, next_order, offset);
> + }
> + }
> + return collapsed;
> +}
> +
> static int collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
> @@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> {
> pmd_t *pmd;
> pte_t *pte, *_pte;
> + int i;
> int result = SCAN_FAIL, referenced = 0;
> - int none_or_zero = 0, shared = 0;
> + int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> struct page *page = NULL;
> + unsigned int max_ptes_none;
> struct folio *folio = NULL;
> unsigned long addr;
> + unsigned long enabled_orders;
> + bool full_scan = true;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
>
> @@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (result != SCAN_SUCCEED)
> goto out;
>
> + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> +
> + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> +
> + /*
> + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> + * scan all pages to populate the bitmap for mTHP collapse.
> + */
> + if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
> + full_scan = false;
> + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
> +
> pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> if (!pte) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
> - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> - _pte++, addr += PAGE_SIZE) {
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + _pte = pte + i;
> + addr = start_addr + i * PAGE_SIZE;
> pte_t pteval = ptep_get(_pte);
> if (is_swap_pte(pteval)) {
> ++unmapped;
> @@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> - (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
> + /* Set bit for occupied pages */
> + bitmap_set(cc->mthp_bitmap, i, 1);
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
> @@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> out_unmap:
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> - result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc, mmap_locked,
> - HPAGE_PMD_ORDER, 0);
> + nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> + cc, mmap_locked, enabled_orders);
> + if (nr_collapsed > 0)
> + result = SCAN_SUCCEED;
> + else
> + result = SCAN_FAIL;
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
2025-10-27 6:28 ` Baolin Wang
@ 2025-11-09 2:08 ` Wei Yang
2025-11-11 21:56 ` Nico Pache
2025-11-19 11:53 ` Lorenzo Stoakes
2 siblings, 1 reply; 91+ messages in thread
From: Wei Yang @ 2025-11-09 2:08 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:14PM -0600, Nico Pache wrote:
>During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
>enabled we remove the restriction of max_ptes_none during the scan phase
>to avoid missing potential mTHP candidates.
>
>Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
>and determine the best eligible order for the collapse. A stack struct is
>used instead of traditional recursion. The algorithm splits the bitmap
>into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
>the attempted collapse order to determine how "full" an order must be
>before being considered for collapse.
>
>Once we determine what mTHP sizes fits best in that PMD range a collapse
>is attempted. A minimum collapse order of 2 is used as this is the lowest
>order supported by anon memory.
>
>mTHP collapses reject regions containing swapped out or shared pages.
>This is because adding new entries can lead to new none pages, and these
>may lead to constant promotion into a higher order (m)THP. A similar
>issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
>introducing at least 2x the number of pages, and on a future scan will
>satisfy the promotion condition once again. This issue is prevented via
>the collapse_allowable_orders() function.
>
>Currently madv_collapse is not supported and will only attempt PMD
>collapse.
>
>We can also remove the check for is_khugepaged inside the PMD scan as
>the collapse_max_ptes_none() function handles this logic now.
>
>Signed-off-by: Nico Pache <npache@redhat.com>
Generally LGTM.
Some nit below.
>---
> include/linux/khugepaged.h | 2 +
> mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 122 insertions(+), 8 deletions(-)
>
>diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
>index eb1946a70cff..179ce716e769 100644
>--- a/include/linux/khugepaged.h
>+++ b/include/linux/khugepaged.h
>@@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_KHUGEPAGED_H
> #define _LINUX_KHUGEPAGED_H
>+#define KHUGEPAGED_MIN_MTHP_ORDER 2
>+#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> #include <linux/mm.h>
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 89a105124790..e2319bfd0065 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
> static struct kmem_cache *mm_slot_cache __ro_after_init;
>
>+struct scan_bit_state {
>+ u8 order;
>+ u16 offset;
>+};
>+
> struct collapse_control {
> bool is_khugepaged;
>
>@@ -101,6 +106,13 @@ struct collapse_control {
>
> /* nodemask for allocation fallback */
> nodemask_t alloc_nmask;
>+
>+ /*
>+ * bitmap used to collapse mTHP sizes.
>+ */
>+ DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
>+ DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
>+ struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
Looks like an indent issue.
> };
>
> /**
>@@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> return result;
> }
>
>+static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
>+ u8 order, u16 offset)
>+{
>+ cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
>+ { order, offset };
>+}
>+
For me, I may introduce pop_mth_bitmap_stack() .
And use it ...
>+/*
>+ * collapse_scan_bitmap() consumes the bitmap that is generated during
>+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
>+ *
>+ * Each bit in the bitmap represents a single occupied (!none/zero) page.
>+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
>+ * of the bitmap for collapse eligibility. We start at the PMD order and
>+ * check if it is eligible for collapse; if not, we add two entries to the
>+ * stack at a lower order to represent the left and right halves of the region.
>+ *
>+ * For each region, we calculate the number of set bits and compare it
>+ * against a threshold derived from collapse_max_ptes_none(). A region is
>+ * eligible if the number of set bits exceeds this threshold.
>+ */
>+static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
>+ int referenced, int unmapped, struct collapse_control *cc,
>+ bool *mmap_locked, unsigned long enabled_orders)
>+{
>+ u8 order, next_order;
>+ u16 offset, mid_offset;
>+ int num_chunks;
>+ int bits_set, threshold_bits;
>+ int top = -1;
>+ int collapsed = 0;
>+ int ret;
>+ struct scan_bit_state state;
>+ unsigned int max_none_ptes;
>+
>+ push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
>+
>+ while (top >= 0) {
>+ state = cc->mthp_bitmap_stack[top--];
... here.
>+ order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
We push real_order - KHUGEPAGED_MIN_MTHP_ORDER, and get it by add
KHUGEPAGED_MIN_MTHP_ORDER.
Maybe we can push real_order ...
>+ offset = state.offset;
>+ num_chunks = 1UL << order;
>+
>+ /* Skip mTHP orders that are not enabled */
>+ if (!test_bit(order, &enabled_orders))
>+ goto next_order;
>+
>+ max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
>+
>+ /* Calculate weight of the range */
>+ bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
>+ bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
>+ bits_set = bitmap_weight_and(cc->mthp_bitmap,
>+ cc->mthp_bitmap_mask, HPAGE_PMD_NR);
>+
>+ threshold_bits = (1UL << order) - max_none_ptes - 1;
>+
>+ /* Check if the region is eligible based on the threshold */
>+ if (bits_set > threshold_bits) {
>+ ret = collapse_huge_page(mm, address, referenced,
>+ unmapped, cc, mmap_locked,
>+ order, offset);
>+ if (ret == SCAN_SUCCEED) {
>+ collapsed += 1UL << order;
>+ continue;
>+ }
>+ }
>+
>+next_order:
>+ if (state.order > 0) {
...and if (order > KHUGEPAGED_MIN_MTHP_ORDER) here?
Not sure you would like it.
>+ next_order = state.order - 1;
>+ mid_offset = offset + (num_chunks / 2);
>+ push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
>+ push_mthp_bitmap_stack(cc, &top, next_order, offset);
>+ }
>+ }
>+ return collapsed;
>+}
>+
> static int collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
>@@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> {
> pmd_t *pmd;
> pte_t *pte, *_pte;
>+ int i;
> int result = SCAN_FAIL, referenced = 0;
>- int none_or_zero = 0, shared = 0;
>+ int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> struct page *page = NULL;
>+ unsigned int max_ptes_none;
> struct folio *folio = NULL;
> unsigned long addr;
>+ unsigned long enabled_orders;
>+ bool full_scan = true;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
>
>@@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (result != SCAN_SUCCEED)
> goto out;
>
>+ bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
>+
>+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
>+
>+ /*
>+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
>+ * scan all pages to populate the bitmap for mTHP collapse.
>+ */
>+ if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
We sometimes use BIT(), e.g. in collapse_allowable_orders().
And sometimes use _BITUL().
Suggest to use the same form.
Nothing else, great job!
>+ full_scan = false;
>+ max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
>+
> pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> if (!pte) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
>- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
>- _pte++, addr += PAGE_SIZE) {
>+ for (i = 0; i < HPAGE_PMD_NR; i++) {
>+ _pte = pte + i;
>+ addr = start_addr + i * PAGE_SIZE;
> pte_t pteval = ptep_get(_pte);
> if (is_swap_pte(pteval)) {
> ++unmapped;
>@@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
>- (!cc->is_khugepaged ||
>- none_or_zero <= khugepaged_max_ptes_none)) {
>+ none_or_zero <= max_ptes_none) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
>@@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
>+ /* Set bit for occupied pages */
>+ bitmap_set(cc->mthp_bitmap, i, 1);
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
>@@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> out_unmap:
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
>- result = collapse_huge_page(mm, start_addr, referenced,
>- unmapped, cc, mmap_locked,
>- HPAGE_PMD_ORDER, 0);
>+ nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
>+ cc, mmap_locked, enabled_orders);
>+ if (nr_collapsed > 0)
>+ result = SCAN_SUCCEED;
>+ else
>+ result = SCAN_FAIL;
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
>--
>2.51.0
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-11-09 2:08 ` Wei Yang
@ 2025-11-11 21:56 ` Nico Pache
0 siblings, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-11 21:56 UTC (permalink / raw)
To: Wei Yang
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, lance.yang, vbabka, rppt, jannh, pfalcato
On Sat, Nov 8, 2025 at 7:08 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:14PM -0600, Nico Pache wrote:
> >During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> >enabled we remove the restriction of max_ptes_none during the scan phase
> >to avoid missing potential mTHP candidates.
> >
> >Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> >and determine the best eligible order for the collapse. A stack struct is
> >used instead of traditional recursion. The algorithm splits the bitmap
> >into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> >the attempted collapse order to determine how "full" an order must be
> >before being considered for collapse.
> >
> >Once we determine what mTHP sizes fits best in that PMD range a collapse
> >is attempted. A minimum collapse order of 2 is used as this is the lowest
> >order supported by anon memory.
> >
> >mTHP collapses reject regions containing swapped out or shared pages.
> >This is because adding new entries can lead to new none pages, and these
> >may lead to constant promotion into a higher order (m)THP. A similar
> >issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> >introducing at least 2x the number of pages, and on a future scan will
> >satisfy the promotion condition once again. This issue is prevented via
> >the collapse_allowable_orders() function.
> >
> >Currently madv_collapse is not supported and will only attempt PMD
> >collapse.
> >
> >We can also remove the check for is_khugepaged inside the PMD scan as
> >the collapse_max_ptes_none() function handles this logic now.
> >
> >Signed-off-by: Nico Pache <npache@redhat.com>
>
> Generally LGTM.
>
> Some nit below.
>
> >---
> > include/linux/khugepaged.h | 2 +
> > mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> > 2 files changed, 122 insertions(+), 8 deletions(-)
> >
> >diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> >index eb1946a70cff..179ce716e769 100644
> >--- a/include/linux/khugepaged.h
> >+++ b/include/linux/khugepaged.h
> >@@ -1,6 +1,8 @@
> > /* SPDX-License-Identifier: GPL-2.0 */
> > #ifndef _LINUX_KHUGEPAGED_H
> > #define _LINUX_KHUGEPAGED_H
> >+#define KHUGEPAGED_MIN_MTHP_ORDER 2
> >+#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
> >
> > #include <linux/mm.h>
> >
> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >index 89a105124790..e2319bfd0065 100644
> >--- a/mm/khugepaged.c
> >+++ b/mm/khugepaged.c
> >@@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> > static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> >+struct scan_bit_state {
> >+ u8 order;
> >+ u16 offset;
> >+};
> >+
> > struct collapse_control {
> > bool is_khugepaged;
> >
> >@@ -101,6 +106,13 @@ struct collapse_control {
> >
> > /* nodemask for allocation fallback */
> > nodemask_t alloc_nmask;
> >+
> >+ /*
> >+ * bitmap used to collapse mTHP sizes.
> >+ */
> >+ DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
> >+ DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
> >+ struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
>
> Looks like an indent issue.
Thanks!
>
> > };
> >
> > /**
> >@@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> > return result;
> > }
> >
> >+static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> >+ u8 order, u16 offset)
> >+{
> >+ cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> >+ { order, offset };
> >+}
> >+
>
> For me, I may introduce pop_mth_bitmap_stack() .
>
> And use it ...
>
> >+/*
> >+ * collapse_scan_bitmap() consumes the bitmap that is generated during
> >+ * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> >+ *
> >+ * Each bit in the bitmap represents a single occupied (!none/zero) page.
> >+ * A stack structure cc->mthp_bitmap_stack is used to check different regions
> >+ * of the bitmap for collapse eligibility. We start at the PMD order and
> >+ * check if it is eligible for collapse; if not, we add two entries to the
> >+ * stack at a lower order to represent the left and right halves of the region.
> >+ *
> >+ * For each region, we calculate the number of set bits and compare it
> >+ * against a threshold derived from collapse_max_ptes_none(). A region is
> >+ * eligible if the number of set bits exceeds this threshold.
> >+ */
> >+static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> >+ int referenced, int unmapped, struct collapse_control *cc,
> >+ bool *mmap_locked, unsigned long enabled_orders)
> >+{
> >+ u8 order, next_order;
> >+ u16 offset, mid_offset;
> >+ int num_chunks;
> >+ int bits_set, threshold_bits;
> >+ int top = -1;
> >+ int collapsed = 0;
> >+ int ret;
> >+ struct scan_bit_state state;
> >+ unsigned int max_none_ptes;
> >+
> >+ push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
> >+
> >+ while (top >= 0) {
> >+ state = cc->mthp_bitmap_stack[top--];
>
> ... here.
Ack!
>
> >+ order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
>
> We push real_order - KHUGEPAGED_MIN_MTHP_ORDER, and get it by add
> KHUGEPAGED_MIN_MTHP_ORDER.
>
> Maybe we can push real_order ...
>
> >+ offset = state.offset;
> >+ num_chunks = 1UL << order;
> >+
> >+ /* Skip mTHP orders that are not enabled */
> >+ if (!test_bit(order, &enabled_orders))
> >+ goto next_order;
> >+
> >+ max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
> >+
> >+ /* Calculate weight of the range */
> >+ bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> >+ bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> >+ bits_set = bitmap_weight_and(cc->mthp_bitmap,
> >+ cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> >+
> >+ threshold_bits = (1UL << order) - max_none_ptes - 1;
> >+
> >+ /* Check if the region is eligible based on the threshold */
> >+ if (bits_set > threshold_bits) {
> >+ ret = collapse_huge_page(mm, address, referenced,
> >+ unmapped, cc, mmap_locked,
> >+ order, offset);
> >+ if (ret == SCAN_SUCCEED) {
> >+ collapsed += 1UL << order;
> >+ continue;
> >+ }
> >+ }
> >+
> >+next_order:
> >+ if (state.order > 0) {
>
> ...and if (order > KHUGEPAGED_MIN_MTHP_ORDER) here?
>
> Not sure you would like it.
I went ahead and implemented this based on real order. Thanks for the
suggestion, it's much cleaner now. It made more sense like this when I
had the bitmap compressed into 128 bits.
>
> >+ next_order = state.order - 1;
> >+ mid_offset = offset + (num_chunks / 2);
> >+ push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> >+ push_mthp_bitmap_stack(cc, &top, next_order, offset);
> >+ }
> >+ }
> >+ return collapsed;
> >+}
> >+
> > static int collapse_scan_pmd(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > unsigned long start_addr, bool *mmap_locked,
> >@@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > {
> > pmd_t *pmd;
> > pte_t *pte, *_pte;
> >+ int i;
> > int result = SCAN_FAIL, referenced = 0;
> >- int none_or_zero = 0, shared = 0;
> >+ int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> > struct page *page = NULL;
> >+ unsigned int max_ptes_none;
> > struct folio *folio = NULL;
> > unsigned long addr;
> >+ unsigned long enabled_orders;
> >+ bool full_scan = true;
> > spinlock_t *ptl;
> > int node = NUMA_NO_NODE, unmapped = 0;
> >
> >@@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (result != SCAN_SUCCEED)
> > goto out;
> >
> >+ bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> > memset(cc->node_load, 0, sizeof(cc->node_load));
> > nodes_clear(cc->alloc_nmask);
> >+
> >+ enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> >+
> >+ /*
> >+ * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> >+ * scan all pages to populate the bitmap for mTHP collapse.
> >+ */
> >+ if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
>
> We sometimes use BIT(), e.g. in collapse_allowable_orders().
> And sometimes use _BITUL().
>
> Suggest to use the same form.
Yeah I caught this after posting, I missed this one!
>
> Nothing else, great job!
Thank you :) I appreciate the reviews!
>
> >+ full_scan = false;
> >+ max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
> >+
> > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > if (!pte) {
> > result = SCAN_PMD_NULL;
> > goto out;
> > }
> >
> >- for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> >- _pte++, addr += PAGE_SIZE) {
> >+ for (i = 0; i < HPAGE_PMD_NR; i++) {
> >+ _pte = pte + i;
> >+ addr = start_addr + i * PAGE_SIZE;
> > pte_t pteval = ptep_get(_pte);
> > if (is_swap_pte(pteval)) {
> > ++unmapped;
> >@@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (pte_none_or_zero(pteval)) {
> > ++none_or_zero;
> > if (!userfaultfd_armed(vma) &&
> >- (!cc->is_khugepaged ||
> >- none_or_zero <= khugepaged_max_ptes_none)) {
> >+ none_or_zero <= max_ptes_none) {
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> >@@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > }
> > }
> >
> >+ /* Set bit for occupied pages */
> >+ bitmap_set(cc->mthp_bitmap, i, 1);
> > /*
> > * Record which node the original page is from and save this
> > * information to cc->node_load[].
> >@@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > out_unmap:
> > pte_unmap_unlock(pte, ptl);
> > if (result == SCAN_SUCCEED) {
> >- result = collapse_huge_page(mm, start_addr, referenced,
> >- unmapped, cc, mmap_locked,
> >- HPAGE_PMD_ORDER, 0);
> >+ nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> >+ cc, mmap_locked, enabled_orders);
> >+ if (nr_collapsed > 0)
> >+ result = SCAN_SUCCEED;
> >+ else
> >+ result = SCAN_FAIL;
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> >--
> >2.51.0
>
> --
> Wei Yang
> Help you, Help me
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
2025-10-27 6:28 ` Baolin Wang
2025-11-09 2:08 ` Wei Yang
@ 2025-11-19 11:53 ` Lorenzo Stoakes
2025-11-19 12:08 ` Lorenzo Stoakes
2025-11-20 22:32 ` Nico Pache
2 siblings, 2 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-19 11:53 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:14PM -0600, Nico Pache wrote:
> During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> enabled we remove the restriction of max_ptes_none during the scan phase
> to avoid missing potential mTHP candidates.
It's a bit odd to open the commit message with a very specific
implementation detail, I think we should instead open with a broad
description of what we intend here, e.g. to permit mTHP collapse, before:
- Discussing the algorithm used (in more detail than below!)
- How and under what circumstances this algorithm is invoked
- (Mention MADV_COLLAPSE not supporting mTHP as of yet)
- THEN super-specific details like this.
>
> Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> and determine the best eligible order for the collapse. A stack struct is
> used instead of traditional recursion. The algorithm splits the bitmap
> into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> the attempted collapse order to determine how "full" an order must be
> before being considered for collapse.
I feel this is a _very_ brief description of a complicated algorithm. I
think we should go into a lot more detail here. 'Binary recursion' is pretty
hand-wavey, and you go from hand waving that to being super-specific about
max_ptes_none before handwaving about 'fullness' of an order.
All in all I find it super confusing - so I think you need to take a step
back and 'explain it to me like I'm 5' here :)
>
> Once we determine what mTHP sizes fits best in that PMD range a collapse
> is attempted. A minimum collapse order of 2 is used as this is the lowest
> order supported by anon memory.
I don't know what 'lowest order supported by anon memory' means?
I guess really this is the minimum order contptes support for arm64 right?
>
> mTHP collapses reject regions containing swapped out or shared pages.
> This is because adding new entries can lead to new none pages, and these
> may lead to constant promotion into a higher order (m)THP. A similar
> issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> introducing at least 2x the number of pages, and on a future scan will
> satisfy the promotion condition once again. This issue is prevented via
> the collapse_allowable_orders() function.
Obviously this has been discussed to death, but you should update this to
reflect the decided upon course (0, 511 + warning, etc.).
>
> Currently madv_collapse is not supported and will only attempt PMD
> collapse.
Good to highlight this.
>
> We can also remove the check for is_khugepaged inside the PMD scan as
> the collapse_max_ptes_none() function handles this logic now.
Again we're kind of leaping from mega handwaving to super-specific details
:) let's make it all a lot more specific + clear, and then put the really
niche details like this at the end of the commit msg (I mean this one is
fine where it is ofc as a result :)
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> include/linux/khugepaged.h | 2 +
> mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 122 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index eb1946a70cff..179ce716e769 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -1,6 +1,8 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #ifndef _LINUX_KHUGEPAGED_H
> #define _LINUX_KHUGEPAGED_H
> +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> +#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
This is an internal implementation detail, I don't think we need this in a
header do we? I think this define should be in khugepaged.c.
Also this is a really fiddly and confusing value, I don't think it's a good idea
to just put this here without explanation.
It's not even clear what it is. I'd probably rename it to MTHP_STACK_SIZE?
We need a comment that explains how you're deriving this, something like:
/*
* In order to determine mTHP order, we consider every possible mTHP size
* starting with MAX_PTRS_PER_PTE PTE entries and stopping at
* 2^KHUGEPAGED_MIN_THP_ORDER.
*
* We store (offset, order) pairs on the stack to do so, each describing a
* candidate mTHP collapse.
*
* For each (offset, order) candidate mTHP range that we consider, we must
* also consider candiate mTHPs at (offset, order - 1), and (offset + (1 <<
* order), order - 1).
*
*
* This is because each order can be split into two (an order expresses the
* power-of-two size), so we examine each half of the next lower order
* mTHP:
*
* offset mid_offset
* . |
* . v
* |---------------.-------------------|
* | PTE page table |
* |---------------.-------------------|
* <--------><-------->
* order-1 order-1
*
* Given we must consider the range of KHUGEPAGED_MIN_MTHP_ORDER to
* MAX_PTRS_PER_PTE number of PTE entries, this is the same as saying we
* must consider KHUGEPAGED_MIN_MTHP_ORDER to lg2(MAX_PTRS_PER_PTE) mTHP
* orders.
*
* As we must consider 2 possible mTHP ranges for each order, this requires
* our stack to be 2^n, where n is the number of orders we must consider.
*
* And thus MTHP_STACK_SIZE is 2^(lg2(MAX_PTRS_PER_PTE) -
* KHUGEPAGED_MIN_MTHP_ORDER).
*/
This may seem (very) long-winded, but this is all really non-obvious.
You can additionally rephrase this and utilise it in the commit message,
description of the iterative recursion function and possibly elsewhere to
describe the algorithm more clearly.
In fact, since this should really be declared in khugepaged.c, and since
you can place it just before the mthp collapse function, you could expand
this to describe the algorithm as a whole and simply put the define and the
function immediately next to each other after the comment?
>
> #include <linux/mm.h>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 89a105124790..e2319bfd0065 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
> static struct kmem_cache *mm_slot_cache __ro_after_init;
>
> +struct scan_bit_state {
Scan bit state is a bit of a weird name. Scanning what? What bit? State is
kind of implied?
struct order_offset_pair?
struct mthp_range?
> + u8 order;
> + u16 offset;
Real mega nit, but feels more natural to put offset first here. As
(position, size) seems more naturally the way to view this than (size,
position).
> +};
> +
Also needs comments...? Order of what? Offset in what?
> struct collapse_control {
> bool is_khugepaged;
>
> @@ -101,6 +106,13 @@ struct collapse_control {
>
> /* nodemask for allocation fallback */
> nodemask_t alloc_nmask;
> +
> + /*
> + * bitmap used to collapse mTHP sizes.
> + */
Nit but this should be on one line /* Bitmap used to collapse mTHP sizes */.
But we're not storing sizes though are we? And we're declaring two bitmaps?
> + DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
Really this is more of a PTE table bitmap but probably fine to call it this.
> + DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
You've added random whitespace after the tab twice here? [tab][space]DECLARE_...
> + struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
> };
>
> /**
> @@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> return result;
> }
>
> +static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> + u8 order, u16 offset)
Not sure we need to say mthp_bitmap_stack everywhere. This is a local
static function we can be a little more succinct.
mthp_stack_push()?
> +{
> + cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> + { order, offset };
This feels rather difficult to read, cc->mthp_bitmap_stack[++*top] in
particular is rather too succinct.
This would be better more broken out, e.g.:
static void mthp_stack_push(struct collapse_control *cc, int *sizep,
u8 order, u16 offset)
{
const int size = *sizep;
struct scan_bit_state *stack = &cc->mthp_bitmap_stack[size];
VM_WARN_ON_ONCE(idx >= MAX_MTHP_BITMAP_STACK);
stack->order = order;
stack->offset = offset;
*sizep++;
}
(Note this requires the change I suggest below re: not defaulting top to -1
but instead renaming it to stack_size and starting at 0, see below).
Re: below comment having pop as a helper too, that can be:
static struct scan_bit_state mthp_stack_pop(struct collapse_control *cc,
int *sizep)
{
const int size = *sizep;
VM_WARN_ON_ONCE(size <= 0);
*sizep--;
return cc->mthp_bitmap_stack[size - 1];
}
> +}
> +
> +/*
> + * collapse_scan_bitmap() consumes the bitmap that is generated during
> + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> + *
> + * Each bit in the bitmap represents a single occupied (!none/zero) page.
In which bitmap? There are 2 that are declared. Be specific - cc->mthp_bitmap.
> + * A stack structure cc->mthp_bitmap_stack is used to check different regions
> + * of the bitmap for collapse eligibility. We start at the PMD order and
> + * check if it is eligible for collapse; if not, we add two entries to the
I questioned this since you start at HPAGE_PMD_ORDER -
KHUGEPAGED_MIN_MTHP_ORDER, but then realised you're intentionally
offsetting like that.
See comments below about changing this.
> + * stack at a lower order to represent the left and right halves of the region.
> + *
> + * For each region, we calculate the number of set bits and compare it
> + * against a threshold derived from collapse_max_ptes_none(). A region is
> + * eligible if the number of set bits exceeds this threshold.
> + */
I think we could be clearer here. Something like:
...
* stack at a lower order to represent the left and right halves of the
* portion of the PTE page table we are examining.
*
* For each of these, we determine how many PTE entries are occupied in the
* range of PTE entries we propose to collapse, then compare this to the
* number of PTE entries which would need to be set for a collapse to be
* permitted at that order (accounting for max_ptes_none).
*
* If a collapse is permissible, we attempt to perform one. We do so for
* every possible mTHP in the PTE page table.
*/
> +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
Really inconsistent naming going on here, we're collapsing and scanning and
what's the bitmap?
How about mthp_collapse()?
> + int referenced, int unmapped, struct collapse_control *cc,
> + bool *mmap_locked, unsigned long enabled_orders)
> +{
> + u8 order, next_order;
> + u16 offset, mid_offset;
> + int num_chunks;
> + int bits_set, threshold_bits;
> + int top = -1;
This seems unnecessary and confusing. Just start at 0 and treat this as the
exclusive end of the stack.
You can rename this stack_size to make that clear. Have commented above
about adjustments to push function and introducing pop helper.
> + int collapsed = 0;
> + int ret;
> + struct scan_bit_state state;
> + unsigned int max_none_ptes;
Everywhere else we say max_ptes_none, let's maintain that convention here
please.
> +
> + push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
See below re: order here, we should change this.
> +
> + while (top >= 0) {
> + state = cc->mthp_bitmap_stack[top--];
I hate that we have a push helper but then do pop manually. See above.
> + order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
OK so now the order isn't state.order but is instead state.order +
KHUGEPAGED_MIN_MTHP_ORDER? :/ this is extremely confusing.
We shouldn't call this field order if you're doing a hack where state.order
isn't the order but instead is order - KHUGEPAGED_MIN_MTHP_ORDER.
Just have state.order be the actual order? And change the below condition
as mentioned there.
> + offset = state.offset;
> + num_chunks = 1UL << order;
What's a chunk? You do need to clarify these things. This is a new term not
used elsewhere in THP code as far as I can tell?
This is the number of pte entries no?
nr_entries? nr_pte_entries?
> +
> + /* Skip mTHP orders that are not enabled */
Note we're also considering PMD here :) Probably we can just delete this
comment, the code below makes it clear what you're doing.
> + if (!test_bit(order, &enabled_orders))
> + goto next_order;
> +
> + max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
OK so this is going to be scaled to order.
> +
> + /* Calculate weight of the range */
What's the weight of a range? This isn't a very helpful comment. You're
counting the Hamming weight or much more clearly - the number of set bits.
So it seems you're simply counting the number of bits you have accumulated
so far in cc->mthp_bitmap, adding in the latest offset.
So I'd say add a comment saying something like:
/*
* Determine how many PTE entries are populated in the range in which we
* propose to collapse this mTHP.
*/
> + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> + bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> + bits_set = bitmap_weight_and(cc->mthp_bitmap,
I think this variable name is pretty horrible, we don't care that it's the
number of bits set, we care about what it _means_ - that is the number of
PTE occupied entries.
So nr_occupied_pte_entries? Or nr_occupied_ptes?
> + cc->mthp_bitmap_mask, HPAGE_PMD_NR);
Frustrating there isn't a bitmap_weight_offset() or something, as you could
do that in one go then...
I think this could be made clearer by separating out the gnarly bitmap
stuff into a helper function:
static int mthp_nr_occupied_pte_entries(struct collapse_control *cc,
struct scan_bit_state state)
{
const int nr_pte_entries = 1 << state.order;
/* Setup cc->mthp_bitmap_mask to contain mask for candidate mTHP. */
bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
bitmap_set(cc->mthp_bitmap_mask, state.offset, nr_pte_entries);
/* Mask against actually occupied PTE entries in PTE table. */
return bitmap_weight_and(cc->mthp_bitmap,
cc->mthp_bitmap_mask, HPAGE_PMD_NR);
}
> +
> + threshold_bits = (1UL << order) - max_none_ptes - 1;
We defined num_chunks to 1UL << order then don't use here? :)
I'm not sure we need this to be a separate value, and I don't think we need
the -1 either, which only confuses matter more.
How about just changing the below conditional to (assuming we've renamed
num_chunks to something sensible like nr_pte_entries and bits_set to
nr_occupied_pte_entries):
if (nr_occupied_pte_entries >= nr_pte_entries - max_none_ptes) {
...
}
> +
> + /* Check if the region is eligible based on the threshold */
Probalby don't need this comment with the change above.
> + if (bits_set > threshold_bits) {
> + ret = collapse_huge_page(mm, address, referenced,
> + unmapped, cc, mmap_locked,
> + order, offset);
We declare ret at the top of the function scope, then only use it
here. That's confusing and unnecessary, just declare it in block scope
here.
> + if (ret == SCAN_SUCCEED) {
> + collapsed += 1UL << order;
Again we have defined num_chunks or rather nr_pte_entries but then
open-code 1UL << order, let's just use the value we declared here.
> + continue;
This is kinda subtle, we don't bother considering lower orders any longer
*in this range*, but do continue to consider mTHP collapse in other
portions of the PTE page table.
This shouldn't just be a 'continue' :) we need a comment here to explain
that.
E.g.:
/*
* We have collapsed an mTHP in this range at the maximum order we
* could, so we do not push lower orders on to the stack.
*/
continue;
> + }
> + }
> +
> +next_order:
> + if (state.order > 0) {
This is a great example of how this is confusing by making state.order not
actually be the order but the order - KHUGEPAGED_MIN_MTHP_ORDER.
Just make the order correct and change this to > KHUGEPAGED_MIN_MTHP_ORDER.
> + next_order = state.order - 1;
Not sure we should have a label and a variable be the same thing.
Also why are we decl'ing next_order at the top of the function but only using here?
Just declare this here, like:
if (state.order > KHUGEPAGED_MIN_MTHP_ORDER) {
const u16 new_order = state.order - 1;
...
}
> + mid_offset = offset + (num_chunks / 2);
> + push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> + push_mthp_bitmap_stack(cc, &top, next_order, offset);
I guess one subtlety that wouldn't be obvious at first glance is that
num_chunks (oh so badly needs a rename :) is a power-of-2 so we never get
weird 'what if num_chunks is odd' scenarios to worry about.
Probably doesn't really need a comment though. But this _does_ badly needs
an ASCII diagram :):
/*
* The next lowest mTHP order possesses half the number of PTE
* entries of the current one. We therefore must consider both
* halves of the current mTHP:
*
* offset mid_offset
* . |
* . v
* |---------------.-------------------|
* | PTE page table |
* |---------------.-------------------|
* <--------><-------->
* order-1 order-1
*/
Since writing this I copied this above in another suggestion :P so you
could always say 'see comment above for details' or something.
> + }
> + }
> + return collapsed;
> +}
I've commented this function to death here, but a few more things to note.
(BTW - I'm sorry I personally _hate_ repeated iterations of review when
there's stuff you could have commented in prior iterations BUT I think I
may end up having to once we respin due to the subtleties here.)
- I wonder if we could just use a helper struct to make this all a little
easier. Perhaps as it's realtively short code not so necesary, but a bit
horrid to pass around all these paramters all the time. Maybe something
for later THP rework.
- Could we exit early if it's obvious that we won't be able to collapse due
to max_ptes_none? I mean for one, we could at least check if the next
lowest order is empty. If max_ptes_none was 511, then we would have
already collapsed so can surely throw that out?
I was thinking we could go 'upwards', starting with the lowest order and
increasing order (essentially reverse things) then not collapsing until
we can't collapse at a given order (so collapse at next lowest). That
might be less efficient though.
- Given that we're going to be only permitting max_ptes_none of 0 and 511
for mTHP to start with, maybe things can be simplified - either all bits
have to 1 or we don't care what they are we attempt colalpse anyway?
But then again, maybe it's worth having the generic algorithm in place
for future flexibility? Thoughts?
- How much have you tested this? This is pretty subtle stuff... it _seems_
correct to me logically, but this is crying out for some userland testing
that exhaustively throws every possible permutation of state at this
function and asserts it's all correct.
- Are we not missing a bunch of stat counts? Didn't we add a bunch but now
are actually setting them? E.g. if we reject mTHP candidates due to
pte_max_none?
> +
> static int collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma,
> unsigned long start_addr, bool *mmap_locked,
> @@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> {
> pmd_t *pmd;
> pte_t *pte, *_pte;
> + int i;
> int result = SCAN_FAIL, referenced = 0;
> - int none_or_zero = 0, shared = 0;
> + int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> struct page *page = NULL;
> + unsigned int max_ptes_none;
Correct spelling of this :)
> struct folio *folio = NULL;
> unsigned long addr;
> + unsigned long enabled_orders;
> + bool full_scan = true;
> spinlock_t *ptl;
> int node = NUMA_NO_NODE, unmapped = 0;
>
> @@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (result != SCAN_SUCCEED)
> goto out;
>
> + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> memset(cc->node_load, 0, sizeof(cc->node_load));
> nodes_clear(cc->alloc_nmask);
> +
> + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> +
> + /*
> + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> + * scan all pages to populate the bitmap for mTHP collapse.
> + */
Ugh this is quite ugly. I don't really love that we've converted this from
doing the actual work to _mostly_ just populating the bitmap for the mthp
logic.
Then again it's only a couple places where this is checked, but it's pretty
horrible that what once was _the logic that determines what is being
considered for THP collapse' is now turned into 'the logic that populates a
bitmap'.
> + if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
I think this should be BIT(HPAGE_PMD_ORDER), I realise I reviewed the
opposite before (or think I did) but as per David we prefer BIT() :)
> + full_scan = false;
> + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
Again really quite nasty, this may as well be:
if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER))
max_ptes_none = khugepaged_max_ptes_none;
else
max_ptes_none = HPAGE_PMD_NR - 1;
It makes this hack a lot more obvious.
But also, what if !cc->is_khugepaged? We're going to scan everything even
if we only have PMD? I thought we only considered PMD size for MADV_COLLAPSE?
> +
> pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> if (!pte) {
> result = SCAN_PMD_NULL;
> goto out;
> }
>
> - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> - _pte++, addr += PAGE_SIZE) {
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + _pte = pte + i;
> + addr = start_addr + i * PAGE_SIZE;
That's nicer. I still hate _pte...
> pte_t pteval = ptep_get(_pte);
> if (is_swap_pte(pteval)) {
> ++unmapped;
> @@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> if (pte_none_or_zero(pteval)) {
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> - (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none) {
Why are we dropping !cc->is_khugepaged?
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> }
> }
>
> + /* Set bit for occupied pages */
> + bitmap_set(cc->mthp_bitmap, i, 1);
Maybe worth highlighting this is now _the entire point_ of the loop.
I wonder if we shouldn't just separate this logic out and name it
apppropriately? As we're into realms of real confusion here.
> /*
> * Record which node the original page is from and save this
> * information to cc->node_load[].
> @@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> out_unmap:
> pte_unmap_unlock(pte, ptl);
> if (result == SCAN_SUCCEED) {
> - result = collapse_huge_page(mm, start_addr, referenced,
> - unmapped, cc, mmap_locked,
> - HPAGE_PMD_ORDER, 0);
Hmm... what's actually enforcing that MADV_COLLAPSE isn't using this?
You've not done any cc->khugepaged checks afaict?
It seems that you _are_ enabling this for MADV_COLLAPSE unless I've missed
something?
> + nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> + cc, mmap_locked, enabled_orders);
> + if (nr_collapsed > 0)
> + result = SCAN_SUCCEED;
> + else
> + result = SCAN_FAIL;
> }
> out:
> trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> --
> 2.51.0
>
Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-11-19 11:53 ` Lorenzo Stoakes
@ 2025-11-19 12:08 ` Lorenzo Stoakes
2025-11-20 22:32 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-19 12:08 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
Oh I forgot to add -
In collapse_scan_pmd() there are casees where you just bail out altogether.
E.g.: pte_uffd_wp() for _any_ PTE entry in the range.
Or !folio_test_anon() for _any_ PTE entry in the range.
Etc.
Surely these are cases where an mTHP scan on part of the range might still
succeed?
You then in the subseuqent patch seem to check for collapse failures
specifically due to some of these, but surely you will never hit them as you
already discarded the whole PTE page table?
I'm not sure you've updated collapse_scan_pmd() sufficiently to account for the
mTHP logic.
Cheers, Lorenzo
On Wed, Nov 19, 2025 at 11:53:16AM +0000, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 12:37:14PM -0600, Nico Pache wrote:
> > During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> > enabled we remove the restriction of max_ptes_none during the scan phase
> > to avoid missing potential mTHP candidates.
>
> It's a bit odd to open the commit message with a very specific
> implementation detail, I think we should instead open with a broad
> description of what we intend here, e.g. to permit mTHP collapse, before:
>
> - Discussing the algorithm used (in more detail than below!)
> - How and under what circumstances this algorithm is invoked
> - (Mention MADV_COLLAPSE not supporting mTHP as of yet)
> - THEN super-specific details like this.
>
> >
> > Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> > and determine the best eligible order for the collapse. A stack struct is
> > used instead of traditional recursion. The algorithm splits the bitmap
> > into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> > the attempted collapse order to determine how "full" an order must be
> > before being considered for collapse.
>
> I feel this is a _very_ brief description of a complicated algorithm. I
> think we should go into a lot more detail here. 'Binary recursion' is pretty
> hand-wavey, and you go from hand waving that to being super-specific about
> max_ptes_none before handwaving about 'fullness' of an order.
>
> All in all I find it super confusing - so I think you need to take a step
> back and 'explain it to me like I'm 5' here :)
>
> >
> > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > order supported by anon memory.
>
> I don't know what 'lowest order supported by anon memory' means?
>
> I guess really this is the minimum order contptes support for arm64 right?
>
> >
> > mTHP collapses reject regions containing swapped out or shared pages.
> > This is because adding new entries can lead to new none pages, and these
> > may lead to constant promotion into a higher order (m)THP. A similar
> > issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> > introducing at least 2x the number of pages, and on a future scan will
> > satisfy the promotion condition once again. This issue is prevented via
> > the collapse_allowable_orders() function.
>
> Obviously this has been discussed to death, but you should update this to
> reflect the decided upon course (0, 511 + warning, etc.).
>
> >
> > Currently madv_collapse is not supported and will only attempt PMD
> > collapse.
>
> Good to highlight this.
>
> >
> > We can also remove the check for is_khugepaged inside the PMD scan as
> > the collapse_max_ptes_none() function handles this logic now.
>
> Again we're kind of leaping from mega handwaving to super-specific details
> :) let's make it all a lot more specific + clear, and then put the really
> niche details like this at the end of the commit msg (I mean this one is
> fine where it is ofc as a result :)
>
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > include/linux/khugepaged.h | 2 +
> > mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> > 2 files changed, 122 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > index eb1946a70cff..179ce716e769 100644
> > --- a/include/linux/khugepaged.h
> > +++ b/include/linux/khugepaged.h
> > @@ -1,6 +1,8 @@
> > /* SPDX-License-Identifier: GPL-2.0 */
> > #ifndef _LINUX_KHUGEPAGED_H
> > #define _LINUX_KHUGEPAGED_H
> > +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> > +#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> This is an internal implementation detail, I don't think we need this in a
> header do we? I think this define should be in khugepaged.c.
>
> Also this is a really fiddly and confusing value, I don't think it's a good idea
> to just put this here without explanation.
>
> It's not even clear what it is. I'd probably rename it to MTHP_STACK_SIZE?
>
> We need a comment that explains how you're deriving this, something like:
>
> /*
> * In order to determine mTHP order, we consider every possible mTHP size
> * starting with MAX_PTRS_PER_PTE PTE entries and stopping at
> * 2^KHUGEPAGED_MIN_THP_ORDER.
> *
> * We store (offset, order) pairs on the stack to do so, each describing a
> * candidate mTHP collapse.
> *
> * For each (offset, order) candidate mTHP range that we consider, we must
> * also consider candiate mTHPs at (offset, order - 1), and (offset + (1 <<
> * order), order - 1).
> *
> *
> * This is because each order can be split into two (an order expresses the
> * power-of-two size), so we examine each half of the next lower order
> * mTHP:
> *
> * offset mid_offset
> * . |
> * . v
> * |---------------.-------------------|
> * | PTE page table |
> * |---------------.-------------------|
> * <--------><-------->
> * order-1 order-1
> *
> * Given we must consider the range of KHUGEPAGED_MIN_MTHP_ORDER to
> * MAX_PTRS_PER_PTE number of PTE entries, this is the same as saying we
> * must consider KHUGEPAGED_MIN_MTHP_ORDER to lg2(MAX_PTRS_PER_PTE) mTHP
> * orders.
> *
> * As we must consider 2 possible mTHP ranges for each order, this requires
> * our stack to be 2^n, where n is the number of orders we must consider.
> *
> * And thus MTHP_STACK_SIZE is 2^(lg2(MAX_PTRS_PER_PTE) -
> * KHUGEPAGED_MIN_MTHP_ORDER).
> */
>
> This may seem (very) long-winded, but this is all really non-obvious.
>
> You can additionally rephrase this and utilise it in the commit message,
> description of the iterative recursion function and possibly elsewhere to
> describe the algorithm more clearly.
>
> In fact, since this should really be declared in khugepaged.c, and since
> you can place it just before the mthp collapse function, you could expand
> this to describe the algorithm as a whole and simply put the define and the
> function immediately next to each other after the comment?
>
> >
> > #include <linux/mm.h>
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 89a105124790..e2319bfd0065 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> > static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> > +struct scan_bit_state {
>
> Scan bit state is a bit of a weird name. Scanning what? What bit? State is
> kind of implied?
>
> struct order_offset_pair?
>
> struct mthp_range?
>
> > + u8 order;
> > + u16 offset;
>
> Real mega nit, but feels more natural to put offset first here. As
> (position, size) seems more naturally the way to view this than (size,
> position).
>
> > +};
> > +
>
> Also needs comments...? Order of what? Offset in what?
>
> > struct collapse_control {
> > bool is_khugepaged;
> >
> > @@ -101,6 +106,13 @@ struct collapse_control {
> >
> > /* nodemask for allocation fallback */
> > nodemask_t alloc_nmask;
> > +
> > + /*
> > + * bitmap used to collapse mTHP sizes.
> > + */
>
> Nit but this should be on one line /* Bitmap used to collapse mTHP sizes */.
>
> But we're not storing sizes though are we? And we're declaring two bitmaps?
>
> > + DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
>
> Really this is more of a PTE table bitmap but probably fine to call it this.
>
> > + DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
>
> You've added random whitespace after the tab twice here? [tab][space]DECLARE_...
>
> > + struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
> > };
> >
> > /**
> > @@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> > return result;
> > }
> >
> > +static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> > + u8 order, u16 offset)
>
> Not sure we need to say mthp_bitmap_stack everywhere. This is a local
> static function we can be a little more succinct.
>
> mthp_stack_push()?
>
> > +{
> > + cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> > + { order, offset };
>
> This feels rather difficult to read, cc->mthp_bitmap_stack[++*top] in
> particular is rather too succinct.
>
> This would be better more broken out, e.g.:
>
> static void mthp_stack_push(struct collapse_control *cc, int *sizep,
> u8 order, u16 offset)
> {
> const int size = *sizep;
> struct scan_bit_state *stack = &cc->mthp_bitmap_stack[size];
>
> VM_WARN_ON_ONCE(idx >= MAX_MTHP_BITMAP_STACK);
> stack->order = order;
> stack->offset = offset;
> *sizep++;
> }
>
> (Note this requires the change I suggest below re: not defaulting top to -1
> but instead renaming it to stack_size and starting at 0, see below).
>
> Re: below comment having pop as a helper too, that can be:
>
> static struct scan_bit_state mthp_stack_pop(struct collapse_control *cc,
> int *sizep)
> {
> const int size = *sizep;
>
> VM_WARN_ON_ONCE(size <= 0);
> *sizep--;
> return cc->mthp_bitmap_stack[size - 1];
> }
>
> > +}
> > +
> > +/*
> > + * collapse_scan_bitmap() consumes the bitmap that is generated during
> > + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> > + *
> > + * Each bit in the bitmap represents a single occupied (!none/zero) page.
>
> In which bitmap? There are 2 that are declared. Be specific - cc->mthp_bitmap.
>
> > + * A stack structure cc->mthp_bitmap_stack is used to check different regions
>
> > + * of the bitmap for collapse eligibility. We start at the PMD order and
> > + * check if it is eligible for collapse; if not, we add two entries to the
>
> I questioned this since you start at HPAGE_PMD_ORDER -
> KHUGEPAGED_MIN_MTHP_ORDER, but then realised you're intentionally
> offsetting like that.
>
> See comments below about changing this.
>
> > + * stack at a lower order to represent the left and right halves of the region.
> > + *
> > + * For each region, we calculate the number of set bits and compare it
> > + * against a threshold derived from collapse_max_ptes_none(). A region is
> > + * eligible if the number of set bits exceeds this threshold.
> > + */
>
> I think we could be clearer here. Something like:
>
> ...
> * stack at a lower order to represent the left and right halves of the
> * portion of the PTE page table we are examining.
> *
>
> * For each of these, we determine how many PTE entries are occupied in the
> * range of PTE entries we propose to collapse, then compare this to the
> * number of PTE entries which would need to be set for a collapse to be
> * permitted at that order (accounting for max_ptes_none).
> *
> * If a collapse is permissible, we attempt to perform one. We do so for
> * every possible mTHP in the PTE page table.
> */
>
> > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
>
> Really inconsistent naming going on here, we're collapsing and scanning and
> what's the bitmap?
>
> How about mthp_collapse()?
>
> > + int referenced, int unmapped, struct collapse_control *cc,
> > + bool *mmap_locked, unsigned long enabled_orders)
> > +{
> > + u8 order, next_order;
> > + u16 offset, mid_offset;
> > + int num_chunks;
> > + int bits_set, threshold_bits;
> > + int top = -1;
>
> This seems unnecessary and confusing. Just start at 0 and treat this as the
> exclusive end of the stack.
>
> You can rename this stack_size to make that clear. Have commented above
> about adjustments to push function and introducing pop helper.
>
>
> > + int collapsed = 0;
> > + int ret;
> > + struct scan_bit_state state;
> > + unsigned int max_none_ptes;
>
> Everywhere else we say max_ptes_none, let's maintain that convention here
> please.
>
> > +
> > + push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
>
> See below re: order here, we should change this.
>
> > +
> > + while (top >= 0) {
> > + state = cc->mthp_bitmap_stack[top--];
>
> I hate that we have a push helper but then do pop manually. See above.
>
> > + order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
>
> OK so now the order isn't state.order but is instead state.order +
> KHUGEPAGED_MIN_MTHP_ORDER? :/ this is extremely confusing.
>
> We shouldn't call this field order if you're doing a hack where state.order
> isn't the order but instead is order - KHUGEPAGED_MIN_MTHP_ORDER.
>
> Just have state.order be the actual order? And change the below condition
> as mentioned there.
>
> > + offset = state.offset;
> > + num_chunks = 1UL << order;
>
> What's a chunk? You do need to clarify these things. This is a new term not
> used elsewhere in THP code as far as I can tell?
>
> This is the number of pte entries no?
>
> nr_entries? nr_pte_entries?
>
> > +
> > + /* Skip mTHP orders that are not enabled */
>
> Note we're also considering PMD here :) Probably we can just delete this
> comment, the code below makes it clear what you're doing.
>
> > + if (!test_bit(order, &enabled_orders))
> > + goto next_order;
> > +
> > + max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
>
> OK so this is going to be scaled to order.
>
> > +
> > + /* Calculate weight of the range */
>
> What's the weight of a range? This isn't a very helpful comment. You're
> counting the Hamming weight or much more clearly - the number of set bits.
>
> So it seems you're simply counting the number of bits you have accumulated
> so far in cc->mthp_bitmap, adding in the latest offset.
>
> So I'd say add a comment saying something like:
>
> /*
> * Determine how many PTE entries are populated in the range in which we
> * propose to collapse this mTHP.
> */
>
> > + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> > + bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> > + bits_set = bitmap_weight_and(cc->mthp_bitmap,
>
> I think this variable name is pretty horrible, we don't care that it's the
> number of bits set, we care about what it _means_ - that is the number of
> PTE occupied entries.
>
> So nr_occupied_pte_entries? Or nr_occupied_ptes?
>
> > + cc->mthp_bitmap_mask, HPAGE_PMD_NR);
>
> Frustrating there isn't a bitmap_weight_offset() or something, as you could
> do that in one go then...
>
> I think this could be made clearer by separating out the gnarly bitmap
> stuff into a helper function:
>
> static int mthp_nr_occupied_pte_entries(struct collapse_control *cc,
> struct scan_bit_state state)
> {
> const int nr_pte_entries = 1 << state.order;
>
> /* Setup cc->mthp_bitmap_mask to contain mask for candidate mTHP. */
> bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> bitmap_set(cc->mthp_bitmap_mask, state.offset, nr_pte_entries);
> /* Mask against actually occupied PTE entries in PTE table. */
> return bitmap_weight_and(cc->mthp_bitmap,
> cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> }
>
> > +
> > + threshold_bits = (1UL << order) - max_none_ptes - 1;
>
> We defined num_chunks to 1UL << order then don't use here? :)
>
> I'm not sure we need this to be a separate value, and I don't think we need
> the -1 either, which only confuses matter more.
>
> How about just changing the below conditional to (assuming we've renamed
> num_chunks to something sensible like nr_pte_entries and bits_set to
> nr_occupied_pte_entries):
>
> if (nr_occupied_pte_entries >= nr_pte_entries - max_none_ptes) {
> ...
> }
>
> > +
> > + /* Check if the region is eligible based on the threshold */
>
> Probalby don't need this comment with the change above.
>
> > + if (bits_set > threshold_bits) {
> > + ret = collapse_huge_page(mm, address, referenced,
> > + unmapped, cc, mmap_locked,
> > + order, offset);
>
> We declare ret at the top of the function scope, then only use it
> here. That's confusing and unnecessary, just declare it in block scope
> here.
>
> > + if (ret == SCAN_SUCCEED) {
> > + collapsed += 1UL << order;
>
> Again we have defined num_chunks or rather nr_pte_entries but then
> open-code 1UL << order, let's just use the value we declared here.
>
> > + continue;
>
> This is kinda subtle, we don't bother considering lower orders any longer
> *in this range*, but do continue to consider mTHP collapse in other
> portions of the PTE page table.
>
> This shouldn't just be a 'continue' :) we need a comment here to explain
> that.
>
> E.g.:
>
> /*
> * We have collapsed an mTHP in this range at the maximum order we
> * could, so we do not push lower orders on to the stack.
> */
> continue;
>
>
> > + }
> > + }
> > +
> > +next_order:
> > + if (state.order > 0) {
>
> This is a great example of how this is confusing by making state.order not
> actually be the order but the order - KHUGEPAGED_MIN_MTHP_ORDER.
>
> Just make the order correct and change this to > KHUGEPAGED_MIN_MTHP_ORDER.
>
> > + next_order = state.order - 1;
>
> Not sure we should have a label and a variable be the same thing.
>
> Also why are we decl'ing next_order at the top of the function but only using here?
>
> Just declare this here, like:
>
> if (state.order > KHUGEPAGED_MIN_MTHP_ORDER) {
> const u16 new_order = state.order - 1;
>
> ...
> }
>
> > + mid_offset = offset + (num_chunks / 2);
> > + push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> > + push_mthp_bitmap_stack(cc, &top, next_order, offset);
>
> I guess one subtlety that wouldn't be obvious at first glance is that
> num_chunks (oh so badly needs a rename :) is a power-of-2 so we never get
> weird 'what if num_chunks is odd' scenarios to worry about.
>
> Probably doesn't really need a comment though. But this _does_ badly needs
> an ASCII diagram :):
>
> /*
> * The next lowest mTHP order possesses half the number of PTE
> * entries of the current one. We therefore must consider both
> * halves of the current mTHP:
> *
> * offset mid_offset
> * . |
> * . v
> * |---------------.-------------------|
> * | PTE page table |
> * |---------------.-------------------|
> * <--------><-------->
> * order-1 order-1
> */
>
> Since writing this I copied this above in another suggestion :P so you
> could always say 'see comment above for details' or something.
>
> > + }
> > + }
> > + return collapsed;
> > +}
>
> I've commented this function to death here, but a few more things to note.
>
> (BTW - I'm sorry I personally _hate_ repeated iterations of review when
> there's stuff you could have commented in prior iterations BUT I think I
> may end up having to once we respin due to the subtleties here.)
>
> - I wonder if we could just use a helper struct to make this all a little
> easier. Perhaps as it's realtively short code not so necesary, but a bit
> horrid to pass around all these paramters all the time. Maybe something
> for later THP rework.
>
> - Could we exit early if it's obvious that we won't be able to collapse due
> to max_ptes_none? I mean for one, we could at least check if the next
> lowest order is empty. If max_ptes_none was 511, then we would have
> already collapsed so can surely throw that out?
>
> I was thinking we could go 'upwards', starting with the lowest order and
> increasing order (essentially reverse things) then not collapsing until
> we can't collapse at a given order (so collapse at next lowest). That
> might be less efficient though.
>
> - Given that we're going to be only permitting max_ptes_none of 0 and 511
> for mTHP to start with, maybe things can be simplified - either all bits
> have to 1 or we don't care what they are we attempt colalpse anyway?
>
> But then again, maybe it's worth having the generic algorithm in place
> for future flexibility? Thoughts?
>
> - How much have you tested this? This is pretty subtle stuff... it _seems_
> correct to me logically, but this is crying out for some userland testing
> that exhaustively throws every possible permutation of state at this
> function and asserts it's all correct.
>
> - Are we not missing a bunch of stat counts? Didn't we add a bunch but now
> are actually setting them? E.g. if we reject mTHP candidates due to
> pte_max_none?
>
> > +
> > static int collapse_scan_pmd(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > unsigned long start_addr, bool *mmap_locked,
> > @@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > {
> > pmd_t *pmd;
> > pte_t *pte, *_pte;
> > + int i;
> > int result = SCAN_FAIL, referenced = 0;
> > - int none_or_zero = 0, shared = 0;
> > + int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> > struct page *page = NULL;
> > + unsigned int max_ptes_none;
>
> Correct spelling of this :)
>
> > struct folio *folio = NULL;
> > unsigned long addr;
> > + unsigned long enabled_orders;
> > + bool full_scan = true;
> > spinlock_t *ptl;
> > int node = NUMA_NO_NODE, unmapped = 0;
> >
> > @@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (result != SCAN_SUCCEED)
> > goto out;
> >
> > + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> > memset(cc->node_load, 0, sizeof(cc->node_load));
> > nodes_clear(cc->alloc_nmask);
> > +
> > + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> > +
> > + /*
> > + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> > + * scan all pages to populate the bitmap for mTHP collapse.
> > + */
>
> Ugh this is quite ugly. I don't really love that we've converted this from
> doing the actual work to _mostly_ just populating the bitmap for the mthp
> logic.
>
> Then again it's only a couple places where this is checked, but it's pretty
> horrible that what once was _the logic that determines what is being
> considered for THP collapse' is now turned into 'the logic that populates a
> bitmap'.
>
> > + if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
>
> I think this should be BIT(HPAGE_PMD_ORDER), I realise I reviewed the
> opposite before (or think I did) but as per David we prefer BIT() :)
>
> > + full_scan = false;
> > + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
>
> Again really quite nasty, this may as well be:
>
> if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER))
> max_ptes_none = khugepaged_max_ptes_none;
> else
> max_ptes_none = HPAGE_PMD_NR - 1;
>
> It makes this hack a lot more obvious.
>
> But also, what if !cc->is_khugepaged? We're going to scan everything even
> if we only have PMD? I thought we only considered PMD size for MADV_COLLAPSE?
>
> > +
> > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > if (!pte) {
> > result = SCAN_PMD_NULL;
> > goto out;
> > }
> >
> > - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > - _pte++, addr += PAGE_SIZE) {
> > + for (i = 0; i < HPAGE_PMD_NR; i++) {
> > + _pte = pte + i;
> > + addr = start_addr + i * PAGE_SIZE;
>
> That's nicer. I still hate _pte...
>
> > pte_t pteval = ptep_get(_pte);
> > if (is_swap_pte(pteval)) {
> > ++unmapped;
> > @@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (pte_none_or_zero(pteval)) {
> > ++none_or_zero;
> > if (!userfaultfd_armed(vma) &&
> > - (!cc->is_khugepaged ||
> > - none_or_zero <= khugepaged_max_ptes_none)) {
> > + none_or_zero <= max_ptes_none) {
>
> Why are we dropping !cc->is_khugepaged?
>
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> > @@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > }
> > }
> >
> > + /* Set bit for occupied pages */
> > + bitmap_set(cc->mthp_bitmap, i, 1);
>
> Maybe worth highlighting this is now _the entire point_ of the loop.
>
> I wonder if we shouldn't just separate this logic out and name it
> apppropriately? As we're into realms of real confusion here.
>
> > /*
> > * Record which node the original page is from and save this
> > * information to cc->node_load[].
> > @@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > out_unmap:
> > pte_unmap_unlock(pte, ptl);
> > if (result == SCAN_SUCCEED) {
> > - result = collapse_huge_page(mm, start_addr, referenced,
> > - unmapped, cc, mmap_locked,
> > - HPAGE_PMD_ORDER, 0);
>
> Hmm... what's actually enforcing that MADV_COLLAPSE isn't using this?
> You've not done any cc->khugepaged checks afaict?
>
> It seems that you _are_ enabling this for MADV_COLLAPSE unless I've missed
> something?
>
> > + nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> > + cc, mmap_locked, enabled_orders);
> > + if (nr_collapsed > 0)
> > + result = SCAN_SUCCEED;
> > + else
> > + result = SCAN_FAIL;
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> > --
> > 2.51.0
> >
>
> Thanks, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support
2025-11-19 11:53 ` Lorenzo Stoakes
2025-11-19 12:08 ` Lorenzo Stoakes
@ 2025-11-20 22:32 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-20 22:32 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Nov 19, 2025 at 4:56 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:14PM -0600, Nico Pache wrote:
> > During PMD range scanning, track occupied pages in a bitmap. If mTHPs are
> > enabled we remove the restriction of max_ptes_none during the scan phase
> > to avoid missing potential mTHP candidates.
>
> It's a bit odd to open the commit message with a very specific
> implementation detail, I think we should instead open with a broad
> description of what we intend here, e.g. to permit mTHP collapse, before:
>
> - Discussing the algorithm used (in more detail than below!)
> - How and under what circumstances this algorithm is invoked
> - (Mention MADV_COLLAPSE not supporting mTHP as of yet)
> - THEN super-specific details like this
>
> >
> > Implement collapse_scan_bitmap() to perform binary recursion on the bitmap
> > and determine the best eligible order for the collapse. A stack struct is
> > used instead of traditional recursion. The algorithm splits the bitmap
> > into smaller chunks to find the best fit mTHP. max_ptes_none is scaled by
> > the attempted collapse order to determine how "full" an order must be
> > before being considered for collapse.
>
> I feel this is a _very_ brief description of a complicated algorithm. I
> think we should go into a lot more detail here. 'Binary recursion' is pretty
> hand-wavey, and you go from hand waving that to being super-specific about
> max_ptes_none before handwaving about 'fullness' of an order.
>
> All in all I find it super confusing - so I think you need to take a step
> back and 'explain it to me like I'm 5' here :)
Sounds good, I'll rework the commit message with your feedback in mind! Thanks!
>
> >
> > Once we determine what mTHP sizes fits best in that PMD range a collapse
> > is attempted. A minimum collapse order of 2 is used as this is the lowest
> > order supported by anon memory.
>
> I don't know what 'lowest order supported by anon memory' means?
>
> I guess really this is the minimum order contptes support for arm64 right?
Anonymous memory supports mTHP sizes of order 2 or greater.
>
> >
> > mTHP collapses reject regions containing swapped out or shared pages.
> > This is because adding new entries can lead to new none pages, and these
> > may lead to constant promotion into a higher order (m)THP. A similar
> > issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse
> > introducing at least 2x the number of pages, and on a future scan will
> > satisfy the promotion condition once again. This issue is prevented via
> > the collapse_allowable_orders() function.
>
> Obviously this has been discussed to death, but you should update this to
> reflect the decided upon course (0, 511 + warning, etc.).
Yeah I wasnt sure whether to reference collapse_allowable_orders()
which should now dictate this limitation, or directly reference the
limitations here. Ill do both.
>
> >
> > Currently madv_collapse is not supported and will only attempt PMD
> > collapse.
>
> Good to highlight this.
>
> >
> > We can also remove the check for is_khugepaged inside the PMD scan as
> > the collapse_max_ptes_none() function handles this logic now.
>
> Again we're kind of leaping from mega handwaving to super-specific details
> :) let's make it all a lot more specific + clear, and then put the really
> niche details like this at the end of the commit msg (I mean this one is
> fine where it is ofc as a result :)
>
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > include/linux/khugepaged.h | 2 +
> > mm/khugepaged.c | 128 ++++++++++++++++++++++++++++++++++---
> > 2 files changed, 122 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> > index eb1946a70cff..179ce716e769 100644
> > --- a/include/linux/khugepaged.h
> > +++ b/include/linux/khugepaged.h
> > @@ -1,6 +1,8 @@
> > /* SPDX-License-Identifier: GPL-2.0 */
> > #ifndef _LINUX_KHUGEPAGED_H
> > #define _LINUX_KHUGEPAGED_H
> > +#define KHUGEPAGED_MIN_MTHP_ORDER 2
> > +#define MAX_MTHP_BITMAP_STACK (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER))
>
> This is an internal implementation detail, I don't think we need this in a
> header do we? I think this define should be in khugepaged.c.
sounds good!
>
> Also this is a really fiddly and confusing value, I don't think it's a good idea
> to just put this here without explanation.
This is sadly an outcome of ppc64. HPAGE_PMD_ORDER cannot be used due
to ppc defining this at runtime, so instead we use
lg(MAX_PTRS_PER_PTE).
>
> It's not even clear what it is. I'd probably rename it to MTHP_STACK_SIZE?
Yeah MTHP_STACK_SIZE is better!
>
> We need a comment that explains how you're deriving this, something like:
>
> /*
> * In order to determine mTHP order, we consider every possible mTHP size
> * starting with MAX_PTRS_PER_PTE PTE entries and stopping at
> * 2^KHUGEPAGED_MIN_THP_ORDER.
> *
> * We store (offset, order) pairs on the stack to do so, each describing a
> * candidate mTHP collapse.
> *
> * For each (offset, order) candidate mTHP range that we consider, we must
> * also consider candiate mTHPs at (offset, order - 1), and (offset + (1 <<
> * order), order - 1).
> *
> *
> * This is because each order can be split into two (an order expresses the
> * power-of-two size), so we examine each half of the next lower order
> * mTHP:
> *
> * offset mid_offset
> * . |
> * . v
> * |---------------.-------------------|
> * | PTE page table |
> * |---------------.-------------------|
> * <--------><-------->
> * order-1 order-1
> *
> * Given we must consider the range of KHUGEPAGED_MIN_MTHP_ORDER to
> * MAX_PTRS_PER_PTE number of PTE entries, this is the same as saying we
> * must consider KHUGEPAGED_MIN_MTHP_ORDER to lg2(MAX_PTRS_PER_PTE) mTHP
> * orders.
> *
> * As we must consider 2 possible mTHP ranges for each order, this requires
> * our stack to be 2^n, where n is the number of orders we must consider.
> *
> * And thus MTHP_STACK_SIZE is 2^(lg2(MAX_PTRS_PER_PTE) -
> * KHUGEPAGED_MIN_MTHP_ORDER).
> */
>
> This may seem (very) long-winded, but this is all really non-obvious.
>
> You can additionally rephrase this and utilise it in the commit message,
> description of the iterative recursion function and possibly elsewhere to
> describe the algorithm more clearly.
>
> In fact, since this should really be declared in khugepaged.c, and since
> you can place it just before the mthp collapse function, you could expand
> this to describe the algorithm as a whole and simply put the define and the
> function immediately next to each other after the comment?
Sounds good ill break this down in more detail and try to group the
functions/comments into one section. It may not be fully possible to
keep the code together as some things are needed early in the code (ie
MAX_STACK_SIZE would be needed at scan_control definition)
>
> >
> > #include <linux/mm.h>
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 89a105124790..e2319bfd0065 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -93,6 +93,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
> >
> > static struct kmem_cache *mm_slot_cache __ro_after_init;
> >
> > +struct scan_bit_state {
>
> Scan bit state is a bit of a weird name. Scanning what? What bit? State is
> kind of implied?
>
> struct order_offset_pair?
>
> struct mthp_range?
mthp_range sounds good to me!
>
> > + u8 order;
> > + u16 offset;
>
> Real mega nit, but feels more natural to put offset first here. As
> (position, size) seems more naturally the way to view this than (size,
> position).
ack!
>
> > +};
> > +
>
> Also needs comments...? Order of what? Offset in what?
>
> > struct collapse_control {
> > bool is_khugepaged;
> >
> > @@ -101,6 +106,13 @@ struct collapse_control {
> >
> > /* nodemask for allocation fallback */
> > nodemask_t alloc_nmask;
> > +
> > + /*
> > + * bitmap used to collapse mTHP sizes.
> > + */
>
> Nit but this should be on one line /* Bitmap used to collapse mTHP sizes */.
ACK, already handled this one!
>
> But we're not storing sizes though are we? And we're declaring two bitmaps?
>
> > + DECLARE_BITMAP(mthp_bitmap, HPAGE_PMD_NR);
>
> Really this is more of a PTE table bitmap but probably fine to call it this.
>
> > + DECLARE_BITMAP(mthp_bitmap_mask, HPAGE_PMD_NR);
>
> You've added random whitespace after the tab twice here? [tab][space]DECLARE_...
>
> > + struct scan_bit_state mthp_bitmap_stack[MAX_MTHP_BITMAP_STACK];
> > };
> >
> > /**
> > @@ -1357,6 +1369,85 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long pmd_address,
> > return result;
> > }
> >
> > +static void push_mthp_bitmap_stack(struct collapse_control *cc, int *top,
> > + u8 order, u16 offset)
>
> Not sure we need to say mthp_bitmap_stack everywhere. This is a local
> static function we can be a little more succinct.
>
> mthp_stack_push()?
looks good!
>
> > +{
> > + cc->mthp_bitmap_stack[++*top] = (struct scan_bit_state)
> > + { order, offset };
>
> This feels rather difficult to read, cc->mthp_bitmap_stack[++*top] in
> particular is rather too succinct.
>
> This would be better more broken out, e.g.:
>
> static void mthp_stack_push(struct collapse_control *cc, int *sizep,
> u8 order, u16 offset)
> {
> const int size = *sizep;
> struct scan_bit_state *stack = &cc->mthp_bitmap_stack[size];
>
> VM_WARN_ON_ONCE(idx >= MAX_MTHP_BITMAP_STACK);
> stack->order = order;
> stack->offset = offset;
> *sizep++;
> }
>
> (Note this requires the change I suggest below re: not defaulting top to -1
> but instead renaming it to stack_size and starting at 0, see below).
>
> Re: below comment having pop as a helper too, that can be:
>
> static struct scan_bit_state mthp_stack_pop(struct collapse_control *cc,
> int *sizep)
> {
> const int size = *sizep;
>
> VM_WARN_ON_ONCE(size <= 0);
> *sizep--;
> return cc->mthp_bitmap_stack[size - 1];
> }
ack sounds good. I implemented these more verbosely!
>
> > +}
> > +
> > +/*
> > + * collapse_scan_bitmap() consumes the bitmap that is generated during
> > + * collapse_scan_pmd() to determine what regions and mTHP orders fit best.
> > + *
> > + * Each bit in the bitmap represents a single occupied (!none/zero) page.
>
> In which bitmap? There are 2 that are declared. Be specific - cc->mthp_bitmap.
>
> > + * A stack structure cc->mthp_bitmap_stack is used to check different regions
>
> > + * of the bitmap for collapse eligibility. We start at the PMD order and
> > + * check if it is eligible for collapse; if not, we add two entries to the
>
> I questioned this since you start at HPAGE_PMD_ORDER -
> KHUGEPAGED_MIN_MTHP_ORDER, but then realised you're intentionally
> offsetting like that.
>
> See comments below about changing this.
>
> > + * stack at a lower order to represent the left and right halves of the region.
> > + *
> > + * For each region, we calculate the number of set bits and compare it
> > + * against a threshold derived from collapse_max_ptes_none(). A region is
> > + * eligible if the number of set bits exceeds this threshold.
> > + */
>
> I think we could be clearer here. Something like:
>
> ...
> * stack at a lower order to represent the left and right halves of the
> * portion of the PTE page table we are examining.
> *
>
> * For each of these, we determine how many PTE entries are occupied in the
> * range of PTE entries we propose to collapse, then compare this to the
> * number of PTE entries which would need to be set for a collapse to be
> * permitted at that order (accounting for max_ptes_none).
> *
> * If a collapse is permissible, we attempt to perform one. We do so for
> * every possible mTHP in the PTE page table.
> */
>
> > +static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
>
> Really inconsistent naming going on here, we're collapsing and scanning and
> what's the bitmap?
>
> How about mthp_collapse()?
ok sounds good
>
> > + int referenced, int unmapped, struct collapse_control *cc,
> > + bool *mmap_locked, unsigned long enabled_orders)
> > +{
> > + u8 order, next_order;
> > + u16 offset, mid_offset;
> > + int num_chunks;
> > + int bits_set, threshold_bits;
> > + int top = -1;
>
> This seems unnecessary and confusing. Just start at 0 and treat this as the
> exclusive end of the stack.
>
> You can rename this stack_size to make that clear. Have commented above
> about adjustments to push function and introducing pop helper.
>
>
> > + int collapsed = 0;
> > + int ret;
> > + struct scan_bit_state state;
> > + unsigned int max_none_ptes;
>
> Everywhere else we say max_ptes_none, let's maintain that convention here
> please.
ack, didn't realize it was different. Must have been tired eyes.
>
> > +
> > + push_mthp_bitmap_stack(cc, &top, HPAGE_PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER, 0);
>
> See below re: order here, we should change this.
>
> > +
> > + while (top >= 0) {
> > + state = cc->mthp_bitmap_stack[top--];
>
> I hate that we have a push helper but then do pop manually. See above.
>
> > + order = state.order + KHUGEPAGED_MIN_MTHP_ORDER;
>
> OK so now the order isn't state.order but is instead state.order +
> KHUGEPAGED_MIN_MTHP_ORDER? :/ this is extremely confusing.
>
> We shouldn't call this field order if you're doing a hack where state.order
> isn't the order but instead is order - KHUGEPAGED_MIN_MTHP_ORDER.
>
> Just have state.order be the actual order? And change the below condition
> as mentioned there.
Sounds good, Wei already suggested something similar. This made more
sense when we were compressing the bitmap into 128 bits. Already
changed :)
>
> > + offset = state.offset;
> > + num_chunks = 1UL << order;
>
> What's a chunk? You do need to clarify these things. This is a new term not
> used elsewhere in THP code as far as I can tell?
>
> This is the number of pte entries no?
>
> nr_entries? nr_pte_entries?
Yeah that looks much cleaner. Some remnants from my RFC.
>
> > +
> > + /* Skip mTHP orders that are not enabled */
>
> Note we're also considering PMD here :) Probably we can just delete this
> comment, the code below makes it clear what you're doing.
>
> > + if (!test_bit(order, &enabled_orders))
> > + goto next_order;
> > +
> > + max_none_ptes = collapse_max_ptes_none(order, !cc->is_khugepaged);
>
> OK so this is going to be scaled to order.
>
> > +
> > + /* Calculate weight of the range */
>
> What's the weight of a range? This isn't a very helpful comment. You're
> counting the Hamming weight or much more clearly - the number of set bits.
>
> So it seems you're simply counting the number of bits you have accumulated
> so far in cc->mthp_bitmap, adding in the latest offset.
>
> So I'd say add a comment saying something like:
>
> /*
> * Determine how many PTE entries are populated in the range in which we
> * propose to collapse this mTHP.
> */
>
> > + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> > + bitmap_set(cc->mthp_bitmap_mask, offset, num_chunks);
> > + bits_set = bitmap_weight_and(cc->mthp_bitmap,
>
> I think this variable name is pretty horrible, we don't care that it's the
> number of bits set, we care about what it _means_ - that is the number of
> PTE occupied entries.
>
> So nr_occupied_pte_entries? Or nr_occupied_ptes?
ack, looks better!
>
> > + cc->mthp_bitmap_mask, HPAGE_PMD_NR);
>
> Frustrating there isn't a bitmap_weight_offset() or something, as you could
> do that in one go then...
Yeah it's a shame, my previous implementation was much worse. I found
a better solution (this one) a few versions ago.
>
> I think this could be made clearer by separating out the gnarly bitmap
> stuff into a helper function:
>
> static int mthp_nr_occupied_pte_entries(struct collapse_control *cc,
> struct scan_bit_state state)
> {
> const int nr_pte_entries = 1 << state.order;
>
> /* Setup cc->mthp_bitmap_mask to contain mask for candidate mTHP. */
> bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> bitmap_set(cc->mthp_bitmap_mask, state.offset, nr_pte_entries);
> /* Mask against actually occupied PTE entries in PTE table. */
> return bitmap_weight_and(cc->mthp_bitmap,
> cc->mthp_bitmap_mask, HPAGE_PMD_NR);
> }
Done, this fits well with all the other helpers!
>
> > +
> > + threshold_bits = (1UL << order) - max_none_ptes - 1;
>
> We defined num_chunks to 1UL << order then don't use here? :)
Cleaned these up! thanks :)
>
> I'm not sure we need this to be a separate value, and I don't think we need
> the -1 either, which only confuses matter more.
>
> How about just changing the below conditional to (assuming we've renamed
> num_chunks to something sensible like nr_pte_entries and bits_set to
> nr_occupied_pte_entries):
>
> if (nr_occupied_pte_entries >= nr_pte_entries - max_none_ptes) {
> ...
> }
>
> > +
> > + /* Check if the region is eligible based on the threshold */
>
> Probalby don't need this comment with the change above.
ack, that does look cleaner. Although iirc there was some weird corner
case that required the -1. This was back when we were compressing the
bitmap. I reviewed the logic, I think we can go with this solution
now. Ill make sure to test the corner cases I have.
>
> > + if (bits_set > threshold_bits) {
> > + ret = collapse_huge_page(mm, address, referenced,
> > + unmapped, cc, mmap_locked,
> > + order, offset);
>
> We declare ret at the top of the function scope, then only use it
> here. That's confusing and unnecessary, just declare it in block scope
> here.
>
> > + if (ret == SCAN_SUCCEED) {
> > + collapsed += 1UL << order;
>
> Again we have defined num_chunks or rather nr_pte_entries but then
> open-code 1UL << order, let's just use the value we declared here.
>
> > + continue;
>
> This is kinda subtle, we don't bother considering lower orders any longer
> *in this range*, but do continue to consider mTHP collapse in other
> portions of the PTE page table.
>
> This shouldn't just be a 'continue' :) we need a comment here to explain
> that.
sounds good i'll separate and add more to the comments to help explain
the flow. (more applicable to patch 13)
>
> E.g.:
>
> /*
> * We have collapsed an mTHP in this range at the maximum order we
> * could, so we do not push lower orders on to the stack.
> */
> continue;
>
>
> > + }
> > + }
> > +
> > +next_order:
> > + if (state.order > 0) {
>
> This is a great example of how this is confusing by making state.order not
> actually be the order but the order - KHUGEPAGED_MIN_MTHP_ORDER.
>
> Just make the order correct and change this to > KHUGEPAGED_MIN_MTHP_ORDER.
>
> > + next_order = state.order - 1;
>
> Not sure we should have a label and a variable be the same thing.
>
> Also why are we decl'ing next_order at the top of the function but only using here?
ack.
>
> Just declare this here, like:
>
> if (state.order > KHUGEPAGED_MIN_MTHP_ORDER) {
> const u16 new_order = state.order - 1;
>
> ...
> }
>
> > + mid_offset = offset + (num_chunks / 2);
> > + push_mthp_bitmap_stack(cc, &top, next_order, mid_offset);
> > + push_mthp_bitmap_stack(cc, &top, next_order, offset);
>
> I guess one subtlety that wouldn't be obvious at first glance is that
> num_chunks (oh so badly needs a rename :) is a power-of-2 so we never get
> weird 'what if num_chunks is odd' scenarios to worry about.
>
> Probably doesn't really need a comment though. But this _does_ badly needs
> an ASCII diagram :):
>
> /*
> * The next lowest mTHP order possesses half the number of PTE
> * entries of the current one. We therefore must consider both
> * halves of the current mTHP:
> *
> * offset mid_offset
> * . |
> * . v
> * |---------------.-------------------|
> * | PTE page table |
> * |---------------.-------------------|
> * <--------><-------->
> * order-1 order-1
> */
>
yeah a diagram would help a lot!
> Since writing this I copied this above in another suggestion :P so you
> could always say 'see comment above for details' or something.
>
> > + }
> > + }
> > + return collapsed;
> > +}
>
> I've commented this function to death here, but a few more things to note.
>
> (BTW - I'm sorry I personally _hate_ repeated iterations of review when
> there's stuff you could have commented in prior iterations BUT I think I
> may end up having to once we respin due to the subtleties here.)
>
> - I wonder if we could just use a helper struct to make this all a little
> easier. Perhaps as it's realtively short code not so necesary, but a bit
> horrid to pass around all these paramters all the time. Maybe something
> for later THP rework.
>
> - Could we exit early if it's obvious that we won't be able to collapse due
> to max_ptes_none? I mean for one, we could at least check if the next
> lowest order is empty. If max_ptes_none was 511, then we would have
> already collapsed so can surely throw that out?
>
> I was thinking we could go 'upwards', starting with the lowest order and
> increasing order (essentially reverse things) then not collapsing until
> we can't collapse at a given order (so collapse at next lowest). That
> might be less efficient though.
>
> - Given that we're going to be only permitting max_ptes_none of 0 and 511
> for mTHP to start with, maybe things can be simplified - either all bits
> have to 1 or we don't care what they are we attempt colalpse anyway?
>
> But then again, maybe it's worth having the generic algorithm in place
> for future flexibility? Thoughts?
I'd prefer to leave the generic algorithm for future work. ie
eagerness, and Baolins shmem mthp collapse support.
>
> - How much have you tested this? This is pretty subtle stuff... it _seems_
> correct to me logically, but this is crying out for some userland testing
> that exhaustively throws every possible permutation of state at this
> function and asserts it's all correct.
Lots! check out https://gitlab.com/npache/khugepaged_mthp_test I use
this to test a number of edge cases, gather statistics, etc.
We've also run a number of our internal CI on this including
performance testing.
>
> - Are we not missing a bunch of stat counts? Didn't we add a bunch but now
> are actually setting them? E.g. if we reject mTHP candidates due to
> pte_max_none?
They should already be added in the generalization patches.
>
> > +
> > static int collapse_scan_pmd(struct mm_struct *mm,
> > struct vm_area_struct *vma,
> > unsigned long start_addr, bool *mmap_locked,
> > @@ -1364,11 +1455,15 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > {
> > pmd_t *pmd;
> > pte_t *pte, *_pte;
> > + int i;
> > int result = SCAN_FAIL, referenced = 0;
> > - int none_or_zero = 0, shared = 0;
> > + int none_or_zero = 0, shared = 0, nr_collapsed = 0;
> > struct page *page = NULL;
> > + unsigned int max_ptes_none;
>
> Correct spelling of this :)
>
> > struct folio *folio = NULL;
> > unsigned long addr;
> > + unsigned long enabled_orders;
> > + bool full_scan = true;
> > spinlock_t *ptl;
> > int node = NUMA_NO_NODE, unmapped = 0;
> >
> > @@ -1378,16 +1473,29 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (result != SCAN_SUCCEED)
> > goto out;
> >
> > + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR);
> > memset(cc->node_load, 0, sizeof(cc->node_load));
> > nodes_clear(cc->alloc_nmask);
> > +
> > + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged);
> > +
> > + /*
> > + * If PMD is the only enabled order, enforce max_ptes_none, otherwise
> > + * scan all pages to populate the bitmap for mTHP collapse.
> > + */
>
> Ugh this is quite ugly. I don't really love that we've converted this from
> doing the actual work to _mostly_ just populating the bitmap for the mthp
> logic.
>
> Then again it's only a couple places where this is checked, but it's pretty
> horrible that what once was _the logic that determines what is being
> considered for THP collapse' is now turned into 'the logic that populates a
> bitmap'.
>
> > + if (cc->is_khugepaged && enabled_orders == _BITUL(HPAGE_PMD_ORDER))
>
> I think this should be BIT(HPAGE_PMD_ORDER), I realise I reviewed the
> opposite before (or think I did) but as per David we prefer BIT() :)
>
> > + full_scan = false;
> > + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan);
>
> Again really quite nasty, this may as well be:
>
> if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER))
> max_ptes_none = khugepaged_max_ptes_none;
> else
> max_ptes_none = HPAGE_PMD_NR - 1;
>
> It makes this hack a lot more obvious.
The point of collapse_max_ptes_none was to centralize all this logic
into a helper function.
This check/toggle is mostly to preserve the original khugepaged
behavior (aborting during scan phase), if only PMD is enabled. ie)
full scan vs abort early.
>
> But also, what if !cc->is_khugepaged? We're going to scan everything even
> if we only have PMD? I thought we only considered PMD size for MADV_COLLAPSE?
MADV_COLLAPSE also ignores sysfs tunables. So if !khugepaged we still
do the full scan.
>
> > +
> > pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
> > if (!pte) {
> > result = SCAN_PMD_NULL;
> > goto out;
> > }
> >
> > - for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
> > - _pte++, addr += PAGE_SIZE) {
> > + for (i = 0; i < HPAGE_PMD_NR; i++) {
> > + _pte = pte + i;
> > + addr = start_addr + i * PAGE_SIZE;
>
> That's nicer. I still hate _pte...
>
> > pte_t pteval = ptep_get(_pte);
> > if (is_swap_pte(pteval)) {
> > ++unmapped;
> > @@ -1412,8 +1520,7 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > if (pte_none_or_zero(pteval)) {
> > ++none_or_zero;
> > if (!userfaultfd_armed(vma) &&
> > - (!cc->is_khugepaged ||
> > - none_or_zero <= khugepaged_max_ptes_none)) {
> > + none_or_zero <= max_ptes_none) {
>
> Why are we dropping !cc->is_khugepaged?
One of the nice things about using the collapse_max_ptes_none helper
is we simplify the logic here. if !cc->is_khugepaged (ie
madv_collapse) we ignore the max_ptes_none value. But the helper
already does this by returning HPAGE_PMD_NR in the case of
madv_collapse.
>
> > continue;
> > } else {
> > result = SCAN_EXCEED_NONE_PTE;
> > @@ -1461,6 +1568,8 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > }
> > }
> >
> > + /* Set bit for occupied pages */
> > + bitmap_set(cc->mthp_bitmap, i, 1);
>
> Maybe worth highlighting this is now _the entire point_ of the loop.
>
> I wonder if we shouldn't just separate this logic out and name it
> apppropriately? As we're into realms of real confusion here.
That is the clean up that conflicts with my series. We decided to wait
until after as with my changes the helper that was suggested needs to
be reworked.
>
> > /*
> > * Record which node the original page is from and save this
> > * information to cc->node_load[].
> > @@ -1517,9 +1626,12 @@ static int collapse_scan_pmd(struct mm_struct *mm,
> > out_unmap:
> > pte_unmap_unlock(pte, ptl);
> > if (result == SCAN_SUCCEED) {
> > - result = collapse_huge_page(mm, start_addr, referenced,
> > - unmapped, cc, mmap_locked,
> > - HPAGE_PMD_ORDER, 0);
>
> Hmm... what's actually enforcing that MADV_COLLAPSE isn't using this?
> You've not done any cc->khugepaged checks afaict?
The collapse_allowable_orders helper function handles this.
>
> It seems that you _are_ enabling this for MADV_COLLAPSE unless I've missed
> something?
>
> > + nr_collapsed = collapse_scan_bitmap(mm, start_addr, referenced, unmapped,
> > + cc, mmap_locked, enabled_orders);
> > + if (nr_collapsed > 0)
> > + result = SCAN_SUCCEED;
> > + else
> > + result = SCAN_FAIL;
> > }
> > out:
> > trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
> > --
> > 2.51.0
> >
>
> Thanks, Lorenzo
Thanks for the very thorough review! Hopefully I didn't miss any of
your points. I'll get these changes in place before my next version :)
Cheers,
-- Nico
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (11 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 12/15] khugepaged: Introduce mTHP collapse support Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-09 2:40 ` Wei Yang
2025-11-19 12:05 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders Nico Pache
` (2 subsequent siblings)
15 siblings, 2 replies; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
There are cases where, if an attempted collapse fails, all subsequent
orders are guaranteed to also fail. Avoid these collapse attempts by
bailing out early.
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e2319bfd0065..54f5c7888e46 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
ret = collapse_huge_page(mm, address, referenced,
unmapped, cc, mmap_locked,
order, offset);
- if (ret == SCAN_SUCCEED) {
+
+ /*
+ * Analyze failure reason to determine next action:
+ * - goto next_order: try smaller orders in same region
+ * - continue: try other regions at same order
+ * - break: stop all attempts (system-wide failure)
+ */
+ switch (ret) {
+ /* Cases were we should continue to the next region */
+ case SCAN_SUCCEED:
collapsed += 1UL << order;
+ fallthrough;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
continue;
+ /* Cases were lower orders might still succeed */
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_EXCEED_NONE_PTE:
+ case SCAN_EXCEED_SWAP_PTE:
+ case SCAN_EXCEED_SHARED_PTE:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LRU:
+ case SCAN_PAGE_NULL:
+ case SCAN_DEL_PAGE_LRU:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ goto next_order;
+ /* All other cases should stop collapse attempts */
+ default:
+ break;
}
+ break;
}
next_order:
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
@ 2025-11-09 2:40 ` Wei Yang
2025-11-17 18:16 ` Nico Pache
2025-11-19 12:05 ` Lorenzo Stoakes
1 sibling, 1 reply; 91+ messages in thread
From: Wei Yang @ 2025-11-09 2:40 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
>There are cases where, if an attempted collapse fails, all subsequent
>orders are guaranteed to also fail. Avoid these collapse attempts by
>bailing out early.
>
>Signed-off-by: Nico Pache <npache@redhat.com>
>---
> mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
> 1 file changed, 30 insertions(+), 1 deletion(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index e2319bfd0065..54f5c7888e46 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> ret = collapse_huge_page(mm, address, referenced,
> unmapped, cc, mmap_locked,
> order, offset);
>- if (ret == SCAN_SUCCEED) {
>+
>+ /*
>+ * Analyze failure reason to determine next action:
>+ * - goto next_order: try smaller orders in same region
>+ * - continue: try other regions at same order
>+ * - break: stop all attempts (system-wide failure)
>+ */
>+ switch (ret) {
>+ /* Cases were we should continue to the next region */
>+ case SCAN_SUCCEED:
> collapsed += 1UL << order;
>+ fallthrough;
>+ case SCAN_PTE_MAPPED_HUGEPAGE:
> continue;
>+ /* Cases were lower orders might still succeed */
>+ case SCAN_LACK_REFERENCED_PAGE:
>+ case SCAN_EXCEED_NONE_PTE:
>+ case SCAN_EXCEED_SWAP_PTE:
>+ case SCAN_EXCEED_SHARED_PTE:
>+ case SCAN_PAGE_LOCK:
>+ case SCAN_PAGE_COUNT:
>+ case SCAN_PAGE_LRU:
>+ case SCAN_PAGE_NULL:
>+ case SCAN_DEL_PAGE_LRU:
>+ case SCAN_PTE_NON_PRESENT:
>+ case SCAN_PTE_UFFD_WP:
>+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
>+ goto next_order;
>+ /* All other cases should stop collapse attempts */
>+ default:
>+ break;
> }
>+ break;
One question here:
Suppose we have iterated several orders and not collapse successfully yet. So
the mthp_bitmap_stack[] would look like this:
[8 7 6 6]
^
|
Now we found this one pass the threshold check, but it fails with other
result.
Current code looks it would give up at all, but we may still have a chance to
collapse the above 3 range?
> }
>
> next_order:
>--
>2.51.0
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-11-09 2:40 ` Wei Yang
@ 2025-11-17 18:16 ` Nico Pache
2025-11-18 2:00 ` Wei Yang
0 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-11-17 18:16 UTC (permalink / raw)
To: Wei Yang
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, lance.yang, vbabka, rppt, jannh, pfalcato
On Sat, Nov 8, 2025 at 7:40 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
> >There are cases where, if an attempted collapse fails, all subsequent
> >orders are guaranteed to also fail. Avoid these collapse attempts by
> >bailing out early.
> >
> >Signed-off-by: Nico Pache <npache@redhat.com>
> >---
> > mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
> > 1 file changed, 30 insertions(+), 1 deletion(-)
> >
> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >index e2319bfd0065..54f5c7888e46 100644
> >--- a/mm/khugepaged.c
> >+++ b/mm/khugepaged.c
> >@@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > ret = collapse_huge_page(mm, address, referenced,
> > unmapped, cc, mmap_locked,
> > order, offset);
> >- if (ret == SCAN_SUCCEED) {
> >+
> >+ /*
> >+ * Analyze failure reason to determine next action:
> >+ * - goto next_order: try smaller orders in same region
> >+ * - continue: try other regions at same order
> >+ * - break: stop all attempts (system-wide failure)
> >+ */
> >+ switch (ret) {
> >+ /* Cases were we should continue to the next region */
> >+ case SCAN_SUCCEED:
> > collapsed += 1UL << order;
> >+ fallthrough;
> >+ case SCAN_PTE_MAPPED_HUGEPAGE:
> > continue;
> >+ /* Cases were lower orders might still succeed */
> >+ case SCAN_LACK_REFERENCED_PAGE:
> >+ case SCAN_EXCEED_NONE_PTE:
> >+ case SCAN_EXCEED_SWAP_PTE:
> >+ case SCAN_EXCEED_SHARED_PTE:
> >+ case SCAN_PAGE_LOCK:
> >+ case SCAN_PAGE_COUNT:
> >+ case SCAN_PAGE_LRU:
> >+ case SCAN_PAGE_NULL:
> >+ case SCAN_DEL_PAGE_LRU:
> >+ case SCAN_PTE_NON_PRESENT:
> >+ case SCAN_PTE_UFFD_WP:
> >+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
> >+ goto next_order;
> >+ /* All other cases should stop collapse attempts */
> >+ default:
> >+ break;
> > }
> >+ break;
>
> One question here:
Hi Wei Yang,
Sorry I forgot to get back to this email.
>
> Suppose we have iterated several orders and not collapse successfully yet. So
> the mthp_bitmap_stack[] would look like this:
>
> [8 7 6 6]
> ^
> |
so we always pop before pushing. So it would go
[9]
pop
if (collapse fails)
[8 8]
lets say we pop and successfully collapse a order 8
[8]
Then we fail the other order 8
[7 7]
now if we succeed the first order 7
[7 6 6]
I believe we are now in the state you wanted to describe.
>
> Now we found this one pass the threshold check, but it fails with other
> result.
ok lets say we pass the threshold checks, but the collapse fails for
any reason that is described in the
/* Cases were lower orders might still succeed */
In this case we would continue to order 5 (or lower). Once we are done
with this branch of the tree we go back to the other order 6 collapse.
and eventually the order 7.
>
> Current code looks it would give up at all, but we may still have a chance to
> collapse the above 3 range?
for cases under /* All other cases should stop collapse attempts */
Yes we would bail out and skip some collapses. I tried to think about
all the cases were we would still want to continue trying, vs cases
where the system is probably out of resources or hitting some major
failure, and we should just break out (as others will probably fail
too).
But this is also why I separated this patch out on its own. I was
hoping to have some more focus on the different cases, and make sure I
handled them in the best possible way. So I really appreciate the
question :)
* I did some digging through old message to find this *
I believe these are the remaining cases. If these are hit I figured
it's better to abort.
/* cases where we must stop collapse attempts */
case SCAN_CGROUP_CHARGE_FAIL:
case SCAN_COPY_MC:
case SCAN_ADDRESS_RANGE:
case SCAN_PMD_NULL:
case SCAN_ANY_PROCESS:
case SCAN_VMA_NULL:
case SCAN_VMA_CHECK:
case SCAN_SCAN_ABORT:
case SCAN_PMD_NONE:
case SCAN_PAGE_ANON:
case SCAN_PMD_MAPPED:
case SCAN_FAIL:
Please let me know if you think we should move these to either the
`continue` or `next order` cases.
Cheers,
-- Nico
>
> > }
> >
> > next_order:
> >--
> >2.51.0
>
> --
> Wei Yang
> Help you, Help me
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-11-17 18:16 ` Nico Pache
@ 2025-11-18 2:00 ` Wei Yang
0 siblings, 0 replies; 91+ messages in thread
From: Wei Yang @ 2025-11-18 2:00 UTC (permalink / raw)
To: Nico Pache
Cc: Wei Yang, linux-kernel, linux-trace-kernel, linux-mm, linux-doc,
david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, lance.yang, vbabka,
rppt, jannh, pfalcato
On Mon, Nov 17, 2025 at 11:16:53AM -0700, Nico Pache wrote:
>On Sat, Nov 8, 2025 at 7:40 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>>
>> On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
>> >There are cases where, if an attempted collapse fails, all subsequent
>> >orders are guaranteed to also fail. Avoid these collapse attempts by
>> >bailing out early.
>> >
>> >Signed-off-by: Nico Pache <npache@redhat.com>
>> >---
>> > mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
>> > 1 file changed, 30 insertions(+), 1 deletion(-)
>> >
>> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> >index e2319bfd0065..54f5c7888e46 100644
>> >--- a/mm/khugepaged.c
>> >+++ b/mm/khugepaged.c
>> >@@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
>> > ret = collapse_huge_page(mm, address, referenced,
>> > unmapped, cc, mmap_locked,
>> > order, offset);
>> >- if (ret == SCAN_SUCCEED) {
>> >+
>> >+ /*
>> >+ * Analyze failure reason to determine next action:
>> >+ * - goto next_order: try smaller orders in same region
>> >+ * - continue: try other regions at same order
>> >+ * - break: stop all attempts (system-wide failure)
>> >+ */
>> >+ switch (ret) {
>> >+ /* Cases were we should continue to the next region */
>> >+ case SCAN_SUCCEED:
>> > collapsed += 1UL << order;
>> >+ fallthrough;
>> >+ case SCAN_PTE_MAPPED_HUGEPAGE:
>> > continue;
>> >+ /* Cases were lower orders might still succeed */
>> >+ case SCAN_LACK_REFERENCED_PAGE:
>> >+ case SCAN_EXCEED_NONE_PTE:
>> >+ case SCAN_EXCEED_SWAP_PTE:
>> >+ case SCAN_EXCEED_SHARED_PTE:
>> >+ case SCAN_PAGE_LOCK:
>> >+ case SCAN_PAGE_COUNT:
>> >+ case SCAN_PAGE_LRU:
>> >+ case SCAN_PAGE_NULL:
>> >+ case SCAN_DEL_PAGE_LRU:
>> >+ case SCAN_PTE_NON_PRESENT:
>> >+ case SCAN_PTE_UFFD_WP:
>> >+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
>> >+ goto next_order;
>> >+ /* All other cases should stop collapse attempts */
>> >+ default:
>> >+ break;
>> > }
>> >+ break;
>>
>> One question here:
>
>Hi Wei Yang,
>
>Sorry I forgot to get back to this email.
>
No problem, thanks for taking a look.
>>
>> Suppose we have iterated several orders and not collapse successfully yet. So
>> the mthp_bitmap_stack[] would look like this:
>>
>> [8 7 6 6]
>> ^
>> |
>
>so we always pop before pushing. So it would go
>
>[9]
>pop
>if (collapse fails)
>[8 8]
>lets say we pop and successfully collapse a order 8
>[8]
>Then we fail the other order 8
>[7 7]
>now if we succeed the first order 7
>[7 6 6]
>I believe we are now in the state you wanted to describe.
>
>>
>> Now we found this one pass the threshold check, but it fails with other
>> result.
>
>ok lets say we pass the threshold checks, but the collapse fails for
>any reason that is described in the
>/* Cases were lower orders might still succeed */
>In this case we would continue to order 5 (or lower). Once we are done
>with this branch of the tree we go back to the other order 6 collapse.
>and eventually the order 7.
>
>>
>> Current code looks it would give up at all, but we may still have a chance to
>> collapse the above 3 range?
>
>for cases under /* All other cases should stop collapse attempts */
>Yes we would bail out and skip some collapses. I tried to think about
>all the cases were we would still want to continue trying, vs cases
>where the system is probably out of resources or hitting some major
>failure, and we should just break out (as others will probably fail
>too).
>
Thanks, your explanation is very clear.
>But this is also why I separated this patch out on its own. I was
>hoping to have some more focus on the different cases, and make sure I
>handled them in the best possible way. So I really appreciate the
>question :)
>
>* I did some digging through old message to find this *
>
>I believe these are the remaining cases. If these are hit I figured
>it's better to abort.
>
I agree we need to take care of those cases.
>/* cases where we must stop collapse attempts */
>case SCAN_CGROUP_CHARGE_FAIL:
>case SCAN_COPY_MC:
>case SCAN_ADDRESS_RANGE:
>case SCAN_PMD_NULL:
>case SCAN_ANY_PROCESS:
>case SCAN_VMA_NULL:
>case SCAN_VMA_CHECK:
>case SCAN_SCAN_ABORT:
>case SCAN_PMD_NONE:
>case SCAN_PAGE_ANON:
>case SCAN_PMD_MAPPED:
>case SCAN_FAIL:
>
>Please let me know if you think we should move these to either the
>`continue` or `next order` cases.
Take a look into these cases, it looks good to me now.
Also one of my concern is this coding style is a little hard to maintain. In
case we introduce a new result, we should remember to add it here. Otherwise
we may stop the collapse too early.
While it maybe a separate work after this patch set merged.
>
>Cheers,
>-- Nico
>
>>
>> > }
>> >
>> > next_order:
>> >--
>> >2.51.0
>>
>> --
>> Wei Yang
>> Help you, Help me
>>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
2025-11-09 2:40 ` Wei Yang
@ 2025-11-19 12:05 ` Lorenzo Stoakes
2025-11-26 23:16 ` Nico Pache
2025-11-26 23:29 ` Nico Pache
1 sibling, 2 replies; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-19 12:05 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
> There are cases where, if an attempted collapse fails, all subsequent
> orders are guaranteed to also fail. Avoid these collapse attempts by
> bailing out early.
>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
> 1 file changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e2319bfd0065..54f5c7888e46 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> ret = collapse_huge_page(mm, address, referenced,
> unmapped, cc, mmap_locked,
> order, offset);
> - if (ret == SCAN_SUCCEED) {
> +
> + /*
> + * Analyze failure reason to determine next action:
> + * - goto next_order: try smaller orders in same region
> + * - continue: try other regions at same order
The stack is a DFS, so this isn't correct, you may have pushed a bunch of higher
order candidate mTHPs (I don't like plain 'region') which you will also true.
> + * - break: stop all attempts (system-wide failure)
> + */
This comment isn't hugely helpful, just put the relevant comments next to each
of the goto, continue, break (soon to be return re: review below) statements
please.
> + switch (ret) {
> + /* Cases were we should continue to the next region */
> + case SCAN_SUCCEED:
> collapsed += 1UL << order;
> + fallthrough;
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> continue;
Would we not run into trouble potentially in the previous patch's implementation
of this examing candidate mTHPs that are part of an already existing huge page,
or would a folio check in the collapse just make this wasted work?
> + /* Cases were lower orders might still succeed */
> + case SCAN_LACK_REFERENCED_PAGE:
> + case SCAN_EXCEED_NONE_PTE:
How can we, having checked the max_pte_none, still fail due to this?
> + case SCAN_EXCEED_SWAP_PTE:
> + case SCAN_EXCEED_SHARED_PTE:
> + case SCAN_PAGE_LOCK:
> + case SCAN_PAGE_COUNT:
> + case SCAN_PAGE_LRU:
> + case SCAN_PAGE_NULL:
> + case SCAN_DEL_PAGE_LRU:
> + case SCAN_PTE_NON_PRESENT:
> + case SCAN_PTE_UFFD_WP:
> + case SCAN_ALLOC_HUGE_PAGE_FAIL:
> + goto next_order;
> + /* All other cases should stop collapse attempts */
I don't love us having a catch-all, plase spell out all cases.
> + default:
> + break;
> }
> + break;
_Hate_ this double break. Just return collapsed please.
> }
>
> next_order:
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-11-19 12:05 ` Lorenzo Stoakes
@ 2025-11-26 23:16 ` Nico Pache
2025-11-26 23:29 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-26 23:16 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Nov 19, 2025 at 5:06 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
> > There are cases where, if an attempted collapse fails, all subsequent
> > orders are guaranteed to also fail. Avoid these collapse attempts by
> > bailing out early.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
> > 1 file changed, 30 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index e2319bfd0065..54f5c7888e46 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > ret = collapse_huge_page(mm, address, referenced,
> > unmapped, cc, mmap_locked,
> > order, offset);
> > - if (ret == SCAN_SUCCEED) {
> > +
> > + /*
> > + * Analyze failure reason to determine next action:
> > + * - goto next_order: try smaller orders in same region
> > + * - continue: try other regions at same order
>
> The stack is a DFS, so this isn't correct, you may have pushed a bunch of higher
> order candidate mTHPs (I don't like plain 'region') which you will also true.
Ah yeah so it should just be try other "regions" or in this case we
want something like "try to collapse another mTHP candidate in the
stack"
>
> > + * - break: stop all attempts (system-wide failure)
> > + */
>
> This comment isn't hugely helpful, just put the relevant comments next to each
> of the goto, continue, break (soon to be return re: review below) statements
> please.
ack
>
> > + switch (ret) {
> > + /* Cases were we should continue to the next region */
> > + case SCAN_SUCCEED:
> > collapsed += 1UL << order;
> > + fallthrough;
> > + case SCAN_PTE_MAPPED_HUGEPAGE:
> > continue;
>
> Would we not run into trouble potentially in the previous patch's implementation
> of this examing candidate mTHPs that are part of an already existing huge page,
> or would a folio check in the collapse just make this wasted work?
>
> > + /* Cases were lower orders might still succeed */
> > + case SCAN_LACK_REFERENCED_PAGE:
> > + case SCAN_EXCEED_NONE_PTE:
>
> How can we, having checked the max_pte_none, still fail due to this?
There are two phases in the khugepaged code, scan and collapse. in
between them is an alloc which requires dropping the lock, and
reconfirming values (in the collapse phase) after relocking.
During this time, the state of the PMD range might have changed and
our thresholds may have been exceeded.
This was true for PMD collapse and holds true for mTHP collapse too.
>
> > + case SCAN_EXCEED_SWAP_PTE:
> > + case SCAN_EXCEED_SHARED_PTE:
> > + case SCAN_PAGE_LOCK:
> > + case SCAN_PAGE_COUNT:
> > + case SCAN_PAGE_LRU:
> > + case SCAN_PAGE_NULL:
> > + case SCAN_DEL_PAGE_LRU:
> > + case SCAN_PTE_NON_PRESENT:
> > + case SCAN_PTE_UFFD_WP:
> > + case SCAN_ALLOC_HUGE_PAGE_FAIL:
> > + goto next_order;
> > + /* All other cases should stop collapse attempts */
>
> I don't love us having a catch-all, plase spell out all cases.
Ok sounds good, quick question, do we spell out ALL the enums or just
the ones that are reachable from here?
>
> > + default:
> > + break;
> > }
> > + break;
>
> _Hate_ this double break. Just return collapsed please.
ack, yeah that's much better. Thanks!
-- Nico
>
> > }
> >
> > next_order:
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts
2025-11-19 12:05 ` Lorenzo Stoakes
2025-11-26 23:16 ` Nico Pache
@ 2025-11-26 23:29 ` Nico Pache
1 sibling, 0 replies; 91+ messages in thread
From: Nico Pache @ 2025-11-26 23:29 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Nov 19, 2025 at 5:06 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Wed, Oct 22, 2025 at 12:37:15PM -0600, Nico Pache wrote:
> > There are cases where, if an attempted collapse fails, all subsequent
> > orders are guaranteed to also fail. Avoid these collapse attempts by
> > bailing out early.
> >
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > mm/khugepaged.c | 31 ++++++++++++++++++++++++++++++-
> > 1 file changed, 30 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index e2319bfd0065..54f5c7888e46 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1431,10 +1431,39 @@ static int collapse_scan_bitmap(struct mm_struct *mm, unsigned long address,
> > ret = collapse_huge_page(mm, address, referenced,
> > unmapped, cc, mmap_locked,
> > order, offset);
> > - if (ret == SCAN_SUCCEED) {
> > +
> > + /*
> > + * Analyze failure reason to determine next action:
> > + * - goto next_order: try smaller orders in same region
> > + * - continue: try other regions at same order
>
> The stack is a DFS, so this isn't correct, you may have pushed a bunch of higher
> order candidate mTHPs (I don't like plain 'region') which you will also true.
>
> > + * - break: stop all attempts (system-wide failure)
> > + */
>
> This comment isn't hugely helpful, just put the relevant comments next to each
> of the goto, continue, break (soon to be return re: review below) statements
> please.
>
> > + switch (ret) {
> > + /* Cases were we should continue to the next region */
> > + case SCAN_SUCCEED:
> > collapsed += 1UL << order;
> > + fallthrough;
> > + case SCAN_PTE_MAPPED_HUGEPAGE:
> > continue;
>
> Would we not run into trouble potentially in the previous patch's implementation
> of this examing candidate mTHPs that are part of an already existing huge page,
> or would a folio check in the collapse just make this wasted work?
whoops almost missed this comment.
There is a folio check in the __collapse_huge_page_isolate function
that handles this. I think Dev has some plans to try and add
partially-mapped support as the todo comment suggests (I think he
already has some patches from earlier mTHP work).
/*
* TODO: In some cases of partially-mapped folios, we'd actually
* want to collapse.
*/
>
> > + /* Cases were lower orders might still succeed */
> > + case SCAN_LACK_REFERENCED_PAGE:
> > + case SCAN_EXCEED_NONE_PTE:
>
> How can we, having checked the max_pte_none, still fail due to this?
>
> > + case SCAN_EXCEED_SWAP_PTE:
> > + case SCAN_EXCEED_SHARED_PTE:
> > + case SCAN_PAGE_LOCK:
> > + case SCAN_PAGE_COUNT:
> > + case SCAN_PAGE_LRU:
> > + case SCAN_PAGE_NULL:
> > + case SCAN_DEL_PAGE_LRU:
> > + case SCAN_PTE_NON_PRESENT:
> > + case SCAN_PTE_UFFD_WP:
> > + case SCAN_ALLOC_HUGE_PAGE_FAIL:
> > + goto next_order;
> > + /* All other cases should stop collapse attempts */
>
> I don't love us having a catch-all, plase spell out all cases.
>
> > + default:
> > + break;
> > }
> > + break;
>
> _Hate_ this double break. Just return collapsed please.
>
> > }
> >
> > next_order:
> > --
> > 2.51.0
> >
>
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (12 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 13/15] khugepaged: avoid unnecessary mTHP collapse attempts Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-11-19 12:13 ` Lorenzo Stoakes
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
2025-10-22 20:13 ` [PATCH v12 mm-new 00/15] khugepaged: mTHP support Andrew Morton
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato
From: Baolin Wang <baolin.wang@linux.alibaba.com>
If any order (m)THP is enabled we should allow running khugepaged to
attempt scanning and collapsing mTHPs. In order for khugepaged to operate
when only mTHP sizes are specified in sysfs, we must modify the predicate
function that determines whether it ought to run to do so.
This function is currently called hugepage_pmd_enabled(), this patch
renames it to hugepage_enabled() and updates the logic to check to
determine whether any valid orders may exist which would justify
khugepaged running.
We must also update collapse_allowable_orders() to check all orders if
the vma is anonymous and the collapse is khugepaged.
After this patch khugepaged mTHP collapse is fully enabled.
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
mm/khugepaged.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 54f5c7888e46..8ed9f8e2d376 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -418,23 +418,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
-static bool hugepage_pmd_enabled(void)
+static bool hugepage_enabled(void)
{
/*
* We cover the anon, shmem and the file-backed case here; file-backed
* hugepages, when configured in, are determined by the global control.
- * Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Anon hugepages are determined by its per-size mTHP control.
* Shmem pmd-sized hugepages are also determined by its pmd-size control,
* except when the global shmem_huge is set to SHMEM_HUGE_DENY.
*/
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
hugepage_global_enabled())
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+ if (READ_ONCE(huge_anon_orders_always))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+ if (READ_ONCE(huge_anon_orders_madvise))
return true;
- if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+ if (READ_ONCE(huge_anon_orders_inherit) &&
hugepage_global_enabled())
return true;
if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
@@ -508,7 +508,8 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags, bool is_khugepaged)
{
enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
- unsigned long orders = BIT(HPAGE_PMD_ORDER);
+ unsigned long orders = is_khugepaged && vma_is_anonymous(vma) ?
+ THP_ORDERS_ALL_ANON : BIT(HPAGE_PMD_ORDER);
return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}
@@ -517,7 +518,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_pmd_enabled()) {
+ hugepage_enabled()) {
if (collapse_allowable_orders(vma, vm_flags, true))
__khugepaged_enter(vma->vm_mm);
}
@@ -2791,7 +2792,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
static int khugepaged_has_work(void)
{
- return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
+ return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
}
static int khugepaged_wait_event(void)
@@ -2864,7 +2865,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (hugepage_pmd_enabled())
+ if (hugepage_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -2895,7 +2896,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!hugepage_pmd_enabled()) {
+ if (!hugepage_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -2945,7 +2946,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled()) {
+ if (hugepage_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2971,7 +2972,7 @@ int start_stop_khugepaged(void)
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (hugepage_pmd_enabled() && khugepaged_thread)
+ if (hugepage_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders
2025-10-22 18:37 ` [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders Nico Pache
@ 2025-11-19 12:13 ` Lorenzo Stoakes
2025-11-20 6:37 ` Baolin Wang
0 siblings, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-11-19 12:13 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, Liam.Howlett, ryan.roberts, dev.jain, corbet,
rostedt, mhiramat, mathieu.desnoyers, akpm, baohua, willy,
peterx, wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, Oct 22, 2025 at 12:37:16PM -0600, Nico Pache wrote:
> From: Baolin Wang <baolin.wang@linux.alibaba.com>
>
> If any order (m)THP is enabled we should allow running khugepaged to
> attempt scanning and collapsing mTHPs. In order for khugepaged to operate
> when only mTHP sizes are specified in sysfs, we must modify the predicate
> function that determines whether it ought to run to do so.
>
> This function is currently called hugepage_pmd_enabled(), this patch
> renames it to hugepage_enabled() and updates the logic to check to
> determine whether any valid orders may exist which would justify
> khugepaged running.
>
> We must also update collapse_allowable_orders() to check all orders if
> the vma is anonymous and the collapse is khugepaged.
>
> After this patch khugepaged mTHP collapse is fully enabled.
>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> mm/khugepaged.c | 25 +++++++++++++------------
> 1 file changed, 13 insertions(+), 12 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 54f5c7888e46..8ed9f8e2d376 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -418,23 +418,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
> mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
> }
>
> -static bool hugepage_pmd_enabled(void)
> +static bool hugepage_enabled(void)
> {
> /*
> * We cover the anon, shmem and the file-backed case here; file-backed
> * hugepages, when configured in, are determined by the global control.
> - * Anon pmd-sized hugepages are determined by the pmd-size control.
> + * Anon hugepages are determined by its per-size mTHP control.
> * Shmem pmd-sized hugepages are also determined by its pmd-size control,
> * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
> */
> if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
> hugepage_global_enabled())
> return true;
> - if (test_bit(PMD_ORDER, &huge_anon_orders_always))
> + if (READ_ONCE(huge_anon_orders_always))
> return true;
> - if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
> + if (READ_ONCE(huge_anon_orders_madvise))
> return true;
> - if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
> + if (READ_ONCE(huge_anon_orders_inherit) &&
> hugepage_global_enabled())
> return true;
> if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
> @@ -508,7 +508,8 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
> vm_flags_t vm_flags, bool is_khugepaged)
> {
> enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
> - unsigned long orders = BIT(HPAGE_PMD_ORDER);
> + unsigned long orders = is_khugepaged && vma_is_anonymous(vma) ?
> + THP_ORDERS_ALL_ANON : BIT(HPAGE_PMD_ORDER);
Why are we doing this? If this is explicitly enabling mTHP for anon, which it
seems to be, can we please make this a little more explicit :)
I'd prefer this not to be a horribly squashed ternary, rather:
unsigned long orders;
/* We explicitly allow mTHP collapse for anonymous khugepaged ONLY. */
if (is_khugepaged && vma_is_anonymous(vma))
orders = THP_ORDERS_ALL_ANON;
else
orders = BIT(HPAGE_PMD_ORDER);
Also, does THP_ORDERS_ALL_ANON account for KHUGEPAGED_MIN_MTHP_ORDER? It's weird
to say that an order is allowed that isn't permitted by mTHP (e.g. order-0).
>
> return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
> }
> @@ -517,7 +518,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> - hugepage_pmd_enabled()) {
> + hugepage_enabled()) {
> if (collapse_allowable_orders(vma, vm_flags, true))
> __khugepaged_enter(vma->vm_mm);
> }
> @@ -2791,7 +2792,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result,
>
> static int khugepaged_has_work(void)
> {
> - return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
> + return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled();
> }
>
> static int khugepaged_wait_event(void)
> @@ -2864,7 +2865,7 @@ static void khugepaged_wait_work(void)
> return;
> }
>
> - if (hugepage_pmd_enabled())
> + if (hugepage_enabled())
> wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
> }
>
> @@ -2895,7 +2896,7 @@ static void set_recommended_min_free_kbytes(void)
> int nr_zones = 0;
> unsigned long recommended_min;
>
> - if (!hugepage_pmd_enabled()) {
> + if (!hugepage_enabled()) {
> calculate_min_free_kbytes();
> goto update_wmarks;
> }
> @@ -2945,7 +2946,7 @@ int start_stop_khugepaged(void)
> int err = 0;
>
> mutex_lock(&khugepaged_mutex);
> - if (hugepage_pmd_enabled()) {
> + if (hugepage_enabled()) {
> if (!khugepaged_thread)
> khugepaged_thread = kthread_run(khugepaged, NULL,
> "khugepaged");
> @@ -2971,7 +2972,7 @@ int start_stop_khugepaged(void)
> void khugepaged_min_free_kbytes_update(void)
> {
> mutex_lock(&khugepaged_mutex);
> - if (hugepage_pmd_enabled() && khugepaged_thread)
> + if (hugepage_enabled() && khugepaged_thread)
> set_recommended_min_free_kbytes();
> mutex_unlock(&khugepaged_mutex);
> }
> --
> 2.51.0
>
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders
2025-11-19 12:13 ` Lorenzo Stoakes
@ 2025-11-20 6:37 ` Baolin Wang
0 siblings, 0 replies; 91+ messages in thread
From: Baolin Wang @ 2025-11-20 6:37 UTC (permalink / raw)
To: Lorenzo Stoakes, Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, Liam.Howlett, ryan.roberts, dev.jain, corbet, rostedt,
mhiramat, mathieu.desnoyers, akpm, baohua, willy, peterx,
wangkefeng.wang, usamaarif642, sunnanyong, vishal.moola,
thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On 2025/11/19 20:13, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 12:37:16PM -0600, Nico Pache wrote:
>> From: Baolin Wang <baolin.wang@linux.alibaba.com>
>>
>> If any order (m)THP is enabled we should allow running khugepaged to
>> attempt scanning and collapsing mTHPs. In order for khugepaged to operate
>> when only mTHP sizes are specified in sysfs, we must modify the predicate
>> function that determines whether it ought to run to do so.
>>
>> This function is currently called hugepage_pmd_enabled(), this patch
>> renames it to hugepage_enabled() and updates the logic to check to
>> determine whether any valid orders may exist which would justify
>> khugepaged running.
>>
>> We must also update collapse_allowable_orders() to check all orders if
>> the vma is anonymous and the collapse is khugepaged.
>>
>> After this patch khugepaged mTHP collapse is fully enabled.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Signed-off-by: Nico Pache <npache@redhat.com>
>> ---
>> mm/khugepaged.c | 25 +++++++++++++------------
>> 1 file changed, 13 insertions(+), 12 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 54f5c7888e46..8ed9f8e2d376 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -418,23 +418,23 @@ static inline int collapse_test_exit_or_disable(struct mm_struct *mm)
>> mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
>> }
>>
>> -static bool hugepage_pmd_enabled(void)
>> +static bool hugepage_enabled(void)
>> {
>> /*
>> * We cover the anon, shmem and the file-backed case here; file-backed
>> * hugepages, when configured in, are determined by the global control.
>> - * Anon pmd-sized hugepages are determined by the pmd-size control.
>> + * Anon hugepages are determined by its per-size mTHP control.
>> * Shmem pmd-sized hugepages are also determined by its pmd-size control,
>> * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
>> */
>> if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
>> hugepage_global_enabled())
>> return true;
>> - if (test_bit(PMD_ORDER, &huge_anon_orders_always))
>> + if (READ_ONCE(huge_anon_orders_always))
>> return true;
>> - if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
>> + if (READ_ONCE(huge_anon_orders_madvise))
>> return true;
>> - if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
>> + if (READ_ONCE(huge_anon_orders_inherit) &&
>> hugepage_global_enabled())
>> return true;
>> if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
>> @@ -508,7 +508,8 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma,
>> vm_flags_t vm_flags, bool is_khugepaged)
>> {
>> enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE;
>> - unsigned long orders = BIT(HPAGE_PMD_ORDER);
>> + unsigned long orders = is_khugepaged && vma_is_anonymous(vma) ?
>> + THP_ORDERS_ALL_ANON : BIT(HPAGE_PMD_ORDER);
>
> Why are we doing this? If this is explicitly enabling mTHP for anon, which it
> seems to be, can we please make this a little more explicit :)
>
> I'd prefer this not to be a horribly squashed ternary, rather:
>
> unsigned long orders;
>
> /* We explicitly allow mTHP collapse for anonymous khugepaged ONLY. */
> if (is_khugepaged && vma_is_anonymous(vma))
> orders = THP_ORDERS_ALL_ANON;
> else
> orders = BIT(HPAGE_PMD_ORDER);
Yes, LGTM.
> Also, does THP_ORDERS_ALL_ANON account for KHUGEPAGED_MIN_MTHP_ORDER? It's weird
> to say that an order is allowed that isn't permitted by mTHP (e.g. order-0).
The THP_ORDERS_ALL_ANON has already filtered out order 0 and order 1, so
it matches the definition of KHUGEPAGED_MIN_MTHP_ORDER.
/*
* Mask of all large folio orders supported for anonymous THP; all
orders up to
* and including PMD_ORDER, except order-0 (which is not "huge") and
order-1
* (which is a limitation of the THP implementation).
*/
#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
^ permalink raw reply [flat|nested] 91+ messages in thread
* [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (13 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 14/15] khugepaged: run khugepaged for all orders Nico Pache
@ 2025-10-22 18:37 ` Nico Pache
2025-10-22 19:52 ` Christoph Lameter (Ampere)
2025-10-22 20:13 ` [PATCH v12 mm-new 00/15] khugepaged: mTHP support Andrew Morton
15 siblings, 1 reply; 91+ messages in thread
From: Nico Pache @ 2025-10-22 18:37 UTC (permalink / raw)
To: linux-kernel, linux-trace-kernel, linux-mm, linux-doc
Cc: david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, cl, jglisse, surenb, zokeefe,
hannes, rientjes, mhocko, rdunlap, hughd, richard.weiyang,
lance.yang, vbabka, rppt, jannh, pfalcato, Bagas Sanjaya
Now that we can collapse to mTHPs lets update the admin guide to
reflect these changes and provide proper guidence on how to utilize it.
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
Documentation/admin-guide/mm/transhuge.rst | 53 ++++++++++++----------
1 file changed, 30 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7c71cda8aea1..2569a92fd96c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -63,7 +63,8 @@ often.
THP can be enabled system wide or restricted to certain tasks or even
memory ranges inside task's address space. Unless THP is completely
disabled, there is ``khugepaged`` daemon that scans memory and
-collapses sequences of basic pages into PMD-sized huge pages.
+collapses sequences of basic pages into huge pages of either PMD size
+or mTHP sizes, if the system is configured to do so
The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
interface and using madvise(2) and prctl(2) system calls.
@@ -212,17 +213,17 @@ PMD-mappable transparent hugepage::
All THPs at fault and collapse time will be added to _deferred_list,
and will therefore be split under memory presure if they are considered
"underused". A THP is underused if the number of zero-filled pages in
-the THP is above max_ptes_none (see below). It is possible to disable
-this behaviour by writing 0 to shrink_underused, and enable it by writing
-1 to it::
+the THP is above max_ptes_none (see below) scaled by the THP order. It is
+possible to disable this behaviour by writing 0 to shrink_underused, and enable
+it by writing 1 to it::
echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
-khugepaged will be automatically started when PMD-sized THP is enabled
+khugepaged will be automatically started when any THP size is enabled
(either of the per-size anon control or the top-level control are set
to "always" or "madvise"), and it'll be automatically shutdown when
-PMD-sized THP is disabled (when both the per-size anon control and the
+all THP sizes are disabled (when both the per-size anon control and the
top-level control are "never")
process THP controls
@@ -264,11 +265,6 @@ support the following arguments::
Khugepaged controls
-------------------
-.. note::
- khugepaged currently only searches for opportunities to collapse to
- PMD-sized THP and no attempt is made to collapse to other THP
- sizes.
-
khugepaged runs usually at low frequency so while one may not want to
invoke defrag algorithms synchronously during the page faults, it
should be worth invoking defrag at least in khugepaged. However it's
@@ -296,11 +292,11 @@ allocation failure to throttle the next allocation attempt::
The khugepaged progress can be seen in the number of pages collapsed (note
that this counter may not be an exact count of the number of pages
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
-being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
-one 2M hugepage. Each may happen independently, or together, depending on
-the type of memory and the failures that occur. As such, this value should
-be interpreted roughly as a sign of progress, and counters in /proc/vmstat
-consulted for more accurate accounting)::
+being replaced by a PMD mapping, or (2) physical pages replaced by one
+hugepage of various sizes (PMD-sized or mTHP). Each may happen independently,
+or together, depending on the type of memory and the failures that occur.
+As such, this value should be interpreted roughly as a sign of progress,
+and counters in /proc/vmstat consulted for more accurate accounting)::
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
@@ -308,16 +304,18 @@ for each pass::
/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
-``max_ptes_none`` specifies how many extra small pages (that are
-not already mapped) can be allocated when collapsing a group
-of small pages into one large page::
+``max_ptes_none`` specifies how many empty (none/zero) pages are allowed
+when collapsing a group of small pages into one large page::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
-A higher value leads to use additional memory for programs.
-A lower value leads to gain less thp performance. Value of
-max_ptes_none can waste cpu time very little, you can
-ignore it.
+For PMD-sized THP collapse, this directly limits the number of empty pages
+allowed in the 2MB region. For mTHP collapse, the kernel might use a more
+conservative value when determining eligibility.
+
+A higher value allows more empty pages, potentially leading to more memory
+usage but better THP performance. A lower value is more conservative and
+may result in fewer THP collapses.
``max_ptes_swap`` specifies how many pages can be brought in from
swap when collapsing a group of pages into a transparent huge page::
@@ -337,6 +335,15 @@ that THP is shared. Exceeding the number would block the collapse::
A higher value may increase memory footprint for some workloads.
+.. note::
+ For mTHP collapse, khugepaged does not support collapsing regions that
+ contain shared or swapped out pages, as this could lead to continuous
+ promotion to higher orders. The collapse will fail if any shared or
+ swapped PTEs are encountered during the scan.
+
+ Currently, madvise_collapse only supports collapsing to PMD-sized THPs
+ and does not attempt mTHP collapses.
+
Boot parameters
===============
--
2.51.0
^ permalink raw reply [flat|nested] 91+ messages in thread* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-10-22 19:52 ` Christoph Lameter (Ampere)
2025-10-22 20:22 ` David Hildenbrand
0 siblings, 1 reply; 91+ messages in thread
From: Christoph Lameter (Ampere) @ 2025-10-22 19:52 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato, Bagas Sanjaya
On Wed, 22 Oct 2025, Nico Pache wrote:
> Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> and does not attempt mTHP collapses. +
madvise collapse is frequently used as far as I can tell from the THP
loads being tested. Could we support madvise collapse for mTHP?
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 19:52 ` Christoph Lameter (Ampere)
@ 2025-10-22 20:22 ` David Hildenbrand
2025-10-23 8:00 ` Lorenzo Stoakes
2025-10-23 23:41 ` Christoph Lameter (Ampere)
0 siblings, 2 replies; 91+ messages in thread
From: David Hildenbrand @ 2025-10-22 20:22 UTC (permalink / raw)
To: Christoph Lameter (Ampere), Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, ziy,
baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato, Bagas Sanjaya
On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> On Wed, 22 Oct 2025, Nico Pache wrote:
>
>> Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
>> and does not attempt mTHP collapses. +
>
> madvise collapse is frequently used as far as I can tell from the THP
> loads being tested. Could we support madvise collapse for mTHP?
The big question is still how user space can communicate the desired
order, and how we can not break existing users.
So I guess there will definitely be some support to trigger collapse to
mTHP in the future, the big question is through which interface. So it
will happen after this series.
Maybe through process_madvise() where we have an additional parameter, I
think that was what people discussed in the past.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 20:22 ` David Hildenbrand
@ 2025-10-23 8:00 ` Lorenzo Stoakes
2025-10-23 8:44 ` Pedro Falcato
2025-10-23 23:41 ` Christoph Lameter (Ampere)
1 sibling, 1 reply; 91+ messages in thread
From: Lorenzo Stoakes @ 2025-10-23 8:00 UTC (permalink / raw)
To: David Hildenbrand
Cc: Christoph Lameter (Ampere),
Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato, Bagas Sanjaya
On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > On Wed, 22 Oct 2025, Nico Pache wrote:
> >
> > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > and does not attempt mTHP collapses. +
> >
> > madvise collapse is frequently used as far as I can tell from the THP
> > loads being tested. Could we support madvise collapse for mTHP?
>
> The big question is still how user space can communicate the desired order,
> and how we can not break existing users.
Yes, and let's go one step at a time, this series still needs careful scrutiny
and we need to ensure the _fundamentals_ are in place for khugepaged before we
get into MADV_COLLAPSE :)
>
> So I guess there will definitely be some support to trigger collapse to mTHP
> in the future, the big question is through which interface. So it will
> happen after this series.
Yes.
>
> Maybe through process_madvise() where we have an additional parameter, I
> think that was what people discussed in the past.
I wouldn't absolutely love us doing that, given it is a general parameter so
would seem applicable to any madvise() option and could lead to confusion, also
process_madvise() was originally for cross-process madvise vector operations.
I expanded this to make it applicable to the current process (and introduced
PIDFD_SELF to make that more sane), and SJ has optimised it across vector
operations (thanks SJ! :), but in general - it seems very weird to have
madvise() provide an operation that process_madvise() providse another version
of that has an extra parameter.
As usual we've painted ourselves into a corner with an API... :)
Perhaps we'll to accept the process_madvise() compromise and add
MADV_COLLAPSE_MHTP that only works with it or something.
Of course adding a new syscall isn't impossible... madvise2() not very appealing
however...
TL;DR I guess we'll deal with that when we come to it :)
>
> --
> Cheers
>
> David / dhildenb
>
Cheers, Lorenzo
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-23 8:00 ` Lorenzo Stoakes
@ 2025-10-23 8:44 ` Pedro Falcato
2025-10-24 13:54 ` Zach O'Keefe
0 siblings, 1 reply; 91+ messages in thread
From: Pedro Falcato @ 2025-10-23 8:44 UTC (permalink / raw)
To: Lorenzo Stoakes, David Hildenbrand
Cc: Christoph Lameter (Ampere),
Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
Bagas Sanjaya
On Thu, Oct 23, 2025 at 09:00:10AM +0100, Lorenzo Stoakes wrote:
> On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> > On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > > On Wed, 22 Oct 2025, Nico Pache wrote:
> > >
> > > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > > and does not attempt mTHP collapses. +
> > >
> > > madvise collapse is frequently used as far as I can tell from the THP
> > > loads being tested. Could we support madvise collapse for mTHP?
> >
> > The big question is still how user space can communicate the desired order,
> > and how we can not break existing users.
>
Do we want to let userspace communicate order? It seems like an extremely
specific thing to do. A more simple&sane semantic could be something like:
"MADV_COLLAPSE collapses a given [addr, addr+len] range into the highest
order THP it can/thinks it should.". The implementation details of PMD or
contpte or <...> are lost by the time we get to userspace.
The man page itself is pretty vaguely written to allow us to do whatever
we want. It sounds to me that allowing userspace to create arbitrary order
mTHPs would be another pandora's box we shouldn't get into.
> Yes, and let's go one step at a time, this series still needs careful scrutiny
> and we need to ensure the _fundamentals_ are in place for khugepaged before we
> get into MADV_COLLAPSE :)
>
> >
> > So I guess there will definitely be some support to trigger collapse to mTHP
> > in the future, the big question is through which interface. So it will
> > happen after this series.
>
> Yes.
>
> >
> > Maybe through process_madvise() where we have an additional parameter, I
> > think that was what people discussed in the past.
>
> I wouldn't absolutely love us doing that, given it is a general parameter so
> would seem applicable to any madvise() option and could lead to confusion, also
> process_madvise() was originally for cross-process madvise vector operations.
For what it's worth, it would probably not be too hard to devise a generic
separation there between "generic flags" and "behavior-specific flags".
And then stuff the desired THP order into MADV_COLLAPSE-specific flags.
>
> I expanded this to make it applicable to the current process (and introduced
> PIDFD_SELF to make that more sane), and SJ has optimised it across vector
> operations (thanks SJ! :), but in general - it seems very weird to have
> madvise() provide an operation that process_madvise() providse another version
> of that has an extra parameter.
>
> As usual we've painted ourselves into a corner with an API... :)
But yes, I agree it would feel weird.
>
> Perhaps we'll to accept the process_madvise() compromise and add
> MADV_COLLAPSE_MHTP that only works with it or something.
>
> Of course adding a new syscall isn't impossible... madvise2() not very appealing
> however...
It is my impression that process_madvise() is already madvise2(), but
poorly named.
>
> TL;DR I guess we'll deal with that when we come to it :)
Amen :)
--
Pedro
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-23 8:44 ` Pedro Falcato
@ 2025-10-24 13:54 ` Zach O'Keefe
0 siblings, 0 replies; 91+ messages in thread
From: Zach O'Keefe @ 2025-10-24 13:54 UTC (permalink / raw)
To: Pedro Falcato
Cc: Lorenzo Stoakes, David Hildenbrand, Christoph Lameter (Ampere),
Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, akpm,
baohua, willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, jglisse, surenb, hannes, rientjes, mhocko, rdunlap, hughd,
richard.weiyang, lance.yang, vbabka, rppt, jannh, Bagas Sanjaya
On Thu, Oct 23, 2025 at 1:44 AM Pedro Falcato <pfalcato@suse.de> wrote:
>
> On Thu, Oct 23, 2025 at 09:00:10AM +0100, Lorenzo Stoakes wrote:
> > On Wed, Oct 22, 2025 at 10:22:08PM +0200, David Hildenbrand wrote:
> > > On 22.10.25 21:52, Christoph Lameter (Ampere) wrote:
> > > > On Wed, 22 Oct 2025, Nico Pache wrote:
> > > >
> > > > > Currently, madvise_collapse only supports collapsing to PMD-sized THPs +
> > > > > and does not attempt mTHP collapses. +
> > > >
> > > > madvise collapse is frequently used as far as I can tell from the THP
> > > > loads being tested. Could we support madvise collapse for mTHP?
> > >
> > > The big question is still how user space can communicate the desired order,
> > > and how we can not break existing users.
> >
>
> Do we want to let userspace communicate order? It seems like an extremely
> specific thing to do. A more simple&sane semantic could be something like:
> "MADV_COLLAPSE collapses a given [addr, addr+len] range into the highest
> order THP it can/thinks it should.". The implementation details of PMD or
> contpte or <...> are lost by the time we get to userspace.
>
> The man page itself is pretty vaguely written to allow us to do whatever
> we want. It sounds to me that allowing userspace to create arbitrary order
> mTHPs would be another pandora's box we shouldn't get into.
>
> > Yes, and let's go one step at a time, this series still needs careful scrutiny
> > and we need to ensure the _fundamentals_ are in place for khugepaged before we
> > get into MADV_COLLAPSE :)
> >
> > >
> > > So I guess there will definitely be some support to trigger collapse to mTHP
> > > in the future, the big question is through which interface. So it will
> > > happen after this series.
> >
> > Yes.
> >
> > >
> > > Maybe through process_madvise() where we have an additional parameter, I
> > > think that was what people discussed in the past.
> >
> > I wouldn't absolutely love us doing that, given it is a general parameter so
> > would seem applicable to any madvise() option and could lead to confusion, also
> > process_madvise() was originally for cross-process madvise vector operations.
>
> For what it's worth, it would probably not be too hard to devise a generic
> separation there between "generic flags" and "behavior-specific flags".
> And then stuff the desired THP order into MADV_COLLAPSE-specific flags.
Yeah, this is how I envisioned the flags to be leveraged; reserve some
number of bits for generic, and overload the others for
advice-specific. I suspect once the seal is broken on this, more
advice-specific flags will promptly follow.
> >
> > I expanded this to make it applicable to the current process (and introduced
> > PIDFD_SELF to make that more sane), and SJ has optimised it across vector
> > operations (thanks SJ! :), but in general - it seems very weird to have
> > madvise() provide an operation that process_madvise() providse another version
> > of that has an extra parameter.
> >
> > As usual we've painted ourselves into a corner with an API... :)
>
> But yes, I agree it would feel weird.
>
> >
> > Perhaps we'll to accept the process_madvise() compromise and add
> > MADV_COLLAPSE_MHTP that only works with it or something.
> >
> > Of course adding a new syscall isn't impossible... madvise2() not very appealing
> > however...
>
> It is my impression that process_madvise() is already madvise2(), but
> poorly named.
+1
> >
> > TL;DR I guess we'll deal with that when we come to it :)
>
> Amen :)
>
> --
> Pedro
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse
2025-10-22 20:22 ` David Hildenbrand
2025-10-23 8:00 ` Lorenzo Stoakes
@ 2025-10-23 23:41 ` Christoph Lameter (Ampere)
1 sibling, 0 replies; 91+ messages in thread
From: Christoph Lameter (Ampere) @ 2025-10-23 23:41 UTC (permalink / raw)
To: David Hildenbrand
Cc: Nico Pache, linux-kernel, linux-trace-kernel, linux-mm,
linux-doc, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
ryan.roberts, dev.jain, corbet, rostedt, mhiramat,
mathieu.desnoyers, akpm, baohua, willy, peterx, wangkefeng.wang,
usamaarif642, sunnanyong, vishal.moola, thomas.hellstrom, yang,
kas, aarcange, raquini, anshuman.khandual, catalin.marinas,
tiwai, will, dave.hansen, jack, jglisse, surenb, zokeefe, hannes,
rientjes, mhocko, rdunlap, hughd, richard.weiyang, lance.yang,
vbabka, rppt, jannh, pfalcato, Bagas Sanjaya
On Wed, 22 Oct 2025, David Hildenbrand wrote:
> The big question is still how user space can communicate the desired order,
> and how we can not break existing users.
>
> So I guess there will definitely be some support to trigger collapse to mTHP
> in the future, the big question is through which interface. So it will happen
> after this series.
Well we have a possibility of a memory policy for each VMA and we can set
memory policies for arbitrary memory ranges as well as per process through
the existing APIs from user space.
Extending the memory policies by a parameter to allow setting a preferred
order would allow us to use this mechanisms.
Memory policies can already be used to control numa balancing and
migration. The ability to specify page sizes is similar I think.
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..429117bbd2f4 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -31,6 +31,7 @@ enum {
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */
+#define MPOL_F_PAGE_ORDER (1 << 12)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
@@ -56,6 +57,9 @@ enum {
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
+#define MPOL_MF_PAGE_ORDER (1<<5) /* Set preferred page order */
+
+
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
^ permalink raw reply [flat|nested] 91+ messages in thread
* Re: [PATCH v12 mm-new 00/15] khugepaged: mTHP support
2025-10-22 18:37 [PATCH v12 mm-new 00/15] khugepaged: mTHP support Nico Pache
` (14 preceding siblings ...)
2025-10-22 18:37 ` [PATCH v12 mm-new 15/15] Documentation: mm: update the admin guide for mTHP collapse Nico Pache
@ 2025-10-22 20:13 ` Andrew Morton
15 siblings, 0 replies; 91+ messages in thread
From: Andrew Morton @ 2025-10-22 20:13 UTC (permalink / raw)
To: Nico Pache
Cc: linux-kernel, linux-trace-kernel, linux-mm, linux-doc, david,
ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett, ryan.roberts,
dev.jain, corbet, rostedt, mhiramat, mathieu.desnoyers, baohua,
willy, peterx, wangkefeng.wang, usamaarif642, sunnanyong,
vishal.moola, thomas.hellstrom, yang, kas, aarcange, raquini,
anshuman.khandual, catalin.marinas, tiwai, will, dave.hansen,
jack, cl, jglisse, surenb, zokeefe, hannes, rientjes, mhocko,
rdunlap, hughd, richard.weiyang, lance.yang, vbabka, rppt, jannh,
pfalcato
On Wed, 22 Oct 2025 12:37:02 -0600 Nico Pache <npache@redhat.com> wrote:
> The following series provides khugepaged with the capability to collapse
> anonymous memory regions to mTHPs.
I added this to mm.git's mm-new branch, thanks.
I suppressed the 500 added-to-mm emails.
^ permalink raw reply [flat|nested] 91+ messages in thread