* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
@ 2025-12-15 11:52 ` Lance Yang
2025-12-16 6:27 ` Vernon Yang
2025-12-15 21:45 ` kernel test robot
` (4 subsequent siblings)
5 siblings, 1 reply; 42+ messages in thread
From: Lance Yang @ 2025-12-15 11:52 UTC (permalink / raw)
To: Vernon Yang
Cc: ziy, npache, baohua, linux-mm, linux-kernel, Vernon Yang, akpm,
lorenzo.stoakes, david
Hi Vernon,
Thanks for the patches!
On 2025/12/15 17:04, Vernon Yang wrote:
> The following data is traced by bpftrace on a desktop system. After
> the system has been left idle for 10 minutes upon booting, a lot of
> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> khugepaged.
>
> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> total progress size: 701 MB
> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>
> The khugepaged_scan list save all task that support collapse into hugepage,
> as long as the take is not destroyed, khugepaged will not remove it from
Nit: s/take/task/
> the khugepaged_scan list. This exist a phenomenon where task has already
> collapsed all memory regions into hugepage, but khugepaged continues to
> scan it, which wastes CPU time and invalid, and due to
> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> scanning a large number of invalid task, so scanning really valid task
> is later.
>
> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> list. If the page fault or MADV_HUGEPAGE again, it is added back to
> khugepaged.
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
> mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> 1 file changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0598a19a98cc..1ec1af5be3c8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -115,6 +115,7 @@ struct khugepaged_scan {
> struct list_head mm_head;
> struct mm_slot *mm_slot;
> unsigned long address;
> + bool maybe_collapse;
At a quick glance, the name of "maybe_collapse" is a bit ambiguous ...
Perhaps "scan_needed" or "collapse_possible" would be clearer to
indicate that the mm should be kept in the scan list?
> };
>
> static struct khugepaged_scan khugepaged_scan = {
> @@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> return result;
> }
>
> -static void collect_mm_slot(struct mm_slot *slot)
> +static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
> {
> struct mm_struct *mm = slot->mm;
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
>
> - /*
> - * Not strictly needed because the mm exited already.
> - *
> - * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> - */
> + if (!maybe_collapse)
> + mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>
> /* khugepaged_mm_lock actually not necessary for the below */
> mm_slot_free(mm_slot_cache, slot);
> @@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> struct mm_slot, mm_node);
> khugepaged_scan.address = 0;
> khugepaged_scan.mm_slot = slot;
> + khugepaged_scan.maybe_collapse = false;
> }
> spin_unlock(&khugepaged_mm_lock);
>
> @@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> khugepaged_scan.address, &mmap_locked, cc);
> }
>
> - if (*result == SCAN_SUCCEED)
> + switch (*result) {
> + case SCAN_PMD_NULL:
> + case SCAN_PMD_NONE:
> + case SCAN_PMD_MAPPED:
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> + break;
> + case SCAN_SUCCEED:
> ++khugepaged_pages_collapsed;
> + fallthrough;
> + default:
> + khugepaged_scan.maybe_collapse = true;
> + }
>
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> @@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * if we scanned all vmas of this mm.
> */
> if (hpage_collapse_test_exit(mm) || !vma) {
> + bool maybe_collapse = khugepaged_scan.maybe_collapse;
> +
> + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> + maybe_collapse = true;
> +
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
> @@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> khugepaged_scan.address = 0;
> + khugepaged_scan.maybe_collapse = false;
> } else {
> khugepaged_scan.mm_slot = NULL;
> khugepaged_full_scans++;
> }
>
> - collect_mm_slot(slot);
> + collect_mm_slot(slot, maybe_collapse);
> }
>
> trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> @@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
> slot = khugepaged_scan.mm_slot;
> khugepaged_scan.mm_slot = NULL;
> if (slot)
> - collect_mm_slot(slot);
> + collect_mm_slot(slot, true);
> spin_unlock(&khugepaged_mm_lock);
> return 0;
> }
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 11:52 ` Lance Yang
@ 2025-12-16 6:27 ` Vernon Yang
0 siblings, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-16 6:27 UTC (permalink / raw)
To: Lance Yang
Cc: ziy, baohua, linux-mm, linux-kernel, Vernon Yang, akpm,
lorenzo.stoakes, david
On Mon, Dec 15, 2025 at 07:52:41PM +0800, Lance Yang wrote:
> Hi Vernon,
>
> Thanks for the patches!
>
> On 2025/12/15 17:04, Vernon Yang wrote:
> > The following data is traced by bpftrace on a desktop system. After
> > the system has been left idle for 10 minutes upon booting, a lot of
> > SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> > khugepaged.
> >
> > @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> > @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> > @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> > total progress size: 701 MB
> > Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >
> > The khugepaged_scan list save all task that support collapse into hugepage,
> > as long as the take is not destroyed, khugepaged will not remove it from
>
> Nit: s/take/task/
Thanks, I'll fix it in the next version.
> > the khugepaged_scan list. This exist a phenomenon where task has already
> > collapsed all memory regions into hugepage, but khugepaged continues to
> > scan it, which wastes CPU time and invalid, and due to
> > khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> > scanning a large number of invalid task, so scanning really valid task
> > is later.
> >
> > After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> > SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> > list. If the page fault or MADV_HUGEPAGE again, it is added back to
> > khugepaged.
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> > mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> > 1 file changed, 25 insertions(+), 10 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 0598a19a98cc..1ec1af5be3c8 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -115,6 +115,7 @@ struct khugepaged_scan {
> > struct list_head mm_head;
> > struct mm_slot *mm_slot;
> > unsigned long address;
> > + bool maybe_collapse;
>
> At a quick glance, the name of "maybe_collapse" is a bit ambiguous ...
>
> Perhaps "scan_needed" or "collapse_possible" would be clearer to
> indicate that the mm should be kept in the scan list?
The "collapse_possible" sounds good to me, Thanks! I will do it in the
next version.
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
2025-12-15 11:52 ` Lance Yang
@ 2025-12-15 21:45 ` kernel test robot
2025-12-16 6:30 ` Vernon Yang
2025-12-15 23:01 ` kernel test robot
` (3 subsequent siblings)
5 siblings, 1 reply; 42+ messages in thread
From: kernel test robot @ 2025-12-15 21:45 UTC (permalink / raw)
To: Vernon Yang, akpm, david, lorenzo.stoakes
Cc: llvm, oe-kbuild-all, ziy, npache, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
Hi Vernon,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.19-rc1 next-20251215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512160533.KuHwyJTP-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/khugepaged.c:2490:9: error: use of undeclared identifier 'SCAN_PMD_NULL'; did you mean 'SCAN_VMA_NULL'?
2490 | case SCAN_PMD_NULL:
| ^~~~~~~~~~~~~
| SCAN_VMA_NULL
mm/khugepaged.c:50:2: note: 'SCAN_VMA_NULL' declared here
50 | SCAN_VMA_NULL,
| ^
>> mm/khugepaged.c:2491:9: error: use of undeclared identifier 'SCAN_PMD_NONE'
2491 | case SCAN_PMD_NONE:
| ^
2 errors generated.
vim +2490 mm/khugepaged.c
2392
2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2394 struct collapse_control *cc)
2395 __releases(&khugepaged_mm_lock)
2396 __acquires(&khugepaged_mm_lock)
2397 {
2398 struct vma_iterator vmi;
2399 struct mm_slot *slot;
2400 struct mm_struct *mm;
2401 struct vm_area_struct *vma;
2402 int progress = 0;
2403
2404 VM_BUG_ON(!pages);
2405 lockdep_assert_held(&khugepaged_mm_lock);
2406 *result = SCAN_FAIL;
2407
2408 if (khugepaged_scan.mm_slot) {
2409 slot = khugepaged_scan.mm_slot;
2410 } else {
2411 slot = list_first_entry(&khugepaged_scan.mm_head,
2412 struct mm_slot, mm_node);
2413 khugepaged_scan.address = 0;
2414 khugepaged_scan.mm_slot = slot;
2415 khugepaged_scan.maybe_collapse = false;
2416 }
2417 spin_unlock(&khugepaged_mm_lock);
2418
2419 mm = slot->mm;
2420 /*
2421 * Don't wait for semaphore (to avoid long wait times). Just move to
2422 * the next mm on the list.
2423 */
2424 vma = NULL;
2425 if (unlikely(!mmap_read_trylock(mm)))
2426 goto breakouterloop_mmap_lock;
2427
2428 progress++;
2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2430 goto breakouterloop;
2431
2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2433 for_each_vma(vmi, vma) {
2434 unsigned long hstart, hend;
2435
2436 cond_resched();
2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2438 progress++;
2439 break;
2440 }
2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2442 skip:
2443 progress++;
2444 continue;
2445 }
2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2448 if (khugepaged_scan.address > hend)
2449 goto skip;
2450 if (khugepaged_scan.address < hstart)
2451 khugepaged_scan.address = hstart;
2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2453
2454 while (khugepaged_scan.address < hend) {
2455 bool mmap_locked = true;
2456
2457 cond_resched();
2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2459 goto breakouterloop;
2460
2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
2463 hend);
2464 if (!vma_is_anonymous(vma)) {
2465 struct file *file = get_file(vma->vm_file);
2466 pgoff_t pgoff = linear_page_index(vma,
2467 khugepaged_scan.address);
2468
2469 mmap_read_unlock(mm);
2470 mmap_locked = false;
2471 *result = hpage_collapse_scan_file(mm,
2472 khugepaged_scan.address, file, pgoff, cc);
2473 fput(file);
2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2475 mmap_read_lock(mm);
2476 if (hpage_collapse_test_exit_or_disable(mm))
2477 goto breakouterloop;
2478 *result = collapse_pte_mapped_thp(mm,
2479 khugepaged_scan.address, false);
2480 if (*result == SCAN_PMD_MAPPED)
2481 *result = SCAN_SUCCEED;
2482 mmap_read_unlock(mm);
2483 }
2484 } else {
2485 *result = hpage_collapse_scan_pmd(mm, vma,
2486 khugepaged_scan.address, &mmap_locked, cc);
2487 }
2488
2489 switch (*result) {
> 2490 case SCAN_PMD_NULL:
> 2491 case SCAN_PMD_NONE:
2492 case SCAN_PMD_MAPPED:
2493 case SCAN_PTE_MAPPED_HUGEPAGE:
2494 break;
2495 case SCAN_SUCCEED:
2496 ++khugepaged_pages_collapsed;
2497 fallthrough;
2498 default:
2499 khugepaged_scan.maybe_collapse = true;
2500 }
2501
2502 /* move to next address */
2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
2504 progress += HPAGE_PMD_NR;
2505 if (!mmap_locked)
2506 /*
2507 * We released mmap_lock so break loop. Note
2508 * that we drop mmap_lock before all hugepage
2509 * allocations, so if allocation fails, we are
2510 * guaranteed to break here and report the
2511 * correct result back to caller.
2512 */
2513 goto breakouterloop_mmap_lock;
2514 if (progress >= pages)
2515 goto breakouterloop;
2516 }
2517 }
2518 breakouterloop:
2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2520 breakouterloop_mmap_lock:
2521
2522 spin_lock(&khugepaged_mm_lock);
2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2524 /*
2525 * Release the current mm_slot if this mm is about to die, or
2526 * if we scanned all vmas of this mm.
2527 */
2528 if (hpage_collapse_test_exit(mm) || !vma) {
2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
2530
2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
2532 maybe_collapse = true;
2533
2534 /*
2535 * Make sure that if mm_users is reaching zero while
2536 * khugepaged runs here, khugepaged_exit will find
2537 * mm_slot not pointing to the exiting mm.
2538 */
2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2541 khugepaged_scan.address = 0;
2542 khugepaged_scan.maybe_collapse = false;
2543 } else {
2544 khugepaged_scan.mm_slot = NULL;
2545 khugepaged_full_scans++;
2546 }
2547
2548 collect_mm_slot(slot, maybe_collapse);
2549 }
2550
2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
2552
2553 return progress;
2554 }
2555
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 21:45 ` kernel test robot
@ 2025-12-16 6:30 ` Vernon Yang
0 siblings, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-16 6:30 UTC (permalink / raw)
To: kernel test robot
Cc: akpm, david, lorenzo.stoakes, llvm, oe-kbuild-all, ziy, baohua,
lance.yang, linux-mm, linux-kernel, Vernon Yang
On Tue, Dec 16, 2025 at 05:45:31AM +0800, kernel test robot wrote:
> Hi Vernon,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.19-rc1 next-20251215]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
> base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
> patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
> config: x86_64-kexec (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/config)
> compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202512160533.KuHwyJTP-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> >> mm/khugepaged.c:2490:9: error: use of undeclared identifier 'SCAN_PMD_NULL'; did you mean 'SCAN_VMA_NULL'?
> 2490 | case SCAN_PMD_NULL:
> | ^~~~~~~~~~~~~
> | SCAN_VMA_NULL
> mm/khugepaged.c:50:2: note: 'SCAN_VMA_NULL' declared here
> 50 | SCAN_VMA_NULL,
> | ^
> >> mm/khugepaged.c:2491:9: error: use of undeclared identifier 'SCAN_PMD_NONE'
> 2491 | case SCAN_PMD_NONE:
> | ^
> 2 errors generated.
This series is based on Linux v6.18, due to the v6.19-rc1 add "mm/khugepaged:
unify SCAN_PMD_NONE and SCAN_PMD_NULL into SCAN_NO_PTE_TABLE"[1], trigger this
build errors. I'll fix it in the next version, Thanks!
[1] https://lkml.kernel.org/r/20251114030028.7035-4-richard.weiyang@gmail.com
>
> vim +2490 mm/khugepaged.c
>
> 2392
> 2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> 2394 struct collapse_control *cc)
> 2395 __releases(&khugepaged_mm_lock)
> 2396 __acquires(&khugepaged_mm_lock)
> 2397 {
> 2398 struct vma_iterator vmi;
> 2399 struct mm_slot *slot;
> 2400 struct mm_struct *mm;
> 2401 struct vm_area_struct *vma;
> 2402 int progress = 0;
> 2403
> 2404 VM_BUG_ON(!pages);
> 2405 lockdep_assert_held(&khugepaged_mm_lock);
> 2406 *result = SCAN_FAIL;
> 2407
> 2408 if (khugepaged_scan.mm_slot) {
> 2409 slot = khugepaged_scan.mm_slot;
> 2410 } else {
> 2411 slot = list_first_entry(&khugepaged_scan.mm_head,
> 2412 struct mm_slot, mm_node);
> 2413 khugepaged_scan.address = 0;
> 2414 khugepaged_scan.mm_slot = slot;
> 2415 khugepaged_scan.maybe_collapse = false;
> 2416 }
> 2417 spin_unlock(&khugepaged_mm_lock);
> 2418
> 2419 mm = slot->mm;
> 2420 /*
> 2421 * Don't wait for semaphore (to avoid long wait times). Just move to
> 2422 * the next mm on the list.
> 2423 */
> 2424 vma = NULL;
> 2425 if (unlikely(!mmap_read_trylock(mm)))
> 2426 goto breakouterloop_mmap_lock;
> 2427
> 2428 progress++;
> 2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> 2430 goto breakouterloop;
> 2431
> 2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
> 2433 for_each_vma(vmi, vma) {
> 2434 unsigned long hstart, hend;
> 2435
> 2436 cond_resched();
> 2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> 2438 progress++;
> 2439 break;
> 2440 }
> 2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> 2442 skip:
> 2443 progress++;
> 2444 continue;
> 2445 }
> 2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
> 2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
> 2448 if (khugepaged_scan.address > hend)
> 2449 goto skip;
> 2450 if (khugepaged_scan.address < hstart)
> 2451 khugepaged_scan.address = hstart;
> 2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
> 2453
> 2454 while (khugepaged_scan.address < hend) {
> 2455 bool mmap_locked = true;
> 2456
> 2457 cond_resched();
> 2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> 2459 goto breakouterloop;
> 2460
> 2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
> 2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
> 2463 hend);
> 2464 if (!vma_is_anonymous(vma)) {
> 2465 struct file *file = get_file(vma->vm_file);
> 2466 pgoff_t pgoff = linear_page_index(vma,
> 2467 khugepaged_scan.address);
> 2468
> 2469 mmap_read_unlock(mm);
> 2470 mmap_locked = false;
> 2471 *result = hpage_collapse_scan_file(mm,
> 2472 khugepaged_scan.address, file, pgoff, cc);
> 2473 fput(file);
> 2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> 2475 mmap_read_lock(mm);
> 2476 if (hpage_collapse_test_exit_or_disable(mm))
> 2477 goto breakouterloop;
> 2478 *result = collapse_pte_mapped_thp(mm,
> 2479 khugepaged_scan.address, false);
> 2480 if (*result == SCAN_PMD_MAPPED)
> 2481 *result = SCAN_SUCCEED;
> 2482 mmap_read_unlock(mm);
> 2483 }
> 2484 } else {
> 2485 *result = hpage_collapse_scan_pmd(mm, vma,
> 2486 khugepaged_scan.address, &mmap_locked, cc);
> 2487 }
> 2488
> 2489 switch (*result) {
> > 2490 case SCAN_PMD_NULL:
> > 2491 case SCAN_PMD_NONE:
> 2492 case SCAN_PMD_MAPPED:
> 2493 case SCAN_PTE_MAPPED_HUGEPAGE:
> 2494 break;
> 2495 case SCAN_SUCCEED:
> 2496 ++khugepaged_pages_collapsed;
> 2497 fallthrough;
> 2498 default:
> 2499 khugepaged_scan.maybe_collapse = true;
> 2500 }
> 2501
> 2502 /* move to next address */
> 2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
> 2504 progress += HPAGE_PMD_NR;
> 2505 if (!mmap_locked)
> 2506 /*
> 2507 * We released mmap_lock so break loop. Note
> 2508 * that we drop mmap_lock before all hugepage
> 2509 * allocations, so if allocation fails, we are
> 2510 * guaranteed to break here and report the
> 2511 * correct result back to caller.
> 2512 */
> 2513 goto breakouterloop_mmap_lock;
> 2514 if (progress >= pages)
> 2515 goto breakouterloop;
> 2516 }
> 2517 }
> 2518 breakouterloop:
> 2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
> 2520 breakouterloop_mmap_lock:
> 2521
> 2522 spin_lock(&khugepaged_mm_lock);
> 2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
> 2524 /*
> 2525 * Release the current mm_slot if this mm is about to die, or
> 2526 * if we scanned all vmas of this mm.
> 2527 */
> 2528 if (hpage_collapse_test_exit(mm) || !vma) {
> 2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
> 2530
> 2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> 2532 maybe_collapse = true;
> 2533
> 2534 /*
> 2535 * Make sure that if mm_users is reaching zero while
> 2536 * khugepaged runs here, khugepaged_exit will find
> 2537 * mm_slot not pointing to the exiting mm.
> 2538 */
> 2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> 2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> 2541 khugepaged_scan.address = 0;
> 2542 khugepaged_scan.maybe_collapse = false;
> 2543 } else {
> 2544 khugepaged_scan.mm_slot = NULL;
> 2545 khugepaged_full_scans++;
> 2546 }
> 2547
> 2548 collect_mm_slot(slot, maybe_collapse);
> 2549 }
> 2550
> 2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> 2552
> 2553 return progress;
> 2554 }
> 2555
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
2025-12-15 11:52 ` Lance Yang
2025-12-15 21:45 ` kernel test robot
@ 2025-12-15 23:01 ` kernel test robot
2025-12-16 6:32 ` Vernon Yang
2025-12-17 3:31 ` Wei Yang
` (2 subsequent siblings)
5 siblings, 1 reply; 42+ messages in thread
From: kernel test robot @ 2025-12-15 23:01 UTC (permalink / raw)
To: Vernon Yang, akpm, david, lorenzo.stoakes
Cc: oe-kbuild-all, ziy, npache, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
Hi Vernon,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.19-rc1 next-20251215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512160619.3Ut4sxaJ-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/khugepaged.c: In function 'khugepaged_scan_mm_slot':
>> mm/khugepaged.c:2490:30: error: 'SCAN_PMD_NULL' undeclared (first use in this function); did you mean 'SCAN_VMA_NULL'?
2490 | case SCAN_PMD_NULL:
| ^~~~~~~~~~~~~
| SCAN_VMA_NULL
mm/khugepaged.c:2490:30: note: each undeclared identifier is reported only once for each function it appears in
>> mm/khugepaged.c:2491:30: error: 'SCAN_PMD_NONE' undeclared (first use in this function)
2491 | case SCAN_PMD_NONE:
| ^~~~~~~~~~~~~
vim +2490 mm/khugepaged.c
2392
2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2394 struct collapse_control *cc)
2395 __releases(&khugepaged_mm_lock)
2396 __acquires(&khugepaged_mm_lock)
2397 {
2398 struct vma_iterator vmi;
2399 struct mm_slot *slot;
2400 struct mm_struct *mm;
2401 struct vm_area_struct *vma;
2402 int progress = 0;
2403
2404 VM_BUG_ON(!pages);
2405 lockdep_assert_held(&khugepaged_mm_lock);
2406 *result = SCAN_FAIL;
2407
2408 if (khugepaged_scan.mm_slot) {
2409 slot = khugepaged_scan.mm_slot;
2410 } else {
2411 slot = list_first_entry(&khugepaged_scan.mm_head,
2412 struct mm_slot, mm_node);
2413 khugepaged_scan.address = 0;
2414 khugepaged_scan.mm_slot = slot;
2415 khugepaged_scan.maybe_collapse = false;
2416 }
2417 spin_unlock(&khugepaged_mm_lock);
2418
2419 mm = slot->mm;
2420 /*
2421 * Don't wait for semaphore (to avoid long wait times). Just move to
2422 * the next mm on the list.
2423 */
2424 vma = NULL;
2425 if (unlikely(!mmap_read_trylock(mm)))
2426 goto breakouterloop_mmap_lock;
2427
2428 progress++;
2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2430 goto breakouterloop;
2431
2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2433 for_each_vma(vmi, vma) {
2434 unsigned long hstart, hend;
2435
2436 cond_resched();
2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2438 progress++;
2439 break;
2440 }
2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2442 skip:
2443 progress++;
2444 continue;
2445 }
2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2448 if (khugepaged_scan.address > hend)
2449 goto skip;
2450 if (khugepaged_scan.address < hstart)
2451 khugepaged_scan.address = hstart;
2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2453
2454 while (khugepaged_scan.address < hend) {
2455 bool mmap_locked = true;
2456
2457 cond_resched();
2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2459 goto breakouterloop;
2460
2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
2463 hend);
2464 if (!vma_is_anonymous(vma)) {
2465 struct file *file = get_file(vma->vm_file);
2466 pgoff_t pgoff = linear_page_index(vma,
2467 khugepaged_scan.address);
2468
2469 mmap_read_unlock(mm);
2470 mmap_locked = false;
2471 *result = hpage_collapse_scan_file(mm,
2472 khugepaged_scan.address, file, pgoff, cc);
2473 fput(file);
2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2475 mmap_read_lock(mm);
2476 if (hpage_collapse_test_exit_or_disable(mm))
2477 goto breakouterloop;
2478 *result = collapse_pte_mapped_thp(mm,
2479 khugepaged_scan.address, false);
2480 if (*result == SCAN_PMD_MAPPED)
2481 *result = SCAN_SUCCEED;
2482 mmap_read_unlock(mm);
2483 }
2484 } else {
2485 *result = hpage_collapse_scan_pmd(mm, vma,
2486 khugepaged_scan.address, &mmap_locked, cc);
2487 }
2488
2489 switch (*result) {
> 2490 case SCAN_PMD_NULL:
> 2491 case SCAN_PMD_NONE:
2492 case SCAN_PMD_MAPPED:
2493 case SCAN_PTE_MAPPED_HUGEPAGE:
2494 break;
2495 case SCAN_SUCCEED:
2496 ++khugepaged_pages_collapsed;
2497 fallthrough;
2498 default:
2499 khugepaged_scan.maybe_collapse = true;
2500 }
2501
2502 /* move to next address */
2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
2504 progress += HPAGE_PMD_NR;
2505 if (!mmap_locked)
2506 /*
2507 * We released mmap_lock so break loop. Note
2508 * that we drop mmap_lock before all hugepage
2509 * allocations, so if allocation fails, we are
2510 * guaranteed to break here and report the
2511 * correct result back to caller.
2512 */
2513 goto breakouterloop_mmap_lock;
2514 if (progress >= pages)
2515 goto breakouterloop;
2516 }
2517 }
2518 breakouterloop:
2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2520 breakouterloop_mmap_lock:
2521
2522 spin_lock(&khugepaged_mm_lock);
2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2524 /*
2525 * Release the current mm_slot if this mm is about to die, or
2526 * if we scanned all vmas of this mm.
2527 */
2528 if (hpage_collapse_test_exit(mm) || !vma) {
2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
2530
2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
2532 maybe_collapse = true;
2533
2534 /*
2535 * Make sure that if mm_users is reaching zero while
2536 * khugepaged runs here, khugepaged_exit will find
2537 * mm_slot not pointing to the exiting mm.
2538 */
2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2541 khugepaged_scan.address = 0;
2542 khugepaged_scan.maybe_collapse = false;
2543 } else {
2544 khugepaged_scan.mm_slot = NULL;
2545 khugepaged_full_scans++;
2546 }
2547
2548 collect_mm_slot(slot, maybe_collapse);
2549 }
2550
2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
2552
2553 return progress;
2554 }
2555
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 23:01 ` kernel test robot
@ 2025-12-16 6:32 ` Vernon Yang
0 siblings, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-16 6:32 UTC (permalink / raw)
To: kernel test robot
Cc: akpm, david, lorenzo.stoakes, oe-kbuild-all, ziy, baohua,
lance.yang, linux-mm, linux-kernel, Vernon Yang
On Tue, Dec 16, 2025 at 07:01:18AM +0800, kernel test robot wrote:
> Hi Vernon,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.19-rc1 next-20251215]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
> base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
> patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
> config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/config)
> compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202512160619.3Ut4sxaJ-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> mm/khugepaged.c: In function 'khugepaged_scan_mm_slot':
> >> mm/khugepaged.c:2490:30: error: 'SCAN_PMD_NULL' undeclared (first use in this function); did you mean 'SCAN_VMA_NULL'?
> 2490 | case SCAN_PMD_NULL:
> | ^~~~~~~~~~~~~
> | SCAN_VMA_NULL
> mm/khugepaged.c:2490:30: note: each undeclared identifier is reported only once for each function it appears in
> >> mm/khugepaged.c:2491:30: error: 'SCAN_PMD_NONE' undeclared (first use in this function)
> 2491 | case SCAN_PMD_NONE:
> | ^~~~~~~~~~~~~
same above, Thanks.
>
> vim +2490 mm/khugepaged.c
>
> 2392
> 2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> 2394 struct collapse_control *cc)
> 2395 __releases(&khugepaged_mm_lock)
> 2396 __acquires(&khugepaged_mm_lock)
> 2397 {
> 2398 struct vma_iterator vmi;
> 2399 struct mm_slot *slot;
> 2400 struct mm_struct *mm;
> 2401 struct vm_area_struct *vma;
> 2402 int progress = 0;
> 2403
> 2404 VM_BUG_ON(!pages);
> 2405 lockdep_assert_held(&khugepaged_mm_lock);
> 2406 *result = SCAN_FAIL;
> 2407
> 2408 if (khugepaged_scan.mm_slot) {
> 2409 slot = khugepaged_scan.mm_slot;
> 2410 } else {
> 2411 slot = list_first_entry(&khugepaged_scan.mm_head,
> 2412 struct mm_slot, mm_node);
> 2413 khugepaged_scan.address = 0;
> 2414 khugepaged_scan.mm_slot = slot;
> 2415 khugepaged_scan.maybe_collapse = false;
> 2416 }
> 2417 spin_unlock(&khugepaged_mm_lock);
> 2418
> 2419 mm = slot->mm;
> 2420 /*
> 2421 * Don't wait for semaphore (to avoid long wait times). Just move to
> 2422 * the next mm on the list.
> 2423 */
> 2424 vma = NULL;
> 2425 if (unlikely(!mmap_read_trylock(mm)))
> 2426 goto breakouterloop_mmap_lock;
> 2427
> 2428 progress++;
> 2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> 2430 goto breakouterloop;
> 2431
> 2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
> 2433 for_each_vma(vmi, vma) {
> 2434 unsigned long hstart, hend;
> 2435
> 2436 cond_resched();
> 2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
> 2438 progress++;
> 2439 break;
> 2440 }
> 2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
> 2442 skip:
> 2443 progress++;
> 2444 continue;
> 2445 }
> 2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
> 2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
> 2448 if (khugepaged_scan.address > hend)
> 2449 goto skip;
> 2450 if (khugepaged_scan.address < hstart)
> 2451 khugepaged_scan.address = hstart;
> 2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
> 2453
> 2454 while (khugepaged_scan.address < hend) {
> 2455 bool mmap_locked = true;
> 2456
> 2457 cond_resched();
> 2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
> 2459 goto breakouterloop;
> 2460
> 2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
> 2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
> 2463 hend);
> 2464 if (!vma_is_anonymous(vma)) {
> 2465 struct file *file = get_file(vma->vm_file);
> 2466 pgoff_t pgoff = linear_page_index(vma,
> 2467 khugepaged_scan.address);
> 2468
> 2469 mmap_read_unlock(mm);
> 2470 mmap_locked = false;
> 2471 *result = hpage_collapse_scan_file(mm,
> 2472 khugepaged_scan.address, file, pgoff, cc);
> 2473 fput(file);
> 2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
> 2475 mmap_read_lock(mm);
> 2476 if (hpage_collapse_test_exit_or_disable(mm))
> 2477 goto breakouterloop;
> 2478 *result = collapse_pte_mapped_thp(mm,
> 2479 khugepaged_scan.address, false);
> 2480 if (*result == SCAN_PMD_MAPPED)
> 2481 *result = SCAN_SUCCEED;
> 2482 mmap_read_unlock(mm);
> 2483 }
> 2484 } else {
> 2485 *result = hpage_collapse_scan_pmd(mm, vma,
> 2486 khugepaged_scan.address, &mmap_locked, cc);
> 2487 }
> 2488
> 2489 switch (*result) {
> > 2490 case SCAN_PMD_NULL:
> > 2491 case SCAN_PMD_NONE:
> 2492 case SCAN_PMD_MAPPED:
> 2493 case SCAN_PTE_MAPPED_HUGEPAGE:
> 2494 break;
> 2495 case SCAN_SUCCEED:
> 2496 ++khugepaged_pages_collapsed;
> 2497 fallthrough;
> 2498 default:
> 2499 khugepaged_scan.maybe_collapse = true;
> 2500 }
> 2501
> 2502 /* move to next address */
> 2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
> 2504 progress += HPAGE_PMD_NR;
> 2505 if (!mmap_locked)
> 2506 /*
> 2507 * We released mmap_lock so break loop. Note
> 2508 * that we drop mmap_lock before all hugepage
> 2509 * allocations, so if allocation fails, we are
> 2510 * guaranteed to break here and report the
> 2511 * correct result back to caller.
> 2512 */
> 2513 goto breakouterloop_mmap_lock;
> 2514 if (progress >= pages)
> 2515 goto breakouterloop;
> 2516 }
> 2517 }
> 2518 breakouterloop:
> 2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
> 2520 breakouterloop_mmap_lock:
> 2521
> 2522 spin_lock(&khugepaged_mm_lock);
> 2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
> 2524 /*
> 2525 * Release the current mm_slot if this mm is about to die, or
> 2526 * if we scanned all vmas of this mm.
> 2527 */
> 2528 if (hpage_collapse_test_exit(mm) || !vma) {
> 2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
> 2530
> 2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> 2532 maybe_collapse = true;
> 2533
> 2534 /*
> 2535 * Make sure that if mm_users is reaching zero while
> 2536 * khugepaged runs here, khugepaged_exit will find
> 2537 * mm_slot not pointing to the exiting mm.
> 2538 */
> 2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> 2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> 2541 khugepaged_scan.address = 0;
> 2542 khugepaged_scan.maybe_collapse = false;
> 2543 } else {
> 2544 khugepaged_scan.mm_slot = NULL;
> 2545 khugepaged_full_scans++;
> 2546 }
> 2547
> 2548 collect_mm_slot(slot, maybe_collapse);
> 2549 }
> 2550
> 2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> 2552
> 2553 return progress;
> 2554 }
> 2555
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
` (2 preceding siblings ...)
2025-12-15 23:01 ` kernel test robot
@ 2025-12-17 3:31 ` Wei Yang
2025-12-18 3:27 ` Vernon Yang
2025-12-18 9:29 ` David Hildenbrand (Red Hat)
2025-12-22 19:00 ` kernel test robot
5 siblings, 1 reply; 42+ messages in thread
From: Wei Yang @ 2025-12-17 3:31 UTC (permalink / raw)
To: Vernon Yang
Cc: akpm, david, lorenzo.stoakes, ziy, npache, baohua, lance.yang,
linux-mm, linux-kernel, Vernon Yang
On Mon, Dec 15, 2025 at 05:04:17PM +0800, Vernon Yang wrote:
>The following data is traced by bpftrace on a desktop system. After
>the system has been left idle for 10 minutes upon booting, a lot of
>SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
>khugepaged.
>
>@scan_pmd_status[1]: 1 ## SCAN_SUCCEED
>@scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
>@scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
>total progress size: 701 MB
>Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>
>The khugepaged_scan list save all task that support collapse into hugepage,
>as long as the take is not destroyed, khugepaged will not remove it from
>the khugepaged_scan list. This exist a phenomenon where task has already
>collapsed all memory regions into hugepage, but khugepaged continues to
>scan it, which wastes CPU time and invalid, and due to
>khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>scanning a large number of invalid task, so scanning really valid task
>is later.
>
>After applying this patch, when all memory is either SCAN_PMD_MAPPED or
>SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
>list. If the page fault or MADV_HUGEPAGE again, it is added back to
>khugepaged.
Two thing s come up my mind:
* what happens if we split the huge page under memory pressure?
* would this interfere with mTHP collapse?
>
>Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>---
> mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> 1 file changed, 25 insertions(+), 10 deletions(-)
>
>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>index 0598a19a98cc..1ec1af5be3c8 100644
>--- a/mm/khugepaged.c
>+++ b/mm/khugepaged.c
>@@ -115,6 +115,7 @@ struct khugepaged_scan {
> struct list_head mm_head;
> struct mm_slot *mm_slot;
> unsigned long address;
>+ bool maybe_collapse;
> };
>
> static struct khugepaged_scan khugepaged_scan = {
>@@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> return result;
> }
>
>-static void collect_mm_slot(struct mm_slot *slot)
>+static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
> {
> struct mm_struct *mm = slot->mm;
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
>- if (hpage_collapse_test_exit(mm)) {
>+ if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
>
>- /*
>- * Not strictly needed because the mm exited already.
>- *
>- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>- */
>+ if (!maybe_collapse)
>+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>
> /* khugepaged_mm_lock actually not necessary for the below */
> mm_slot_free(mm_slot_cache, slot);
>@@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> struct mm_slot, mm_node);
> khugepaged_scan.address = 0;
> khugepaged_scan.mm_slot = slot;
>+ khugepaged_scan.maybe_collapse = false;
> }
> spin_unlock(&khugepaged_mm_lock);
>
>@@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> khugepaged_scan.address, &mmap_locked, cc);
> }
>
>- if (*result == SCAN_SUCCEED)
>+ switch (*result) {
>+ case SCAN_PMD_NULL:
>+ case SCAN_PMD_NONE:
>+ case SCAN_PMD_MAPPED:
>+ case SCAN_PTE_MAPPED_HUGEPAGE:
>+ break;
>+ case SCAN_SUCCEED:
> ++khugepaged_pages_collapsed;
>+ fallthrough;
If collapse successfully, we don't need to set maybe_collapse to true?
>+ default:
>+ khugepaged_scan.maybe_collapse = true;
>+ }
>
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
>@@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * if we scanned all vmas of this mm.
> */
> if (hpage_collapse_test_exit(mm) || !vma) {
>+ bool maybe_collapse = khugepaged_scan.maybe_collapse;
>+
>+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
>+ maybe_collapse = true;
>+
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
>@@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> khugepaged_scan.address = 0;
>+ khugepaged_scan.maybe_collapse = false;
> } else {
> khugepaged_scan.mm_slot = NULL;
> khugepaged_full_scans++;
> }
>
>- collect_mm_slot(slot);
>+ collect_mm_slot(slot, maybe_collapse);
> }
>
> trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
>@@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
> slot = khugepaged_scan.mm_slot;
> khugepaged_scan.mm_slot = NULL;
> if (slot)
>- collect_mm_slot(slot);
>+ collect_mm_slot(slot, true);
> spin_unlock(&khugepaged_mm_lock);
> return 0;
> }
>--
>2.51.0
>
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-17 3:31 ` Wei Yang
@ 2025-12-18 3:27 ` Vernon Yang
2025-12-18 3:48 ` Wei Yang
0 siblings, 1 reply; 42+ messages in thread
From: Vernon Yang @ 2025-12-18 3:27 UTC (permalink / raw)
To: Wei Yang
Cc: akpm, david, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Wed, Dec 17, 2025 at 03:31:55AM +0000, Wei Yang wrote:
> On Mon, Dec 15, 2025 at 05:04:17PM +0800, Vernon Yang wrote:
> >The following data is traced by bpftrace on a desktop system. After
> >the system has been left idle for 10 minutes upon booting, a lot of
> >SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> >khugepaged.
> >
> >@scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> >@scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> >@scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> >total progress size: 701 MB
> >Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >
> >The khugepaged_scan list save all task that support collapse into hugepage,
> >as long as the take is not destroyed, khugepaged will not remove it from
> >the khugepaged_scan list. This exist a phenomenon where task has already
> >collapsed all memory regions into hugepage, but khugepaged continues to
> >scan it, which wastes CPU time and invalid, and due to
> >khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >scanning a large number of invalid task, so scanning really valid task
> >is later.
> >
> >After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> >SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> >list. If the page fault or MADV_HUGEPAGE again, it is added back to
> >khugepaged.
>
> Two thing s come up my mind:
>
> * what happens if we split the huge page under memory pressure?
static unsigned int shrink_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat, struct scan_control *sc,
struct reclaim_stat *stat, bool ignore_references,
struct mem_cgroup *memcg)
{
...
folio = lru_to_folio(folio_list);
...
references = folio_check_references(folio, sc);
switch (references) {
case FOLIOREF_ACTIVATE:
goto activate_locked;
case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
case FOLIOREF_RECLAIM:
case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
}
...
split_folio_to_list(folio, folio_list);
}
During memory reclaim above, only inactive folios are split. This also
implies that the folio is cold, meaning it hasn't been used recently, so
we do not expect to put the mm back onto the khugepaged scan list to
continue scan/collapse. khugeapged needs to scan hot folios as much as
possible priorityly and collapse hot folios to avoid wasting CPU.
> * would this interfere with mTHP collapse?
It has no impact on mTHP collapse, only when all memory is either
SCAN_PMD_MAPPED or SCAN_PMD_NONE, the mm will be removed automatically.
other cases will not be removed.
Let me know if I missed something please, thanks!
>
> >
> >Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> >---
> > mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> > 1 file changed, 25 insertions(+), 10 deletions(-)
> >
> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >index 0598a19a98cc..1ec1af5be3c8 100644
> >--- a/mm/khugepaged.c
> >+++ b/mm/khugepaged.c
> >@@ -115,6 +115,7 @@ struct khugepaged_scan {
> > struct list_head mm_head;
> > struct mm_slot *mm_slot;
> > unsigned long address;
> >+ bool maybe_collapse;
> > };
> >
> > static struct khugepaged_scan khugepaged_scan = {
> >@@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> > return result;
> > }
> >
> >-static void collect_mm_slot(struct mm_slot *slot)
> >+static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
> > {
> > struct mm_struct *mm = slot->mm;
> >
> > lockdep_assert_held(&khugepaged_mm_lock);
> >
> >- if (hpage_collapse_test_exit(mm)) {
> >+ if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
> > /* free mm_slot */
> > hash_del(&slot->hash);
> > list_del(&slot->mm_node);
> >
> >- /*
> >- * Not strictly needed because the mm exited already.
> >- *
> >- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> >- */
> >+ if (!maybe_collapse)
> >+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> >
> > /* khugepaged_mm_lock actually not necessary for the below */
> > mm_slot_free(mm_slot_cache, slot);
> >@@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > struct mm_slot, mm_node);
> > khugepaged_scan.address = 0;
> > khugepaged_scan.mm_slot = slot;
> >+ khugepaged_scan.maybe_collapse = false;
> > }
> > spin_unlock(&khugepaged_mm_lock);
> >
> >@@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > khugepaged_scan.address, &mmap_locked, cc);
> > }
> >
> >- if (*result == SCAN_SUCCEED)
> >+ switch (*result) {
> >+ case SCAN_PMD_NULL:
> >+ case SCAN_PMD_NONE:
> >+ case SCAN_PMD_MAPPED:
> >+ case SCAN_PTE_MAPPED_HUGEPAGE:
> >+ break;
> >+ case SCAN_SUCCEED:
> > ++khugepaged_pages_collapsed;
> >+ fallthrough;
>
> If collapse successfully, we don't need to set maybe_collapse to true?
Above "fallthrough" explicitly tells the compiler that when the collapse is
successful, run below "khugepaged_scan.maybe_collapse = true" :)
> >+ default:
> >+ khugepaged_scan.maybe_collapse = true;
> >+ }
> >
> > /* move to next address */
> > khugepaged_scan.address += HPAGE_PMD_SIZE;
> >@@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > * if we scanned all vmas of this mm.
> > */
> > if (hpage_collapse_test_exit(mm) || !vma) {
> >+ bool maybe_collapse = khugepaged_scan.maybe_collapse;
> >+
> >+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> >+ maybe_collapse = true;
> >+
> > /*
> > * Make sure that if mm_users is reaching zero while
> > * khugepaged runs here, khugepaged_exit will find
> >@@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> > if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> > khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> > khugepaged_scan.address = 0;
> >+ khugepaged_scan.maybe_collapse = false;
> > } else {
> > khugepaged_scan.mm_slot = NULL;
> > khugepaged_full_scans++;
> > }
> >
> >- collect_mm_slot(slot);
> >+ collect_mm_slot(slot, maybe_collapse);
> > }
> >
> > trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> >@@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
> > slot = khugepaged_scan.mm_slot;
> > khugepaged_scan.mm_slot = NULL;
> > if (slot)
> >- collect_mm_slot(slot);
> >+ collect_mm_slot(slot, true);
> > spin_unlock(&khugepaged_mm_lock);
> > return 0;
> > }
> >--
> >2.51.0
> >
>
> --
> Wei Yang
> Help you, Help me
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-18 3:27 ` Vernon Yang
@ 2025-12-18 3:48 ` Wei Yang
2025-12-18 4:41 ` Vernon Yang
0 siblings, 1 reply; 42+ messages in thread
From: Wei Yang @ 2025-12-18 3:48 UTC (permalink / raw)
To: Vernon Yang
Cc: Wei Yang, akpm, david, lorenzo.stoakes, ziy, baohua, lance.yang,
linux-mm, linux-kernel, Vernon Yang
On Thu, Dec 18, 2025 at 11:27:24AM +0800, Vernon Yang wrote:
>On Wed, Dec 17, 2025 at 03:31:55AM +0000, Wei Yang wrote:
>> On Mon, Dec 15, 2025 at 05:04:17PM +0800, Vernon Yang wrote:
>> >The following data is traced by bpftrace on a desktop system. After
>> >the system has been left idle for 10 minutes upon booting, a lot of
>> >SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
>> >khugepaged.
>> >
>> >@scan_pmd_status[1]: 1 ## SCAN_SUCCEED
>> >@scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
>> >@scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
>> >total progress size: 701 MB
>> >Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>> >
>> >The khugepaged_scan list save all task that support collapse into hugepage,
>> >as long as the take is not destroyed, khugepaged will not remove it from
>> >the khugepaged_scan list. This exist a phenomenon where task has already
>> >collapsed all memory regions into hugepage, but khugepaged continues to
>> >scan it, which wastes CPU time and invalid, and due to
>> >khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>> >scanning a large number of invalid task, so scanning really valid task
>> >is later.
>> >
>> >After applying this patch, when all memory is either SCAN_PMD_MAPPED or
>> >SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
>> >list. If the page fault or MADV_HUGEPAGE again, it is added back to
>> >khugepaged.
>>
>> Two thing s come up my mind:
>>
>> * what happens if we split the huge page under memory pressure?
>
>static unsigned int shrink_folio_list(struct list_head *folio_list,
> struct pglist_data *pgdat, struct scan_control *sc,
> struct reclaim_stat *stat, bool ignore_references,
> struct mem_cgroup *memcg)
>{
> ...
>
> folio = lru_to_folio(folio_list);
>
> ...
>
> references = folio_check_references(folio, sc);
> switch (references) {
> case FOLIOREF_ACTIVATE:
> goto activate_locked;
> case FOLIOREF_KEEP:
> stat->nr_ref_keep += nr_pages;
> goto keep_locked;
> case FOLIOREF_RECLAIM:
> case FOLIOREF_RECLAIM_CLEAN:
> ; /* try to reclaim the folio below */
> }
>
> ...
>
> split_folio_to_list(folio, folio_list);
>}
>
>During memory reclaim above, only inactive folios are split. This also
>implies that the folio is cold, meaning it hasn't been used recently, so
>we do not expect to put the mm back onto the khugepaged scan list to
>continue scan/collapse. khugeapged needs to scan hot folios as much as
>possible priorityly and collapse hot folios to avoid wasting CPU.
>
So we will never pout this process back onto the scan list, right?
>> * would this interfere with mTHP collapse?
>
>It has no impact on mTHP collapse, only when all memory is either
>SCAN_PMD_MAPPED or SCAN_PMD_NONE, the mm will be removed automatically.
>other cases will not be removed.
>
>Let me know if I missed something please, thanks!
>
>>
>> >
>> >Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
>> >---
>> > mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
>> > 1 file changed, 25 insertions(+), 10 deletions(-)
>> >
>> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> >index 0598a19a98cc..1ec1af5be3c8 100644
>> >--- a/mm/khugepaged.c
>> >+++ b/mm/khugepaged.c
>> >@@ -115,6 +115,7 @@ struct khugepaged_scan {
>> > struct list_head mm_head;
>> > struct mm_slot *mm_slot;
>> > unsigned long address;
>> >+ bool maybe_collapse;
>> > };
>> >
>> > static struct khugepaged_scan khugepaged_scan = {
>> >@@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>> > return result;
>> > }
>> >
>> >-static void collect_mm_slot(struct mm_slot *slot)
>> >+static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
>> > {
>> > struct mm_struct *mm = slot->mm;
>> >
>> > lockdep_assert_held(&khugepaged_mm_lock);
>> >
>> >- if (hpage_collapse_test_exit(mm)) {
>> >+ if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
>> > /* free mm_slot */
>> > hash_del(&slot->hash);
>> > list_del(&slot->mm_node);
>> >
>> >- /*
>> >- * Not strictly needed because the mm exited already.
>> >- *
>> >- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>> >- */
>> >+ if (!maybe_collapse)
>> >+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>> >
>> > /* khugepaged_mm_lock actually not necessary for the below */
>> > mm_slot_free(mm_slot_cache, slot);
>> >@@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>> > struct mm_slot, mm_node);
>> > khugepaged_scan.address = 0;
>> > khugepaged_scan.mm_slot = slot;
>> >+ khugepaged_scan.maybe_collapse = false;
>> > }
>> > spin_unlock(&khugepaged_mm_lock);
>> >
>> >@@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>> > khugepaged_scan.address, &mmap_locked, cc);
>> > }
>> >
>> >- if (*result == SCAN_SUCCEED)
>> >+ switch (*result) {
>> >+ case SCAN_PMD_NULL:
>> >+ case SCAN_PMD_NONE:
>> >+ case SCAN_PMD_MAPPED:
>> >+ case SCAN_PTE_MAPPED_HUGEPAGE:
>> >+ break;
>> >+ case SCAN_SUCCEED:
>> > ++khugepaged_pages_collapsed;
>> >+ fallthrough;
>>
>> If collapse successfully, we don't need to set maybe_collapse to true?
>
>Above "fallthrough" explicitly tells the compiler that when the collapse is
>successful, run below "khugepaged_scan.maybe_collapse = true" :)
>
Got it, thanks.
>> >+ default:
>> >+ khugepaged_scan.maybe_collapse = true;
>> >+ }
>> >
>> > /* move to next address */
>> > khugepaged_scan.address += HPAGE_PMD_SIZE;
>> >@@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>> > * if we scanned all vmas of this mm.
>> > */
>> > if (hpage_collapse_test_exit(mm) || !vma) {
>> >+ bool maybe_collapse = khugepaged_scan.maybe_collapse;
>> >+
>> >+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
>> >+ maybe_collapse = true;
>> >+
>> > /*
>> > * Make sure that if mm_users is reaching zero while
>> > * khugepaged runs here, khugepaged_exit will find
>> >@@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
>> > if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
>> > khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
>> > khugepaged_scan.address = 0;
>> >+ khugepaged_scan.maybe_collapse = false;
>> > } else {
>> > khugepaged_scan.mm_slot = NULL;
>> > khugepaged_full_scans++;
>> > }
>> >
>> >- collect_mm_slot(slot);
>> >+ collect_mm_slot(slot, maybe_collapse);
>> > }
>> >
>> > trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
>> >@@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
>> > slot = khugepaged_scan.mm_slot;
>> > khugepaged_scan.mm_slot = NULL;
>> > if (slot)
>> >- collect_mm_slot(slot);
>> >+ collect_mm_slot(slot, true);
>> > spin_unlock(&khugepaged_mm_lock);
>> > return 0;
>> > }
>> >--
>> >2.51.0
>> >
>>
>> --
>> Wei Yang
>> Help you, Help me
>
>--
>Thanks,
>Vernon
--
Wei Yang
Help you, Help me
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-18 3:48 ` Wei Yang
@ 2025-12-18 4:41 ` Vernon Yang
0 siblings, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-18 4:41 UTC (permalink / raw)
To: Wei Yang
Cc: akpm, david, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Thu, Dec 18, 2025 at 03:48:01AM +0000, Wei Yang wrote:
> On Thu, Dec 18, 2025 at 11:27:24AM +0800, Vernon Yang wrote:
> >On Wed, Dec 17, 2025 at 03:31:55AM +0000, Wei Yang wrote:
> >> On Mon, Dec 15, 2025 at 05:04:17PM +0800, Vernon Yang wrote:
> >> >The following data is traced by bpftrace on a desktop system. After
> >> >the system has been left idle for 10 minutes upon booting, a lot of
> >> >SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> >> >khugepaged.
> >> >
> >> >@scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> >> >@scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> >> >@scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> >> >total progress size: 701 MB
> >> >Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >> >
> >> >The khugepaged_scan list save all task that support collapse into hugepage,
> >> >as long as the take is not destroyed, khugepaged will not remove it from
> >> >the khugepaged_scan list. This exist a phenomenon where task has already
> >> >collapsed all memory regions into hugepage, but khugepaged continues to
> >> >scan it, which wastes CPU time and invalid, and due to
> >> >khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >> >scanning a large number of invalid task, so scanning really valid task
> >> >is later.
> >> >
> >> >After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> >> >SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> >> >list. If the page fault or MADV_HUGEPAGE again, it is added back to
> >> >khugepaged.
> >>
> >> Two thing s come up my mind:
> >>
> >> * what happens if we split the huge page under memory pressure?
> >
> >static unsigned int shrink_folio_list(struct list_head *folio_list,
> > struct pglist_data *pgdat, struct scan_control *sc,
> > struct reclaim_stat *stat, bool ignore_references,
> > struct mem_cgroup *memcg)
> >{
> > ...
> >
> > folio = lru_to_folio(folio_list);
> >
> > ...
> >
> > references = folio_check_references(folio, sc);
> > switch (references) {
> > case FOLIOREF_ACTIVATE:
> > goto activate_locked;
> > case FOLIOREF_KEEP:
> > stat->nr_ref_keep += nr_pages;
> > goto keep_locked;
> > case FOLIOREF_RECLAIM:
> > case FOLIOREF_RECLAIM_CLEAN:
> > ; /* try to reclaim the folio below */
> > }
> >
> > ...
> >
> > split_folio_to_list(folio, folio_list);
> >}
> >
> >During memory reclaim above, only inactive folios are split. This also
> >implies that the folio is cold, meaning it hasn't been used recently, so
> >we do not expect to put the mm back onto the khugepaged scan list to
> >continue scan/collapse. khugeapged needs to scan hot folios as much as
> >possible priorityly and collapse hot folios to avoid wasting CPU.
> >
>
> So we will never pout this process back onto the scan list, right?
No, if the page fault or MADV_HUGEPAGE again, this task is added back to
khugepaged scan list. Just doesn't actively put this task back to the
khugepaged scan list after splitting.
>
> >> * would this interfere with mTHP collapse?
> >
> >It has no impact on mTHP collapse, only when all memory is either
> >SCAN_PMD_MAPPED or SCAN_PMD_NONE, the mm will be removed automatically.
> >other cases will not be removed.
> >
> >Let me know if I missed something please, thanks!
> >
> >>
> >> >
> >> >Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> >> >---
> >> > mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> >> > 1 file changed, 25 insertions(+), 10 deletions(-)
> >> >
> >> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >> >index 0598a19a98cc..1ec1af5be3c8 100644
> >> >--- a/mm/khugepaged.c
> >> >+++ b/mm/khugepaged.c
> >> >@@ -115,6 +115,7 @@ struct khugepaged_scan {
> >> > struct list_head mm_head;
> >> > struct mm_slot *mm_slot;
> >> > unsigned long address;
> >> >+ bool maybe_collapse;
> >> > };
> >> >
> >> > static struct khugepaged_scan khugepaged_scan = {
> >> >@@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> >> > return result;
> >> > }
> >> >
> >> >-static void collect_mm_slot(struct mm_slot *slot)
> >> >+static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
> >> > {
> >> > struct mm_struct *mm = slot->mm;
> >> >
> >> > lockdep_assert_held(&khugepaged_mm_lock);
> >> >
> >> >- if (hpage_collapse_test_exit(mm)) {
> >> >+ if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
> >> > /* free mm_slot */
> >> > hash_del(&slot->hash);
> >> > list_del(&slot->mm_node);
> >> >
> >> >- /*
> >> >- * Not strictly needed because the mm exited already.
> >> >- *
> >> >- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> >> >- */
> >> >+ if (!maybe_collapse)
> >> >+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> >> >
> >> > /* khugepaged_mm_lock actually not necessary for the below */
> >> > mm_slot_free(mm_slot_cache, slot);
> >> >@@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> >> > struct mm_slot, mm_node);
> >> > khugepaged_scan.address = 0;
> >> > khugepaged_scan.mm_slot = slot;
> >> >+ khugepaged_scan.maybe_collapse = false;
> >> > }
> >> > spin_unlock(&khugepaged_mm_lock);
> >> >
> >> >@@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> >> > khugepaged_scan.address, &mmap_locked, cc);
> >> > }
> >> >
> >> >- if (*result == SCAN_SUCCEED)
> >> >+ switch (*result) {
> >> >+ case SCAN_PMD_NULL:
> >> >+ case SCAN_PMD_NONE:
> >> >+ case SCAN_PMD_MAPPED:
> >> >+ case SCAN_PTE_MAPPED_HUGEPAGE:
> >> >+ break;
> >> >+ case SCAN_SUCCEED:
> >> > ++khugepaged_pages_collapsed;
> >> >+ fallthrough;
> >>
> >> If collapse successfully, we don't need to set maybe_collapse to true?
> >
> >Above "fallthrough" explicitly tells the compiler that when the collapse is
> >successful, run below "khugepaged_scan.maybe_collapse = true" :)
> >
>
> Got it, thanks.
>
> >> >+ default:
> >> >+ khugepaged_scan.maybe_collapse = true;
> >> >+ }
> >> >
> >> > /* move to next address */
> >> > khugepaged_scan.address += HPAGE_PMD_SIZE;
> >> >@@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> >> > * if we scanned all vmas of this mm.
> >> > */
> >> > if (hpage_collapse_test_exit(mm) || !vma) {
> >> >+ bool maybe_collapse = khugepaged_scan.maybe_collapse;
> >> >+
> >> >+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> >> >+ maybe_collapse = true;
> >> >+
> >> > /*
> >> > * Make sure that if mm_users is reaching zero while
> >> > * khugepaged runs here, khugepaged_exit will find
> >> >@@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> >> > if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> >> > khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> >> > khugepaged_scan.address = 0;
> >> >+ khugepaged_scan.maybe_collapse = false;
> >> > } else {
> >> > khugepaged_scan.mm_slot = NULL;
> >> > khugepaged_full_scans++;
> >> > }
> >> >
> >> >- collect_mm_slot(slot);
> >> >+ collect_mm_slot(slot, maybe_collapse);
> >> > }
> >> >
> >> > trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> >> >@@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
> >> > slot = khugepaged_scan.mm_slot;
> >> > khugepaged_scan.mm_slot = NULL;
> >> > if (slot)
> >> >- collect_mm_slot(slot);
> >> >+ collect_mm_slot(slot, true);
> >> > spin_unlock(&khugepaged_mm_lock);
> >> > return 0;
> >> > }
> >> >--
> >> >2.51.0
> >> >
> >>
> >> --
> >> Wei Yang
> >> Help you, Help me
> >
> >--
> >Thanks,
> >Vernon
>
> --
> Wei Yang
> Help you, Help me
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
` (3 preceding siblings ...)
2025-12-17 3:31 ` Wei Yang
@ 2025-12-18 9:29 ` David Hildenbrand (Red Hat)
2025-12-19 5:24 ` Vernon Yang
2025-12-19 8:35 ` Vernon Yang
2025-12-22 19:00 ` kernel test robot
5 siblings, 2 replies; 42+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-18 9:29 UTC (permalink / raw)
To: Vernon Yang, akpm, lorenzo.stoakes
Cc: ziy, npache, baohua, lance.yang, linux-mm, linux-kernel, Vernon Yang
On 12/15/25 10:04, Vernon Yang wrote:
> The following data is traced by bpftrace on a desktop system. After
> the system has been left idle for 10 minutes upon booting, a lot of
> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> khugepaged.
>
> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> total progress size: 701 MB
> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>
> The khugepaged_scan list save all task that support collapse into hugepage,
> as long as the take is not destroyed, khugepaged will not remove it from
> the khugepaged_scan list. This exist a phenomenon where task has already
> collapsed all memory regions into hugepage, but khugepaged continues to
> scan it, which wastes CPU time and invalid, and due to
> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> scanning a large number of invalid task, so scanning really valid task
> is later.
>
> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> list. If the page fault or MADV_HUGEPAGE again, it is added back to
> khugepaged.
I don't like that, as it assumes that memory within such a process would
be rather static, which is easily not the case (e.g., allocators just
doing MADV_DONTNEED to free memory).
If most stuff is collapsed to PMDs already, can't we just skip over
these regions a bit faster?
--
Cheers
David
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-18 9:29 ` David Hildenbrand (Red Hat)
@ 2025-12-19 5:24 ` Vernon Yang
2025-12-19 9:00 ` David Hildenbrand (Red Hat)
2025-12-19 8:35 ` Vernon Yang
1 sibling, 1 reply; 42+ messages in thread
From: Vernon Yang @ 2025-12-19 5:24 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
> On 12/15/25 10:04, Vernon Yang wrote:
> > The following data is traced by bpftrace on a desktop system. After
> > the system has been left idle for 10 minutes upon booting, a lot of
> > SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> > khugepaged.
> >
> > @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> > @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> > @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> > total progress size: 701 MB
> > Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >
> > The khugepaged_scan list save all task that support collapse into hugepage,
> > as long as the take is not destroyed, khugepaged will not remove it from
> > the khugepaged_scan list. This exist a phenomenon where task has already
> > collapsed all memory regions into hugepage, but khugepaged continues to
> > scan it, which wastes CPU time and invalid, and due to
> > khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> > scanning a large number of invalid task, so scanning really valid task
> > is later.
> >
> > After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> > SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> > list. If the page fault or MADV_HUGEPAGE again, it is added back to
> > khugepaged.
>
> I don't like that, as it assumes that memory within such a process would be
> rather static, which is easily not the case (e.g., allocators just doing
> MADV_DONTNEED to free memory).
>
> If most stuff is collapsed to PMDs already, can't we just skip over these
> regions a bit faster?
/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
The observed phenomenon is that when scanning these regions, the loop is
broken upon reaching the number of khugepaged_pages_to_scan, thereforce
the khugepaged enters 10s sleep. So if we just skip over these regions,
will break the semantics of khugepaged_pages_to_scan.
I also think this approach is great because it is simple sufficiently.
If we can skip over these regions directly, that's excellent.
> --
> Cheers
>
> David
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-19 5:24 ` Vernon Yang
@ 2025-12-19 9:00 ` David Hildenbrand (Red Hat)
0 siblings, 0 replies; 42+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-19 9:00 UTC (permalink / raw)
To: Vernon Yang
Cc: akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On 12/19/25 06:24, Vernon Yang wrote:
> On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
>> On 12/15/25 10:04, Vernon Yang wrote:
>>> The following data is traced by bpftrace on a desktop system. After
>>> the system has been left idle for 10 minutes upon booting, a lot of
>>> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
>>> khugepaged.
>>>
>>> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
>>> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
>>> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
>>> total progress size: 701 MB
>>> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>>>
>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>> as long as the take is not destroyed, khugepaged will not remove it from
>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>> scan it, which wastes CPU time and invalid, and due to
>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>> scanning a large number of invalid task, so scanning really valid task
>>> is later.
>>>
>>> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
>>> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
>>> list. If the page fault or MADV_HUGEPAGE again, it is added back to
>>> khugepaged.
>>
>> I don't like that, as it assumes that memory within such a process would be
>> rather static, which is easily not the case (e.g., allocators just doing
>> MADV_DONTNEED to free memory).
>>
>> If most stuff is collapsed to PMDs already, can't we just skip over these
>> regions a bit faster?
>
> /* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
> static unsigned int khugepaged_pages_to_scan __read_mostly;
>
> The observed phenomenon is that when scanning these regions, the loop is
> broken upon reaching the number of khugepaged_pages_to_scan, thereforce
> the khugepaged enters 10s sleep.
BTW, the 10s sleep is ridiculous :)
I wonder whether we were more careful in the past regarding canning
overhead due to the mmap read lock. Nowadays page faults typicaly use
per-vma locks, so I wonder whether the scanning overhead is still a
problem. (I assume there is more to optimize long-term)
--
Cheers
David
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-18 9:29 ` David Hildenbrand (Red Hat)
2025-12-19 5:24 ` Vernon Yang
@ 2025-12-19 8:35 ` Vernon Yang
2025-12-19 8:55 ` David Hildenbrand (Red Hat)
2025-12-23 11:18 ` Dev Jain
1 sibling, 2 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-19 8:35 UTC (permalink / raw)
To: David Hildenbrand (Red Hat)
Cc: akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
> On 12/15/25 10:04, Vernon Yang wrote:
> > The following data is traced by bpftrace on a desktop system. After
> > the system has been left idle for 10 minutes upon booting, a lot of
> > SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> > khugepaged.
> >
> > @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> > @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> > @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> > total progress size: 701 MB
> > Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >
> > The khugepaged_scan list save all task that support collapse into hugepage,
> > as long as the take is not destroyed, khugepaged will not remove it from
> > the khugepaged_scan list. This exist a phenomenon where task has already
> > collapsed all memory regions into hugepage, but khugepaged continues to
> > scan it, which wastes CPU time and invalid, and due to
> > khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> > scanning a large number of invalid task, so scanning really valid task
> > is later.
> >
> > After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> > SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> > list. If the page fault or MADV_HUGEPAGE again, it is added back to
> > khugepaged.
>
> I don't like that, as it assumes that memory within such a process would be
> rather static, which is easily not the case (e.g., allocators just doing
> MADV_DONTNEED to free memory).
>
> If most stuff is collapsed to PMDs already, can't we just skip over these
> regions a bit faster?
I have a flash of inspiration and came up with a good idea.
If these regions have already been collapsed into hugepage, rechecking
them would be very fast. Due to the khugepaged_pages_to_scan can also
represent the number of VMAs to skip, we can extend its semantics as
follows:
/*
* default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
* every 10 second.
*/
static unsigned int khugepaged_pages_to_scan __read_mostly;
switch (*result) {
case SCAN_NO_PTE_TABLE:
case SCAN_PMD_MAPPED:
case SCAN_PTE_MAPPED_HUGEPAGE:
progress++; // here
break;
case SCAN_SUCCEED:
++khugepaged_pages_collapsed;
fallthrough;
default:
progress += HPAGE_PMD_NR;
}
This way can achieve our goal. David, do you like it?
> --
> Cheers
>
> David
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-19 8:35 ` Vernon Yang
@ 2025-12-19 8:55 ` David Hildenbrand (Red Hat)
2025-12-23 11:18 ` Dev Jain
1 sibling, 0 replies; 42+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-19 8:55 UTC (permalink / raw)
To: Vernon Yang
Cc: akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On 12/19/25 09:35, Vernon Yang wrote:
> On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
>> On 12/15/25 10:04, Vernon Yang wrote:
>>> The following data is traced by bpftrace on a desktop system. After
>>> the system has been left idle for 10 minutes upon booting, a lot of
>>> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
>>> khugepaged.
>>>
>>> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
>>> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
>>> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
>>> total progress size: 701 MB
>>> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>>>
>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>> as long as the take is not destroyed, khugepaged will not remove it from
>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>> scan it, which wastes CPU time and invalid, and due to
>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>> scanning a large number of invalid task, so scanning really valid task
>>> is later.
>>>
>>> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
>>> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
>>> list. If the page fault or MADV_HUGEPAGE again, it is added back to
>>> khugepaged.
>>
>> I don't like that, as it assumes that memory within such a process would be
>> rather static, which is easily not the case (e.g., allocators just doing
>> MADV_DONTNEED to free memory).
>>
>> If most stuff is collapsed to PMDs already, can't we just skip over these
>> regions a bit faster?
>
> I have a flash of inspiration and came up with a good idea.
>
> If these regions have already been collapsed into hugepage, rechecking
> them would be very fast. Due to the khugepaged_pages_to_scan can also
> represent the number of VMAs to skip, we can extend its semantics as
> follows:
>
> /*
> * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> * every 10 second.
> */
> static unsigned int khugepaged_pages_to_scan __read_mostly;
>
> switch (*result) {
> case SCAN_NO_PTE_TABLE:
> case SCAN_PMD_MAPPED:
> case SCAN_PTE_MAPPED_HUGEPAGE:
> progress++; // here
> break;
> case SCAN_SUCCEED:
> ++khugepaged_pages_collapsed;
> fallthrough;
> default:
> progress += HPAGE_PMD_NR;
> }
>
> This way can achieve our goal. David, do you like it?
I'd have to see the full patch, but IMHO we should rather focus on on
"how many pte/pmd entries did we check" and not "how many PMD areas did
we check".
Maybe there is a history to this, but conceptually I think we wanted to
limit the work we do in one operation to something reasonable. Reading a
single PMD is obviously faster than 512 PTEs.
--
Cheers
David
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-19 8:35 ` Vernon Yang
2025-12-19 8:55 ` David Hildenbrand (Red Hat)
@ 2025-12-23 11:18 ` Dev Jain
2025-12-25 16:07 ` Vernon Yang
2025-12-29 6:02 ` Vernon Yang
1 sibling, 2 replies; 42+ messages in thread
From: Dev Jain @ 2025-12-23 11:18 UTC (permalink / raw)
To: Vernon Yang, David Hildenbrand (Red Hat)
Cc: akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On 19/12/25 2:05 pm, Vernon Yang wrote:
> On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
>> On 12/15/25 10:04, Vernon Yang wrote:
>>> The following data is traced by bpftrace on a desktop system. After
>>> the system has been left idle for 10 minutes upon booting, a lot of
>>> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
>>> khugepaged.
>>>
>>> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
>>> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
>>> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
>>> total progress size: 701 MB
>>> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>>>
>>> The khugepaged_scan list save all task that support collapse into hugepage,
>>> as long as the take is not destroyed, khugepaged will not remove it from
>>> the khugepaged_scan list. This exist a phenomenon where task has already
>>> collapsed all memory regions into hugepage, but khugepaged continues to
>>> scan it, which wastes CPU time and invalid, and due to
>>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
>>> scanning a large number of invalid task, so scanning really valid task
>>> is later.
>>>
>>> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
>>> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
>>> list. If the page fault or MADV_HUGEPAGE again, it is added back to
>>> khugepaged.
>> I don't like that, as it assumes that memory within such a process would be
>> rather static, which is easily not the case (e.g., allocators just doing
>> MADV_DONTNEED to free memory).
>>
>> If most stuff is collapsed to PMDs already, can't we just skip over these
>> regions a bit faster?
> I have a flash of inspiration and came up with a good idea.
>
> If these regions have already been collapsed into hugepage, rechecking
> them would be very fast. Due to the khugepaged_pages_to_scan can also
> represent the number of VMAs to skip, we can extend its semantics as
> follows:
>
> /*
> * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> * every 10 second.
> */
> static unsigned int khugepaged_pages_to_scan __read_mostly;
>
> switch (*result) {
> case SCAN_NO_PTE_TABLE:
> case SCAN_PMD_MAPPED:
> case SCAN_PTE_MAPPED_HUGEPAGE:
> progress++; // here
> break;
> case SCAN_SUCCEED:
> ++khugepaged_pages_collapsed;
> fallthrough;
> default:
> progress += HPAGE_PMD_NR;
> }
>
> This way can achieve our goal. David, do you like it?
This looks good, can you formally test this and see if it comes close to the optimizations
yielded by the current version of the patchset?
>
>> --
>> Cheers
>>
>> David
> --
> Thanks,
> Vernon
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-23 11:18 ` Dev Jain
@ 2025-12-25 16:07 ` Vernon Yang
2025-12-29 6:02 ` Vernon Yang
1 sibling, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-25 16:07 UTC (permalink / raw)
To: Dev Jain
Cc: David Hildenbrand (Red Hat),
akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Tue, Dec 23, 2025 at 04:48:57PM +0530, Dev Jain wrote:
>
> On 19/12/25 2:05 pm, Vernon Yang wrote:
> > On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
> >> On 12/15/25 10:04, Vernon Yang wrote:
> >>> The following data is traced by bpftrace on a desktop system. After
> >>> the system has been left idle for 10 minutes upon booting, a lot of
> >>> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> >>> khugepaged.
> >>>
> >>> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> >>> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> >>> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> >>> total progress size: 701 MB
> >>> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >>>
> >>> The khugepaged_scan list save all task that support collapse into hugepage,
> >>> as long as the take is not destroyed, khugepaged will not remove it from
> >>> the khugepaged_scan list. This exist a phenomenon where task has already
> >>> collapsed all memory regions into hugepage, but khugepaged continues to
> >>> scan it, which wastes CPU time and invalid, and due to
> >>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >>> scanning a large number of invalid task, so scanning really valid task
> >>> is later.
> >>>
> >>> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> >>> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> >>> list. If the page fault or MADV_HUGEPAGE again, it is added back to
> >>> khugepaged.
> >> I don't like that, as it assumes that memory within such a process would be
> >> rather static, which is easily not the case (e.g., allocators just doing
> >> MADV_DONTNEED to free memory).
> >>
> >> If most stuff is collapsed to PMDs already, can't we just skip over these
> >> regions a bit faster?
> > I have a flash of inspiration and came up with a good idea.
> >
> > If these regions have already been collapsed into hugepage, rechecking
> > them would be very fast. Due to the khugepaged_pages_to_scan can also
> > represent the number of VMAs to skip, we can extend its semantics as
> > follows:
> >
> > /*
> > * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> > * every 10 second.
> > */
> > static unsigned int khugepaged_pages_to_scan __read_mostly;
> >
> > switch (*result) {
> > case SCAN_NO_PTE_TABLE:
> > case SCAN_PMD_MAPPED:
> > case SCAN_PTE_MAPPED_HUGEPAGE:
> > progress++; // here
> > break;
> > case SCAN_SUCCEED:
> > ++khugepaged_pages_collapsed;
> > fallthrough;
> > default:
> > progress += HPAGE_PMD_NR;
> > }
> >
> > This way can achieve our goal. David, do you like it?
>
> This looks good, can you formally test this and see if it comes close to the optimizations
> yielded by the current version of the patchset?
Both can achieve this function, reducing the time of a full scan,
previously tested.
About performance test, I will test it formally.
--
Merry Christmas,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-23 11:18 ` Dev Jain
2025-12-25 16:07 ` Vernon Yang
@ 2025-12-29 6:02 ` Vernon Yang
1 sibling, 0 replies; 42+ messages in thread
From: Vernon Yang @ 2025-12-29 6:02 UTC (permalink / raw)
To: Dev Jain
Cc: David Hildenbrand (Red Hat),
akpm, lorenzo.stoakes, ziy, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
On Tue, Dec 23, 2025 at 04:48:57PM +0530, Dev Jain wrote:
>
> On 19/12/25 2:05 pm, Vernon Yang wrote:
> > On Thu, Dec 18, 2025 at 10:29:18AM +0100, David Hildenbrand (Red Hat) wrote:
> >> On 12/15/25 10:04, Vernon Yang wrote:
> >>> The following data is traced by bpftrace on a desktop system. After
> >>> the system has been left idle for 10 minutes upon booting, a lot of
> >>> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> >>> khugepaged.
> >>>
> >>> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> >>> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> >>> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> >>> total progress size: 701 MB
> >>> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
> >>>
> >>> The khugepaged_scan list save all task that support collapse into hugepage,
> >>> as long as the take is not destroyed, khugepaged will not remove it from
> >>> the khugepaged_scan list. This exist a phenomenon where task has already
> >>> collapsed all memory regions into hugepage, but khugepaged continues to
> >>> scan it, which wastes CPU time and invalid, and due to
> >>> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> >>> scanning a large number of invalid task, so scanning really valid task
> >>> is later.
> >>>
> >>> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> >>> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> >>> list. If the page fault or MADV_HUGEPAGE again, it is added back to
> >>> khugepaged.
> >> I don't like that, as it assumes that memory within such a process would be
> >> rather static, which is easily not the case (e.g., allocators just doing
> >> MADV_DONTNEED to free memory).
> >>
> >> If most stuff is collapsed to PMDs already, can't we just skip over these
> >> regions a bit faster?
> > I have a flash of inspiration and came up with a good idea.
> >
> > If these regions have already been collapsed into hugepage, rechecking
> > them would be very fast. Due to the khugepaged_pages_to_scan can also
> > represent the number of VMAs to skip, we can extend its semantics as
> > follows:
> >
> > /*
> > * default scan 8*HPAGE_PMD_NR ptes, pmd_mapped, no_pte_table or vmas
> > * every 10 second.
> > */
> > static unsigned int khugepaged_pages_to_scan __read_mostly;
> >
> > switch (*result) {
> > case SCAN_NO_PTE_TABLE:
> > case SCAN_PMD_MAPPED:
> > case SCAN_PTE_MAPPED_HUGEPAGE:
> > progress++; // here
> > break;
> > case SCAN_SUCCEED:
> > ++khugepaged_pages_collapsed;
> > fallthrough;
> > default:
> > progress += HPAGE_PMD_NR;
> > }
> >
> > This way can achieve our goal. David, do you like it?
>
> This looks good, can you formally test this and see if it comes close to the optimizations
> yielded by the current version of the patchset?
Either has same performance. For detailed data, you can see the v2[1].
[1] https://lore.kernel.org/linux-mm/20251229055151.54887-1-yanglincheng@kylinos.cn/
--
Thanks,
Vernon
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
2025-12-15 9:04 ` [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed Vernon Yang
` (4 preceding siblings ...)
2025-12-18 9:29 ` David Hildenbrand (Red Hat)
@ 2025-12-22 19:00 ` kernel test robot
5 siblings, 0 replies; 42+ messages in thread
From: kernel test robot @ 2025-12-22 19:00 UTC (permalink / raw)
To: Vernon Yang, akpm, david, lorenzo.stoakes
Cc: oe-kbuild-all, ziy, npache, baohua, lance.yang, linux-mm,
linux-kernel, Vernon Yang
Hi Vernon,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.19-rc2 next-20251219]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251222/202512221928.EnLvUgqT-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251222/202512221928.EnLvUgqT-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512221928.EnLvUgqT-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/khugepaged.c: In function 'khugepaged_scan_mm_slot':
>> mm/khugepaged.c:2490:30: error: 'SCAN_PMD_NULL' undeclared (first use in this function); did you mean 'SCAN_VMA_NULL'?
2490 | case SCAN_PMD_NULL:
| ^~~~~~~~~~~~~
| SCAN_VMA_NULL
mm/khugepaged.c:2490:30: note: each undeclared identifier is reported only once for each function it appears in
>> mm/khugepaged.c:2491:30: error: 'SCAN_PMD_NONE' undeclared (first use in this function)
2491 | case SCAN_PMD_NONE:
| ^~~~~~~~~~~~~
vim +2490 mm/khugepaged.c
2392
2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2394 struct collapse_control *cc)
2395 __releases(&khugepaged_mm_lock)
2396 __acquires(&khugepaged_mm_lock)
2397 {
2398 struct vma_iterator vmi;
2399 struct mm_slot *slot;
2400 struct mm_struct *mm;
2401 struct vm_area_struct *vma;
2402 int progress = 0;
2403
2404 VM_BUG_ON(!pages);
2405 lockdep_assert_held(&khugepaged_mm_lock);
2406 *result = SCAN_FAIL;
2407
2408 if (khugepaged_scan.mm_slot) {
2409 slot = khugepaged_scan.mm_slot;
2410 } else {
2411 slot = list_first_entry(&khugepaged_scan.mm_head,
2412 struct mm_slot, mm_node);
2413 khugepaged_scan.address = 0;
2414 khugepaged_scan.mm_slot = slot;
2415 khugepaged_scan.maybe_collapse = false;
2416 }
2417 spin_unlock(&khugepaged_mm_lock);
2418
2419 mm = slot->mm;
2420 /*
2421 * Don't wait for semaphore (to avoid long wait times). Just move to
2422 * the next mm on the list.
2423 */
2424 vma = NULL;
2425 if (unlikely(!mmap_read_trylock(mm)))
2426 goto breakouterloop_mmap_lock;
2427
2428 progress++;
2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2430 goto breakouterloop;
2431
2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2433 for_each_vma(vmi, vma) {
2434 unsigned long hstart, hend;
2435
2436 cond_resched();
2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2438 progress++;
2439 break;
2440 }
2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2442 skip:
2443 progress++;
2444 continue;
2445 }
2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2448 if (khugepaged_scan.address > hend)
2449 goto skip;
2450 if (khugepaged_scan.address < hstart)
2451 khugepaged_scan.address = hstart;
2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2453
2454 while (khugepaged_scan.address < hend) {
2455 bool mmap_locked = true;
2456
2457 cond_resched();
2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2459 goto breakouterloop;
2460
2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
2463 hend);
2464 if (!vma_is_anonymous(vma)) {
2465 struct file *file = get_file(vma->vm_file);
2466 pgoff_t pgoff = linear_page_index(vma,
2467 khugepaged_scan.address);
2468
2469 mmap_read_unlock(mm);
2470 mmap_locked = false;
2471 *result = hpage_collapse_scan_file(mm,
2472 khugepaged_scan.address, file, pgoff, cc);
2473 fput(file);
2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2475 mmap_read_lock(mm);
2476 if (hpage_collapse_test_exit_or_disable(mm))
2477 goto breakouterloop;
2478 *result = collapse_pte_mapped_thp(mm,
2479 khugepaged_scan.address, false);
2480 if (*result == SCAN_PMD_MAPPED)
2481 *result = SCAN_SUCCEED;
2482 mmap_read_unlock(mm);
2483 }
2484 } else {
2485 *result = hpage_collapse_scan_pmd(mm, vma,
2486 khugepaged_scan.address, &mmap_locked, cc);
2487 }
2488
2489 switch (*result) {
> 2490 case SCAN_PMD_NULL:
> 2491 case SCAN_PMD_NONE:
2492 case SCAN_PMD_MAPPED:
2493 case SCAN_PTE_MAPPED_HUGEPAGE:
2494 break;
2495 case SCAN_SUCCEED:
2496 ++khugepaged_pages_collapsed;
2497 fallthrough;
2498 default:
2499 khugepaged_scan.maybe_collapse = true;
2500 }
2501
2502 /* move to next address */
2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
2504 progress += HPAGE_PMD_NR;
2505 if (!mmap_locked)
2506 /*
2507 * We released mmap_lock so break loop. Note
2508 * that we drop mmap_lock before all hugepage
2509 * allocations, so if allocation fails, we are
2510 * guaranteed to break here and report the
2511 * correct result back to caller.
2512 */
2513 goto breakouterloop_mmap_lock;
2514 if (progress >= pages)
2515 goto breakouterloop;
2516 }
2517 }
2518 breakouterloop:
2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2520 breakouterloop_mmap_lock:
2521
2522 spin_lock(&khugepaged_mm_lock);
2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2524 /*
2525 * Release the current mm_slot if this mm is about to die, or
2526 * if we scanned all vmas of this mm.
2527 */
2528 if (hpage_collapse_test_exit(mm) || !vma) {
2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
2530
2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
2532 maybe_collapse = true;
2533
2534 /*
2535 * Make sure that if mm_users is reaching zero while
2536 * khugepaged runs here, khugepaged_exit will find
2537 * mm_slot not pointing to the exiting mm.
2538 */
2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2541 khugepaged_scan.address = 0;
2542 khugepaged_scan.maybe_collapse = false;
2543 } else {
2544 khugepaged_scan.mm_slot = NULL;
2545 khugepaged_full_scans++;
2546 }
2547
2548 collect_mm_slot(slot, maybe_collapse);
2549 }
2550
2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
2552
2553 return progress;
2554 }
2555
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread