* [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
@ 2025-10-14 15:11 Pedro Demarchi Gomes
2025-10-14 15:59 ` David Hildenbrand
` (3 more replies)
0 siblings, 4 replies; 6+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 15:11 UTC (permalink / raw)
To: Andrew Morton, David Hildenbrand
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes
Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.
This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].
Changes since v1 [2]:
- Use pmd_entry to walk page range
- Use cond_resched inside pmd_entry()
- walk_page_range returns page+folio
[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
[2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 144 ++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 94 insertions(+), 50 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..adb0267a1b7d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,14 +2455,82 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_walk_private {
+ struct page *page;
+ struct folio *folio;
+ struct vm_area_struct *vma;
+};
+
+static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (!vma->anon_vma || !(vma->vm_flags & VM_MERGEABLE)) {
+ ksm_scan.address = vma->vm_end;
+ return 1;
+ }
+ return 0;
+}
+
+static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mm_struct *mm = walk->mm;
+ struct vm_area_struct *vma = walk->vma;
+ struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
+ struct folio *folio;
+ pte_t *start_pte, *pte, ptent;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte) {
+ ksm_scan.address = end;
+ return 0;
+ }
+
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = ptep_get(pte);
+ struct page *page = vm_normal_page(vma, addr, ptent);
+ ksm_scan.address = addr;
+
+ if (ksm_test_exit(mm)) {
+ ret = 1;
+ break;
+ }
+
+ if (!page)
+ continue;
+
+ folio = page_folio(page);
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ continue;
+
+ ret = 1;
+ folio_get(folio);
+ private->page = page;
+ private->folio = folio;
+ private->vma = vma;
+ break;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+
+ cond_resched();
+ return ret;
+}
+
+struct mm_walk_ops walk_ops = {
+ .pmd_entry = ksm_pmd_entry,
+ .test_walk = ksm_walk_test,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
- struct vm_area_struct *vma;
struct ksm_rmap_item *rmap_item;
- struct vma_iterator vmi;
int nid;
if (list_empty(&ksm_mm_head.slot.mm_node))
@@ -2527,64 +2595,40 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
slot = &mm_slot->slot;
mm = slot->mm;
- vma_iter_init(&vmi, mm, ksm_scan.address);
mmap_read_lock(mm);
if (ksm_test_exit(mm))
goto no_vmas;
- for_each_vma(vmi, vma) {
- if (!(vma->vm_flags & VM_MERGEABLE))
- continue;
- if (ksm_scan.address < vma->vm_start)
- ksm_scan.address = vma->vm_start;
- if (!vma->anon_vma)
- ksm_scan.address = vma->vm_end;
-
- while (ksm_scan.address < vma->vm_end) {
- struct page *tmp_page = NULL;
- struct folio_walk fw;
- struct folio *folio;
+get_page:
+ struct ksm_walk_private walk_private = {
+ .page = NULL,
+ .folio = NULL,
+ .vma = NULL
+ };
- if (ksm_test_exit(mm))
- break;
+ walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
+ if (walk_private.page) {
+ flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
+ flush_dcache_page(walk_private.page);
+ rmap_item = get_next_rmap_item(mm_slot,
+ ksm_scan.rmap_list, ksm_scan.address);
+ if (rmap_item) {
+ ksm_scan.rmap_list =
+ &rmap_item->rmap_list;
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
+ ksm_scan.address += PAGE_SIZE;
+ if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
+ folio_put(walk_private.folio);
+ goto get_page;
}
- if (tmp_page) {
- flush_anon_page(vma, tmp_page, ksm_scan.address);
- flush_dcache_page(tmp_page);
- rmap_item = get_next_rmap_item(mm_slot,
- ksm_scan.rmap_list, ksm_scan.address);
- if (rmap_item) {
- ksm_scan.rmap_list =
- &rmap_item->rmap_list;
-
- if (should_skip_rmap_item(folio, rmap_item)) {
- folio_put(folio);
- goto next_page;
- }
-
- ksm_scan.address += PAGE_SIZE;
- *page = tmp_page;
- } else {
- folio_put(folio);
- }
- mmap_read_unlock(mm);
- return rmap_item;
- }
-next_page:
- ksm_scan.address += PAGE_SIZE;
- cond_resched();
+ *page = walk_private.page;
+ } else {
+ folio_put(walk_private.folio);
}
+ mmap_read_unlock(mm);
+ return rmap_item;
}
if (ksm_test_exit(mm)) {
--
2.43.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
@ 2025-10-14 15:59 ` David Hildenbrand
2025-10-14 21:57 ` Pedro Demarchi Gomes
2025-10-15 3:53 ` kernel test robot
` (2 subsequent siblings)
3 siblings, 1 reply; 6+ messages in thread
From: David Hildenbrand @ 2025-10-14 15:59 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
>
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
>
> Changes since v1 [2]:
> - Use pmd_entry to walk page range
> - Use cond_resched inside pmd_entry()
> - walk_page_range returns page+folio
>
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/
>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
[...]
> +
> +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
> + unsigned long end, struct mm_walk *walk)
> +{
> + struct mm_struct *mm = walk->mm;
> + struct vm_area_struct *vma = walk->vma;
> + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> + struct folio *folio;
> + pte_t *start_pte, *pte, ptent;
> + spinlock_t *ptl;
> + int ret = 0;
> +
> + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> + if (!start_pte) {
> + ksm_scan.address = end;
> + return 0;
> + }
Please take more time to understand the details. If there is a THP there
you actually have to find the relevant page.
> +
> + for (; addr < end; pte++, addr += PAGE_SIZE) {
> + ptent = ptep_get(pte);
> + struct page *page = vm_normal_page(vma, addr, ptent);
> + ksm_scan.address = addr;
Updating that value from in here is a bit nasty. I wonder if you should
rather make the function also return the address of the found page as well.
In the caller, if we don't find any page, there is no need to update the
address from this function I guess. We iterated the complete MM space in
that case.
> +
> + if (ksm_test_exit(mm)) {
> + ret = 1;
> + break;
> + }
> +
> + if (!page)
> + continue;
> +
> + folio = page_folio(page);
> + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> + continue;
> +
> + ret = 1;
> + folio_get(folio);
> + private->page = page;
> + private->folio = folio;
> + private->vma = vma;
> + break;
> + }
> + pte_unmap_unlock(start_pte, ptl);
> +
> + cond_resched();
> + return ret;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> + .pmd_entry = ksm_pmd_entry,
> + .test_walk = ksm_walk_test,
> + .walk_lock = PGWALK_RDLOCK,
> +};
> +
> static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> {
> struct mm_struct *mm;
> struct ksm_mm_slot *mm_slot;
> struct mm_slot *slot;
> - struct vm_area_struct *vma;
> struct ksm_rmap_item *rmap_item;
> - struct vma_iterator vmi;
> int nid;
>
> if (list_empty(&ksm_mm_head.slot.mm_node))
> @@ -2527,64 +2595,40 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>
> slot = &mm_slot->slot;
> mm = slot->mm;
> - vma_iter_init(&vmi, mm, ksm_scan.address);
>
> mmap_read_lock(mm);
> if (ksm_test_exit(mm))
> goto no_vmas;
>
> - for_each_vma(vmi, vma) {
> - if (!(vma->vm_flags & VM_MERGEABLE))
> - continue;
> - if (ksm_scan.address < vma->vm_start)
> - ksm_scan.address = vma->vm_start;
> - if (!vma->anon_vma)
> - ksm_scan.address = vma->vm_end;
> -
> - while (ksm_scan.address < vma->vm_end) {
> - struct page *tmp_page = NULL;
> - struct folio_walk fw;
> - struct folio *folio;
> +get_page:
> + struct ksm_walk_private walk_private = {
> + .page = NULL,
> + .folio = NULL,
> + .vma = NULL
> + };
>
> - if (ksm_test_exit(mm))
> - break;
> + walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> + if (walk_private.page) {
> + flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
> + flush_dcache_page(walk_private.page);
Keep working on the folio please.
> + rmap_item = get_next_rmap_item(mm_slot,
> + ksm_scan.rmap_list, ksm_scan.address);
> + if (rmap_item) {
> + ksm_scan.rmap_list =
> + &rmap_item->rmap_list;
>
> - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> - if (folio) {
> - if (!folio_is_zone_device(folio) &&
> - folio_test_anon(folio)) {
> - folio_get(folio);
> - tmp_page = fw.page;
> - }
> - folio_walk_end(&fw, vma);
> + ksm_scan.address += PAGE_SIZE;
> + if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
> + folio_put(walk_private.folio);
> + goto get_page;
Can you make that a while() loop to avoid the label?
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 15:59 ` David Hildenbrand
@ 2025-10-14 21:57 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 6+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 21:57 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 10/14/25 12:59, David Hildenbrand wrote:
> On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> Changes since v1 [2]:
>> - Use pmd_entry to walk page range
>> - Use cond_resched inside pmd_entry()
>> - walk_page_range returns page+folio
>>
>> [1] https://lore.kernel.org/linux-
>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-
>> pedrodemargomes@gmail.com/
>>
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
>> ---
>
> [...]
>
>> +
>> +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
>> + unsigned long end, struct mm_walk *walk)
>> +{
>> + struct mm_struct *mm = walk->mm;
>> + struct vm_area_struct *vma = walk->vma;
>> + struct ksm_walk_private *private = (struct ksm_walk_private *)
>> walk->private;
>> + struct folio *folio;
>> + pte_t *start_pte, *pte, ptent;
>> + spinlock_t *ptl;
>> + int ret = 0;
>> +
>> + start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>> + if (!start_pte) {
>> + ksm_scan.address = end;
>> + return 0;
>> + }
>
> Please take more time to understand the details. If there is a THP there
> you actually have to find the relevant page.
>
Ok
>> +
>> + for (; addr < end; pte++, addr += PAGE_SIZE) {
>> + ptent = ptep_get(pte);
>> + struct page *page = vm_normal_page(vma, addr, ptent);
>> + ksm_scan.address = addr;
>
> Updating that value from in here is a bit nasty. I wonder if you should
> rather make the function also return the address of the found page as well.
>
> In the caller, if we don't find any page, there is no need to update the
> address from this function I guess. We iterated the complete MM space in
> that case.
>
Ok
>> +
>> + if (ksm_test_exit(mm)) {
>> + ret = 1;
>> + break;
>> + }
>> +
>> + if (!page)
>> + continue;
>> +
>> + folio = page_folio(page);
>> + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
>> + continue;
>> +
>> + ret = 1;
>> + folio_get(folio);
>> + private->page = page;
>> + private->folio = folio;
>> + private->vma = vma;
>> + break;
>> + }
>> + pte_unmap_unlock(start_pte, ptl);
>> +
>> + cond_resched();
>> + return ret;
>> +}
>> +
>> +struct mm_walk_ops walk_ops = {
>> + .pmd_entry = ksm_pmd_entry,
>> + .test_walk = ksm_walk_test,
>> + .walk_lock = PGWALK_RDLOCK,
>> +};
>> +
>> static struct ksm_rmap_item *scan_get_next_rmap_item(struct page
>> **page)
>> {
>> struct mm_struct *mm;
>> struct ksm_mm_slot *mm_slot;
>> struct mm_slot *slot;
>> - struct vm_area_struct *vma;
>> struct ksm_rmap_item *rmap_item;
>> - struct vma_iterator vmi;
>> int nid;
>> if (list_empty(&ksm_mm_head.slot.mm_node))
>> @@ -2527,64 +2595,40 @@ static struct ksm_rmap_item
>> *scan_get_next_rmap_item(struct page **page)
>> slot = &mm_slot->slot;
>> mm = slot->mm;
>> - vma_iter_init(&vmi, mm, ksm_scan.address);
>> mmap_read_lock(mm);
>> if (ksm_test_exit(mm))
>> goto no_vmas;
>> - for_each_vma(vmi, vma) {
>> - if (!(vma->vm_flags & VM_MERGEABLE))
>> - continue;
>> - if (ksm_scan.address < vma->vm_start)
>> - ksm_scan.address = vma->vm_start;
>> - if (!vma->anon_vma)
>> - ksm_scan.address = vma->vm_end;
>> -
>> - while (ksm_scan.address < vma->vm_end) {
>> - struct page *tmp_page = NULL;
>> - struct folio_walk fw;
>> - struct folio *folio;
>> +get_page:
>> + struct ksm_walk_private walk_private = {
>> + .page = NULL,
>> + .folio = NULL,
>> + .vma = NULL
>> + };
>> - if (ksm_test_exit(mm))
>> - break;
>> + walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *)
>> &walk_private);
>> + if (walk_private.page) {
>> + flush_anon_page(walk_private.vma, walk_private.page,
>> ksm_scan.address);
>> + flush_dcache_page(walk_private.page);
>
> Keep working on the folio please.
>
Ok
>> + rmap_item = get_next_rmap_item(mm_slot,
>> + ksm_scan.rmap_list, ksm_scan.address);
>> + if (rmap_item) {
>> + ksm_scan.rmap_list =
>> + &rmap_item->rmap_list;
>> - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
>> - if (folio) {
>> - if (!folio_is_zone_device(folio) &&
>> - folio_test_anon(folio)) {
>> - folio_get(folio);
>> - tmp_page = fw.page;
>> - }
>> - folio_walk_end(&fw, vma);
>> + ksm_scan.address += PAGE_SIZE;
>> + if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
>> + folio_put(walk_private.folio);
>> + goto get_page;
>
> Can you make that a while() loop to avoid the label?
>
Ok, I will make this corrections and send a v3. Thanks!
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 15:59 ` David Hildenbrand
@ 2025-10-15 3:53 ` kernel test robot
2025-10-15 5:46 ` kernel test robot
2025-10-15 12:22 ` David Hildenbrand
3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2025-10-15 3:53 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton, David Hildenbrand
Cc: oe-kbuild-all, Linux Memory Management List, Xu Xin,
Chengming Zhou, linux-kernel, Pedro Demarchi Gomes
Hi Pedro,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.18-rc1 next-20251014]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Pedro-Demarchi-Gomes/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item/20251014-231721
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251014151126.87589-1-pedrodemargomes%40gmail.com
patch subject: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
config: m68k-randconfig-r071-20251015 (https://download.01.org/0day-ci/archive/20251015/202510151108.UqsNiDSP-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251015/202510151108.UqsNiDSP-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510151108.UqsNiDSP-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/ksm.c: In function 'scan_get_next_rmap_item':
>> mm/ksm.c:2604:2: error: a label can only be part of a statement and a declaration is not a statement
struct ksm_walk_private walk_private = {
^~~~~~
vim +2604 mm/ksm.c
2527
2528 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
2529 {
2530 struct mm_struct *mm;
2531 struct ksm_mm_slot *mm_slot;
2532 struct mm_slot *slot;
2533 struct ksm_rmap_item *rmap_item;
2534 int nid;
2535
2536 if (list_empty(&ksm_mm_head.slot.mm_node))
2537 return NULL;
2538
2539 mm_slot = ksm_scan.mm_slot;
2540 if (mm_slot == &ksm_mm_head) {
2541 advisor_start_scan();
2542 trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
2543
2544 /*
2545 * A number of pages can hang around indefinitely in per-cpu
2546 * LRU cache, raised page count preventing write_protect_page
2547 * from merging them. Though it doesn't really matter much,
2548 * it is puzzling to see some stuck in pages_volatile until
2549 * other activity jostles them out, and they also prevented
2550 * LTP's KSM test from succeeding deterministically; so drain
2551 * them here (here rather than on entry to ksm_do_scan(),
2552 * so we don't IPI too often when pages_to_scan is set low).
2553 */
2554 lru_add_drain_all();
2555
2556 /*
2557 * Whereas stale stable_nodes on the stable_tree itself
2558 * get pruned in the regular course of stable_tree_search(),
2559 * those moved out to the migrate_nodes list can accumulate:
2560 * so prune them once before each full scan.
2561 */
2562 if (!ksm_merge_across_nodes) {
2563 struct ksm_stable_node *stable_node, *next;
2564 struct folio *folio;
2565
2566 list_for_each_entry_safe(stable_node, next,
2567 &migrate_nodes, list) {
2568 folio = ksm_get_folio(stable_node,
2569 KSM_GET_FOLIO_NOLOCK);
2570 if (folio)
2571 folio_put(folio);
2572 cond_resched();
2573 }
2574 }
2575
2576 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2577 root_unstable_tree[nid] = RB_ROOT;
2578
2579 spin_lock(&ksm_mmlist_lock);
2580 slot = list_entry(mm_slot->slot.mm_node.next,
2581 struct mm_slot, mm_node);
2582 mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2583 ksm_scan.mm_slot = mm_slot;
2584 spin_unlock(&ksm_mmlist_lock);
2585 /*
2586 * Although we tested list_empty() above, a racing __ksm_exit
2587 * of the last mm on the list may have removed it since then.
2588 */
2589 if (mm_slot == &ksm_mm_head)
2590 return NULL;
2591 next_mm:
2592 ksm_scan.address = 0;
2593 ksm_scan.rmap_list = &mm_slot->rmap_list;
2594 }
2595
2596 slot = &mm_slot->slot;
2597 mm = slot->mm;
2598
2599 mmap_read_lock(mm);
2600 if (ksm_test_exit(mm))
2601 goto no_vmas;
2602
2603 get_page:
> 2604 struct ksm_walk_private walk_private = {
2605 .page = NULL,
2606 .folio = NULL,
2607 .vma = NULL
2608 };
2609
2610 walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
2611 if (walk_private.page) {
2612 flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
2613 flush_dcache_page(walk_private.page);
2614 rmap_item = get_next_rmap_item(mm_slot,
2615 ksm_scan.rmap_list, ksm_scan.address);
2616 if (rmap_item) {
2617 ksm_scan.rmap_list =
2618 &rmap_item->rmap_list;
2619
2620 ksm_scan.address += PAGE_SIZE;
2621 if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
2622 folio_put(walk_private.folio);
2623 goto get_page;
2624 }
2625
2626 *page = walk_private.page;
2627 } else {
2628 folio_put(walk_private.folio);
2629 }
2630 mmap_read_unlock(mm);
2631 return rmap_item;
2632 }
2633
2634 if (ksm_test_exit(mm)) {
2635 no_vmas:
2636 ksm_scan.address = 0;
2637 ksm_scan.rmap_list = &mm_slot->rmap_list;
2638 }
2639 /*
2640 * Nuke all the rmap_items that are above this current rmap:
2641 * because there were no VM_MERGEABLE vmas with such addresses.
2642 */
2643 remove_trailing_rmap_items(ksm_scan.rmap_list);
2644
2645 spin_lock(&ksm_mmlist_lock);
2646 slot = list_entry(mm_slot->slot.mm_node.next,
2647 struct mm_slot, mm_node);
2648 ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2649 if (ksm_scan.address == 0) {
2650 /*
2651 * We've completed a full scan of all vmas, holding mmap_lock
2652 * throughout, and found no VM_MERGEABLE: so do the same as
2653 * __ksm_exit does to remove this mm from all our lists now.
2654 * This applies either when cleaning up after __ksm_exit
2655 * (but beware: we can reach here even before __ksm_exit),
2656 * or when all VM_MERGEABLE areas have been unmapped (and
2657 * mmap_lock then protects against race with MADV_MERGEABLE).
2658 */
2659 hash_del(&mm_slot->slot.hash);
2660 list_del(&mm_slot->slot.mm_node);
2661 spin_unlock(&ksm_mmlist_lock);
2662
2663 mm_slot_free(mm_slot_cache, mm_slot);
2664 /*
2665 * Only clear MMF_VM_MERGEABLE. We must not clear
2666 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
2667 * perhaps their mm_struct has just been added to ksm_mm_slot
2668 * list, and its process has not yet officially started running
2669 * or has not yet performed mmap/brk to allocate anonymous VMAS.
2670 */
2671 mm_flags_clear(MMF_VM_MERGEABLE, mm);
2672 mmap_read_unlock(mm);
2673 mmdrop(mm);
2674 } else {
2675 mmap_read_unlock(mm);
2676 /*
2677 * mmap_read_unlock(mm) first because after
2678 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
2679 * already have been freed under us by __ksm_exit()
2680 * because the "mm_slot" is still hashed and
2681 * ksm_scan.mm_slot doesn't point to it anymore.
2682 */
2683 spin_unlock(&ksm_mmlist_lock);
2684 }
2685
2686 /* Repeat until we've completed scanning the whole list */
2687 mm_slot = ksm_scan.mm_slot;
2688 if (mm_slot != &ksm_mm_head)
2689 goto next_mm;
2690
2691 advisor_stop_scan();
2692
2693 trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
2694 ksm_scan.seqnr++;
2695 return NULL;
2696 }
2697
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 15:59 ` David Hildenbrand
2025-10-15 3:53 ` kernel test robot
@ 2025-10-15 5:46 ` kernel test robot
2025-10-15 12:22 ` David Hildenbrand
3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2025-10-15 5:46 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton, David Hildenbrand
Cc: llvm, oe-kbuild-all, Linux Memory Management List, Xu Xin,
Chengming Zhou, linux-kernel, Pedro Demarchi Gomes
Hi Pedro,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v6.18-rc1 next-20251014]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Pedro-Demarchi-Gomes/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item/20251014-231721
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251014151126.87589-1-pedrodemargomes%40gmail.com
patch subject: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
config: riscv-randconfig-002-20251015 (https://download.01.org/0day-ci/archive/20251015/202510151358.YFw4KsDG-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 39f292ffa13d7ca0d1edff27ac8fd55024bb4d19)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251015/202510151358.YFw4KsDG-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510151358.YFw4KsDG-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> mm/ksm.c:2604:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions]
2604 | struct ksm_walk_private walk_private = {
| ^
1 warning generated.
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for ARCH_HAS_ELF_CORE_EFLAGS
Depends on [n]: BINFMT_ELF [=n] && ELF_CORE [=n]
Selected by [y]:
- RISCV [=y]
vim +2604 mm/ksm.c
2527
2528 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
2529 {
2530 struct mm_struct *mm;
2531 struct ksm_mm_slot *mm_slot;
2532 struct mm_slot *slot;
2533 struct ksm_rmap_item *rmap_item;
2534 int nid;
2535
2536 if (list_empty(&ksm_mm_head.slot.mm_node))
2537 return NULL;
2538
2539 mm_slot = ksm_scan.mm_slot;
2540 if (mm_slot == &ksm_mm_head) {
2541 advisor_start_scan();
2542 trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
2543
2544 /*
2545 * A number of pages can hang around indefinitely in per-cpu
2546 * LRU cache, raised page count preventing write_protect_page
2547 * from merging them. Though it doesn't really matter much,
2548 * it is puzzling to see some stuck in pages_volatile until
2549 * other activity jostles them out, and they also prevented
2550 * LTP's KSM test from succeeding deterministically; so drain
2551 * them here (here rather than on entry to ksm_do_scan(),
2552 * so we don't IPI too often when pages_to_scan is set low).
2553 */
2554 lru_add_drain_all();
2555
2556 /*
2557 * Whereas stale stable_nodes on the stable_tree itself
2558 * get pruned in the regular course of stable_tree_search(),
2559 * those moved out to the migrate_nodes list can accumulate:
2560 * so prune them once before each full scan.
2561 */
2562 if (!ksm_merge_across_nodes) {
2563 struct ksm_stable_node *stable_node, *next;
2564 struct folio *folio;
2565
2566 list_for_each_entry_safe(stable_node, next,
2567 &migrate_nodes, list) {
2568 folio = ksm_get_folio(stable_node,
2569 KSM_GET_FOLIO_NOLOCK);
2570 if (folio)
2571 folio_put(folio);
2572 cond_resched();
2573 }
2574 }
2575
2576 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2577 root_unstable_tree[nid] = RB_ROOT;
2578
2579 spin_lock(&ksm_mmlist_lock);
2580 slot = list_entry(mm_slot->slot.mm_node.next,
2581 struct mm_slot, mm_node);
2582 mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2583 ksm_scan.mm_slot = mm_slot;
2584 spin_unlock(&ksm_mmlist_lock);
2585 /*
2586 * Although we tested list_empty() above, a racing __ksm_exit
2587 * of the last mm on the list may have removed it since then.
2588 */
2589 if (mm_slot == &ksm_mm_head)
2590 return NULL;
2591 next_mm:
2592 ksm_scan.address = 0;
2593 ksm_scan.rmap_list = &mm_slot->rmap_list;
2594 }
2595
2596 slot = &mm_slot->slot;
2597 mm = slot->mm;
2598
2599 mmap_read_lock(mm);
2600 if (ksm_test_exit(mm))
2601 goto no_vmas;
2602
2603 get_page:
> 2604 struct ksm_walk_private walk_private = {
2605 .page = NULL,
2606 .folio = NULL,
2607 .vma = NULL
2608 };
2609
2610 walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
2611 if (walk_private.page) {
2612 flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
2613 flush_dcache_page(walk_private.page);
2614 rmap_item = get_next_rmap_item(mm_slot,
2615 ksm_scan.rmap_list, ksm_scan.address);
2616 if (rmap_item) {
2617 ksm_scan.rmap_list =
2618 &rmap_item->rmap_list;
2619
2620 ksm_scan.address += PAGE_SIZE;
2621 if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
2622 folio_put(walk_private.folio);
2623 goto get_page;
2624 }
2625
2626 *page = walk_private.page;
2627 } else {
2628 folio_put(walk_private.folio);
2629 }
2630 mmap_read_unlock(mm);
2631 return rmap_item;
2632 }
2633
2634 if (ksm_test_exit(mm)) {
2635 no_vmas:
2636 ksm_scan.address = 0;
2637 ksm_scan.rmap_list = &mm_slot->rmap_list;
2638 }
2639 /*
2640 * Nuke all the rmap_items that are above this current rmap:
2641 * because there were no VM_MERGEABLE vmas with such addresses.
2642 */
2643 remove_trailing_rmap_items(ksm_scan.rmap_list);
2644
2645 spin_lock(&ksm_mmlist_lock);
2646 slot = list_entry(mm_slot->slot.mm_node.next,
2647 struct mm_slot, mm_node);
2648 ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
2649 if (ksm_scan.address == 0) {
2650 /*
2651 * We've completed a full scan of all vmas, holding mmap_lock
2652 * throughout, and found no VM_MERGEABLE: so do the same as
2653 * __ksm_exit does to remove this mm from all our lists now.
2654 * This applies either when cleaning up after __ksm_exit
2655 * (but beware: we can reach here even before __ksm_exit),
2656 * or when all VM_MERGEABLE areas have been unmapped (and
2657 * mmap_lock then protects against race with MADV_MERGEABLE).
2658 */
2659 hash_del(&mm_slot->slot.hash);
2660 list_del(&mm_slot->slot.mm_node);
2661 spin_unlock(&ksm_mmlist_lock);
2662
2663 mm_slot_free(mm_slot_cache, mm_slot);
2664 /*
2665 * Only clear MMF_VM_MERGEABLE. We must not clear
2666 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
2667 * perhaps their mm_struct has just been added to ksm_mm_slot
2668 * list, and its process has not yet officially started running
2669 * or has not yet performed mmap/brk to allocate anonymous VMAS.
2670 */
2671 mm_flags_clear(MMF_VM_MERGEABLE, mm);
2672 mmap_read_unlock(mm);
2673 mmdrop(mm);
2674 } else {
2675 mmap_read_unlock(mm);
2676 /*
2677 * mmap_read_unlock(mm) first because after
2678 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
2679 * already have been freed under us by __ksm_exit()
2680 * because the "mm_slot" is still hashed and
2681 * ksm_scan.mm_slot doesn't point to it anymore.
2682 */
2683 spin_unlock(&ksm_mmlist_lock);
2684 }
2685
2686 /* Repeat until we've completed scanning the whole list */
2687 mm_slot = ksm_scan.mm_slot;
2688 if (mm_slot != &ksm_mm_head)
2689 goto next_mm;
2690
2691 advisor_stop_scan();
2692
2693 trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
2694 ksm_scan.seqnr++;
2695 return NULL;
2696 }
2697
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
` (2 preceding siblings ...)
2025-10-15 5:46 ` kernel test robot
@ 2025-10-15 12:22 ` David Hildenbrand
3 siblings, 0 replies; 6+ messages in thread
From: David Hildenbrand @ 2025-10-15 12:22 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
>
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
>
> Changes since v1 [2]:
> - Use pmd_entry to walk page range
> - Use cond_resched inside pmd_entry()
> - walk_page_range returns page+folio
>
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/
>
Can you also make sure to CC the reporter. So you might want to add
Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
And if it was my suggestion
Suggested-by: David Hildenbrand <david@redhat.com>
Not sure if we want a Fixes: tag ... we could have created gigantic
VMAs with an anon VMA for like ever, so it would date back quite a bit.
Please make sure to thoroughly compile- and runtime-test your changes.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-10-15 12:22 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 15:59 ` David Hildenbrand
2025-10-14 21:57 ` Pedro Demarchi Gomes
2025-10-15 3:53 ` kernel test robot
2025-10-15 5:46 ` kernel test robot
2025-10-15 12:22 ` David Hildenbrand
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox