[PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
@ 2025-10-14 15:11 Pedro Demarchi Gomes
  2025-10-14 15:59 ` David Hildenbrand
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 15:11 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes

Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].

Changes since v1 [2]:
- Use pmd_entry to walk page range
- Use cond_resched inside pmd_entry()
- walk_page_range returns page+folio

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
[2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/

Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 144 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 94 insertions(+), 50 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..adb0267a1b7d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,14 +2455,82 @@ static bool should_skip_rmap_item(struct folio *folio,
 	return true;
 }
 
+struct ksm_walk_private {
+	struct page *page;
+	struct folio *folio;
+	struct vm_area_struct *vma;
+};
+
+static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	if (!vma->anon_vma || !(vma->vm_flags & VM_MERGEABLE)) {
+		ksm_scan.address = vma->vm_end;
+		return 1;
+	}
+	return 0;
+}
+
+static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
+			    unsigned long end, struct mm_walk *walk)
+{
+	struct mm_struct *mm = walk->mm;
+	struct vm_area_struct *vma = walk->vma;
+	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
+	struct folio *folio;
+	pte_t *start_pte, *pte, ptent;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!start_pte) {
+		ksm_scan.address = end;
+		return 0;
+	}
+
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = ptep_get(pte);
+		struct page *page = vm_normal_page(vma, addr, ptent);
+		ksm_scan.address = addr;
+
+		if (ksm_test_exit(mm)) {
+			ret = 1;
+			break;
+		}
+
+		if (!page)
+			continue;
+
+		folio = page_folio(page);
+		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+			continue;
+
+		ret = 1;
+		folio_get(folio);
+		private->page = page;
+		private->folio = folio;
+		private->vma = vma;
+		break;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	cond_resched();
+	return ret;
+}
+
+struct mm_walk_ops walk_ops = {
+	.pmd_entry = ksm_pmd_entry,
+	.test_walk = ksm_walk_test,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
 	struct ksm_mm_slot *mm_slot;
 	struct mm_slot *slot;
-	struct vm_area_struct *vma;
 	struct ksm_rmap_item *rmap_item;
-	struct vma_iterator vmi;
 	int nid;
 
 	if (list_empty(&ksm_mm_head.slot.mm_node))
@@ -2527,64 +2595,40 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 
 	slot = &mm_slot->slot;
 	mm = slot->mm;
-	vma_iter_init(&vmi, mm, ksm_scan.address);
 
 	mmap_read_lock(mm);
 	if (ksm_test_exit(mm))
 		goto no_vmas;
 
-	for_each_vma(vmi, vma) {
-		if (!(vma->vm_flags & VM_MERGEABLE))
-			continue;
-		if (ksm_scan.address < vma->vm_start)
-			ksm_scan.address = vma->vm_start;
-		if (!vma->anon_vma)
-			ksm_scan.address = vma->vm_end;
-
-		while (ksm_scan.address < vma->vm_end) {
-			struct page *tmp_page = NULL;
-			struct folio_walk fw;
-			struct folio *folio;
+get_page:
+	struct ksm_walk_private walk_private = {
+		.page = NULL,
+		.folio = NULL,
+		.vma = NULL
+	};
 
-			if (ksm_test_exit(mm))
-				break;
+	walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
+	if (walk_private.page) {
+		flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
+		flush_dcache_page(walk_private.page);
+		rmap_item = get_next_rmap_item(mm_slot,
+			ksm_scan.rmap_list, ksm_scan.address);
+		if (rmap_item) {
+			ksm_scan.rmap_list =
+					&rmap_item->rmap_list;
 
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
+			ksm_scan.address += PAGE_SIZE;
+			if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
+				folio_put(walk_private.folio);
+				goto get_page;
 			}
 
-			if (tmp_page) {
-				flush_anon_page(vma, tmp_page, ksm_scan.address);
-				flush_dcache_page(tmp_page);
-				rmap_item = get_next_rmap_item(mm_slot,
-					ksm_scan.rmap_list, ksm_scan.address);
-				if (rmap_item) {
-					ksm_scan.rmap_list =
-							&rmap_item->rmap_list;
-
-					if (should_skip_rmap_item(folio, rmap_item)) {
-						folio_put(folio);
-						goto next_page;
-					}
-
-					ksm_scan.address += PAGE_SIZE;
-					*page = tmp_page;
-				} else {
-					folio_put(folio);
-				}
-				mmap_read_unlock(mm);
-				return rmap_item;
-			}
-next_page:
-			ksm_scan.address += PAGE_SIZE;
-			cond_resched();
+			*page = walk_private.page;
+		} else {
+			folio_put(walk_private.folio);
 		}
+		mmap_read_unlock(mm);
+		return rmap_item;
 	}
 
 	if (ksm_test_exit(mm)) {
-- 
2.43.0



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
@ 2025-10-14 15:59 ` David Hildenbrand
  2025-10-14 21:57   ` Pedro Demarchi Gomes
  2025-10-15  3:53 ` kernel test robot
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 6+ messages in thread
From: David Hildenbrand @ 2025-10-14 15:59 UTC (permalink / raw)
  To: Pedro Demarchi Gomes, Andrew Morton
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel

On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> Changes since v1 [2]:
> - Use pmd_entry to walk page range
> - Use cond_resched inside pmd_entry()
> - walk_page_range returns page+folio
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---

[...]

> +
> +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
> +			    unsigned long end, struct mm_walk *walk)
> +{
> +	struct mm_struct *mm = walk->mm;
> +	struct vm_area_struct *vma = walk->vma;
> +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> +	struct folio *folio;
> +	pte_t *start_pte, *pte, ptent;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +
> +	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +	if (!start_pte) {
> +		ksm_scan.address = end;
> +		return 0;
> +	}

Please take more time to understand the details. If there is a THP there 
you actually have to find the relevant page.

> +
> +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> +		ptent = ptep_get(pte);
> +		struct page *page = vm_normal_page(vma, addr, ptent);
> +		ksm_scan.address = addr;

Updating that value from in here is a bit nasty. I wonder if you should 
rather make the function also return the address of the found page as well.

In the caller, if we don't find any page, there is no need to update the
address from this function I guess. We iterated the complete MM space in 
that case.

> +
> +		if (ksm_test_exit(mm)) {
> +			ret = 1;
> +			break;
> +		}
> +
> +		if (!page)
> +			continue;
> +
> +		folio = page_folio(page);
> +		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +			continue;
> +
> +		ret = 1;
> +		folio_get(folio);
> +		private->page = page;
> +		private->folio = folio;
> +		private->vma = vma;
> +		break;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	cond_resched();
> +	return ret;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> +	.pmd_entry = ksm_pmd_entry,
> +	.test_walk = ksm_walk_test,
> +	.walk_lock = PGWALK_RDLOCK,
> +};
> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
>   	struct ksm_mm_slot *mm_slot;
>   	struct mm_slot *slot;
> -	struct vm_area_struct *vma;
>   	struct ksm_rmap_item *rmap_item;
> -	struct vma_iterator vmi;
>   	int nid;
>   
>   	if (list_empty(&ksm_mm_head.slot.mm_node))
> @@ -2527,64 +2595,40 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   
>   	slot = &mm_slot->slot;
>   	mm = slot->mm;
> -	vma_iter_init(&vmi, mm, ksm_scan.address);
>   
>   	mmap_read_lock(mm);
>   	if (ksm_test_exit(mm))
>   		goto no_vmas;
>   
> -	for_each_vma(vmi, vma) {
> -		if (!(vma->vm_flags & VM_MERGEABLE))
> -			continue;
> -		if (ksm_scan.address < vma->vm_start)
> -			ksm_scan.address = vma->vm_start;
> -		if (!vma->anon_vma)
> -			ksm_scan.address = vma->vm_end;
> -
> -		while (ksm_scan.address < vma->vm_end) {
> -			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
> -			struct folio *folio;
> +get_page:
> +	struct ksm_walk_private walk_private = {
> +		.page = NULL,
> +		.folio = NULL,
> +		.vma = NULL
> +	};
>   
> -			if (ksm_test_exit(mm))
> -				break;
> +	walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> +	if (walk_private.page) {
> +		flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
> +		flush_dcache_page(walk_private.page);

Keep working on the folio please.

> +		rmap_item = get_next_rmap_item(mm_slot,
> +			ksm_scan.rmap_list, ksm_scan.address);
> +		if (rmap_item) {
> +			ksm_scan.rmap_list =
> +					&rmap_item->rmap_list;
>   
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> +			ksm_scan.address += PAGE_SIZE;
> +			if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
> +				folio_put(walk_private.folio);
> +				goto get_page;

Can you make that a while() loop to avoid the label?



-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14 15:59 ` David Hildenbrand
@ 2025-10-14 21:57   ` Pedro Demarchi Gomes
  0 siblings, 0 replies; 6+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 21:57 UTC (permalink / raw)
  To: David Hildenbrand, Andrew Morton
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel



On 10/14/25 12:59, David Hildenbrand wrote:
> On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> Changes since v1 [2]:
>> - Use pmd_entry to walk page range
>> - Use cond_resched inside pmd_entry()
>> - walk_page_range returns page+folio
>>
>> [1] https://lore.kernel.org/linux- 
>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1- 
>> pedrodemargomes@gmail.com/
>>
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
>> ---
> 
> [...]
> 
>> +
>> +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
>> +                unsigned long end, struct mm_walk *walk)
>> +{
>> +    struct mm_struct *mm = walk->mm;
>> +    struct vm_area_struct *vma = walk->vma;
>> +    struct ksm_walk_private *private = (struct ksm_walk_private *) 
>> walk->private;
>> +    struct folio *folio;
>> +    pte_t *start_pte, *pte, ptent;
>> +    spinlock_t *ptl;
>> +    int ret = 0;
>> +
>> +    start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>> +    if (!start_pte) {
>> +        ksm_scan.address = end;
>> +        return 0;
>> +    }
> 
> Please take more time to understand the details. If there is a THP there 
> you actually have to find the relevant page.
> 

Ok

>> +
>> +    for (; addr < end; pte++, addr += PAGE_SIZE) {
>> +        ptent = ptep_get(pte);
>> +        struct page *page = vm_normal_page(vma, addr, ptent);
>> +        ksm_scan.address = addr;
> 
> Updating that value from in here is a bit nasty. I wonder if you should 
> rather make the function also return the address of the found page as well.
> 
> In the caller, if we don't find any page, there is no need to update the
> address from this function I guess. We iterated the complete MM space in 
> that case.
> 

Ok

>> +
>> +        if (ksm_test_exit(mm)) {
>> +            ret = 1;
>> +            break;
>> +        }
>> +
>> +        if (!page)
>> +            continue;
>> +
>> +        folio = page_folio(page);
>> +        if (folio_is_zone_device(folio) || !folio_test_anon(folio))
>> +            continue;
>> +
>> +        ret = 1;
>> +        folio_get(folio);
>> +        private->page = page;
>> +        private->folio = folio;
>> +        private->vma = vma;
>> +        break;
>> +    }
>> +    pte_unmap_unlock(start_pte, ptl);
>> +
>> +    cond_resched();
>> +    return ret;
>> +}
>> +
>> +struct mm_walk_ops walk_ops = {
>> +    .pmd_entry = ksm_pmd_entry,
>> +    .test_walk = ksm_walk_test,
>> +    .walk_lock = PGWALK_RDLOCK,
>> +};
>> +
>>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page 
>> **page)
>>   {
>>       struct mm_struct *mm;
>>       struct ksm_mm_slot *mm_slot;
>>       struct mm_slot *slot;
>> -    struct vm_area_struct *vma;
>>       struct ksm_rmap_item *rmap_item;
>> -    struct vma_iterator vmi;
>>       int nid;
>>       if (list_empty(&ksm_mm_head.slot.mm_node))
>> @@ -2527,64 +2595,40 @@ static struct ksm_rmap_item 
>> *scan_get_next_rmap_item(struct page **page)
>>       slot = &mm_slot->slot;
>>       mm = slot->mm;
>> -    vma_iter_init(&vmi, mm, ksm_scan.address);
>>       mmap_read_lock(mm);
>>       if (ksm_test_exit(mm))
>>           goto no_vmas;
>> -    for_each_vma(vmi, vma) {
>> -        if (!(vma->vm_flags & VM_MERGEABLE))
>> -            continue;
>> -        if (ksm_scan.address < vma->vm_start)
>> -            ksm_scan.address = vma->vm_start;
>> -        if (!vma->anon_vma)
>> -            ksm_scan.address = vma->vm_end;
>> -
>> -        while (ksm_scan.address < vma->vm_end) {
>> -            struct page *tmp_page = NULL;
>> -            struct folio_walk fw;
>> -            struct folio *folio;
>> +get_page:
>> +    struct ksm_walk_private walk_private = {
>> +        .page = NULL,
>> +        .folio = NULL,
>> +        .vma = NULL
>> +    };
>> -            if (ksm_test_exit(mm))
>> -                break;
>> +    walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) 
>> &walk_private);
>> +    if (walk_private.page) {
>> +        flush_anon_page(walk_private.vma, walk_private.page, 
>> ksm_scan.address);
>> +        flush_dcache_page(walk_private.page);
> 
> Keep working on the folio please.
> 

Ok

>> +        rmap_item = get_next_rmap_item(mm_slot,
>> +            ksm_scan.rmap_list, ksm_scan.address);
>> +        if (rmap_item) {
>> +            ksm_scan.rmap_list =
>> +                    &rmap_item->rmap_list;
>> -            folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
>> -            if (folio) {
>> -                if (!folio_is_zone_device(folio) &&
>> -                     folio_test_anon(folio)) {
>> -                    folio_get(folio);
>> -                    tmp_page = fw.page;
>> -                }
>> -                folio_walk_end(&fw, vma);
>> +            ksm_scan.address += PAGE_SIZE;
>> +            if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
>> +                folio_put(walk_private.folio);
>> +                goto get_page;
> 
> Can you make that a while() loop to avoid the label?
> 

Ok, I will make this corrections and send a v3. Thanks!




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
  2025-10-14 15:59 ` David Hildenbrand
@ 2025-10-15  3:53 ` kernel test robot
  2025-10-15  5:46 ` kernel test robot
  2025-10-15 12:22 ` David Hildenbrand
  3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2025-10-15  3:53 UTC (permalink / raw)
  To: Pedro Demarchi Gomes, Andrew Morton, David Hildenbrand
  Cc: oe-kbuild-all, Linux Memory Management List, Xu Xin,
	Chengming Zhou, linux-kernel, Pedro Demarchi Gomes

Hi Pedro,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.18-rc1 next-20251014]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Pedro-Demarchi-Gomes/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item/20251014-231721
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20251014151126.87589-1-pedrodemargomes%40gmail.com
patch subject: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
config: m68k-randconfig-r071-20251015 (https://download.01.org/0day-ci/archive/20251015/202510151108.UqsNiDSP-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251015/202510151108.UqsNiDSP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510151108.UqsNiDSP-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/ksm.c: In function 'scan_get_next_rmap_item':
>> mm/ksm.c:2604:2: error: a label can only be part of a statement and a declaration is not a statement
     struct ksm_walk_private walk_private = {
     ^~~~~~


vim +2604 mm/ksm.c

  2527	
  2528	static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  2529	{
  2530		struct mm_struct *mm;
  2531		struct ksm_mm_slot *mm_slot;
  2532		struct mm_slot *slot;
  2533		struct ksm_rmap_item *rmap_item;
  2534		int nid;
  2535	
  2536		if (list_empty(&ksm_mm_head.slot.mm_node))
  2537			return NULL;
  2538	
  2539		mm_slot = ksm_scan.mm_slot;
  2540		if (mm_slot == &ksm_mm_head) {
  2541			advisor_start_scan();
  2542			trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
  2543	
  2544			/*
  2545			 * A number of pages can hang around indefinitely in per-cpu
  2546			 * LRU cache, raised page count preventing write_protect_page
  2547			 * from merging them.  Though it doesn't really matter much,
  2548			 * it is puzzling to see some stuck in pages_volatile until
  2549			 * other activity jostles them out, and they also prevented
  2550			 * LTP's KSM test from succeeding deterministically; so drain
  2551			 * them here (here rather than on entry to ksm_do_scan(),
  2552			 * so we don't IPI too often when pages_to_scan is set low).
  2553			 */
  2554			lru_add_drain_all();
  2555	
  2556			/*
  2557			 * Whereas stale stable_nodes on the stable_tree itself
  2558			 * get pruned in the regular course of stable_tree_search(),
  2559			 * those moved out to the migrate_nodes list can accumulate:
  2560			 * so prune them once before each full scan.
  2561			 */
  2562			if (!ksm_merge_across_nodes) {
  2563				struct ksm_stable_node *stable_node, *next;
  2564				struct folio *folio;
  2565	
  2566				list_for_each_entry_safe(stable_node, next,
  2567							 &migrate_nodes, list) {
  2568					folio = ksm_get_folio(stable_node,
  2569							      KSM_GET_FOLIO_NOLOCK);
  2570					if (folio)
  2571						folio_put(folio);
  2572					cond_resched();
  2573				}
  2574			}
  2575	
  2576			for (nid = 0; nid < ksm_nr_node_ids; nid++)
  2577				root_unstable_tree[nid] = RB_ROOT;
  2578	
  2579			spin_lock(&ksm_mmlist_lock);
  2580			slot = list_entry(mm_slot->slot.mm_node.next,
  2581					  struct mm_slot, mm_node);
  2582			mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
  2583			ksm_scan.mm_slot = mm_slot;
  2584			spin_unlock(&ksm_mmlist_lock);
  2585			/*
  2586			 * Although we tested list_empty() above, a racing __ksm_exit
  2587			 * of the last mm on the list may have removed it since then.
  2588			 */
  2589			if (mm_slot == &ksm_mm_head)
  2590				return NULL;
  2591	next_mm:
  2592			ksm_scan.address = 0;
  2593			ksm_scan.rmap_list = &mm_slot->rmap_list;
  2594		}
  2595	
  2596		slot = &mm_slot->slot;
  2597		mm = slot->mm;
  2598	
  2599		mmap_read_lock(mm);
  2600		if (ksm_test_exit(mm))
  2601			goto no_vmas;
  2602	
  2603	get_page:
> 2604		struct ksm_walk_private walk_private = {
  2605			.page = NULL,
  2606			.folio = NULL,
  2607			.vma = NULL
  2608		};
  2609	
  2610		walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
  2611		if (walk_private.page) {
  2612			flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
  2613			flush_dcache_page(walk_private.page);
  2614			rmap_item = get_next_rmap_item(mm_slot,
  2615				ksm_scan.rmap_list, ksm_scan.address);
  2616			if (rmap_item) {
  2617				ksm_scan.rmap_list =
  2618						&rmap_item->rmap_list;
  2619	
  2620				ksm_scan.address += PAGE_SIZE;
  2621				if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
  2622					folio_put(walk_private.folio);
  2623					goto get_page;
  2624				}
  2625	
  2626				*page = walk_private.page;
  2627			} else {
  2628				folio_put(walk_private.folio);
  2629			}
  2630			mmap_read_unlock(mm);
  2631			return rmap_item;
  2632		}
  2633	
  2634		if (ksm_test_exit(mm)) {
  2635	no_vmas:
  2636			ksm_scan.address = 0;
  2637			ksm_scan.rmap_list = &mm_slot->rmap_list;
  2638		}
  2639		/*
  2640		 * Nuke all the rmap_items that are above this current rmap:
  2641		 * because there were no VM_MERGEABLE vmas with such addresses.
  2642		 */
  2643		remove_trailing_rmap_items(ksm_scan.rmap_list);
  2644	
  2645		spin_lock(&ksm_mmlist_lock);
  2646		slot = list_entry(mm_slot->slot.mm_node.next,
  2647				  struct mm_slot, mm_node);
  2648		ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
  2649		if (ksm_scan.address == 0) {
  2650			/*
  2651			 * We've completed a full scan of all vmas, holding mmap_lock
  2652			 * throughout, and found no VM_MERGEABLE: so do the same as
  2653			 * __ksm_exit does to remove this mm from all our lists now.
  2654			 * This applies either when cleaning up after __ksm_exit
  2655			 * (but beware: we can reach here even before __ksm_exit),
  2656			 * or when all VM_MERGEABLE areas have been unmapped (and
  2657			 * mmap_lock then protects against race with MADV_MERGEABLE).
  2658			 */
  2659			hash_del(&mm_slot->slot.hash);
  2660			list_del(&mm_slot->slot.mm_node);
  2661			spin_unlock(&ksm_mmlist_lock);
  2662	
  2663			mm_slot_free(mm_slot_cache, mm_slot);
  2664			/*
  2665			 * Only clear MMF_VM_MERGEABLE. We must not clear
  2666			 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
  2667			 * perhaps their mm_struct has just been added to ksm_mm_slot
  2668			 * list, and its process has not yet officially started running
  2669			 * or has not yet performed mmap/brk to allocate anonymous VMAS.
  2670			 */
  2671			mm_flags_clear(MMF_VM_MERGEABLE, mm);
  2672			mmap_read_unlock(mm);
  2673			mmdrop(mm);
  2674		} else {
  2675			mmap_read_unlock(mm);
  2676			/*
  2677			 * mmap_read_unlock(mm) first because after
  2678			 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
  2679			 * already have been freed under us by __ksm_exit()
  2680			 * because the "mm_slot" is still hashed and
  2681			 * ksm_scan.mm_slot doesn't point to it anymore.
  2682			 */
  2683			spin_unlock(&ksm_mmlist_lock);
  2684		}
  2685	
  2686		/* Repeat until we've completed scanning the whole list */
  2687		mm_slot = ksm_scan.mm_slot;
  2688		if (mm_slot != &ksm_mm_head)
  2689			goto next_mm;
  2690	
  2691		advisor_stop_scan();
  2692	
  2693		trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
  2694		ksm_scan.seqnr++;
  2695		return NULL;
  2696	}
  2697	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
  2025-10-14 15:59 ` David Hildenbrand
  2025-10-15  3:53 ` kernel test robot
@ 2025-10-15  5:46 ` kernel test robot
  2025-10-15 12:22 ` David Hildenbrand
  3 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2025-10-15  5:46 UTC (permalink / raw)
  To: Pedro Demarchi Gomes, Andrew Morton, David Hildenbrand
  Cc: llvm, oe-kbuild-all, Linux Memory Management List, Xu Xin,
	Chengming Zhou, linux-kernel, Pedro Demarchi Gomes

Hi Pedro,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v6.18-rc1 next-20251014]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Pedro-Demarchi-Gomes/ksm-use-range-walk-function-to-jump-over-holes-in-scan_get_next_rmap_item/20251014-231721
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20251014151126.87589-1-pedrodemargomes%40gmail.com
patch subject: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
config: riscv-randconfig-002-20251015 (https://download.01.org/0day-ci/archive/20251015/202510151358.YFw4KsDG-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 39f292ffa13d7ca0d1edff27ac8fd55024bb4d19)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251015/202510151358.YFw4KsDG-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510151358.YFw4KsDG-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/ksm.c:2604:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions]
    2604 |         struct ksm_walk_private walk_private = {
         |         ^
   1 warning generated.

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for ARCH_HAS_ELF_CORE_EFLAGS
   Depends on [n]: BINFMT_ELF [=n] && ELF_CORE [=n]
   Selected by [y]:
   - RISCV [=y]


vim +2604 mm/ksm.c

  2527	
  2528	static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  2529	{
  2530		struct mm_struct *mm;
  2531		struct ksm_mm_slot *mm_slot;
  2532		struct mm_slot *slot;
  2533		struct ksm_rmap_item *rmap_item;
  2534		int nid;
  2535	
  2536		if (list_empty(&ksm_mm_head.slot.mm_node))
  2537			return NULL;
  2538	
  2539		mm_slot = ksm_scan.mm_slot;
  2540		if (mm_slot == &ksm_mm_head) {
  2541			advisor_start_scan();
  2542			trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
  2543	
  2544			/*
  2545			 * A number of pages can hang around indefinitely in per-cpu
  2546			 * LRU cache, raised page count preventing write_protect_page
  2547			 * from merging them.  Though it doesn't really matter much,
  2548			 * it is puzzling to see some stuck in pages_volatile until
  2549			 * other activity jostles them out, and they also prevented
  2550			 * LTP's KSM test from succeeding deterministically; so drain
  2551			 * them here (here rather than on entry to ksm_do_scan(),
  2552			 * so we don't IPI too often when pages_to_scan is set low).
  2553			 */
  2554			lru_add_drain_all();
  2555	
  2556			/*
  2557			 * Whereas stale stable_nodes on the stable_tree itself
  2558			 * get pruned in the regular course of stable_tree_search(),
  2559			 * those moved out to the migrate_nodes list can accumulate:
  2560			 * so prune them once before each full scan.
  2561			 */
  2562			if (!ksm_merge_across_nodes) {
  2563				struct ksm_stable_node *stable_node, *next;
  2564				struct folio *folio;
  2565	
  2566				list_for_each_entry_safe(stable_node, next,
  2567							 &migrate_nodes, list) {
  2568					folio = ksm_get_folio(stable_node,
  2569							      KSM_GET_FOLIO_NOLOCK);
  2570					if (folio)
  2571						folio_put(folio);
  2572					cond_resched();
  2573				}
  2574			}
  2575	
  2576			for (nid = 0; nid < ksm_nr_node_ids; nid++)
  2577				root_unstable_tree[nid] = RB_ROOT;
  2578	
  2579			spin_lock(&ksm_mmlist_lock);
  2580			slot = list_entry(mm_slot->slot.mm_node.next,
  2581					  struct mm_slot, mm_node);
  2582			mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
  2583			ksm_scan.mm_slot = mm_slot;
  2584			spin_unlock(&ksm_mmlist_lock);
  2585			/*
  2586			 * Although we tested list_empty() above, a racing __ksm_exit
  2587			 * of the last mm on the list may have removed it since then.
  2588			 */
  2589			if (mm_slot == &ksm_mm_head)
  2590				return NULL;
  2591	next_mm:
  2592			ksm_scan.address = 0;
  2593			ksm_scan.rmap_list = &mm_slot->rmap_list;
  2594		}
  2595	
  2596		slot = &mm_slot->slot;
  2597		mm = slot->mm;
  2598	
  2599		mmap_read_lock(mm);
  2600		if (ksm_test_exit(mm))
  2601			goto no_vmas;
  2602	
  2603	get_page:
> 2604		struct ksm_walk_private walk_private = {
  2605			.page = NULL,
  2606			.folio = NULL,
  2607			.vma = NULL
  2608		};
  2609	
  2610		walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
  2611		if (walk_private.page) {
  2612			flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
  2613			flush_dcache_page(walk_private.page);
  2614			rmap_item = get_next_rmap_item(mm_slot,
  2615				ksm_scan.rmap_list, ksm_scan.address);
  2616			if (rmap_item) {
  2617				ksm_scan.rmap_list =
  2618						&rmap_item->rmap_list;
  2619	
  2620				ksm_scan.address += PAGE_SIZE;
  2621				if (should_skip_rmap_item(walk_private.folio, rmap_item)) {
  2622					folio_put(walk_private.folio);
  2623					goto get_page;
  2624				}
  2625	
  2626				*page = walk_private.page;
  2627			} else {
  2628				folio_put(walk_private.folio);
  2629			}
  2630			mmap_read_unlock(mm);
  2631			return rmap_item;
  2632		}
  2633	
  2634		if (ksm_test_exit(mm)) {
  2635	no_vmas:
  2636			ksm_scan.address = 0;
  2637			ksm_scan.rmap_list = &mm_slot->rmap_list;
  2638		}
  2639		/*
  2640		 * Nuke all the rmap_items that are above this current rmap:
  2641		 * because there were no VM_MERGEABLE vmas with such addresses.
  2642		 */
  2643		remove_trailing_rmap_items(ksm_scan.rmap_list);
  2644	
  2645		spin_lock(&ksm_mmlist_lock);
  2646		slot = list_entry(mm_slot->slot.mm_node.next,
  2647				  struct mm_slot, mm_node);
  2648		ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
  2649		if (ksm_scan.address == 0) {
  2650			/*
  2651			 * We've completed a full scan of all vmas, holding mmap_lock
  2652			 * throughout, and found no VM_MERGEABLE: so do the same as
  2653			 * __ksm_exit does to remove this mm from all our lists now.
  2654			 * This applies either when cleaning up after __ksm_exit
  2655			 * (but beware: we can reach here even before __ksm_exit),
  2656			 * or when all VM_MERGEABLE areas have been unmapped (and
  2657			 * mmap_lock then protects against race with MADV_MERGEABLE).
  2658			 */
  2659			hash_del(&mm_slot->slot.hash);
  2660			list_del(&mm_slot->slot.mm_node);
  2661			spin_unlock(&ksm_mmlist_lock);
  2662	
  2663			mm_slot_free(mm_slot_cache, mm_slot);
  2664			/*
  2665			 * Only clear MMF_VM_MERGEABLE. We must not clear
  2666			 * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
  2667			 * perhaps their mm_struct has just been added to ksm_mm_slot
  2668			 * list, and its process has not yet officially started running
  2669			 * or has not yet performed mmap/brk to allocate anonymous VMAS.
  2670			 */
  2671			mm_flags_clear(MMF_VM_MERGEABLE, mm);
  2672			mmap_read_unlock(mm);
  2673			mmdrop(mm);
  2674		} else {
  2675			mmap_read_unlock(mm);
  2676			/*
  2677			 * mmap_read_unlock(mm) first because after
  2678			 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
  2679			 * already have been freed under us by __ksm_exit()
  2680			 * because the "mm_slot" is still hashed and
  2681			 * ksm_scan.mm_slot doesn't point to it anymore.
  2682			 */
  2683			spin_unlock(&ksm_mmlist_lock);
  2684		}
  2685	
  2686		/* Repeat until we've completed scanning the whole list */
  2687		mm_slot = ksm_scan.mm_slot;
  2688		if (mm_slot != &ksm_mm_head)
  2689			goto next_mm;
  2690	
  2691		advisor_stop_scan();
  2692	
  2693		trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
  2694		ksm_scan.seqnr++;
  2695		return NULL;
  2696	}
  2697	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
                   ` (2 preceding siblings ...)
  2025-10-15  5:46 ` kernel test robot
@ 2025-10-15 12:22 ` David Hildenbrand
  3 siblings, 0 replies; 6+ messages in thread
From: David Hildenbrand @ 2025-10-15 12:22 UTC (permalink / raw)
  To: Pedro Demarchi Gomes, Andrew Morton
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel

On 14.10.25 17:11, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> Changes since v1 [2]:
> - Use pmd_entry to walk page range
> - Use cond_resched inside pmd_entry()
> - walk_page_range returns page+folio
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> [2] https://lore.kernel.org/linux-mm/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 

Can you also make sure to CC the reporter. So you might want to add

Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io

And if it was my suggestion

Suggested-by: David Hildenbrand <david@redhat.com>

Not sure if we want a Fixes: tag ... we could have created gigantic
VMAs with an anon VMA for like ever, so it would date back quite a bit.


Please make sure to thoroughly compile- and runtime-test your changes.

-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-10-15 12:22 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14 15:11 [PATCH v2] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 15:59 ` David Hildenbrand
2025-10-14 21:57   ` Pedro Demarchi Gomes
2025-10-15  3:53 ` kernel test robot
2025-10-15  5:46 ` kernel test robot
2025-10-15 12:22 ` David Hildenbrand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox