linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
@ 2025-10-14  5:58 Pedro Demarchi Gomes
  2025-10-14  9:26 ` David Hildenbrand
  2025-10-14 11:40 ` [syzbot ci] " syzbot ci
  0 siblings, 2 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14  5:58 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, craftfever
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes

Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.

To evaluate this change, I created a test that maps a 1 TB virtual area
where only the first and last 10 MB are populated with identical data.
With this patch applied, KSM scanned and merged the region approximately
seven times faster.

This problem was previously discussed in [1].

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/

Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 79 insertions(+), 57 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..584fd987e8ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
 	return true;
 }
 
+struct ksm_walk_private {
+	struct page *page;
+	struct ksm_rmap_item *rmap_item;
+	struct ksm_mm_slot *mm_slot;
+};
+
+static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+
+	if (!vma || !(vma->vm_flags & VM_MERGEABLE))
+		return 1;
+	return 0;
+}
+
+static int ksm_pte_entry(pte_t *pte, unsigned long addr,
+			    unsigned long end, struct mm_walk *walk)
+{
+	struct mm_struct *mm = walk->mm;
+	struct vm_area_struct *vma = walk->vma;
+	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
+	struct ksm_mm_slot *mm_slot = private->mm_slot;
+	pte_t ptent = ptep_get(pte);
+	struct page *page = pfn_to_online_page(pte_pfn(ptent));
+	struct ksm_rmap_item *rmap_item;
+	struct folio *folio;
+
+	ksm_scan.address = addr;
+
+	if (ksm_test_exit(mm))
+		return 1;
+
+	if (!page)
+		return 0;
+
+	folio = page_folio(page);
+	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+		return 0;
+
+	folio_get(folio);
+
+	flush_anon_page(vma, page, ksm_scan.address);
+	flush_dcache_page(page);
+	rmap_item = get_next_rmap_item(mm_slot,
+		ksm_scan.rmap_list, ksm_scan.address);
+	if (rmap_item) {
+		ksm_scan.rmap_list =
+				&rmap_item->rmap_list;
+
+		if (should_skip_rmap_item(folio, rmap_item)) {
+			folio_put(folio);
+			return 0;
+		}
+		ksm_scan.address = end;
+		private->page = page;
+	} else
+		folio_put(folio);
+
+	private->rmap_item = rmap_item;
+	return 1;
+}
+
+struct mm_walk_ops walk_ops = {
+	.pte_entry = ksm_pte_entry,
+	.test_walk = ksm_walk_test,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
 	struct ksm_mm_slot *mm_slot;
 	struct mm_slot *slot;
-	struct vm_area_struct *vma;
-	struct ksm_rmap_item *rmap_item;
-	struct vma_iterator vmi;
-	int nid;
+	int nid, ret;
 
 	if (list_empty(&ksm_mm_head.slot.mm_node))
 		return NULL;
@@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 
 	slot = &mm_slot->slot;
 	mm = slot->mm;
-	vma_iter_init(&vmi, mm, ksm_scan.address);
 
 	mmap_read_lock(mm);
 	if (ksm_test_exit(mm))
 		goto no_vmas;
 
-	for_each_vma(vmi, vma) {
-		if (!(vma->vm_flags & VM_MERGEABLE))
-			continue;
-		if (ksm_scan.address < vma->vm_start)
-			ksm_scan.address = vma->vm_start;
-		if (!vma->anon_vma)
-			ksm_scan.address = vma->vm_end;
-
-		while (ksm_scan.address < vma->vm_end) {
-			struct page *tmp_page = NULL;
-			struct folio_walk fw;
-			struct folio *folio;
-
-			if (ksm_test_exit(mm))
-				break;
-
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
-			}
-
-			if (tmp_page) {
-				flush_anon_page(vma, tmp_page, ksm_scan.address);
-				flush_dcache_page(tmp_page);
-				rmap_item = get_next_rmap_item(mm_slot,
-					ksm_scan.rmap_list, ksm_scan.address);
-				if (rmap_item) {
-					ksm_scan.rmap_list =
-							&rmap_item->rmap_list;
-
-					if (should_skip_rmap_item(folio, rmap_item)) {
-						folio_put(folio);
-						goto next_page;
-					}
-
-					ksm_scan.address += PAGE_SIZE;
-					*page = tmp_page;
-				} else {
-					folio_put(folio);
-				}
-				mmap_read_unlock(mm);
-				return rmap_item;
-			}
-next_page:
-			ksm_scan.address += PAGE_SIZE;
-			cond_resched();
-		}
+	struct ksm_walk_private walk_private = {
+		.page = NULL,
+		.rmap_item = NULL,
+		.mm_slot = ksm_scan.mm_slot
+	};
+	ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
+	*page = walk_private.page;
+	if (ret) {
+		mmap_read_unlock(mm);
+		return walk_private.rmap_item;
 	}
 
 	if (ksm_test_exit(mm)) {
-- 
2.39.5



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14  5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
@ 2025-10-14  9:26 ` David Hildenbrand
  2025-10-14 13:36   ` Pedro Demarchi Gomes
  2025-10-14 11:40 ` [syzbot ci] " syzbot ci
  1 sibling, 1 reply; 4+ messages in thread
From: David Hildenbrand @ 2025-10-14  9:26 UTC (permalink / raw)
  To: Pedro Demarchi Gomes, Andrew Morton, craftfever
  Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel

On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> 
> To evaluate this change, I created a test that maps a 1 TB virtual area
> where only the first and last 10 MB are populated with identical data.
> With this patch applied, KSM scanned and merged the region approximately
> seven times faster.
> 
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
>   mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
>   1 file changed, 79 insertions(+), 57 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..584fd987e8ae 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
>   	return true;
>   }
>   
> +struct ksm_walk_private {
> +	struct page *page;
> +	struct ksm_rmap_item *rmap_item;
> +	struct ksm_mm_slot *mm_slot;
> +};
> +
> +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (!vma || !(vma->vm_flags & VM_MERGEABLE))

The anon_vma check should go in here as well.

How can we possibly get !vma?

> +		return 1;
> +	return 0;
> +}
> +
> +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> +			    unsigned long end, struct mm_walk *walk)
> +{
> +	struct mm_struct *mm = walk->mm;
> +	struct vm_area_struct *vma = walk->vma;
> +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> +	struct ksm_mm_slot *mm_slot = private->mm_slot;
> +	pte_t ptent = ptep_get(pte);
> +	struct page *page = pfn_to_online_page(pte_pfn(ptent));

Oh no.

vm_normal_page()

> +	struct ksm_rmap_item *rmap_item;
> +	struct folio *folio;
> +
> +	ksm_scan.address = addr;
> +
> +	if (ksm_test_exit(mm))
> +		return 1;
> +
> +	if (!page)
> +		return 0;
> +
> +	folio = page_folio(page);
> +	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +		return 0;
> +
> +	folio_get(folio);
> +
> +	flush_anon_page(vma, page, ksm_scan.address);
> +	flush_dcache_page(page);
> +	rmap_item = get_next_rmap_item(mm_slot,
> +		ksm_scan.rmap_list, ksm_scan.address);
> +	if (rmap_item) {
> +		ksm_scan.rmap_list =
> +				&rmap_item->rmap_list;
> +
> +		if (should_skip_rmap_item(folio, rmap_item)) {
> +			folio_put(folio);
> +			return 0;
> +		}
> +		ksm_scan.address = end;
> +		private->page = page;
> +	} else
> +		folio_put(folio);
> +

You're under PTL, get_next_rmap_item() will perform an allocation, so 
that won't work.

Observe how the original code worked around that by performing all magic 
outside of the PTL (folio_walk_end()).

When you switch to .pmd_entry() (see below) you will be able to handle it.

What you could also try doing is returing page+folio and letting the 
caller deal with everything starting at the flush_anon_page().

> +	private->rmap_item = rmap_item;
> +	return 1;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> +	.pte_entry = ksm_pte_entry,
> +	.test_walk = ksm_walk_test,
> +	.walk_lock = PGWALK_RDLOCK,
> +};

It's more complicated: you'd be remapping each PMD to be mapped by PTEs 
first, which is not what we want. You'll have to handle pmd_entry 
instead of pte_entry.

> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
>   	struct ksm_mm_slot *mm_slot;
>   	struct mm_slot *slot;
> -	struct vm_area_struct *vma;
> -	struct ksm_rmap_item *rmap_item;
> -	struct vma_iterator vmi;
> -	int nid;
> +	int nid, ret;
>   
>   	if (list_empty(&ksm_mm_head.slot.mm_node))
>   		return NULL;
> @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   
>   	slot = &mm_slot->slot;
>   	mm = slot->mm;
> -	vma_iter_init(&vmi, mm, ksm_scan.address);
>   
>   	mmap_read_lock(mm);
>   	if (ksm_test_exit(mm))
>   		goto no_vmas;
>   
> -	for_each_vma(vmi, vma) {
> -		if (!(vma->vm_flags & VM_MERGEABLE))
> -			continue;
> -		if (ksm_scan.address < vma->vm_start)
> -			ksm_scan.address = vma->vm_start;
> -		if (!vma->anon_vma)
> -			ksm_scan.address = vma->vm_end;
> -
> -		while (ksm_scan.address < vma->vm_end) {
> -			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
> -			struct folio *folio;
> -
> -			if (ksm_test_exit(mm))
> -				break;
> -
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> -			}
> -
> -			if (tmp_page) {
> -				flush_anon_page(vma, tmp_page, ksm_scan.address);
> -				flush_dcache_page(tmp_page);
> -				rmap_item = get_next_rmap_item(mm_slot,
> -					ksm_scan.rmap_list, ksm_scan.address);
> -				if (rmap_item) {
> -					ksm_scan.rmap_list =
> -							&rmap_item->rmap_list;
> -
> -					if (should_skip_rmap_item(folio, rmap_item)) {
> -						folio_put(folio);
> -						goto next_page;
> -					}
> -
> -					ksm_scan.address += PAGE_SIZE;
> -					*page = tmp_page;
> -				} else {
> -					folio_put(folio);
> -				}
> -				mmap_read_unlock(mm);
> -				return rmap_item;
> -			}
> -next_page:
> -			ksm_scan.address += PAGE_SIZE;
> -			cond_resched();

You're dropping all cond_resched(), which will be a problem.

> -		}
> +	struct ksm_walk_private walk_private = {
> +		.page = NULL,
> +		.rmap_item = NULL,
> +		.mm_slot = ksm_scan.mm_slot
> +	};

empty line missing

> +	ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> +	*page = walk_private.page;
> +	if (ret) {
> +		mmap_read_unlock(mm);
> +		return walk_private.rmap_item;
>   	}
>   
>   	if (ksm_test_exit(mm)) {


-- 
Cheers

David / dhildenb



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [syzbot ci] Re: ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14  5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
  2025-10-14  9:26 ` David Hildenbrand
@ 2025-10-14 11:40 ` syzbot ci
  1 sibling, 0 replies; 4+ messages in thread
From: syzbot ci @ 2025-10-14 11:40 UTC (permalink / raw)
  To: akpm, chengming.zhou, craftfever, david, linux-kernel, linux-mm,
	pedrodemargomes, xu.xin16
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v1] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com
* [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

and found the following issue:
possible deadlock in __pte_offset_map_lock

Full report is available here:
https://ci.syzbot.org/series/15a341dd-23e3-4627-9fb9-00b92b7bae3f

***

possible deadlock in __pte_offset_map_lock

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      0d97f2067c166eb495771fede9f7b73999c67f66
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/31dfd455-a2ef-4c41-8f8d-172612e161d9/config
C repro:   https://ci.syzbot.org/findings/5bddd702-c879-4aa6-b86d-5be2b4b3d0e2/c_repro
syz repro: https://ci.syzbot.org/findings/5bddd702-c879-4aa6-b86d-5be2b4b3d0e2/syz_repro

======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Not tainted
------------------------------------------------------
ksmd/41 is trying to acquire lock:
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: might_alloc include/linux/sched/mm.h:318 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: slab_pre_alloc_hook mm/slub.c:4897 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: slab_alloc_node mm/slub.c:5221 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252

but task is already holding lock:
ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: __pte_offset_map_lock+0x13e/0x210 mm/pgtable-generic.c:401

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #3 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}:
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
       _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
       spin_lock include/linux/spinlock.h:351 [inline]
       map_pte mm/page_vma_mapped.c:72 [inline]
       page_vma_mapped_walk+0xeea/0x20f0 mm/page_vma_mapped.c:291
       try_to_migrate_one+0x5d8/0x34d0 mm/rmap.c:2339
       rmap_walk_anon+0x553/0x730 mm/rmap.c:2855
       try_to_migrate+0x319/0x3d0 mm/rmap.c:-1
       migrate_folio_unmap mm/migrate.c:1319 [inline]
       migrate_pages_batch+0x1432/0x35e0 mm/migrate.c:1882
       migrate_pages+0x1d5a/0x2930 mm/migrate.c:2103
       migrate_misplaced_folio+0x28c/0x840 mm/migrate.c:2724
       do_numa_page mm/memory.c:6004 [inline]
       handle_pte_fault mm/memory.c:6183 [inline]
       __handle_mm_fault+0x3bab/0x5400 mm/memory.c:6318
       handle_mm_fault+0x40a/0x8e0 mm/memory.c:6487
       do_user_addr_fault+0x764/0x1380 arch/x86/mm/fault.c:1387
       handle_page_fault arch/x86/mm/fault.c:1476 [inline]
       exc_page_fault+0x82/0x100 arch/x86/mm/fault.c:1532
       asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
       __get_user_8+0x14/0x30 arch/x86/lib/getuser.S:100
       rseq_get_rseq_cs_ptr_val kernel/rseq.c:248 [inline]
       rseq_get_rseq_cs kernel/rseq.c:270 [inline]
       rseq_ip_fixup kernel/rseq.c:390 [inline]
       __rseq_handle_notify_resume+0x13d/0x1220 kernel/rseq.c:438
       rseq_handle_notify_resume include/linux/rseq.h:44 [inline]
       resume_user_mode_work include/linux/resume_user_mode.h:62 [inline]
       exit_to_user_mode_loop+0xbf/0x130 kernel/entry/common.c:43
       exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
       syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
       syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
       do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #2 (&anon_vma->rwsem){++++}-{4:4}:
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       down_write+0x96/0x1f0 kernel/locking/rwsem.c:1590
       anon_vma_lock_write include/linux/rmap.h:122 [inline]
       vma_prepare+0x25c/0x4b0 mm/vma.c:309
       __split_vma+0x76e/0xa00 mm/vma.c:550
       split_vma mm/vma.c:598 [inline]
       vma_modify+0x13b3/0x1970 mm/vma.c:1631
       vma_modify_flags+0x1e8/0x230 mm/vma.c:1649
       mprotect_fixup+0x407/0x9c0 mm/mprotect.c:816
       do_mprotect_pkey+0x8c5/0xcd0 mm/mprotect.c:990
       __do_sys_mprotect mm/mprotect.c:1011 [inline]
       __se_sys_mprotect mm/mprotect.c:1008 [inline]
       __x64_sys_mprotect+0x80/0x90 mm/mprotect.c:1008
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #1 (&mapping->i_mmap_rwsem){++++}-{4:4}:
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       down_write+0x96/0x1f0 kernel/locking/rwsem.c:1590
       i_mmap_lock_write include/linux/fs.h:548 [inline]
       dma_resv_lockdep+0x2f6/0x5b0 drivers/dma-buf/dma-resv.c:797
       do_one_initcall+0x236/0x820 init/main.c:1283
       do_initcall_level+0x104/0x190 init/main.c:1345
       do_initcalls+0x59/0xa0 init/main.c:1361
       kernel_init_freeable+0x334/0x4b0 init/main.c:1593
       kernel_init+0x1d/0x1d0 init/main.c:1483
       ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
       ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

-> #0 (fs_reclaim){+.+.}-{0:0}:
       check_prev_add kernel/locking/lockdep.c:3165 [inline]
       check_prevs_add kernel/locking/lockdep.c:3284 [inline]
       validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
       __lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
       lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
       __fs_reclaim_acquire mm/page_alloc.c:4269 [inline]
       fs_reclaim_acquire+0x72/0x100 mm/page_alloc.c:4283
       might_alloc include/linux/sched/mm.h:318 [inline]
       slab_pre_alloc_hook mm/slub.c:4897 [inline]
       slab_alloc_node mm/slub.c:5221 [inline]
       kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252
       alloc_rmap_item mm/ksm.c:565 [inline]
       get_next_rmap_item mm/ksm.c:2378 [inline]
       ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
       walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
       walk_pte_range mm/pagewalk.c:88 [inline]
       walk_pmd_range mm/pagewalk.c:155 [inline]
       walk_pud_range mm/pagewalk.c:224 [inline]
       walk_p4d_range mm/pagewalk.c:262 [inline]
       walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
       __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
       walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
       scan_get_next_rmap_item mm/ksm.c:2605 [inline]
       ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
       ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
       kthread+0x711/0x8a0 kernel/kthread.c:463
       ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
       ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

other info that might help us debug this:

Chain exists of:
  fs_reclaim --> &anon_vma->rwsem --> ptlock_ptr(ptdesc)#2

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(ptlock_ptr(ptdesc)#2);
                               lock(&anon_vma->rwsem);
                               lock(ptlock_ptr(ptdesc)#2);
  lock(fs_reclaim);

 *** DEADLOCK ***

4 locks held by ksmd/41:
 #0: ffffffff8e259b88 (ksm_thread_mutex){+.+.}-{4:4}, at: ksm_scan_thread+0xc8/0x4b0 mm/ksm.c:2703
 #1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: mmap_read_lock include/linux/mmap_lock.h:368 [inline]
 #1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: scan_get_next_rmap_item mm/ksm.c:2596 [inline]
 #1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: ksm_do_scan+0x578/0x5c10 mm/ksm.c:2681
 #2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:331 [inline]
 #2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:867 [inline]
 #2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: ___pte_offset_map+0x29/0x250 mm/pgtable-generic.c:286
 #3: ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: spin_lock include/linux/spinlock.h:351 [inline]
 #3: ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: __pte_offset_map_lock+0x13e/0x210 mm/pgtable-generic.c:401

stack backtrace:
CPU: 1 UID: 0 PID: 41 Comm: ksmd Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 print_circular_bug+0x2ee/0x310 kernel/locking/lockdep.c:2043
 check_noncircular+0x134/0x160 kernel/locking/lockdep.c:2175
 check_prev_add kernel/locking/lockdep.c:3165 [inline]
 check_prevs_add kernel/locking/lockdep.c:3284 [inline]
 validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
 __lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
 lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
 __fs_reclaim_acquire mm/page_alloc.c:4269 [inline]
 fs_reclaim_acquire+0x72/0x100 mm/page_alloc.c:4283
 might_alloc include/linux/sched/mm.h:318 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G        W           syzkaller #0 PREEMPT(full) 
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 __might_resched+0x495/0x610 kernel/sched/core.c:8925
 might_alloc include/linux/sched/mm.h:321 [inline]
 slab_pre_alloc_hook mm/slub.c:4897 [inline]
 slab_alloc_node mm/slub.c:5221 [inline]
 kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
 alloc_rmap_item mm/ksm.c:565 [inline]
 get_next_rmap_item mm/ksm.c:2378 [inline]
 ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
 walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
 walk_pte_range mm/pagewalk.c:88 [inline]
 walk_pmd_range mm/pagewalk.c:155 [inline]
 walk_pud_range mm/pagewalk.c:224 [inline]
 walk_p4d_range mm/pagewalk.c:262 [inline]
 walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
 __walk_page_range+0x14c/0x710 mm/pagewalk.c:410
 walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
 scan_get_next_rmap_item mm/ksm.c:2605 [inline]
 ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
 ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
 kthread+0x711/0x8a0 kernel/kthread.c:463
 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  2025-10-14  9:26 ` David Hildenbrand
@ 2025-10-14 13:36   ` Pedro Demarchi Gomes
  0 siblings, 0 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 13:36 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Andrew Morton, craftfever, Xu Xin, Chengming Zhou, linux-mm,
	linux-kernel

On Tue, Oct 14, 2025 at 11:26:06AM +0200, David Hildenbrand wrote:
> On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> > Currently, scan_get_next_rmap_item() walks every page address in a VMA
> > to locate mergeable pages. This becomes highly inefficient when scanning
> > large virtual memory areas that contain mostly unmapped regions.
> > 
> > This patch replaces the per-address lookup with a range walk using
> > walk_page_range(). The range walker allows KSM to skip over entire
> > unmapped holes in a VMA, avoiding unnecessary lookups.
> > 
> > To evaluate this change, I created a test that maps a 1 TB virtual area
> > where only the first and last 10 MB are populated with identical data.
> > With this patch applied, KSM scanned and merged the region approximately
> > seven times faster.
> > 
> > This problem was previously discussed in [1].
> > 
> > [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> > 
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> >   mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> >   1 file changed, 79 insertions(+), 57 deletions(-)
> > 
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 3aed0478fdce..584fd987e8ae 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> >   	return true;
> >   }
> > +struct ksm_walk_private {
> > +	struct page *page;
> > +	struct ksm_rmap_item *rmap_item;
> > +	struct ksm_mm_slot *mm_slot;
> > +};
> > +
> > +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> > +{
> > +	struct vm_area_struct *vma = walk->vma;
> > +
> > +	if (!vma || !(vma->vm_flags & VM_MERGEABLE))
> 
> The anon_vma check should go in here as well.
> 
> How can we possibly get !vma?
> 
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> > +			    unsigned long end, struct mm_walk *walk)
> > +{
> > +	struct mm_struct *mm = walk->mm;
> > +	struct vm_area_struct *vma = walk->vma;
> > +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> > +	struct ksm_mm_slot *mm_slot = private->mm_slot;
> > +	pte_t ptent = ptep_get(pte);
> > +	struct page *page = pfn_to_online_page(pte_pfn(ptent));
> 
> Oh no.
> 
> vm_normal_page()
> 
> > +	struct ksm_rmap_item *rmap_item;
> > +	struct folio *folio;
> > +
> > +	ksm_scan.address = addr;
> > +
> > +	if (ksm_test_exit(mm))
> > +		return 1;
> > +
> > +	if (!page)
> > +		return 0;
> > +
> > +	folio = page_folio(page);
> > +	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> > +		return 0;
> > +
> > +	folio_get(folio);
> > +
> > +	flush_anon_page(vma, page, ksm_scan.address);
> > +	flush_dcache_page(page);
> > +	rmap_item = get_next_rmap_item(mm_slot,
> > +		ksm_scan.rmap_list, ksm_scan.address);
> > +	if (rmap_item) {
> > +		ksm_scan.rmap_list =
> > +				&rmap_item->rmap_list;
> > +
> > +		if (should_skip_rmap_item(folio, rmap_item)) {
> > +			folio_put(folio);
> > +			return 0;
> > +		}
> > +		ksm_scan.address = end;
> > +		private->page = page;
> > +	} else
> > +		folio_put(folio);
> > +
> 
> You're under PTL, get_next_rmap_item() will perform an allocation, so that
> won't work.
> 
> Observe how the original code worked around that by performing all magic
> outside of the PTL (folio_walk_end()).
> 
> When you switch to .pmd_entry() (see below) you will be able to handle it.
> 
> What you could also try doing is returing page+folio and letting the caller
> deal with everything starting at the flush_anon_page().
> 
> > +	private->rmap_item = rmap_item;
> > +	return 1;
> > +}
> > +
> > +struct mm_walk_ops walk_ops = {
> > +	.pte_entry = ksm_pte_entry,
> > +	.test_walk = ksm_walk_test,
> > +	.walk_lock = PGWALK_RDLOCK,
> > +};
> 
> It's more complicated: you'd be remapping each PMD to be mapped by PTEs
> first, which is not what we want. You'll have to handle pmd_entry instead of
> pte_entry.
> 
> > +
> >   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> >   {
> >   	struct mm_struct *mm;
> >   	struct ksm_mm_slot *mm_slot;
> >   	struct mm_slot *slot;
> > -	struct vm_area_struct *vma;
> > -	struct ksm_rmap_item *rmap_item;
> > -	struct vma_iterator vmi;
> > -	int nid;
> > +	int nid, ret;
> >   	if (list_empty(&ksm_mm_head.slot.mm_node))
> >   		return NULL;
> > @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> >   	slot = &mm_slot->slot;
> >   	mm = slot->mm;
> > -	vma_iter_init(&vmi, mm, ksm_scan.address);
> >   	mmap_read_lock(mm);
> >   	if (ksm_test_exit(mm))
> >   		goto no_vmas;
> > -	for_each_vma(vmi, vma) {
> > -		if (!(vma->vm_flags & VM_MERGEABLE))
> > -			continue;
> > -		if (ksm_scan.address < vma->vm_start)
> > -			ksm_scan.address = vma->vm_start;
> > -		if (!vma->anon_vma)
> > -			ksm_scan.address = vma->vm_end;
> > -
> > -		while (ksm_scan.address < vma->vm_end) {
> > -			struct page *tmp_page = NULL;
> > -			struct folio_walk fw;
> > -			struct folio *folio;
> > -
> > -			if (ksm_test_exit(mm))
> > -				break;
> > -
> > -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> > -			if (folio) {
> > -				if (!folio_is_zone_device(folio) &&
> > -				     folio_test_anon(folio)) {
> > -					folio_get(folio);
> > -					tmp_page = fw.page;
> > -				}
> > -				folio_walk_end(&fw, vma);
> > -			}
> > -
> > -			if (tmp_page) {
> > -				flush_anon_page(vma, tmp_page, ksm_scan.address);
> > -				flush_dcache_page(tmp_page);
> > -				rmap_item = get_next_rmap_item(mm_slot,
> > -					ksm_scan.rmap_list, ksm_scan.address);
> > -				if (rmap_item) {
> > -					ksm_scan.rmap_list =
> > -							&rmap_item->rmap_list;
> > -
> > -					if (should_skip_rmap_item(folio, rmap_item)) {
> > -						folio_put(folio);
> > -						goto next_page;
> > -					}
> > -
> > -					ksm_scan.address += PAGE_SIZE;
> > -					*page = tmp_page;
> > -				} else {
> > -					folio_put(folio);
> > -				}
> > -				mmap_read_unlock(mm);
> > -				return rmap_item;
> > -			}
> > -next_page:
> > -			ksm_scan.address += PAGE_SIZE;
> > -			cond_resched();
> 
> You're dropping all cond_resched(), which will be a problem.
> 
> > -		}
> > +	struct ksm_walk_private walk_private = {
> > +		.page = NULL,
> > +		.rmap_item = NULL,
> > +		.mm_slot = ksm_scan.mm_slot
> > +	};
> 
> empty line missing
> 
> > +	ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> > +	*page = walk_private.page;
> > +	if (ret) {
> > +		mmap_read_unlock(mm);
> > +		return walk_private.rmap_item;
> >   	}
> >   	if (ksm_test_exit(mm)) {
> 
> 
> -- 
> Cheers
> 
> David / dhildenb
> 

Thanks for the explanations, I will send a v2 shortly.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-10-14 13:38 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14  5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14  9:26 ` David Hildenbrand
2025-10-14 13:36   ` Pedro Demarchi Gomes
2025-10-14 11:40 ` [syzbot ci] " syzbot ci

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox