* [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
@ 2025-10-14 5:58 Pedro Demarchi Gomes
2025-10-14 9:26 ` David Hildenbrand
2025-10-14 11:40 ` [syzbot ci] " syzbot ci
0 siblings, 2 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 5:58 UTC (permalink / raw)
To: Andrew Morton, David Hildenbrand, craftfever
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes
Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.
This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
To evaluate this change, I created a test that maps a 1 TB virtual area
where only the first and last 10 MB are populated with identical data.
With this patch applied, KSM scanned and merged the region approximately
seven times faster.
This problem was previously discussed in [1].
[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
1 file changed, 79 insertions(+), 57 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..584fd987e8ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_walk_private {
+ struct page *page;
+ struct ksm_rmap_item *rmap_item;
+ struct ksm_mm_slot *mm_slot;
+};
+
+static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (!vma || !(vma->vm_flags & VM_MERGEABLE))
+ return 1;
+ return 0;
+}
+
+static int ksm_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct mm_struct *mm = walk->mm;
+ struct vm_area_struct *vma = walk->vma;
+ struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
+ struct ksm_mm_slot *mm_slot = private->mm_slot;
+ pte_t ptent = ptep_get(pte);
+ struct page *page = pfn_to_online_page(pte_pfn(ptent));
+ struct ksm_rmap_item *rmap_item;
+ struct folio *folio;
+
+ ksm_scan.address = addr;
+
+ if (ksm_test_exit(mm))
+ return 1;
+
+ if (!page)
+ return 0;
+
+ folio = page_folio(page);
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ return 0;
+
+ folio_get(folio);
+
+ flush_anon_page(vma, page, ksm_scan.address);
+ flush_dcache_page(page);
+ rmap_item = get_next_rmap_item(mm_slot,
+ ksm_scan.rmap_list, ksm_scan.address);
+ if (rmap_item) {
+ ksm_scan.rmap_list =
+ &rmap_item->rmap_list;
+
+ if (should_skip_rmap_item(folio, rmap_item)) {
+ folio_put(folio);
+ return 0;
+ }
+ ksm_scan.address = end;
+ private->page = page;
+ } else
+ folio_put(folio);
+
+ private->rmap_item = rmap_item;
+ return 1;
+}
+
+struct mm_walk_ops walk_ops = {
+ .pte_entry = ksm_pte_entry,
+ .test_walk = ksm_walk_test,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
- struct vm_area_struct *vma;
- struct ksm_rmap_item *rmap_item;
- struct vma_iterator vmi;
- int nid;
+ int nid, ret;
if (list_empty(&ksm_mm_head.slot.mm_node))
return NULL;
@@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
slot = &mm_slot->slot;
mm = slot->mm;
- vma_iter_init(&vmi, mm, ksm_scan.address);
mmap_read_lock(mm);
if (ksm_test_exit(mm))
goto no_vmas;
- for_each_vma(vmi, vma) {
- if (!(vma->vm_flags & VM_MERGEABLE))
- continue;
- if (ksm_scan.address < vma->vm_start)
- ksm_scan.address = vma->vm_start;
- if (!vma->anon_vma)
- ksm_scan.address = vma->vm_end;
-
- while (ksm_scan.address < vma->vm_end) {
- struct page *tmp_page = NULL;
- struct folio_walk fw;
- struct folio *folio;
-
- if (ksm_test_exit(mm))
- break;
-
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
- }
-
- if (tmp_page) {
- flush_anon_page(vma, tmp_page, ksm_scan.address);
- flush_dcache_page(tmp_page);
- rmap_item = get_next_rmap_item(mm_slot,
- ksm_scan.rmap_list, ksm_scan.address);
- if (rmap_item) {
- ksm_scan.rmap_list =
- &rmap_item->rmap_list;
-
- if (should_skip_rmap_item(folio, rmap_item)) {
- folio_put(folio);
- goto next_page;
- }
-
- ksm_scan.address += PAGE_SIZE;
- *page = tmp_page;
- } else {
- folio_put(folio);
- }
- mmap_read_unlock(mm);
- return rmap_item;
- }
-next_page:
- ksm_scan.address += PAGE_SIZE;
- cond_resched();
- }
+ struct ksm_walk_private walk_private = {
+ .page = NULL,
+ .rmap_item = NULL,
+ .mm_slot = ksm_scan.mm_slot
+ };
+ ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
+ *page = walk_private.page;
+ if (ret) {
+ mmap_read_unlock(mm);
+ return walk_private.rmap_item;
}
if (ksm_test_exit(mm)) {
--
2.39.5
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
@ 2025-10-14 9:26 ` David Hildenbrand
2025-10-14 13:36 ` Pedro Demarchi Gomes
2025-10-14 11:40 ` [syzbot ci] " syzbot ci
1 sibling, 1 reply; 4+ messages in thread
From: David Hildenbrand @ 2025-10-14 9:26 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton, craftfever
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
>
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
>
> To evaluate this change, I created a test that maps a 1 TB virtual area
> where only the first and last 10 MB are populated with identical data.
> With this patch applied, KSM scanned and merged the region approximately
> seven times faster.
>
> This problem was previously discussed in [1].
>
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
> mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 79 insertions(+), 57 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..584fd987e8ae 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> return true;
> }
>
> +struct ksm_walk_private {
> + struct page *page;
> + struct ksm_rmap_item *rmap_item;
> + struct ksm_mm_slot *mm_slot;
> +};
> +
> +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> +
> + if (!vma || !(vma->vm_flags & VM_MERGEABLE))
The anon_vma check should go in here as well.
How can we possibly get !vma?
> + return 1;
> + return 0;
> +}
> +
> +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> + unsigned long end, struct mm_walk *walk)
> +{
> + struct mm_struct *mm = walk->mm;
> + struct vm_area_struct *vma = walk->vma;
> + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> + struct ksm_mm_slot *mm_slot = private->mm_slot;
> + pte_t ptent = ptep_get(pte);
> + struct page *page = pfn_to_online_page(pte_pfn(ptent));
Oh no.
vm_normal_page()
> + struct ksm_rmap_item *rmap_item;
> + struct folio *folio;
> +
> + ksm_scan.address = addr;
> +
> + if (ksm_test_exit(mm))
> + return 1;
> +
> + if (!page)
> + return 0;
> +
> + folio = page_folio(page);
> + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> + return 0;
> +
> + folio_get(folio);
> +
> + flush_anon_page(vma, page, ksm_scan.address);
> + flush_dcache_page(page);
> + rmap_item = get_next_rmap_item(mm_slot,
> + ksm_scan.rmap_list, ksm_scan.address);
> + if (rmap_item) {
> + ksm_scan.rmap_list =
> + &rmap_item->rmap_list;
> +
> + if (should_skip_rmap_item(folio, rmap_item)) {
> + folio_put(folio);
> + return 0;
> + }
> + ksm_scan.address = end;
> + private->page = page;
> + } else
> + folio_put(folio);
> +
You're under PTL, get_next_rmap_item() will perform an allocation, so
that won't work.
Observe how the original code worked around that by performing all magic
outside of the PTL (folio_walk_end()).
When you switch to .pmd_entry() (see below) you will be able to handle it.
What you could also try doing is returing page+folio and letting the
caller deal with everything starting at the flush_anon_page().
> + private->rmap_item = rmap_item;
> + return 1;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> + .pte_entry = ksm_pte_entry,
> + .test_walk = ksm_walk_test,
> + .walk_lock = PGWALK_RDLOCK,
> +};
It's more complicated: you'd be remapping each PMD to be mapped by PTEs
first, which is not what we want. You'll have to handle pmd_entry
instead of pte_entry.
> +
> static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> {
> struct mm_struct *mm;
> struct ksm_mm_slot *mm_slot;
> struct mm_slot *slot;
> - struct vm_area_struct *vma;
> - struct ksm_rmap_item *rmap_item;
> - struct vma_iterator vmi;
> - int nid;
> + int nid, ret;
>
> if (list_empty(&ksm_mm_head.slot.mm_node))
> return NULL;
> @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>
> slot = &mm_slot->slot;
> mm = slot->mm;
> - vma_iter_init(&vmi, mm, ksm_scan.address);
>
> mmap_read_lock(mm);
> if (ksm_test_exit(mm))
> goto no_vmas;
>
> - for_each_vma(vmi, vma) {
> - if (!(vma->vm_flags & VM_MERGEABLE))
> - continue;
> - if (ksm_scan.address < vma->vm_start)
> - ksm_scan.address = vma->vm_start;
> - if (!vma->anon_vma)
> - ksm_scan.address = vma->vm_end;
> -
> - while (ksm_scan.address < vma->vm_end) {
> - struct page *tmp_page = NULL;
> - struct folio_walk fw;
> - struct folio *folio;
> -
> - if (ksm_test_exit(mm))
> - break;
> -
> - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> - if (folio) {
> - if (!folio_is_zone_device(folio) &&
> - folio_test_anon(folio)) {
> - folio_get(folio);
> - tmp_page = fw.page;
> - }
> - folio_walk_end(&fw, vma);
> - }
> -
> - if (tmp_page) {
> - flush_anon_page(vma, tmp_page, ksm_scan.address);
> - flush_dcache_page(tmp_page);
> - rmap_item = get_next_rmap_item(mm_slot,
> - ksm_scan.rmap_list, ksm_scan.address);
> - if (rmap_item) {
> - ksm_scan.rmap_list =
> - &rmap_item->rmap_list;
> -
> - if (should_skip_rmap_item(folio, rmap_item)) {
> - folio_put(folio);
> - goto next_page;
> - }
> -
> - ksm_scan.address += PAGE_SIZE;
> - *page = tmp_page;
> - } else {
> - folio_put(folio);
> - }
> - mmap_read_unlock(mm);
> - return rmap_item;
> - }
> -next_page:
> - ksm_scan.address += PAGE_SIZE;
> - cond_resched();
You're dropping all cond_resched(), which will be a problem.
> - }
> + struct ksm_walk_private walk_private = {
> + .page = NULL,
> + .rmap_item = NULL,
> + .mm_slot = ksm_scan.mm_slot
> + };
empty line missing
> + ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> + *page = walk_private.page;
> + if (ret) {
> + mmap_read_unlock(mm);
> + return walk_private.rmap_item;
> }
>
> if (ksm_test_exit(mm)) {
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 4+ messages in thread
* [syzbot ci] Re: ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 9:26 ` David Hildenbrand
@ 2025-10-14 11:40 ` syzbot ci
1 sibling, 0 replies; 4+ messages in thread
From: syzbot ci @ 2025-10-14 11:40 UTC (permalink / raw)
To: akpm, chengming.zhou, craftfever, david, linux-kernel, linux-mm,
pedrodemargomes, xu.xin16
Cc: syzbot, syzkaller-bugs
syzbot ci has tested the following series
[v1] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com
* [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
and found the following issue:
possible deadlock in __pte_offset_map_lock
Full report is available here:
https://ci.syzbot.org/series/15a341dd-23e3-4627-9fb9-00b92b7bae3f
***
possible deadlock in __pte_offset_map_lock
tree: torvalds
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base: 0d97f2067c166eb495771fede9f7b73999c67f66
arch: amd64
compiler: Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config: https://ci.syzbot.org/builds/31dfd455-a2ef-4c41-8f8d-172612e161d9/config
C repro: https://ci.syzbot.org/findings/5bddd702-c879-4aa6-b86d-5be2b4b3d0e2/c_repro
syz repro: https://ci.syzbot.org/findings/5bddd702-c879-4aa6-b86d-5be2b4b3d0e2/syz_repro
======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Not tainted
------------------------------------------------------
ksmd/41 is trying to acquire lock:
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: might_alloc include/linux/sched/mm.h:318 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: slab_pre_alloc_hook mm/slub.c:4897 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: slab_alloc_node mm/slub.c:5221 [inline]
ffffffff8e245b20 (fs_reclaim){+.+.}-{0:0}, at: kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252
but task is already holding lock:
ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: spin_lock include/linux/spinlock.h:351 [inline]
ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: __pte_offset_map_lock+0x13e/0x210 mm/pgtable-generic.c:401
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}:
lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
__raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
_raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154
spin_lock include/linux/spinlock.h:351 [inline]
map_pte mm/page_vma_mapped.c:72 [inline]
page_vma_mapped_walk+0xeea/0x20f0 mm/page_vma_mapped.c:291
try_to_migrate_one+0x5d8/0x34d0 mm/rmap.c:2339
rmap_walk_anon+0x553/0x730 mm/rmap.c:2855
try_to_migrate+0x319/0x3d0 mm/rmap.c:-1
migrate_folio_unmap mm/migrate.c:1319 [inline]
migrate_pages_batch+0x1432/0x35e0 mm/migrate.c:1882
migrate_pages+0x1d5a/0x2930 mm/migrate.c:2103
migrate_misplaced_folio+0x28c/0x840 mm/migrate.c:2724
do_numa_page mm/memory.c:6004 [inline]
handle_pte_fault mm/memory.c:6183 [inline]
__handle_mm_fault+0x3bab/0x5400 mm/memory.c:6318
handle_mm_fault+0x40a/0x8e0 mm/memory.c:6487
do_user_addr_fault+0x764/0x1380 arch/x86/mm/fault.c:1387
handle_page_fault arch/x86/mm/fault.c:1476 [inline]
exc_page_fault+0x82/0x100 arch/x86/mm/fault.c:1532
asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623
__get_user_8+0x14/0x30 arch/x86/lib/getuser.S:100
rseq_get_rseq_cs_ptr_val kernel/rseq.c:248 [inline]
rseq_get_rseq_cs kernel/rseq.c:270 [inline]
rseq_ip_fixup kernel/rseq.c:390 [inline]
__rseq_handle_notify_resume+0x13d/0x1220 kernel/rseq.c:438
rseq_handle_notify_resume include/linux/rseq.h:44 [inline]
resume_user_mode_work include/linux/resume_user_mode.h:62 [inline]
exit_to_user_mode_loop+0xbf/0x130 kernel/entry/common.c:43
exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100
entry_SYSCALL_64_after_hwframe+0x77/0x7f
-> #2 (&anon_vma->rwsem){++++}-{4:4}:
lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
down_write+0x96/0x1f0 kernel/locking/rwsem.c:1590
anon_vma_lock_write include/linux/rmap.h:122 [inline]
vma_prepare+0x25c/0x4b0 mm/vma.c:309
__split_vma+0x76e/0xa00 mm/vma.c:550
split_vma mm/vma.c:598 [inline]
vma_modify+0x13b3/0x1970 mm/vma.c:1631
vma_modify_flags+0x1e8/0x230 mm/vma.c:1649
mprotect_fixup+0x407/0x9c0 mm/mprotect.c:816
do_mprotect_pkey+0x8c5/0xcd0 mm/mprotect.c:990
__do_sys_mprotect mm/mprotect.c:1011 [inline]
__se_sys_mprotect mm/mprotect.c:1008 [inline]
__x64_sys_mprotect+0x80/0x90 mm/mprotect.c:1008
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
-> #1 (&mapping->i_mmap_rwsem){++++}-{4:4}:
lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
down_write+0x96/0x1f0 kernel/locking/rwsem.c:1590
i_mmap_lock_write include/linux/fs.h:548 [inline]
dma_resv_lockdep+0x2f6/0x5b0 drivers/dma-buf/dma-resv.c:797
do_one_initcall+0x236/0x820 init/main.c:1283
do_initcall_level+0x104/0x190 init/main.c:1345
do_initcalls+0x59/0xa0 init/main.c:1361
kernel_init_freeable+0x334/0x4b0 init/main.c:1593
kernel_init+0x1d/0x1d0 init/main.c:1483
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
-> #0 (fs_reclaim){+.+.}-{0:0}:
check_prev_add kernel/locking/lockdep.c:3165 [inline]
check_prevs_add kernel/locking/lockdep.c:3284 [inline]
validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
__lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
__fs_reclaim_acquire mm/page_alloc.c:4269 [inline]
fs_reclaim_acquire+0x72/0x100 mm/page_alloc.c:4283
might_alloc include/linux/sched/mm.h:318 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
other info that might help us debug this:
Chain exists of:
fs_reclaim --> &anon_vma->rwsem --> ptlock_ptr(ptdesc)#2
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(ptlock_ptr(ptdesc)#2);
lock(&anon_vma->rwsem);
lock(ptlock_ptr(ptdesc)#2);
lock(fs_reclaim);
*** DEADLOCK ***
4 locks held by ksmd/41:
#0: ffffffff8e259b88 (ksm_thread_mutex){+.+.}-{4:4}, at: ksm_scan_thread+0xc8/0x4b0 mm/ksm.c:2703
#1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: mmap_read_lock include/linux/mmap_lock.h:368 [inline]
#1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: scan_get_next_rmap_item mm/ksm.c:2596 [inline]
#1: ffff8881703b9760 (&mm->mmap_lock){++++}-{4:4}, at: ksm_do_scan+0x578/0x5c10 mm/ksm.c:2681
#2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:331 [inline]
#2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:867 [inline]
#2: ffffffff8e13d320 (rcu_read_lock){....}-{1:3}, at: ___pte_offset_map+0x29/0x250 mm/pgtable-generic.c:286
#3: ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: spin_lock include/linux/spinlock.h:351 [inline]
#3: ffff888112eeb8b8 (ptlock_ptr(ptdesc)#2){+.+.}-{3:3}, at: __pte_offset_map_lock+0x13e/0x210 mm/pgtable-generic.c:401
stack backtrace:
CPU: 1 UID: 0 PID: 41 Comm: ksmd Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
print_circular_bug+0x2ee/0x310 kernel/locking/lockdep.c:2043
check_noncircular+0x134/0x160 kernel/locking/lockdep.c:2175
check_prev_add kernel/locking/lockdep.c:3165 [inline]
check_prevs_add kernel/locking/lockdep.c:3284 [inline]
validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908
__lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237
lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868
__fs_reclaim_acquire mm/page_alloc.c:4269 [inline]
fs_reclaim_acquire+0x72/0x100 mm/page_alloc.c:4283
might_alloc include/linux/sched/mm.h:318 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x45/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 0 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 41, name: ksmd
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<0000000000000000>] 0x0
CPU: 1 UID: 0 PID: 41 Comm: ksmd Tainted: G W syzkaller #0 PREEMPT(full)
Tainted: [W]=WARN
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
__might_resched+0x495/0x610 kernel/sched/core.c:8925
might_alloc include/linux/sched/mm.h:321 [inline]
slab_pre_alloc_hook mm/slub.c:4897 [inline]
slab_alloc_node mm/slub.c:5221 [inline]
kmem_cache_alloc_noprof+0x65/0x6e0 mm/slub.c:5252
alloc_rmap_item mm/ksm.c:565 [inline]
get_next_rmap_item mm/ksm.c:2378 [inline]
ksm_pte_entry+0x414/0xa60 mm/ksm.c:2501
walk_pte_range_inner+0x1ba/0x380 mm/pagewalk.c:50
walk_pte_range mm/pagewalk.c:88 [inline]
walk_pmd_range mm/pagewalk.c:155 [inline]
walk_pud_range mm/pagewalk.c:224 [inline]
walk_p4d_range mm/pagewalk.c:262 [inline]
walk_pgd_range+0x12bf/0x1d40 mm/pagewalk.c:303
__walk_page_range+0x14c/0x710 mm/pagewalk.c:410
walk_page_range_mm+0x454/0x660 mm/pagewalk.c:506
scan_get_next_rmap_item mm/ksm.c:2605 [inline]
ksm_do_scan+0x60c/0x5c10 mm/ksm.c:2681
ksm_scan_thread+0x10b/0x4b0 mm/ksm.c:2706
kthread+0x711/0x8a0 kernel/kthread.c:463
ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158
ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
</TASK>
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
2025-10-14 9:26 ` David Hildenbrand
@ 2025-10-14 13:36 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-14 13:36 UTC (permalink / raw)
To: David Hildenbrand
Cc: Andrew Morton, craftfever, Xu Xin, Chengming Zhou, linux-mm,
linux-kernel
On Tue, Oct 14, 2025 at 11:26:06AM +0200, David Hildenbrand wrote:
> On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> > Currently, scan_get_next_rmap_item() walks every page address in a VMA
> > to locate mergeable pages. This becomes highly inefficient when scanning
> > large virtual memory areas that contain mostly unmapped regions.
> >
> > This patch replaces the per-address lookup with a range walk using
> > walk_page_range(). The range walker allows KSM to skip over entire
> > unmapped holes in a VMA, avoiding unnecessary lookups.
> >
> > To evaluate this change, I created a test that maps a 1 TB virtual area
> > where only the first and last 10 MB are populated with identical data.
> > With this patch applied, KSM scanned and merged the region approximately
> > seven times faster.
> >
> > This problem was previously discussed in [1].
> >
> > [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> >
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> > mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> > 1 file changed, 79 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 3aed0478fdce..584fd987e8ae 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> > return true;
> > }
> > +struct ksm_walk_private {
> > + struct page *page;
> > + struct ksm_rmap_item *rmap_item;
> > + struct ksm_mm_slot *mm_slot;
> > +};
> > +
> > +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> > +{
> > + struct vm_area_struct *vma = walk->vma;
> > +
> > + if (!vma || !(vma->vm_flags & VM_MERGEABLE))
>
> The anon_vma check should go in here as well.
>
> How can we possibly get !vma?
>
> > + return 1;
> > + return 0;
> > +}
> > +
> > +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> > + unsigned long end, struct mm_walk *walk)
> > +{
> > + struct mm_struct *mm = walk->mm;
> > + struct vm_area_struct *vma = walk->vma;
> > + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> > + struct ksm_mm_slot *mm_slot = private->mm_slot;
> > + pte_t ptent = ptep_get(pte);
> > + struct page *page = pfn_to_online_page(pte_pfn(ptent));
>
> Oh no.
>
> vm_normal_page()
>
> > + struct ksm_rmap_item *rmap_item;
> > + struct folio *folio;
> > +
> > + ksm_scan.address = addr;
> > +
> > + if (ksm_test_exit(mm))
> > + return 1;
> > +
> > + if (!page)
> > + return 0;
> > +
> > + folio = page_folio(page);
> > + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> > + return 0;
> > +
> > + folio_get(folio);
> > +
> > + flush_anon_page(vma, page, ksm_scan.address);
> > + flush_dcache_page(page);
> > + rmap_item = get_next_rmap_item(mm_slot,
> > + ksm_scan.rmap_list, ksm_scan.address);
> > + if (rmap_item) {
> > + ksm_scan.rmap_list =
> > + &rmap_item->rmap_list;
> > +
> > + if (should_skip_rmap_item(folio, rmap_item)) {
> > + folio_put(folio);
> > + return 0;
> > + }
> > + ksm_scan.address = end;
> > + private->page = page;
> > + } else
> > + folio_put(folio);
> > +
>
> You're under PTL, get_next_rmap_item() will perform an allocation, so that
> won't work.
>
> Observe how the original code worked around that by performing all magic
> outside of the PTL (folio_walk_end()).
>
> When you switch to .pmd_entry() (see below) you will be able to handle it.
>
> What you could also try doing is returing page+folio and letting the caller
> deal with everything starting at the flush_anon_page().
>
> > + private->rmap_item = rmap_item;
> > + return 1;
> > +}
> > +
> > +struct mm_walk_ops walk_ops = {
> > + .pte_entry = ksm_pte_entry,
> > + .test_walk = ksm_walk_test,
> > + .walk_lock = PGWALK_RDLOCK,
> > +};
>
> It's more complicated: you'd be remapping each PMD to be mapped by PTEs
> first, which is not what we want. You'll have to handle pmd_entry instead of
> pte_entry.
>
> > +
> > static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> > {
> > struct mm_struct *mm;
> > struct ksm_mm_slot *mm_slot;
> > struct mm_slot *slot;
> > - struct vm_area_struct *vma;
> > - struct ksm_rmap_item *rmap_item;
> > - struct vma_iterator vmi;
> > - int nid;
> > + int nid, ret;
> > if (list_empty(&ksm_mm_head.slot.mm_node))
> > return NULL;
> > @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> > slot = &mm_slot->slot;
> > mm = slot->mm;
> > - vma_iter_init(&vmi, mm, ksm_scan.address);
> > mmap_read_lock(mm);
> > if (ksm_test_exit(mm))
> > goto no_vmas;
> > - for_each_vma(vmi, vma) {
> > - if (!(vma->vm_flags & VM_MERGEABLE))
> > - continue;
> > - if (ksm_scan.address < vma->vm_start)
> > - ksm_scan.address = vma->vm_start;
> > - if (!vma->anon_vma)
> > - ksm_scan.address = vma->vm_end;
> > -
> > - while (ksm_scan.address < vma->vm_end) {
> > - struct page *tmp_page = NULL;
> > - struct folio_walk fw;
> > - struct folio *folio;
> > -
> > - if (ksm_test_exit(mm))
> > - break;
> > -
> > - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> > - if (folio) {
> > - if (!folio_is_zone_device(folio) &&
> > - folio_test_anon(folio)) {
> > - folio_get(folio);
> > - tmp_page = fw.page;
> > - }
> > - folio_walk_end(&fw, vma);
> > - }
> > -
> > - if (tmp_page) {
> > - flush_anon_page(vma, tmp_page, ksm_scan.address);
> > - flush_dcache_page(tmp_page);
> > - rmap_item = get_next_rmap_item(mm_slot,
> > - ksm_scan.rmap_list, ksm_scan.address);
> > - if (rmap_item) {
> > - ksm_scan.rmap_list =
> > - &rmap_item->rmap_list;
> > -
> > - if (should_skip_rmap_item(folio, rmap_item)) {
> > - folio_put(folio);
> > - goto next_page;
> > - }
> > -
> > - ksm_scan.address += PAGE_SIZE;
> > - *page = tmp_page;
> > - } else {
> > - folio_put(folio);
> > - }
> > - mmap_read_unlock(mm);
> > - return rmap_item;
> > - }
> > -next_page:
> > - ksm_scan.address += PAGE_SIZE;
> > - cond_resched();
>
> You're dropping all cond_resched(), which will be a problem.
>
> > - }
> > + struct ksm_walk_private walk_private = {
> > + .page = NULL,
> > + .rmap_item = NULL,
> > + .mm_slot = ksm_scan.mm_slot
> > + };
>
> empty line missing
>
> > + ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> > + *page = walk_private.page;
> > + if (ret) {
> > + mmap_read_unlock(mm);
> > + return walk_private.rmap_item;
> > }
> > if (ksm_test_exit(mm)) {
>
>
> --
> Cheers
>
> David / dhildenb
>
Thanks for the explanations, I will send a v2 shortly.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-10-14 13:38 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-14 5:58 [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item Pedro Demarchi Gomes
2025-10-14 9:26 ` David Hildenbrand
2025-10-14 13:36 ` Pedro Demarchi Gomes
2025-10-14 11:40 ` [syzbot ci] " syzbot ci
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox