* [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"
2025-10-28 13:19 [PATCH 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
@ 2025-10-28 13:19 ` Pedro Demarchi Gomes
2025-10-29 14:34 ` David Hildenbrand
2025-10-28 13:19 ` [PATCH 2/3] ksm: perform a range-walk in break_ksm Pedro Demarchi Gomes
2025-10-28 13:19 ` [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm Pedro Demarchi Gomes
2 siblings, 1 reply; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-28 13:19 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes
This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
PageKsm(page) to folio_test_ksm(page_folio(page)).
This reverts break_ksm() to use walk_page_range_vma() instead of
folio_walk_start().
This will make it easier to later modify break_ksm() to perform a proper
range walk.
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 47 insertions(+), 16 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 4f672f4f2140..2a9a7fd4c777 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ pte_t ptent;
+ int ret;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent)) {
+ page = vm_normal_page(walk->vma, addr, ptent);
+ } else if (!pte_none(ptent)) {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
/*
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
@@ -623,26 +664,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
vm_fault_t ret = 0;
-
- if (lock_vma)
- vma_start_write(vma);
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
- bool ksm_page = false;
- struct folio_walk fw;
- struct folio *folio;
+ int ksm_page;
cond_resched();
- folio = folio_walk_start(&fw, vma, addr,
- FW_MIGRATION | FW_ZEROPAGE);
- if (folio) {
- /* Small folio implies FW_LEVEL_PTE. */
- if (!folio_test_large(folio) &&
- (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
- ksm_page = true;
- folio_walk_end(&fw, vma);
- }
-
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
if (!ksm_page)
return 0;
ret = handle_mm_fault(vma, addr,
--
2.43.0
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"
2025-10-28 13:19 ` [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Pedro Demarchi Gomes
@ 2025-10-29 14:34 ` David Hildenbrand
2025-10-30 11:59 ` Pedro Demarchi Gomes
0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand @ 2025-10-29 14:34 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
> PageKsm(page) to folio_test_ksm(page_folio(page)).
>
> This reverts break_ksm() to use walk_page_range_vma() instead of
> folio_walk_start().
> This will make it easier to later modify break_ksm() to perform a proper
> range walk.
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
> mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
> 1 file changed, 47 insertions(+), 16 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 4f672f4f2140..2a9a7fd4c777 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
> return atomic_read(&mm->mm_users) == 0;
> }
>
> +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> + struct mm_walk *walk)
> +{
> + struct page *page = NULL;
> + spinlock_t *ptl;
> + pte_t *pte;
> + pte_t ptent;
> + int ret;
> +
> + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> + if (!pte)
> + return 0;
> + ptent = ptep_get(pte);
> + if (pte_present(ptent)) {
> + page = vm_normal_page(walk->vma, addr, ptent);
folio = vm_normal_folio()
> + } else if (!pte_none(ptent)) {
> + swp_entry_t entry = pte_to_swp_entry(ptent);
> +
> + /*
> + * As KSM pages remain KSM pages until freed, no need to wait
> + * here for migration to end.
> + */
> + if (is_migration_entry(entry))
> + page = pfn_swap_entry_to_page(entry);
folio = pfn_swap_entry_folio()
> + }
> + /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> + ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
The you can directly work with folios here.
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"
2025-10-29 14:34 ` David Hildenbrand
@ 2025-10-30 11:59 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-30 11:59 UTC (permalink / raw)
To: David Hildenbrand
Cc: Andrew Morton, Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On Wed, Oct 29, 2025 at 03:34:23PM +0100, David Hildenbrand wrote:
> On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> > This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
> > PageKsm(page) to folio_test_ksm(page_folio(page)).
> >
> > This reverts break_ksm() to use walk_page_range_vma() instead of
> > folio_walk_start().
> > This will make it easier to later modify break_ksm() to perform a proper
> > range walk.
> >
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> > mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
> > 1 file changed, 47 insertions(+), 16 deletions(-)
> >
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 4f672f4f2140..2a9a7fd4c777 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> > +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> > + struct mm_walk *walk)
> > +{
> > + struct page *page = NULL;
> > + spinlock_t *ptl;
> > + pte_t *pte;
> > + pte_t ptent;
> > + int ret;
> > +
> > + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> > + if (!pte)
> > + return 0;
> > + ptent = ptep_get(pte);
> > + if (pte_present(ptent)) {
> > + page = vm_normal_page(walk->vma, addr, ptent);
>
> folio = vm_normal_folio()
>
> > + } else if (!pte_none(ptent)) {
> > + swp_entry_t entry = pte_to_swp_entry(ptent);
> > +
> > + /*
> > + * As KSM pages remain KSM pages until freed, no need to wait
> > + * here for migration to end.
> > + */
> > + if (is_migration_entry(entry))
> > + page = pfn_swap_entry_to_page(entry);
>
> folio = pfn_swap_entry_folio()
>
> > + }
> > + /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> > + ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
>
>
> The you can directly work with folios here.
>
Ack, will do.
> --
> Cheers
>
> David / dhildenb
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/3] ksm: perform a range-walk in break_ksm
2025-10-28 13:19 [PATCH 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
2025-10-28 13:19 ` [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Pedro Demarchi Gomes
@ 2025-10-28 13:19 ` Pedro Demarchi Gomes
2025-10-29 14:45 ` David Hildenbrand
2025-10-28 13:19 ` [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm Pedro Demarchi Gomes
2 siblings, 1 reply; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-28 13:19 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes
Make break_ksm() receive an address range and change
break_ksm_pmd_entry() to perform a range-walk and return the address of
the first ksm page found.
This change allows break_ksm() to skip unmapped regions instead of
iterating every page address. When unmerging large sparse VMAs, this
significantly reduces runtime, as confirmed by benchmark test (see
cover letter).
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 88 +++++++++++++++++++++++++++++++-------------------------
1 file changed, 49 insertions(+), 39 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 2a9a7fd4c777..1d1ef0554c7c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,34 +607,54 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+struct break_ksm_arg {
+ unsigned long addr;
+};
+
+static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct page *page = NULL;
+ struct page *page;
spinlock_t *ptl;
- pte_t *pte;
- pte_t ptent;
- int ret;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ int ret = 0;
+ struct mm_struct *mm = walk->mm;
+ struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte)
+ if (ksm_test_exit(walk->mm))
return 0;
- ptent = ptep_get(pte);
- if (pte_present(ptent)) {
- page = vm_normal_page(walk->vma, addr, ptent);
- } else if (!pte_none(ptent)) {
- swp_entry_t entry = pte_to_swp_entry(ptent);
- /*
- * As KSM pages remain KSM pages until freed, no need to wait
- * here for migration to end.
- */
- if (is_migration_entry(entry))
- page = pfn_swap_entry_to_page(entry);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+ page = NULL;
+ if (pte_present(pte)) {
+ page = vm_normal_page(walk->vma, addr, pte);
+ } else if (!pte_none(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(pte);
+ if (ret) {
+ private->addr = addr;
+ goto out_unlock;
+ }
}
- /* return 1 if the page is an normal ksm page or KSM-placed zero page */
- ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
- pte_unmap_unlock(pte, ptl);
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
return ret;
}
@@ -661,9 +681,11 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, bool lock_vma)
{
vm_fault_t ret = 0;
+ struct break_ksm_arg break_ksm_arg;
const struct mm_walk_ops *ops = lock_vma ?
&break_ksm_lock_vma_ops : &break_ksm_ops;
@@ -671,11 +693,10 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
int ksm_page;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
- if (WARN_ON_ONCE(ksm_page < 0))
+ ksm_page = walk_page_range_vma(vma, addr, end, ops, &break_ksm_arg);
+ if (ksm_page <= 0)
return ksm_page;
- if (!ksm_page)
- return 0;
+ addr = break_ksm_arg.addr;
ret = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
@@ -761,7 +782,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr, false);
+ break_ksm(vma, addr, addr + 1, false);
mmap_read_unlock(mm);
}
@@ -1072,18 +1093,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
static int unmerge_ksm_pages(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool lock_vma)
{
- unsigned long addr;
- int err = 0;
-
- for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
- if (ksm_test_exit(vma->vm_mm))
- break;
- if (signal_pending(current))
- err = -ERESTARTSYS;
- else
- err = break_ksm(vma, addr, lock_vma);
- }
- return err;
+ return break_ksm(vma, start, end, lock_vma);
}
static inline
--
2.43.0
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] ksm: perform a range-walk in break_ksm
2025-10-28 13:19 ` [PATCH 2/3] ksm: perform a range-walk in break_ksm Pedro Demarchi Gomes
@ 2025-10-29 14:45 ` David Hildenbrand
2025-10-30 12:29 ` Pedro Demarchi Gomes
0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand @ 2025-10-29 14:45 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> Make break_ksm() receive an address range and change
> break_ksm_pmd_entry() to perform a range-walk and return the address of
> the first ksm page found.
>
> This change allows break_ksm() to skip unmapped regions instead of
> iterating every page address. When unmerging large sparse VMAs, this
> significantly reduces runtime, as confirmed by benchmark test (see
> cover letter).
Instead of referencing the cover letter, directly include the data here.
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
> mm/ksm.c | 88 +++++++++++++++++++++++++++++++-------------------------
> 1 file changed, 49 insertions(+), 39 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 2a9a7fd4c777..1d1ef0554c7c 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -607,34 +607,54 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
> return atomic_read(&mm->mm_users) == 0;
> }
>
> -static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> +struct break_ksm_arg {
> + unsigned long addr;
> +};
You can avoid that by simply passing a pointer to addr.
> +
> +static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
> struct mm_walk *walk)
> {
> - struct page *page = NULL;
> + struct page *page;
> spinlock_t *ptl;
> - pte_t *pte;
> - pte_t ptent;
> - int ret;
> + pte_t *start_ptep = NULL, *ptep, pte;
Is there a need to initialize start_ptep?
> + int ret = 0;
> + struct mm_struct *mm = walk->mm;
> + struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
Prefer reverse xmas tree:
struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
struct mm_struct *mm = walk->mm;
...
With the break_ksm_arg simplification you'd had
unsigned long *found_addr = (unsigned long *)walk->private;
>
> - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> - if (!pte)
> + if (ksm_test_exit(walk->mm))
> return 0;
> - ptent = ptep_get(pte);
> - if (pte_present(ptent)) {
> - page = vm_normal_page(walk->vma, addr, ptent);
> - } else if (!pte_none(ptent)) {
> - swp_entry_t entry = pte_to_swp_entry(ptent);
>
> - /*
> - * As KSM pages remain KSM pages until freed, no need to wait
> - * here for migration to end.
> - */
> - if (is_migration_entry(entry))
> - page = pfn_swap_entry_to_page(entry);
> + if (signal_pending(current))
> + return -ERESTARTSYS;
I assume that's not a problem for the other callsites that wouldn't
check this before.
> +
> + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> + if (!start_ptep)
> + return 0;
> +
> + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
Best to declare pte and folio (see last mail) here in the loop:
pte_t pte = ptep_get(ptep);
struct folio *folio = NULL;
> + pte = ptep_get(ptep);
> + page = NULL;
> + if (pte_present(pte)) {
> + page = vm_normal_page(walk->vma, addr, pte);
> + } else if (!pte_none(pte)) {
> + swp_entry_t entry = pte_to_swp_entry(pte);
> +
> + /*
> + * As KSM pages remain KSM pages until freed, no need to wait
> + * here for migration to end.
> + */
> + if (is_migration_entry(entry))
> + page = pfn_swap_entry_to_page(entry);
> + }
> + /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> + ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(pte);
> + if (ret) {
> + private->addr = addr;
> + goto out_unlock;
> + }
I suggest you call "ret" "found" instead.
> }
> - /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> - ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
> - pte_unmap_unlock(pte, ptl);
> +out_unlock:
> + pte_unmap_unlock(ptep, ptl);
> return ret;
> }
>
> @@ -661,9 +681,11 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
> * of the process that owns 'vma'. We also do not want to enforce
> * protection keys here anyway.
> */
> -static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
> +static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
> + unsigned long end, bool lock_vma)
> {
> vm_fault_t ret = 0;
> + struct break_ksm_arg break_ksm_arg;
> const struct mm_walk_ops *ops = lock_vma ?
> &break_ksm_lock_vma_ops : &break_ksm_ops;
>
> @@ -671,11 +693,10 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
> int ksm_page;
>
> cond_resched();
> - ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
> - if (WARN_ON_ONCE(ksm_page < 0))
> + ksm_page = walk_page_range_vma(vma, addr, end, ops, &break_ksm_arg);
> + if (ksm_page <= 0)
> return ksm_page;
> - if (!ksm_page)
> - return 0;
> + addr = break_ksm_arg.addr;
> ret = handle_mm_fault(vma, addr,
> FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
> NULL);
> @@ -761,7 +782,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
> mmap_read_lock(mm);
> vma = find_mergeable_vma(mm, addr);
> if (vma)
> - break_ksm(vma, addr, false);
> + break_ksm(vma, addr, addr + 1, false);
Better to use addr + PAGE_SIZE
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 2/3] ksm: perform a range-walk in break_ksm
2025-10-29 14:45 ` David Hildenbrand
@ 2025-10-30 12:29 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-30 12:29 UTC (permalink / raw)
To: David Hildenbrand
Cc: Andrew Morton, Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On Wed, Oct 29, 2025 at 03:45:14PM +0100, David Hildenbrand wrote:
> On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> > Make break_ksm() receive an address range and change
> > break_ksm_pmd_entry() to perform a range-walk and return the address of
> > the first ksm page found.
> >
> > This change allows break_ksm() to skip unmapped regions instead of
> > iterating every page address. When unmerging large sparse VMAs, this
> > significantly reduces runtime, as confirmed by benchmark test (see
> > cover letter).
>
> Instead of referencing the cover letter, directly include the data here.
>
Ok
> >
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> > mm/ksm.c | 88 +++++++++++++++++++++++++++++++-------------------------
> > 1 file changed, 49 insertions(+), 39 deletions(-)
> >
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 2a9a7fd4c777..1d1ef0554c7c 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -607,34 +607,54 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
> > return atomic_read(&mm->mm_users) == 0;
> > }
> > -static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> > +struct break_ksm_arg {
> > + unsigned long addr;
> > +};
>
> You can avoid that by simply passing a pointer to addr.
>
> > +
> > +static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
> > struct mm_walk *walk)
> > {
> > - struct page *page = NULL;
> > + struct page *page;
> > spinlock_t *ptl;
> > - pte_t *pte;
> > - pte_t ptent;
> > - int ret;
> > + pte_t *start_ptep = NULL, *ptep, pte;
>
> Is there a need to initialize start_ptep?
>
No, I will fix that.
> > + int ret = 0;
> > + struct mm_struct *mm = walk->mm;
> > + struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
>
> Prefer reverse xmas tree:
>
> struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
> struct mm_struct *mm = walk->mm;
> ...
>
> With the break_ksm_arg simplification you'd had
>
> unsigned long *found_addr = (unsigned long *)walk->private;
>
>
Ok
> > - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> > - if (!pte)
> > + if (ksm_test_exit(walk->mm))
> > return 0;
> > - ptent = ptep_get(pte);
> > - if (pte_present(ptent)) {
> > - page = vm_normal_page(walk->vma, addr, ptent);
> > - } else if (!pte_none(ptent)) {
> > - swp_entry_t entry = pte_to_swp_entry(ptent);
> > - /*
> > - * As KSM pages remain KSM pages until freed, no need to wait
> > - * here for migration to end.
> > - */
> > - if (is_migration_entry(entry))
> > - page = pfn_swap_entry_to_page(entry);
> > + if (signal_pending(current))
> > + return -ERESTARTSYS;
>
> I assume that's not a problem for the other callsites that wouldn't check
> this before.
>
Correct.
> > +
> > + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> > + if (!start_ptep)
> > + return 0;
> > +
> > + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
>
> Best to declare pte and folio (see last mail) here in the loop:
>
> pte_t pte = ptep_get(ptep);
> struct folio *folio = NULL;
>
Ok.
> > + pte = ptep_get(ptep);
> > + page = NULL;
> > + if (pte_present(pte)) {
> > + page = vm_normal_page(walk->vma, addr, pte);
> > + } else if (!pte_none(pte)) {
> > + swp_entry_t entry = pte_to_swp_entry(pte);
> > +
> > + /*
> > + * As KSM pages remain KSM pages until freed, no need to wait
> > + * here for migration to end.
> > + */
> > + if (is_migration_entry(entry))
> > + page = pfn_swap_entry_to_page(entry);
> > + }
> > + /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> > + ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(pte);
> > + if (ret) {
> > + private->addr = addr;
> > + goto out_unlock;
> > + }
>
> I suggest you call "ret" "found" instead.
>
Ok.
> > }
> > - /* return 1 if the page is an normal ksm page or KSM-placed zero page */
> > - ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
> > - pte_unmap_unlock(pte, ptl);
> > +out_unlock:
> > + pte_unmap_unlock(ptep, ptl);
> > return ret;
> > }
> > @@ -661,9 +681,11 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
> > * of the process that owns 'vma'. We also do not want to enforce
> > * protection keys here anyway.
> > */
> > -static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
> > +static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
> > + unsigned long end, bool lock_vma)
> > {
> > vm_fault_t ret = 0;
> > + struct break_ksm_arg break_ksm_arg;
> > const struct mm_walk_ops *ops = lock_vma ?
> > &break_ksm_lock_vma_ops : &break_ksm_ops;
> > @@ -671,11 +693,10 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
> > int ksm_page;
> > cond_resched();
> > - ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
> > - if (WARN_ON_ONCE(ksm_page < 0))
> > + ksm_page = walk_page_range_vma(vma, addr, end, ops, &break_ksm_arg);
> > + if (ksm_page <= 0)
> > return ksm_page;
> > - if (!ksm_page)
> > - return 0;
> > + addr = break_ksm_arg.addr;
> > ret = handle_mm_fault(vma, addr,
> > FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
> > NULL);
> > @@ -761,7 +782,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
> > mmap_read_lock(mm);
> > vma = find_mergeable_vma(mm, addr);
> > if (vma)
> > - break_ksm(vma, addr, false);
> > + break_ksm(vma, addr, addr + 1, false);
>
> Better to use addr + PAGE_SIZE
>
OK.
> --
> Cheers
>
> David / dhildenb
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm
2025-10-28 13:19 [PATCH 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
2025-10-28 13:19 ` [PATCH 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Pedro Demarchi Gomes
2025-10-28 13:19 ` [PATCH 2/3] ksm: perform a range-walk in break_ksm Pedro Demarchi Gomes
@ 2025-10-28 13:19 ` Pedro Demarchi Gomes
2025-10-29 14:46 ` David Hildenbrand
2 siblings, 1 reply; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-28 13:19 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel, Pedro Demarchi Gomes
Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks
an address range. So replace it with break_ksm().
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 39 ++++++++++++++++-----------------------
1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 1d1ef0554c7c..18c9e3bda285 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -669,6 +669,18 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
};
/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ *
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
*
@@ -1077,25 +1089,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
}
}
-/*
- * Though it's very tempting to unmerge rmap_items from stable tree rather
- * than check every pte of a given vma, the locking doesn't quite work for
- * that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
- * rmap_items from parent to child at fork time (so as not to waste time
- * if exit comes before the next scan reaches it).
- *
- * Similarly, although we'd like to remove rmap_items (so updating counts
- * and freeing memory) when unmerging an area, it's easier to leave that
- * to the next pass of ksmd - consider, for example, how ksmd might be
- * in cmp_and_merge_page on one of the rmap_items we would be removing.
- */
-static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, bool lock_vma)
-{
- return break_ksm(vma, start, end, lock_vma);
-}
-
static inline
struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
@@ -1233,7 +1226,7 @@ static int unmerge_and_remove_all_rmap_items(void)
for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
- err = unmerge_ksm_pages(vma,
+ err = break_ksm(vma,
vma->vm_start, vma->vm_end, false);
if (err)
goto error;
@@ -2861,7 +2854,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
+ err = break_ksm(vma, vma->vm_start, vma->vm_end, true);
if (err)
return err;
}
@@ -3013,7 +3006,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end, true);
+ err = break_ksm(vma, start, end, true);
if (err)
return err;
}
@@ -3395,7 +3388,7 @@ static int ksm_memory_callback(struct notifier_block *self,
* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
* and remove_all_stable_nodes() while memory is going offline:
* it is unsafe for them to touch the stable tree at this time.
- * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * But break_ksm(), rmap lookups and other entry points
* which do not need the ksm_thread_mutex are all safe.
*/
mutex_lock(&ksm_thread_mutex);
--
2.43.0
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm
2025-10-28 13:19 ` [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm Pedro Demarchi Gomes
@ 2025-10-29 14:46 ` David Hildenbrand
2025-10-30 12:44 ` Pedro Demarchi Gomes
0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand @ 2025-10-29 14:46 UTC (permalink / raw)
To: Pedro Demarchi Gomes, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks
> an address range. So replace it with break_ksm().
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
> mm/ksm.c | 39 ++++++++++++++++-----------------------
> 1 file changed, 16 insertions(+), 23 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 1d1ef0554c7c..18c9e3bda285 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -669,6 +669,18 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
> };
>
> /*
> + * Though it's very tempting to unmerge rmap_items from stable tree rather
> + * than check every pte of a given vma, the locking doesn't quite work for
> + * that - an rmap_item is assigned to the stable tree after inserting ksm
> + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
> + * rmap_items from parent to child at fork time (so as not to waste time
> + * if exit comes before the next scan reaches it).
> + *
> + * Similarly, although we'd like to remove rmap_items (so updating counts
> + * and freeing memory) when unmerging an area, it's easier to leave that
> + * to the next pass of ksmd - consider, for example, how ksmd might be
> + * in cmp_and_merge_page on one of the rmap_items we would be removing.
> + *
> * We use break_ksm to break COW on a ksm page by triggering unsharing,
> * such that the ksm page will get replaced by an exclusive anonymous page.
> *
> @@ -1077,25 +1089,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
> }
> }
>
> -/*
> - * Though it's very tempting to unmerge rmap_items from stable tree rather
> - * than check every pte of a given vma, the locking doesn't quite work for
> - * that - an rmap_item is assigned to the stable tree after inserting ksm
> - * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
> - * rmap_items from parent to child at fork time (so as not to waste time
> - * if exit comes before the next scan reaches it).
> - *
> - * Similarly, although we'd like to remove rmap_items (so updating counts
> - * and freeing memory) when unmerging an area, it's easier to leave that
> - * to the next pass of ksmd - consider, for example, how ksmd might be
> - * in cmp_and_merge_page on one of the rmap_items we would be removing.
> - */
> -static int unmerge_ksm_pages(struct vm_area_struct *vma,
> - unsigned long start, unsigned long end, bool lock_vma)
> -{
> - return break_ksm(vma, start, end, lock_vma);
> -}
> -
> static inline
> struct ksm_stable_node *folio_stable_node(const struct folio *folio)
> {
> @@ -1233,7 +1226,7 @@ static int unmerge_and_remove_all_rmap_items(void)
> for_each_vma(vmi, vma) {
> if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
> continue;
> - err = unmerge_ksm_pages(vma,
> + err = break_ksm(vma,
> vma->vm_start, vma->vm_end, false);
Move that all into a single line.
With that
Acked-by: David Hildenbrand <david@redhat.com>
Thanks for tackling this!
--
Cheers
David / dhildenb
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH 3/3] ksm: replace function unmerge_ksm_pages with break_ksm
2025-10-29 14:46 ` David Hildenbrand
@ 2025-10-30 12:44 ` Pedro Demarchi Gomes
0 siblings, 0 replies; 10+ messages in thread
From: Pedro Demarchi Gomes @ 2025-10-30 12:44 UTC (permalink / raw)
To: David Hildenbrand
Cc: Andrew Morton, Xu Xin, Chengming Zhou, linux-mm, linux-kernel
On Wed, Oct 29, 2025 at 03:46:36PM +0100, David Hildenbrand wrote:
> On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
> > Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks
> > an address range. So replace it with break_ksm().
> >
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> > mm/ksm.c | 39 ++++++++++++++++-----------------------
> > 1 file changed, 16 insertions(+), 23 deletions(-)
> >
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 1d1ef0554c7c..18c9e3bda285 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -669,6 +669,18 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
> > };
> > /*
> > + * Though it's very tempting to unmerge rmap_items from stable tree rather
> > + * than check every pte of a given vma, the locking doesn't quite work for
> > + * that - an rmap_item is assigned to the stable tree after inserting ksm
> > + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
> > + * rmap_items from parent to child at fork time (so as not to waste time
> > + * if exit comes before the next scan reaches it).
> > + *
> > + * Similarly, although we'd like to remove rmap_items (so updating counts
> > + * and freeing memory) when unmerging an area, it's easier to leave that
> > + * to the next pass of ksmd - consider, for example, how ksmd might be
> > + * in cmp_and_merge_page on one of the rmap_items we would be removing.
> > + *
> > * We use break_ksm to break COW on a ksm page by triggering unsharing,
> > * such that the ksm page will get replaced by an exclusive anonymous page.
> > *
> > @@ -1077,25 +1089,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
> > }
> > }
> > -/*
> > - * Though it's very tempting to unmerge rmap_items from stable tree rather
> > - * than check every pte of a given vma, the locking doesn't quite work for
> > - * that - an rmap_item is assigned to the stable tree after inserting ksm
> > - * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
> > - * rmap_items from parent to child at fork time (so as not to waste time
> > - * if exit comes before the next scan reaches it).
> > - *
> > - * Similarly, although we'd like to remove rmap_items (so updating counts
> > - * and freeing memory) when unmerging an area, it's easier to leave that
> > - * to the next pass of ksmd - consider, for example, how ksmd might be
> > - * in cmp_and_merge_page on one of the rmap_items we would be removing.
> > - */
> > -static int unmerge_ksm_pages(struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end, bool lock_vma)
> > -{
> > - return break_ksm(vma, start, end, lock_vma);
> > -}
> > -
> > static inline
> > struct ksm_stable_node *folio_stable_node(const struct folio *folio)
> > {
> > @@ -1233,7 +1226,7 @@ static int unmerge_and_remove_all_rmap_items(void)
> > for_each_vma(vmi, vma) {
> > if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
> > continue;
> > - err = unmerge_ksm_pages(vma,
> > + err = break_ksm(vma,
> > vma->vm_start, vma->vm_end, false);
>
> Move that all into a single line.
>
Ok.
>
> With that
>
> Acked-by: David Hildenbrand <david@redhat.com>
>
> Thanks for tackling this!
>
Thanks for your comments!
I will send a v2 soon with the corrections.
> --
> Cheers
>
> David / dhildenb
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread