* [PATCH v4 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"
2025-11-05 18:49 [PATCH v4 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
@ 2025-11-05 18:49 ` Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 2/3] ksm: perform a range-walk in break_ksm Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 3/3] ksm: replace function unmerge_ksm_pages with break_ksm Pedro Demarchi Gomes
2 siblings, 0 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-11-05 18:49 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel,
Pedro Demarchi Gomes, David Hildenbrand
This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
function break_ksm_pmd_entry() to use folios.
This reverts break_ksm() to use walk_page_range_vma() instead of
folio_walk_start().
Change break_ksm_pmd_entry() to call is_ksm_zero_pte() only if we know
the folio is present, and also rename variable ret to found.
This will make it easier to later modify break_ksm() to perform a proper
range walk.
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 48 insertions(+), 16 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 4f672f4f2140..9f74baf01e46 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,6 +607,48 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+ struct mm_walk *walk)
+{
+ struct folio *folio = NULL;
+ spinlock_t *ptl;
+ pte_t *pte;
+ pte_t ptent;
+ int found;
+
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte)
+ return 0;
+ ptent = ptep_get(pte);
+ if (pte_present(ptent)) {
+ folio = vm_normal_folio(walk->vma, addr, ptent);
+ } else if (!pte_none(ptent)) {
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ folio = pfn_swap_entry_folio(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ found = (folio && folio_test_ksm(folio)) ||
+ (pte_present(ptent) && is_ksm_zero_pte(ptent));
+ pte_unmap_unlock(pte, ptl);
+ return found;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
/*
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
@@ -623,26 +665,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
vm_fault_t ret = 0;
-
- if (lock_vma)
- vma_start_write(vma);
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
- bool ksm_page = false;
- struct folio_walk fw;
- struct folio *folio;
+ int ksm_page;
cond_resched();
- folio = folio_walk_start(&fw, vma, addr,
- FW_MIGRATION | FW_ZEROPAGE);
- if (folio) {
- /* Small folio implies FW_LEVEL_PTE. */
- if (!folio_test_large(folio) &&
- (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
- ksm_page = true;
- folio_walk_end(&fw, vma);
- }
-
+ ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
+ if (WARN_ON_ONCE(ksm_page < 0))
+ return ksm_page;
if (!ksm_page)
return 0;
ret = handle_mm_fault(vma, addr,
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH v4 2/3] ksm: perform a range-walk in break_ksm
2025-11-05 18:49 [PATCH v4 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Pedro Demarchi Gomes
@ 2025-11-05 18:49 ` Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 3/3] ksm: replace function unmerge_ksm_pages with break_ksm Pedro Demarchi Gomes
2 siblings, 0 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-11-05 18:49 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel,
Pedro Demarchi Gomes, David Hildenbrand
Make break_ksm() receive an address range and change
break_ksm_pmd_entry() to perform a range-walk and return the address of
the first ksm page found.
This change allows break_ksm() to skip unmapped regions instead of
iterating every page address. When unmerging large sparse VMAs, this
significantly reduces runtime.
In a benchmark unmerging a 32 TiB sparse virtual address space where
only one page was populated, the runtime dropped from 9 minutes to less
then 5 seconds.
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 83 +++++++++++++++++++++++++++++---------------------------
1 file changed, 43 insertions(+), 40 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 9f74baf01e46..43be57a6a3fd 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,35 +607,50 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct folio *folio = NULL;
+ unsigned long *found_addr = (unsigned long *) walk->private;
+ struct mm_struct *mm = walk->mm;
+ pte_t *start_ptep, *ptep;
spinlock_t *ptl;
- pte_t *pte;
- pte_t ptent;
- int found;
+ int found = 0;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte)
+ if (ksm_test_exit(walk->mm))
return 0;
- ptent = ptep_get(pte);
- if (pte_present(ptent)) {
- folio = vm_normal_folio(walk->vma, addr, ptent);
- } else if (!pte_none(ptent)) {
- swp_entry_t entry = pte_to_swp_entry(ptent);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
- /*
- * As KSM pages remain KSM pages until freed, no need to wait
- * here for migration to end.
- */
- if (is_migration_entry(entry))
- folio = pfn_swap_entry_folio(entry);
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte_t pte = ptep_get(ptep);
+ struct folio *folio = NULL;
+
+ if (pte_present(pte)) {
+ folio = vm_normal_folio(walk->vma, addr, pte);
+ } else if (!pte_none(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ folio = pfn_swap_entry_folio(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ found = (folio && folio_test_ksm(folio)) ||
+ (pte_present(pte) && is_ksm_zero_pte(pte));
+ if (found) {
+ *found_addr = addr;
+ goto out_unlock;
+ }
}
- /* return 1 if the page is an normal ksm page or KSM-placed zero page */
- found = (folio && folio_test_ksm(folio)) ||
- (pte_present(ptent) && is_ksm_zero_pte(ptent));
- pte_unmap_unlock(pte, ptl);
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
return found;
}
@@ -662,7 +677,8 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, bool lock_vma)
{
vm_fault_t ret = 0;
const struct mm_walk_ops *ops = lock_vma ?
@@ -672,11 +688,9 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
int ksm_page;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
- if (WARN_ON_ONCE(ksm_page < 0))
+ ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr);
+ if (ksm_page <= 0)
return ksm_page;
- if (!ksm_page)
- return 0;
ret = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
@@ -762,7 +776,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr, false);
+ break_ksm(vma, addr, addr + PAGE_SIZE, false);
mmap_read_unlock(mm);
}
@@ -1073,18 +1087,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
static int unmerge_ksm_pages(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool lock_vma)
{
- unsigned long addr;
- int err = 0;
-
- for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
- if (ksm_test_exit(vma->vm_mm))
- break;
- if (signal_pending(current))
- err = -ERESTARTSYS;
- else
- err = break_ksm(vma, addr, lock_vma);
- }
- return err;
+ return break_ksm(vma, start, end, lock_vma);
}
static inline
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH v4 3/3] ksm: replace function unmerge_ksm_pages with break_ksm
2025-11-05 18:49 [PATCH v4 0/3] ksm: perform a range-walk to jump over holes in break_ksm Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk" Pedro Demarchi Gomes
2025-11-05 18:49 ` [PATCH v4 2/3] ksm: perform a range-walk in break_ksm Pedro Demarchi Gomes
@ 2025-11-05 18:49 ` Pedro Demarchi Gomes
2 siblings, 0 replies; 4+ messages in thread
From: Pedro Demarchi Gomes @ 2025-11-05 18:49 UTC (permalink / raw)
To: David Hildenbrand, Andrew Morton
Cc: Xu Xin, Chengming Zhou, linux-mm, linux-kernel,
Pedro Demarchi Gomes, David Hildenbrand
Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks
an address range. So replace it with break_ksm().
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
mm/ksm.c | 40 ++++++++++++++++------------------------
1 file changed, 16 insertions(+), 24 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 43be57a6a3fd..f9a1a3658ead 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -665,6 +665,18 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
};
/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ *
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
*
@@ -1071,25 +1083,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
}
}
-/*
- * Though it's very tempting to unmerge rmap_items from stable tree rather
- * than check every pte of a given vma, the locking doesn't quite work for
- * that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
- * rmap_items from parent to child at fork time (so as not to waste time
- * if exit comes before the next scan reaches it).
- *
- * Similarly, although we'd like to remove rmap_items (so updating counts
- * and freeing memory) when unmerging an area, it's easier to leave that
- * to the next pass of ksmd - consider, for example, how ksmd might be
- * in cmp_and_merge_page on one of the rmap_items we would be removing.
- */
-static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, bool lock_vma)
-{
- return break_ksm(vma, start, end, lock_vma);
-}
-
static inline
struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
@@ -1227,8 +1220,7 @@ static int unmerge_and_remove_all_rmap_items(void)
for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
- err = unmerge_ksm_pages(vma,
- vma->vm_start, vma->vm_end, false);
+ err = break_ksm(vma, vma->vm_start, vma->vm_end, false);
if (err)
goto error;
}
@@ -2855,7 +2847,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
+ err = break_ksm(vma, vma->vm_start, vma->vm_end, true);
if (err)
return err;
}
@@ -3007,7 +2999,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end, true);
+ err = break_ksm(vma, start, end, true);
if (err)
return err;
}
@@ -3389,7 +3381,7 @@ static int ksm_memory_callback(struct notifier_block *self,
* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
* and remove_all_stable_nodes() while memory is going offline:
* it is unsafe for them to touch the stable tree at this time.
- * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * But break_ksm(), rmap lookups and other entry points
* which do not need the ksm_thread_mutex are all safe.
*/
mutex_lock(&ksm_thread_mutex);
--
2.43.0
^ permalink raw reply [flat|nested] 4+ messages in thread