* [PATCH 01/10] tools: improve vma test Makefile
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters Lorenzo Stoakes
` (8 subsequent siblings)
9 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Have vma.o depend on its source dependencies explicitly, as previously
these were simply being ignored as existing object files were up to date.
This now correctly re-triggers the build if mm/ source is changed as well
as local source code.
Also set clean as a phony rule.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
tools/testing/vma/Makefile | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index bfc905d222cf..860fd2311dcc 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-or-later
-.PHONY: default
+.PHONY: default clean
default: vma
@@ -9,7 +9,9 @@ include ../shared/shared.mk
OFILES = $(SHARED_OFILES) vma.o maple-shim.o
TARGETS = vma
-vma: $(OFILES) vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+
+vma: $(OFILES)
$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
clean:
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 01/10] tools: improve vma test Makefile Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 12:47 ` Petr Tesařík
` (2 more replies)
2024-08-05 12:13 ` [PATCH 03/10] mm: abstract duplicated policy comparison Lorenzo Stoakes
` (7 subsequent siblings)
9 siblings, 3 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Rather than passing around huge numbers of parameters to numerous helper
functions, abstract them into a single struct that we thread through the
operation.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/mmap.c | 76 ++++++++------
mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
mm/vma.h | 92 ++++++++---------
3 files changed, 294 insertions(+), 171 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a9c2329b09a..f931000c561f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
- pgoff_t vm_pgoff;
int error;
VMA_ITERATOR(vmi, mm, addr);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .start = addr,
+ .end = end,
+ .flags = vm_flags,
+ .pgoff = pgoff,
+ .file = file,
+ };
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
@@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
- next = vma_next(&vmi);
- prev = vma_prev(&vmi);
+ next = vmg.next = vma_next(&vmi);
+ prev = vmg.prev = vma_prev(&vmi);
if (vm_flags & VM_SPECIAL) {
if (prev)
vma_iter_next_range(&vmi);
@@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Attempt to expand an old mapping */
/* Check next */
if (next && next->vm_start == end && !vma_policy(next) &&
- can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
- NULL_VM_UFFD_CTX, NULL)) {
+ can_vma_merge_before(&vmg)) {
merge_end = next->vm_end;
vma = next;
- vm_pgoff = next->vm_pgoff - pglen;
+ vmg.pgoff = next->vm_pgoff - pglen;
+ }
+
+ if (vma) {
+ vmg.anon_vma = vma->anon_vma;
+ vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
}
/* Check prev */
if (prev && prev->vm_end == addr && !vma_policy(prev) &&
- (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
- pgoff, vma->vm_userfaultfd_ctx, NULL) :
- can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
- NULL_VM_UFFD_CTX, NULL))) {
+ can_vma_merge_after(&vmg)) {
merge_start = prev->vm_start;
vma = prev;
- vm_pgoff = prev->vm_pgoff;
+ vmg.pgoff = prev->vm_pgoff;
} else if (prev) {
vma_iter_next_range(&vmi);
}
/* Actually expand, if possible */
if (vma &&
- !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
+ !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
khugepaged_enter_vma(vma, vm_flags);
goto expanded;
}
@@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Expand the existing vma if possible; Note that singular lists do not
* occur after forking, so the expand will only happen on new VMAs.
*/
- if (vma && vma->vm_end == addr && !vma_policy(vma) &&
- can_vma_merge_after(vma, flags, NULL, NULL,
- addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
- vma_iter_config(vmi, vma->vm_start, addr + len);
- if (vma_iter_prealloc(vmi, vma))
- goto unacct_fail;
-
- vma_start_write(vma);
-
- init_vma_prep(&vp, vma);
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
- vma->vm_end = addr + len;
- vm_flags_set(vma, VM_SOFTDIRTY);
- vma_iter_store(vmi, vma);
-
- vma_complete(&vp, vmi, mm);
- khugepaged_enter_vma(vma, flags);
- goto out;
+ if (vma && vma->vm_end == addr && !vma_policy(vma)) {
+ struct vma_merge_struct vmg = {
+ .prev = vma,
+ .flags = flags,
+ .pgoff = addr >> PAGE_SHIFT,
+ };
+
+ if (can_vma_merge_after(&vmg)) {
+ vma_iter_config(vmi, vma->vm_start, addr + len);
+ if (vma_iter_prealloc(vmi, vma))
+ goto unacct_fail;
+
+ vma_start_write(vma);
+
+ init_vma_prep(&vp, vma);
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ vma->vm_end = addr + len;
+ vm_flags_set(vma, VM_SOFTDIRTY);
+ vma_iter_store(vmi, vma);
+
+ vma_complete(&vp, vmi, mm);
+ khugepaged_enter_vma(vma, flags);
+ goto out;
+ }
}
if (vma)
diff --git a/mm/vma.c b/mm/vma.c
index bf0546fe6eab..20c4ce7712c0 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -7,16 +7,18 @@
#include "vma_internal.h"
#include "vma.h"
-/*
- * If the vma has a ->close operation then the driver probably needs to release
- * per-vma resources, so we don't attempt to merge those if the caller indicates
- * the current vma may be removed as part of the merge.
- */
-static inline bool is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name, bool may_remove_vma)
+static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
+ /*
+ * If the vma has a ->close operation then the driver probably needs to
+ * release per-vma resources, so we don't attempt to merge those if the
+ * caller indicates the current vma may be removed as part of the merge,
+ * which is the case if we are attempting to merge the next VMA into
+ * this one.
+ */
+ bool may_remove_vma = merge_next;
+
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
@@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
- if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
return false;
- if (vma->vm_file != file)
+ if (vma->vm_file != vmg->file)
return false;
if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
return false;
- if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
return false;
- if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
+ if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
return false;
return true;
}
@@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
* We assume the vma may be removed as part of the merge.
*/
bool
-can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+can_vma_merge_before(struct vma_merge_struct *vmg)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
- if (vma->vm_pgoff == vm_pgoff)
+ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+ if (is_mergeable_vma(vmg, true) &&
+ is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
+
return false;
}
@@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
*
* We assume that vma is not removed as part of the merge.
*/
-bool
-can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
- pgoff_t vm_pglen;
-
- vm_pglen = vma_pages(vma);
- if (vma->vm_pgoff + vm_pglen == vm_pgoff)
+ if (is_mergeable_vma(vmg, false) &&
+ is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
return false;
@@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
* VMA Iterator will point to the end VMA.
*/
static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+ unsigned long addr, bool new_below)
{
struct vma_prepare vp;
struct vm_area_struct *new;
@@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Split a vma into two pieces at address 'addr', a new vma is allocated
* either for the first part or the tail.
*/
-static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
+static int split_vma(struct vma_merge_struct *vmg, bool new_below)
{
- if (vma->vm_mm->map_count >= sysctl_max_map_count)
+ if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
- return __split_vma(vmi, vma, addr, new_below);
+ return __split_vma(vmg->vmi, vmg->vma,
+ new_below ? vmg->start : vmg->end,
+ new_below);
}
/*
@@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;
- error = __split_vma(vmi, vma, start, 1);
+ error = __split_vma(vmi, vma, start, true);
if (error)
goto start_split_failed;
}
@@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
do {
/* Does it split the end? */
if (next->vm_end > end) {
- error = __split_vma(vmi, next, end, 0);
+ error = __split_vma(vmi, next, end, false);
if (error)
goto end_split_failed;
}
@@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
* **** is not represented - it will be merged and the vma containing the
* area is returned, or the function will return NULL
*/
-static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *src, unsigned long addr, unsigned long end,
- unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
+static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
{
- struct mm_struct *mm = src->vm_mm;
- struct anon_vma *anon_vma = src->anon_vma;
- struct file *file = src->vm_file;
+ struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
+ struct vm_area_struct *prev = vmg->prev;
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
struct vm_area_struct *anon_dup = NULL;
@@ -953,16 +943,18 @@ static struct vm_area_struct
bool merge_prev = false;
bool merge_next = false;
bool vma_expanded = false;
+ unsigned long addr = vmg->start;
+ unsigned long end = vmg->end;
unsigned long vma_start = addr;
unsigned long vma_end = end;
- pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
+ pgoff_t pglen = PHYS_PFN(end - addr);
long adj_start = 0;
/*
* We later require that vma->vm_flags == vm_flags,
* so this tests vma->vm_flags & VM_SPECIAL, too.
*/
- if (vm_flags & VM_SPECIAL)
+ if (vmg->flags & VM_SPECIAL)
return NULL;
/* Does the input range span an existing VMA? (cases 5 - 8) */
@@ -970,27 +962,26 @@ static struct vm_area_struct
if (!curr || /* cases 1 - 4 */
end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
- next = vma_lookup(mm, end);
+ next = vmg->next = vma_lookup(mm, end);
else
- next = NULL; /* case 5 */
+ next = vmg->next = NULL; /* case 5 */
if (prev) {
vma_start = prev->vm_start;
vma_pgoff = prev->vm_pgoff;
/* Can we merge the predecessor? */
- if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
- && can_vma_merge_after(prev, vm_flags, anon_vma, file,
- pgoff, vm_userfaultfd_ctx, anon_name)) {
+ if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
+ && can_vma_merge_after(vmg)) {
+
merge_prev = true;
- vma_prev(vmi);
+ vma_prev(vmg->vmi);
}
}
/* Can we merge the successor? */
- if (next && mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx, anon_name)) {
+ if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
+ can_vma_merge_before(vmg)) {
merge_next = true;
}
@@ -1041,7 +1032,7 @@ static struct vm_area_struct
remove = curr;
} else { /* case 5 */
adjust = curr;
- adj_start = (end - curr->vm_start);
+ adj_start = end - curr->vm_start;
}
if (!err)
err = dup_anon_vma(prev, curr, &anon_dup);
@@ -1081,13 +1072,13 @@ static struct vm_area_struct
vma_expanded = true;
if (vma_expanded) {
- vma_iter_config(vmi, vma_start, vma_end);
+ vma_iter_config(vmg->vmi, vma_start, vma_end);
} else {
- vma_iter_config(vmi, adjust->vm_start + adj_start,
+ vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
adjust->vm_end);
}
- if (vma_iter_prealloc(vmi, vma))
+ if (vma_iter_prealloc(vmg->vmi, vma))
goto prealloc_fail;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
@@ -1099,19 +1090,19 @@ static struct vm_area_struct
vma_set_range(vma, vma_start, vma_end, vma_pgoff);
if (vma_expanded)
- vma_iter_store(vmi, vma);
+ vma_iter_store(vmg->vmi, vma);
if (adj_start) {
adjust->vm_start += adj_start;
adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
if (adj_start < 0) {
WARN_ON(vma_expanded);
- vma_iter_store(vmi, next);
+ vma_iter_store(vmg->vmi, next);
}
}
- vma_complete(&vp, vmi, mm);
- khugepaged_enter_vma(res, vm_flags);
+ vma_complete(&vp, vmg->vmi, mm);
+ khugepaged_enter_vma(res, vmg->flags);
return res;
prealloc_fail:
@@ -1119,8 +1110,8 @@ static struct vm_area_struct
unlink_anon_vmas(anon_dup);
anon_vma_fail:
- vma_iter_set(vmi, addr);
- vma_iter_load(vmi);
+ vma_iter_set(vmg->vmi, addr);
+ vma_iter_load(vmg->vmi);
return NULL;
}
@@ -1137,38 +1128,141 @@ static struct vm_area_struct
* The function returns either the merged VMA, the original VMA if a split was
* required instead, or an error if the split failed.
*/
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long vm_flags,
- struct mempolicy *policy,
- struct vm_userfaultfd_ctx uffd_ctx,
- struct anon_vma_name *anon_name)
+static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
- pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ struct vm_area_struct *vma = vmg->vma;
struct vm_area_struct *merged;
- merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
- pgoff, policy, uffd_ctx, anon_name);
+ /* First, try to merge. */
+ merged = vma_merge(vmg);
if (merged)
return merged;
- if (vma->vm_start < start) {
- int err = split_vma(vmi, vma, start, 1);
+ /* Split any preceding portion of the VMA. */
+ if (vma->vm_start < vmg->start) {
+ int err = split_vma(vmg, true);
if (err)
return ERR_PTR(err);
}
- if (vma->vm_end > end) {
- int err = split_vma(vmi, vma, end, 0);
+ /* Split any trailing portion of the VMA. */
+ if (vma->vm_end > vmg->end) {
+ int err = split_vma(vmg, false);
if (err)
return ERR_PTR(err);
}
- return vma;
+ return vmg->vma;
+}
+
+/* Assumes addr >= vma->vm_start. */
+static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
+{
+ return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
+}
+
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags)
+{
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .vma = vma,
+ .start = start,
+ .end = end,
+ .flags = new_flags,
+ .pgoff = vma_pgoff_offset(vma, start),
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long new_flags,
+ struct anon_vma_name *new_name)
+{
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .vma = vma,
+ .start = start,
+ .end = end,
+ .flags = new_flags,
+ .pgoff = vma_pgoff_offset(vma, start),
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = new_name,
+ };
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
+{
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .vma = vma,
+ .start = start,
+ .end = end,
+ .flags = vma->vm_flags,
+ .pgoff = vma_pgoff_offset(vma, start),
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .policy = new_pol,
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags,
+ struct vm_userfaultfd_ctx new_ctx)
+{
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .vma = vma,
+ .start = start,
+ .end = end,
+ .flags = new_flags,
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .pgoff = vma_pgoff_offset(vma, start),
+ .policy = vma_policy(vma),
+ .uffd_ctx = new_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
+
+ return vma_modify(&vmg);
}
/*
@@ -1180,8 +1274,22 @@ struct vm_area_struct
struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff)
{
- return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
- vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .vma = vma,
+ .start = start,
+ .end = end,
+ .flags = vma->vm_flags,
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .pgoff = pgoff,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
+
+ return vma_merge(&vmg);
}
/*
@@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
unsigned long delta)
{
pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = vma,
+ .vma = vma,
+ .start = vma->vm_end,
+ .end = vma->vm_end + delta,
+ .flags = vma->vm_flags,
+ .file = vma->vm_file,
+ .pgoff = pgoff,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
/* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
- vma->vm_flags, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ return vma_merge(&vmg);
}
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
diff --git a/mm/vma.h b/mm/vma.h
index 6efdf1768a0a..c31684cc1da6 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -26,6 +26,23 @@ struct unlink_vma_file_batch {
struct vm_area_struct *vmas[8];
};
+/* Represents a VMA merge operation. */
+struct vma_merge_struct {
+ struct vma_iterator *vmi;
+ struct vm_area_struct *prev;
+ struct vm_area_struct *next; /* Modified by vma_merge(). */
+ struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+ unsigned long start;
+ unsigned long end;
+ unsigned long flags;
+ pgoff_t pgoff;
+ struct file *file;
+ struct anon_vma *anon_vma;
+ struct mempolicy *policy;
+ struct vm_userfaultfd_ctx uffd_ctx;
+ struct anon_vma_name *anon_name;
+};
+
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
void validate_mm(struct mm_struct *mm);
#else
@@ -72,80 +89,53 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *next, unsigned long start,
unsigned long end, unsigned long tree_end, bool mm_wr_locked);
-/* Required by mmap_region(). */
-bool
-can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name);
-
-/* Required by mmap_region() and do_brk_flags(). */
-bool
-can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name);
-
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long vm_flags,
- struct mempolicy *policy,
- struct vm_userfaultfd_ctx uffd_ctx,
- struct anon_vma_name *anon_name);
+/*
+ * Can we merge the VMA described by vmg into the following VMA vmg->next?
+ *
+ * Required by mmap_region().
+ */
+bool can_vma_merge_before(struct vma_merge_struct *vmg);
+
+/*
+ * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
+ *
+ * Required by mmap_region() and do_brk_flags().
+ */
+bool can_vma_merge_after(struct vma_merge_struct *vmg);
/* We are about to modify the VMA's flags. */
-static inline struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long new_flags)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx,
- anon_vma_name(vma));
-}
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags);
/* We are about to modify the VMA's flags and/or anon_name. */
-static inline struct vm_area_struct
+struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
unsigned long new_flags,
- struct anon_vma_name *new_name)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
-}
+ struct anon_vma_name *new_name);
/* We are about to modify the VMA's memory policy. */
-static inline struct vm_area_struct
+struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct mempolicy *new_pol)
-{
- return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
- new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
+ struct mempolicy *new_pol);
/* We are about to modify the VMA's flags and/or uffd context. */
-static inline struct vm_area_struct
+struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
-{
- return vma_modify(vmi, prev, vma, start, end, new_flags,
- vma_policy(vma), new_ctx, anon_vma_name(vma));
-}
+ struct vm_userfaultfd_ctx new_ctx);
struct vm_area_struct
*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-05 12:13 ` [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters Lorenzo Stoakes
@ 2024-08-06 12:47 ` Petr Tesařík
2024-08-06 13:43 ` Lorenzo Stoakes
2024-08-08 12:49 ` Vlastimil Babka
2024-08-08 20:07 ` Liam R. Howlett
2 siblings, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 12:47 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
Hi Lorenzo!
On Mon, 5 Aug 2024 13:13:49 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> Rather than passing around huge numbers of parameters to numerous helper
> functions, abstract them into a single struct that we thread through the
> operation.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 76 ++++++++------
> mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> mm/vma.h | 92 ++++++++---------
> 3 files changed, 294 insertions(+), 171 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 4a9c2329b09a..f931000c561f 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long end = addr + len;
> unsigned long merge_start = addr, merge_end = end;
> bool writable_file_mapping = false;
> - pgoff_t vm_pgoff;
> int error;
> VMA_ITERATOR(vmi, mm, addr);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .start = addr,
> + .end = end,
> + .flags = vm_flags,
> + .pgoff = pgoff,
> + .file = file,
> + };
>
> /* Check against address space limit. */
> if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags |= VM_ACCOUNT;
> }
>
> - next = vma_next(&vmi);
> - prev = vma_prev(&vmi);
> + next = vmg.next = vma_next(&vmi);
> + prev = vmg.prev = vma_prev(&vmi);
So, next is now a shortcut for vmg.next, and prev is a shortcut for
vmg.prev. ATM there is only one assignment, so no big deal, but I
wonder if next and prev could be removed instead, same as you replaced
vm_pgoff with vmg.pgoff.
Is the resulting code _too_ ugly?
> if (vm_flags & VM_SPECIAL) {
> if (prev)
> vma_iter_next_range(&vmi);
> @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> /* Attempt to expand an old mapping */
> /* Check next */
> if (next && next->vm_start == end && !vma_policy(next) &&
> - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> - NULL_VM_UFFD_CTX, NULL)) {
> + can_vma_merge_before(&vmg)) {
> merge_end = next->vm_end;
> vma = next;
> - vm_pgoff = next->vm_pgoff - pglen;
> + vmg.pgoff = next->vm_pgoff - pglen;
> + }
> +
> + if (vma) {
> + vmg.anon_vma = vma->anon_vma;
> + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> }
>
> /* Check prev */
> if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> - NULL_VM_UFFD_CTX, NULL))) {
> + can_vma_merge_after(&vmg)) {
> merge_start = prev->vm_start;
> vma = prev;
> - vm_pgoff = prev->vm_pgoff;
> + vmg.pgoff = prev->vm_pgoff;
> } else if (prev) {
> vma_iter_next_range(&vmi);
> }
>
> /* Actually expand, if possible */
> if (vma &&
> - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> khugepaged_enter_vma(vma, vm_flags);
> goto expanded;
> }
> @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Expand the existing vma if possible; Note that singular lists do not
> * occur after forking, so the expand will only happen on new VMAs.
> */
> - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> - can_vma_merge_after(vma, flags, NULL, NULL,
> - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> - vma_iter_config(vmi, vma->vm_start, addr + len);
> - if (vma_iter_prealloc(vmi, vma))
> - goto unacct_fail;
> -
> - vma_start_write(vma);
> -
> - init_vma_prep(&vp, vma);
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> - vma->vm_end = addr + len;
> - vm_flags_set(vma, VM_SOFTDIRTY);
> - vma_iter_store(vmi, vma);
> -
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(vma, flags);
> - goto out;
> + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> + struct vma_merge_struct vmg = {
> + .prev = vma,
> + .flags = flags,
> + .pgoff = addr >> PAGE_SHIFT,
> + };
> +
> + if (can_vma_merge_after(&vmg)) {
> + vma_iter_config(vmi, vma->vm_start, addr + len);
> + if (vma_iter_prealloc(vmi, vma))
> + goto unacct_fail;
> +
> + vma_start_write(vma);
> +
> + init_vma_prep(&vp, vma);
> + vma_prepare(&vp);
> + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> + vma->vm_end = addr + len;
> + vm_flags_set(vma, VM_SOFTDIRTY);
> + vma_iter_store(vmi, vma);
> +
> + vma_complete(&vp, vmi, mm);
> + khugepaged_enter_vma(vma, flags);
> + goto out;
> + }
> }
>
> if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index bf0546fe6eab..20c4ce7712c0 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -7,16 +7,18 @@
> #include "vma_internal.h"
> #include "vma.h"
>
> -/*
> - * If the vma has a ->close operation then the driver probably needs to release
> - * per-vma resources, so we don't attempt to merge those if the caller indicates
> - * the current vma may be removed as part of the merge.
> - */
> -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> - struct file *file, unsigned long vm_flags,
> - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name, bool may_remove_vma)
> +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> {
> + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> + /*
> + * If the vma has a ->close operation then the driver probably needs to
> + * release per-vma resources, so we don't attempt to merge those if the
> + * caller indicates the current vma may be removed as part of the merge,
> + * which is the case if we are attempting to merge the next VMA into
> + * this one.
> + */
> + bool may_remove_vma = merge_next;
> +
This variable is used only once. If you want to clarify the double
meaning of the merge_next parameter, consider moving this comment
further down to the conditional and merely renaming the parameter.
> /*
> * VM_SOFTDIRTY should not prevent from VMA merging, if we
> * match the flags but dirty bit -- the caller should mark
> @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> * the kernel to generate new VMAs when old one could be
> * extended instead.
> */
> - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> return false;
> - if (vma->vm_file != file)
> + if (vma->vm_file != vmg->file)
> return false;
> if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
AFAICS this is the only place where may_remove_vma is used.
> return false;
> - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> return false;
> - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> return false;
> return true;
> }
> @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> * We assume the vma may be removed as part of the merge.
> */
> bool
> -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +can_vma_merge_before(struct vma_merge_struct *vmg)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> - if (vma->vm_pgoff == vm_pgoff)
> + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> +
> + if (is_mergeable_vma(vmg, true) &&
> + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> return true;
> }
> +
> return false;
> }
>
> @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> *
> * We assume that vma is not removed as part of the merge.
> */
> -bool
> -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> - pgoff_t vm_pglen;
> -
> - vm_pglen = vma_pages(vma);
> - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> + if (is_mergeable_vma(vmg, false) &&
> + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> return true;
> }
> return false;
> @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> * VMA Iterator will point to the end VMA.
> */
> static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long addr, int new_below)
> + unsigned long addr, bool new_below)
> {
> struct vma_prepare vp;
> struct vm_area_struct *new;
> @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Split a vma into two pieces at address 'addr', a new vma is allocated
> * either for the first part or the tail.
> */
> -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long addr, int new_below)
> +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
IMHO this patch is already long enough. Maybe the type change from int
to bool could be split out to a separate patch to reduce churn here?
> {
> - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> return -ENOMEM;
>
> - return __split_vma(vmi, vma, addr, new_below);
> + return __split_vma(vmg->vmi, vmg->vma,
> + new_below ? vmg->start : vmg->end,
> + new_below);
> }
>
> /*
> @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> goto map_count_exceeded;
>
> - error = __split_vma(vmi, vma, start, 1);
> + error = __split_vma(vmi, vma, start, true);
> if (error)
> goto start_split_failed;
> }
> @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> do {
> /* Does it split the end? */
> if (next->vm_end > end) {
> - error = __split_vma(vmi, next, end, 0);
> + error = __split_vma(vmi, next, end, false);
> if (error)
> goto end_split_failed;
> }
> @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> * **** is not represented - it will be merged and the vma containing the
> * area is returned, or the function will return NULL
> */
> -static struct vm_area_struct
> -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> {
> - struct mm_struct *mm = src->vm_mm;
> - struct anon_vma *anon_vma = src->anon_vma;
> - struct file *file = src->vm_file;
> + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> + struct vm_area_struct *prev = vmg->prev;
> struct vm_area_struct *curr, *next, *res;
> struct vm_area_struct *vma, *adjust, *remove, *remove2;
> struct vm_area_struct *anon_dup = NULL;
> @@ -953,16 +943,18 @@ static struct vm_area_struct
> bool merge_prev = false;
> bool merge_next = false;
> bool vma_expanded = false;
> + unsigned long addr = vmg->start;
> + unsigned long end = vmg->end;
> unsigned long vma_start = addr;
> unsigned long vma_end = end;
> - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> + pgoff_t pglen = PHYS_PFN(end - addr);
> long adj_start = 0;
>
> /*
> * We later require that vma->vm_flags == vm_flags,
> * so this tests vma->vm_flags & VM_SPECIAL, too.
> */
> - if (vm_flags & VM_SPECIAL)
> + if (vmg->flags & VM_SPECIAL)
> return NULL;
>
> /* Does the input range span an existing VMA? (cases 5 - 8) */
> @@ -970,27 +962,26 @@ static struct vm_area_struct
>
> if (!curr || /* cases 1 - 4 */
> end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> - next = vma_lookup(mm, end);
> + next = vmg->next = vma_lookup(mm, end);
> else
> - next = NULL; /* case 5 */
> + next = vmg->next = NULL; /* case 5 */
Again, is it worth keeping the "next" variable, or could we replace it
with "vmg->next" everywhere?
No other comments to the rest of this patch.
Petr T
>
> if (prev) {
> vma_start = prev->vm_start;
> vma_pgoff = prev->vm_pgoff;
>
> /* Can we merge the predecessor? */
> - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> - && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> - pgoff, vm_userfaultfd_ctx, anon_name)) {
> + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
> + && can_vma_merge_after(vmg)) {
> +
> merge_prev = true;
> - vma_prev(vmi);
> + vma_prev(vmg->vmi);
> }
> }
>
> /* Can we merge the successor? */
> - if (next && mpol_equal(policy, vma_policy(next)) &&
> - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> - vm_userfaultfd_ctx, anon_name)) {
> + if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
> + can_vma_merge_before(vmg)) {
> merge_next = true;
> }
>
> @@ -1041,7 +1032,7 @@ static struct vm_area_struct
> remove = curr;
> } else { /* case 5 */
> adjust = curr;
> - adj_start = (end - curr->vm_start);
> + adj_start = end - curr->vm_start;
> }
> if (!err)
> err = dup_anon_vma(prev, curr, &anon_dup);
> @@ -1081,13 +1072,13 @@ static struct vm_area_struct
> vma_expanded = true;
>
> if (vma_expanded) {
> - vma_iter_config(vmi, vma_start, vma_end);
> + vma_iter_config(vmg->vmi, vma_start, vma_end);
> } else {
> - vma_iter_config(vmi, adjust->vm_start + adj_start,
> + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> adjust->vm_end);
> }
>
> - if (vma_iter_prealloc(vmi, vma))
> + if (vma_iter_prealloc(vmg->vmi, vma))
> goto prealloc_fail;
>
> init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> @@ -1099,19 +1090,19 @@ static struct vm_area_struct
> vma_set_range(vma, vma_start, vma_end, vma_pgoff);
>
> if (vma_expanded)
> - vma_iter_store(vmi, vma);
> + vma_iter_store(vmg->vmi, vma);
>
> if (adj_start) {
> adjust->vm_start += adj_start;
> adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> if (adj_start < 0) {
> WARN_ON(vma_expanded);
> - vma_iter_store(vmi, next);
> + vma_iter_store(vmg->vmi, next);
> }
> }
>
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(res, vm_flags);
> + vma_complete(&vp, vmg->vmi, mm);
> + khugepaged_enter_vma(res, vmg->flags);
> return res;
>
> prealloc_fail:
> @@ -1119,8 +1110,8 @@ static struct vm_area_struct
> unlink_anon_vmas(anon_dup);
>
> anon_vma_fail:
> - vma_iter_set(vmi, addr);
> - vma_iter_load(vmi);
> + vma_iter_set(vmg->vmi, addr);
> + vma_iter_load(vmg->vmi);
> return NULL;
> }
>
> @@ -1137,38 +1128,141 @@ static struct vm_area_struct
> * The function returns either the merged VMA, the original VMA if a split was
> * required instead, or an error if the split failed.
> */
> -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long vm_flags,
> - struct mempolicy *policy,
> - struct vm_userfaultfd_ctx uffd_ctx,
> - struct anon_vma_name *anon_name)
> +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> {
> - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> + struct vm_area_struct *vma = vmg->vma;
> struct vm_area_struct *merged;
>
> - merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> - pgoff, policy, uffd_ctx, anon_name);
> + /* First, try to merge. */
> + merged = vma_merge(vmg);
> if (merged)
> return merged;
>
> - if (vma->vm_start < start) {
> - int err = split_vma(vmi, vma, start, 1);
> + /* Split any preceding portion of the VMA. */
> + if (vma->vm_start < vmg->start) {
> + int err = split_vma(vmg, true);
>
> if (err)
> return ERR_PTR(err);
> }
>
> - if (vma->vm_end > end) {
> - int err = split_vma(vmi, vma, end, 0);
> + /* Split any trailing portion of the VMA. */
> + if (vma->vm_end > vmg->end) {
> + int err = split_vma(vmg, false);
>
> if (err)
> return ERR_PTR(err);
> }
>
> - return vma;
> + return vmg->vma;
> +}
> +
> +/* Assumes addr >= vma->vm_start. */
> +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> +{
> + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> +}
> +
> +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_name(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start,
> + unsigned long end,
> + unsigned long new_flags,
> + struct anon_vma_name *new_name)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = new_name,
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_policy(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + struct mempolicy *new_pol)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = new_pol,
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags,
> + struct vm_userfaultfd_ctx new_ctx)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .policy = vma_policy(vma),
> + .uffd_ctx = new_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> }
>
> /*
> @@ -1180,8 +1274,22 @@ struct vm_area_struct
> struct vm_area_struct *vma, unsigned long start,
> unsigned long end, pgoff_t pgoff)
> {
> - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_merge(&vmg);
> }
>
> /*
> @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> unsigned long delta)
> {
> pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = vma,
> + .vma = vma,
> + .start = vma->vm_end,
> + .end = vma->vm_end + delta,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
>
> /* vma is specified as prev, so case 1 or 2 will apply. */
> - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> - vma->vm_flags, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + return vma_merge(&vmg);
> }
>
> void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> diff --git a/mm/vma.h b/mm/vma.h
> index 6efdf1768a0a..c31684cc1da6 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -26,6 +26,23 @@ struct unlink_vma_file_batch {
> struct vm_area_struct *vmas[8];
> };
>
> +/* Represents a VMA merge operation. */
> +struct vma_merge_struct {
> + struct vma_iterator *vmi;
> + struct vm_area_struct *prev;
> + struct vm_area_struct *next; /* Modified by vma_merge(). */
> + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
> + unsigned long start;
> + unsigned long end;
> + unsigned long flags;
> + pgoff_t pgoff;
> + struct file *file;
> + struct anon_vma *anon_vma;
> + struct mempolicy *policy;
> + struct vm_userfaultfd_ctx uffd_ctx;
> + struct anon_vma_name *anon_name;
> +};
> +
> #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> void validate_mm(struct mm_struct *mm);
> #else
> @@ -72,80 +89,53 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> struct vm_area_struct *next, unsigned long start,
> unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>
> -/* Required by mmap_region(). */
> -bool
> -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name);
> -
> -/* Required by mmap_region() and do_brk_flags(). */
> -bool
> -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name);
> -
> -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long vm_flags,
> - struct mempolicy *policy,
> - struct vm_userfaultfd_ctx uffd_ctx,
> - struct anon_vma_name *anon_name);
> +/*
> + * Can we merge the VMA described by vmg into the following VMA vmg->next?
> + *
> + * Required by mmap_region().
> + */
> +bool can_vma_merge_before(struct vma_merge_struct *vmg);
> +
> +/*
> + * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> + *
> + * Required by mmap_region() and do_brk_flags().
> + */
> +bool can_vma_merge_after(struct vma_merge_struct *vmg);
>
> /* We are about to modify the VMA's flags. */
> -static inline struct vm_area_struct
> -*vma_modify_flags(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long new_flags)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), vma->vm_userfaultfd_ctx,
> - anon_vma_name(vma));
> -}
> +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags);
>
> /* We are about to modify the VMA's flags and/or anon_name. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_flags_name(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start,
> unsigned long end,
> unsigned long new_flags,
> - struct anon_vma_name *new_name)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> -}
> + struct anon_vma_name *new_name);
>
> /* We are about to modify the VMA's memory policy. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_policy(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> - struct mempolicy *new_pol)
> -{
> - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> -}
> + struct mempolicy *new_pol);
>
> /* We are about to modify the VMA's flags and/or uffd context. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_flags_uffd(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long new_flags,
> - struct vm_userfaultfd_ctx new_ctx)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), new_ctx, anon_vma_name(vma));
> -}
> + struct vm_userfaultfd_ctx new_ctx);
>
> struct vm_area_struct
> *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-06 12:47 ` Petr Tesařík
@ 2024-08-06 13:43 ` Lorenzo Stoakes
2024-08-06 14:06 ` Petr Tesařík
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 13:43 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 02:47:54PM GMT, Petr Tesařík wrote:
> Hi Lorenzo!
>
> On Mon, 5 Aug 2024 13:13:49 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > Rather than passing around huge numbers of parameters to numerous helper
> > functions, abstract them into a single struct that we thread through the
> > operation.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 76 ++++++++------
> > mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> > mm/vma.h | 92 ++++++++---------
> > 3 files changed, 294 insertions(+), 171 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 4a9c2329b09a..f931000c561f 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > unsigned long end = addr + len;
> > unsigned long merge_start = addr, merge_end = end;
> > bool writable_file_mapping = false;
> > - pgoff_t vm_pgoff;
> > int error;
> > VMA_ITERATOR(vmi, mm, addr);
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .start = addr,
> > + .end = end,
> > + .flags = vm_flags,
> > + .pgoff = pgoff,
> > + .file = file,
> > + };
> >
> > /* Check against address space limit. */
> > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vma_next(&vmi);
> > - prev = vma_prev(&vmi);
> > + next = vmg.next = vma_next(&vmi);
> > + prev = vmg.prev = vma_prev(&vmi);
>
> So, next is now a shortcut for vmg.next, and prev is a shortcut for
> vmg.prev. ATM there is only one assignment, so no big deal, but I
> wonder if next and prev could be removed instead, same as you replaced
> vm_pgoff with vmg.pgoff.
It's simply to avoid repeatedly referencing vmg.xxx / at least reduce
_some_ churn. Also this will get moved shortly, so it's worth looking at in
final form.
>
> Is the resulting code _too_ ugly?
>
> > if (vm_flags & VM_SPECIAL) {
> > if (prev)
> > vma_iter_next_range(&vmi);
> > @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > /* Attempt to expand an old mapping */
> > /* Check next */
> > if (next && next->vm_start == end && !vma_policy(next) &&
> > - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> > - NULL_VM_UFFD_CTX, NULL)) {
> > + can_vma_merge_before(&vmg)) {
> > merge_end = next->vm_end;
> > vma = next;
> > - vm_pgoff = next->vm_pgoff - pglen;
> > + vmg.pgoff = next->vm_pgoff - pglen;
> > + }
> > +
> > + if (vma) {
> > + vmg.anon_vma = vma->anon_vma;
> > + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> > }
> >
> > /* Check prev */
> > if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> > - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> > - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> > - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> > - NULL_VM_UFFD_CTX, NULL))) {
> > + can_vma_merge_after(&vmg)) {
> > merge_start = prev->vm_start;
> > vma = prev;
> > - vm_pgoff = prev->vm_pgoff;
> > + vmg.pgoff = prev->vm_pgoff;
> > } else if (prev) {
> > vma_iter_next_range(&vmi);
> > }
> >
> > /* Actually expand, if possible */
> > if (vma &&
> > - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> > + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > khugepaged_enter_vma(vma, vm_flags);
> > goto expanded;
> > }
> > @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > * Expand the existing vma if possible; Note that singular lists do not
> > * occur after forking, so the expand will only happen on new VMAs.
> > */
> > - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> > - can_vma_merge_after(vma, flags, NULL, NULL,
> > - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto unacct_fail;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > - vma->vm_end = addr + len;
> > - vm_flags_set(vma, VM_SOFTDIRTY);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(vma, flags);
> > - goto out;
> > + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> > + struct vma_merge_struct vmg = {
> > + .prev = vma,
> > + .flags = flags,
> > + .pgoff = addr >> PAGE_SHIFT,
> > + };
> > +
> > + if (can_vma_merge_after(&vmg)) {
> > + vma_iter_config(vmi, vma->vm_start, addr + len);
> > + if (vma_iter_prealloc(vmi, vma))
> > + goto unacct_fail;
> > +
> > + vma_start_write(vma);
> > +
> > + init_vma_prep(&vp, vma);
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > + vma->vm_end = addr + len;
> > + vm_flags_set(vma, VM_SOFTDIRTY);
> > + vma_iter_store(vmi, vma);
> > +
> > + vma_complete(&vp, vmi, mm);
> > + khugepaged_enter_vma(vma, flags);
> > + goto out;
> > + }
> > }
> >
> > if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index bf0546fe6eab..20c4ce7712c0 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -7,16 +7,18 @@
> > #include "vma_internal.h"
> > #include "vma.h"
> >
> > -/*
> > - * If the vma has a ->close operation then the driver probably needs to release
> > - * per-vma resources, so we don't attempt to merge those if the caller indicates
> > - * the current vma may be removed as part of the merge.
> > - */
> > -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > - struct file *file, unsigned long vm_flags,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name, bool may_remove_vma)
> > +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> > {
> > + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> > + /*
> > + * If the vma has a ->close operation then the driver probably needs to
> > + * release per-vma resources, so we don't attempt to merge those if the
> > + * caller indicates the current vma may be removed as part of the merge,
> > + * which is the case if we are attempting to merge the next VMA into
> > + * this one.
> > + */
> > + bool may_remove_vma = merge_next;
> > +
>
> This variable is used only once. If you want to clarify the double
> meaning of the merge_next parameter, consider moving this comment
> further down to the conditional and merely renaming the parameter.
>
> > /*
> > * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > * match the flags but dirty bit -- the caller should mark
> > @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > * the kernel to generate new VMAs when old one could be
> > * extended instead.
> > */
> > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> > return false;
> > - if (vma->vm_file != file)
> > + if (vma->vm_file != vmg->file)
> > return false;
> > if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
>
> AFAICS this is the only place where may_remove_vma is used.
Yes it is, but the point is to document what we're doing. The compiler
simplifies all this in the generated code.
>
> > return false;
> > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> > return false;
> > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> > return false;
> > return true;
> > }
> > @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > * We assume the vma may be removed as part of the merge.
> > */
> > bool
> > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +can_vma_merge_before(struct vma_merge_struct *vmg)
> > {
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - if (vma->vm_pgoff == vm_pgoff)
> > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > +
> > + if (is_mergeable_vma(vmg, true) &&
> > + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> > + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> > return true;
> > }
> > +
> > return false;
> > }
> >
> > @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > *
> > * We assume that vma is not removed as part of the merge.
> > */
> > -bool
> > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > {
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - pgoff_t vm_pglen;
> > -
> > - vm_pglen = vma_pages(vma);
> > - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > + if (is_mergeable_vma(vmg, false) &&
> > + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> > return true;
> > }
> > return false;
> > @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > * VMA Iterator will point to the end VMA.
> > */
> > static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > + unsigned long addr, bool new_below)
> > {
> > struct vma_prepare vp;
> > struct vm_area_struct *new;
> > @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > * Split a vma into two pieces at address 'addr', a new vma is allocated
> > * either for the first part or the tail.
> > */
> > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
>
> IMHO this patch is already long enough. Maybe the type change from int
> to bool could be split out to a separate patch to reduce churn here?
I don't really understand this comment. This reduces the number of lines of
code, and it's a line I have to change anyway, so there'd be _more_ churn
to split this out?
I don't think this is really all that important, but it'd be very silly to
split this out in my opinion.
>
> > {
> > - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> > return -ENOMEM;
> >
> > - return __split_vma(vmi, vma, addr, new_below);
> > + return __split_vma(vmg->vmi, vmg->vma,
> > + new_below ? vmg->start : vmg->end,
> > + new_below);
> > }
> >
> > /*
> > @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > goto map_count_exceeded;
> >
> > - error = __split_vma(vmi, vma, start, 1);
> > + error = __split_vma(vmi, vma, start, true);
> > if (error)
> > goto start_split_failed;
> > }
> > @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > do {
> > /* Does it split the end? */
> > if (next->vm_end > end) {
> > - error = __split_vma(vmi, next, end, 0);
> > + error = __split_vma(vmi, next, end, false);
> > if (error)
> > goto end_split_failed;
> > }
> > @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > * **** is not represented - it will be merged and the vma containing the
> > * area is returned, or the function will return NULL
> > */
> > -static struct vm_area_struct
> > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > {
> > - struct mm_struct *mm = src->vm_mm;
> > - struct anon_vma *anon_vma = src->anon_vma;
> > - struct file *file = src->vm_file;
> > + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> > + struct vm_area_struct *prev = vmg->prev;
> > struct vm_area_struct *curr, *next, *res;
> > struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > struct vm_area_struct *anon_dup = NULL;
> > @@ -953,16 +943,18 @@ static struct vm_area_struct
> > bool merge_prev = false;
> > bool merge_next = false;
> > bool vma_expanded = false;
> > + unsigned long addr = vmg->start;
> > + unsigned long end = vmg->end;
> > unsigned long vma_start = addr;
> > unsigned long vma_end = end;
> > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > + pgoff_t pglen = PHYS_PFN(end - addr);
> > long adj_start = 0;
> >
> > /*
> > * We later require that vma->vm_flags == vm_flags,
> > * so this tests vma->vm_flags & VM_SPECIAL, too.
> > */
> > - if (vm_flags & VM_SPECIAL)
> > + if (vmg->flags & VM_SPECIAL)
> > return NULL;
> >
> > /* Does the input range span an existing VMA? (cases 5 - 8) */
> > @@ -970,27 +962,26 @@ static struct vm_area_struct
> >
> > if (!curr || /* cases 1 - 4 */
> > end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > - next = vma_lookup(mm, end);
> > + next = vmg->next = vma_lookup(mm, end);
> > else
> > - next = NULL; /* case 5 */
> > + next = vmg->next = NULL; /* case 5 */
>
> Again, is it worth keeping the "next" variable, or could we replace it
> with "vmg->next" everywhere?
I already responded previously but equally, I'm explicitly using a local
variable to keep the code relatively simple and to not be constantly
ostensibly dereferencing vmg.
>
> No other comments to the rest of this patch.
>
> Petr T
>
> >
> > if (prev) {
> > vma_start = prev->vm_start;
> > vma_pgoff = prev->vm_pgoff;
> >
> > /* Can we merge the predecessor? */
> > - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> > - && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> > - pgoff, vm_userfaultfd_ctx, anon_name)) {
> > + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
> > + && can_vma_merge_after(vmg)) {
> > +
> > merge_prev = true;
> > - vma_prev(vmi);
> > + vma_prev(vmg->vmi);
> > }
> > }
> >
> > /* Can we merge the successor? */
> > - if (next && mpol_equal(policy, vma_policy(next)) &&
> > - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> > - vm_userfaultfd_ctx, anon_name)) {
> > + if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
> > + can_vma_merge_before(vmg)) {
> > merge_next = true;
> > }
> >
> > @@ -1041,7 +1032,7 @@ static struct vm_area_struct
> > remove = curr;
> > } else { /* case 5 */
> > adjust = curr;
> > - adj_start = (end - curr->vm_start);
> > + adj_start = end - curr->vm_start;
> > }
> > if (!err)
> > err = dup_anon_vma(prev, curr, &anon_dup);
> > @@ -1081,13 +1072,13 @@ static struct vm_area_struct
> > vma_expanded = true;
> >
> > if (vma_expanded) {
> > - vma_iter_config(vmi, vma_start, vma_end);
> > + vma_iter_config(vmg->vmi, vma_start, vma_end);
> > } else {
> > - vma_iter_config(vmi, adjust->vm_start + adj_start,
> > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > adjust->vm_end);
> > }
> >
> > - if (vma_iter_prealloc(vmi, vma))
> > + if (vma_iter_prealloc(vmg->vmi, vma))
> > goto prealloc_fail;
> >
> > init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> > @@ -1099,19 +1090,19 @@ static struct vm_area_struct
> > vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> >
> > if (vma_expanded)
> > - vma_iter_store(vmi, vma);
> > + vma_iter_store(vmg->vmi, vma);
> >
> > if (adj_start) {
> > adjust->vm_start += adj_start;
> > adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> > if (adj_start < 0) {
> > WARN_ON(vma_expanded);
> > - vma_iter_store(vmi, next);
> > + vma_iter_store(vmg->vmi, next);
> > }
> > }
> >
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(res, vm_flags);
> > + vma_complete(&vp, vmg->vmi, mm);
> > + khugepaged_enter_vma(res, vmg->flags);
> > return res;
> >
> > prealloc_fail:
> > @@ -1119,8 +1110,8 @@ static struct vm_area_struct
> > unlink_anon_vmas(anon_dup);
> >
> > anon_vma_fail:
> > - vma_iter_set(vmi, addr);
> > - vma_iter_load(vmi);
> > + vma_iter_set(vmg->vmi, addr);
> > + vma_iter_load(vmg->vmi);
> > return NULL;
> > }
> >
> > @@ -1137,38 +1128,141 @@ static struct vm_area_struct
> > * The function returns either the merged VMA, the original VMA if a split was
> > * required instead, or an error if the split failed.
> > */
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name)
> > +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> > {
> > - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > + struct vm_area_struct *vma = vmg->vma;
> > struct vm_area_struct *merged;
> >
> > - merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> > - pgoff, policy, uffd_ctx, anon_name);
> > + /* First, try to merge. */
> > + merged = vma_merge(vmg);
> > if (merged)
> > return merged;
> >
> > - if (vma->vm_start < start) {
> > - int err = split_vma(vmi, vma, start, 1);
> > + /* Split any preceding portion of the VMA. */
> > + if (vma->vm_start < vmg->start) {
> > + int err = split_vma(vmg, true);
> >
> > if (err)
> > return ERR_PTR(err);
> > }
> >
> > - if (vma->vm_end > end) {
> > - int err = split_vma(vmi, vma, end, 0);
> > + /* Split any trailing portion of the VMA. */
> > + if (vma->vm_end > vmg->end) {
> > + int err = split_vma(vmg, false);
> >
> > if (err)
> > return ERR_PTR(err);
> > }
> >
> > - return vma;
> > + return vmg->vma;
> > +}
> > +
> > +/* Assumes addr >= vma->vm_start. */
> > +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> > +{
> > + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> > +}
> > +
> > +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_name(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start,
> > + unsigned long end,
> > + unsigned long new_flags,
> > + struct anon_vma_name *new_name)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = new_name,
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_policy(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + struct mempolicy *new_pol)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = new_pol,
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags,
> > + struct vm_userfaultfd_ctx new_ctx)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = new_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > }
> >
> > /*
> > @@ -1180,8 +1274,22 @@ struct vm_area_struct
> > struct vm_area_struct *vma, unsigned long start,
> > unsigned long end, pgoff_t pgoff)
> > {
> > - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_merge(&vmg);
> > }
> >
> > /*
> > @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > unsigned long delta)
> > {
> > pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = vma,
> > + .vma = vma,
> > + .start = vma->vm_end,
> > + .end = vma->vm_end + delta,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> >
> > /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> > - vma->vm_flags, pgoff, vma_policy(vma),
> > - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + return vma_merge(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 6efdf1768a0a..c31684cc1da6 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -26,6 +26,23 @@ struct unlink_vma_file_batch {
> > struct vm_area_struct *vmas[8];
> > };
> >
> > +/* Represents a VMA merge operation. */
> > +struct vma_merge_struct {
> > + struct vma_iterator *vmi;
> > + struct vm_area_struct *prev;
> > + struct vm_area_struct *next; /* Modified by vma_merge(). */
> > + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
> > + unsigned long start;
> > + unsigned long end;
> > + unsigned long flags;
> > + pgoff_t pgoff;
> > + struct file *file;
> > + struct anon_vma *anon_vma;
> > + struct mempolicy *policy;
> > + struct vm_userfaultfd_ctx uffd_ctx;
> > + struct anon_vma_name *anon_name;
> > +};
> > +
> > #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > void validate_mm(struct mm_struct *mm);
> > #else
> > @@ -72,80 +89,53 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > struct vm_area_struct *next, unsigned long start,
> > unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/* Required by mmap_region(). */
> > -bool
> > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name);
> > -
> > -/* Required by mmap_region() and do_brk_flags(). */
> > -bool
> > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name);
> > -
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name);
> > +/*
> > + * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > + *
> > + * Required by mmap_region().
> > + */
> > +bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > +
> > +/*
> > + * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > + *
> > + * Required by mmap_region() and do_brk_flags().
> > + */
> > +bool can_vma_merge_after(struct vma_merge_struct *vmg);
> >
> > /* We are about to modify the VMA's flags. */
> > -static inline struct vm_area_struct
> > -*vma_modify_flags(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long new_flags)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx,
> > - anon_vma_name(vma));
> > -}
> > +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags);
> >
> > /* We are about to modify the VMA's flags and/or anon_name. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_flags_name(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start,
> > unsigned long end,
> > unsigned long new_flags,
> > - struct anon_vma_name *new_name)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> > -}
> > + struct anon_vma_name *new_name);
> >
> > /* We are about to modify the VMA's memory policy. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_policy(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start, unsigned long end,
> > - struct mempolicy *new_pol)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> > - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > -}
> > + struct mempolicy *new_pol);
> >
> > /* We are about to modify the VMA's flags and/or uffd context. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_flags_uffd(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start, unsigned long end,
> > unsigned long new_flags,
> > - struct vm_userfaultfd_ctx new_ctx)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), new_ctx, anon_vma_name(vma));
> > -}
> > + struct vm_userfaultfd_ctx new_ctx);
> >
> > struct vm_area_struct
> > *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-06 13:43 ` Lorenzo Stoakes
@ 2024-08-06 14:06 ` Petr Tesařík
2024-08-06 14:20 ` Lorenzo Stoakes
0 siblings, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 14:06 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, 6 Aug 2024 14:43:48 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> On Tue, Aug 06, 2024 at 02:47:54PM GMT, Petr Tesařík wrote:
> > Hi Lorenzo!
> >
> > On Mon, 5 Aug 2024 13:13:49 +0100
> > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >
> > > Rather than passing around huge numbers of parameters to numerous helper
> > > functions, abstract them into a single struct that we thread through the
> > > operation.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > > mm/mmap.c | 76 ++++++++------
> > > mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> > > mm/vma.h | 92 ++++++++---------
> > > 3 files changed, 294 insertions(+), 171 deletions(-)
> > >
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 4a9c2329b09a..f931000c561f 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > unsigned long end = addr + len;
> > > unsigned long merge_start = addr, merge_end = end;
> > > bool writable_file_mapping = false;
> > > - pgoff_t vm_pgoff;
> > > int error;
> > > VMA_ITERATOR(vmi, mm, addr);
> > > + struct vma_merge_struct vmg = {
> > > + .vmi = &vmi,
> > > + .start = addr,
> > > + .end = end,
> > > + .flags = vm_flags,
> > > + .pgoff = pgoff,
> > > + .file = file,
> > > + };
> > >
> > > /* Check against address space limit. */
> > > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > > @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > vm_flags |= VM_ACCOUNT;
> > > }
> > >
> > > - next = vma_next(&vmi);
> > > - prev = vma_prev(&vmi);
> > > + next = vmg.next = vma_next(&vmi);
> > > + prev = vmg.prev = vma_prev(&vmi);
> >
> > So, next is now a shortcut for vmg.next, and prev is a shortcut for
> > vmg.prev. ATM there is only one assignment, so no big deal, but I
> > wonder if next and prev could be removed instead, same as you replaced
> > vm_pgoff with vmg.pgoff.
>
> It's simply to avoid repeatedly referencing vmg.xxx / at least reduce
> _some_ churn. Also this will get moved shortly, so it's worth looking at in
> final form.
I'm not a MM maintainer, so my comments may not be relevant, but my
experience shows that pointer aliases have a potential to introduce all
kinds of subtle bugs. That's the reason I generally try to avoid them.
>
> >
> > Is the resulting code _too_ ugly?
> >
> > > if (vm_flags & VM_SPECIAL) {
> > > if (prev)
> > > vma_iter_next_range(&vmi);
> > > @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > /* Attempt to expand an old mapping */
> > > /* Check next */
> > > if (next && next->vm_start == end && !vma_policy(next) &&
> > > - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> > > - NULL_VM_UFFD_CTX, NULL)) {
> > > + can_vma_merge_before(&vmg)) {
> > > merge_end = next->vm_end;
> > > vma = next;
> > > - vm_pgoff = next->vm_pgoff - pglen;
> > > + vmg.pgoff = next->vm_pgoff - pglen;
> > > + }
> > > +
> > > + if (vma) {
> > > + vmg.anon_vma = vma->anon_vma;
> > > + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> > > }
> > >
> > > /* Check prev */
> > > if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> > > - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> > > - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> > > - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> > > - NULL_VM_UFFD_CTX, NULL))) {
> > > + can_vma_merge_after(&vmg)) {
> > > merge_start = prev->vm_start;
> > > vma = prev;
> > > - vm_pgoff = prev->vm_pgoff;
> > > + vmg.pgoff = prev->vm_pgoff;
> > > } else if (prev) {
> > > vma_iter_next_range(&vmi);
> > > }
> > >
> > > /* Actually expand, if possible */
> > > if (vma &&
> > > - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> > > + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > > khugepaged_enter_vma(vma, vm_flags);
> > > goto expanded;
> > > }
> > > @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > * Expand the existing vma if possible; Note that singular lists do not
> > > * occur after forking, so the expand will only happen on new VMAs.
> > > */
> > > - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> > > - can_vma_merge_after(vma, flags, NULL, NULL,
> > > - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> > > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > > - if (vma_iter_prealloc(vmi, vma))
> > > - goto unacct_fail;
> > > -
> > > - vma_start_write(vma);
> > > -
> > > - init_vma_prep(&vp, vma);
> > > - vma_prepare(&vp);
> > > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > > - vma->vm_end = addr + len;
> > > - vm_flags_set(vma, VM_SOFTDIRTY);
> > > - vma_iter_store(vmi, vma);
> > > -
> > > - vma_complete(&vp, vmi, mm);
> > > - khugepaged_enter_vma(vma, flags);
> > > - goto out;
> > > + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> > > + struct vma_merge_struct vmg = {
> > > + .prev = vma,
> > > + .flags = flags,
> > > + .pgoff = addr >> PAGE_SHIFT,
> > > + };
> > > +
> > > + if (can_vma_merge_after(&vmg)) {
> > > + vma_iter_config(vmi, vma->vm_start, addr + len);
> > > + if (vma_iter_prealloc(vmi, vma))
> > > + goto unacct_fail;
> > > +
> > > + vma_start_write(vma);
> > > +
> > > + init_vma_prep(&vp, vma);
> > > + vma_prepare(&vp);
> > > + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > > + vma->vm_end = addr + len;
> > > + vm_flags_set(vma, VM_SOFTDIRTY);
> > > + vma_iter_store(vmi, vma);
> > > +
> > > + vma_complete(&vp, vmi, mm);
> > > + khugepaged_enter_vma(vma, flags);
> > > + goto out;
> > > + }
> > > }
> > >
> > > if (vma)
> > > diff --git a/mm/vma.c b/mm/vma.c
> > > index bf0546fe6eab..20c4ce7712c0 100644
> > > --- a/mm/vma.c
> > > +++ b/mm/vma.c
> > > @@ -7,16 +7,18 @@
> > > #include "vma_internal.h"
> > > #include "vma.h"
> > >
> > > -/*
> > > - * If the vma has a ->close operation then the driver probably needs to release
> > > - * per-vma resources, so we don't attempt to merge those if the caller indicates
> > > - * the current vma may be removed as part of the merge.
> > > - */
> > > -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > > - struct file *file, unsigned long vm_flags,
> > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > - struct anon_vma_name *anon_name, bool may_remove_vma)
> > > +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> > > {
> > > + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> > > + /*
> > > + * If the vma has a ->close operation then the driver probably needs to
> > > + * release per-vma resources, so we don't attempt to merge those if the
> > > + * caller indicates the current vma may be removed as part of the merge,
> > > + * which is the case if we are attempting to merge the next VMA into
> > > + * this one.
> > > + */
> > > + bool may_remove_vma = merge_next;
> > > +
> >
> > This variable is used only once. If you want to clarify the double
> > meaning of the merge_next parameter, consider moving this comment
> > further down to the conditional and merely renaming the parameter.
> >
> > > /*
> > > * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > > * match the flags but dirty bit -- the caller should mark
> > > @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > > * the kernel to generate new VMAs when old one could be
> > > * extended instead.
> > > */
> > > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > > + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> > > return false;
> > > - if (vma->vm_file != file)
> > > + if (vma->vm_file != vmg->file)
> > > return false;
> > > if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> >
> > AFAICS this is the only place where may_remove_vma is used.
>
> Yes it is, but the point is to document what we're doing. The compiler
> simplifies all this in the generated code.
What's wrong with moving the comment for this variable before this
conditional?
> >
> > > return false;
> > > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> > > return false;
> > > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > > + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> > > return false;
> > > return true;
> > > }
> > > @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > > * We assume the vma may be removed as part of the merge.
> > > */
> > > bool
> > > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > > - struct anon_vma *anon_vma, struct file *file,
> > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > - struct anon_vma_name *anon_name)
> > > +can_vma_merge_before(struct vma_merge_struct *vmg)
> > > {
> > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > > - if (vma->vm_pgoff == vm_pgoff)
> > > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > > +
> > > + if (is_mergeable_vma(vmg, true) &&
> > > + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> > > + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> > > return true;
> > > }
> > > +
> > > return false;
> > > }
> > >
> > > @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > > *
> > > * We assume that vma is not removed as part of the merge.
> > > */
> > > -bool
> > > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > > - struct anon_vma *anon_vma, struct file *file,
> > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > - struct anon_vma_name *anon_name)
> > > +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > > {
> > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > > - pgoff_t vm_pglen;
> > > -
> > > - vm_pglen = vma_pages(vma);
> > > - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > > + if (is_mergeable_vma(vmg, false) &&
> > > + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > > + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> > > return true;
> > > }
> > > return false;
> > > @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > > * VMA Iterator will point to the end VMA.
> > > */
> > > static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > - unsigned long addr, int new_below)
> > > + unsigned long addr, bool new_below)
> > > {
> > > struct vma_prepare vp;
> > > struct vm_area_struct *new;
> > > @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > * Split a vma into two pieces at address 'addr', a new vma is allocated
> > > * either for the first part or the tail.
> > > */
> > > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > - unsigned long addr, int new_below)
> > > +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
> >
> > IMHO this patch is already long enough. Maybe the type change from int
> > to bool could be split out to a separate patch to reduce churn here?
>
> I don't really understand this comment. This reduces the number of lines of
> code, and it's a line I have to change anyway, so there'd be _more_ churn
> to split this out?
>
> I don't think this is really all that important, but it'd be very silly to
> split this out in my opinion.
Possibly a matter of taste. The churn is further down:
>
> >
> > > {
> > > - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > > + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> > > return -ENOMEM;
> > >
> > > - return __split_vma(vmi, vma, addr, new_below);
> > > + return __split_vma(vmg->vmi, vmg->vma,
> > > + new_below ? vmg->start : vmg->end,
> > > + new_below);
> > > }
> > >
> > > /*
> > > @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > > goto map_count_exceeded;
> > >
> > > - error = __split_vma(vmi, vma, start, 1);
> > > + error = __split_vma(vmi, vma, start, true);
Churn here.
> > > if (error)
> > > goto start_split_failed;
> > > }
> > > @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > do {
> > > /* Does it split the end? */
> > > if (next->vm_end > end) {
> > > - error = __split_vma(vmi, next, end, 0);
> > > + error = __split_vma(vmi, next, end, false);
Churn here.
But you're right, no big deal.
> > > if (error)
> > > goto end_split_failed;
> > > }
> > > @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > > * **** is not represented - it will be merged and the vma containing the
> > > * area is returned, or the function will return NULL
> > > */
> > > -static struct vm_area_struct
> > > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > > - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > - struct anon_vma_name *anon_name)
> > > +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > > {
> > > - struct mm_struct *mm = src->vm_mm;
> > > - struct anon_vma *anon_vma = src->anon_vma;
> > > - struct file *file = src->vm_file;
> > > + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> > > + struct vm_area_struct *prev = vmg->prev;
> > > struct vm_area_struct *curr, *next, *res;
> > > struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > > struct vm_area_struct *anon_dup = NULL;
> > > @@ -953,16 +943,18 @@ static struct vm_area_struct
> > > bool merge_prev = false;
> > > bool merge_next = false;
> > > bool vma_expanded = false;
> > > + unsigned long addr = vmg->start;
> > > + unsigned long end = vmg->end;
> > > unsigned long vma_start = addr;
> > > unsigned long vma_end = end;
> > > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > > + pgoff_t pglen = PHYS_PFN(end - addr);
> > > long adj_start = 0;
> > >
> > > /*
> > > * We later require that vma->vm_flags == vm_flags,
> > > * so this tests vma->vm_flags & VM_SPECIAL, too.
> > > */
> > > - if (vm_flags & VM_SPECIAL)
> > > + if (vmg->flags & VM_SPECIAL)
> > > return NULL;
> > >
> > > /* Does the input range span an existing VMA? (cases 5 - 8) */
> > > @@ -970,27 +962,26 @@ static struct vm_area_struct
> > >
> > > if (!curr || /* cases 1 - 4 */
> > > end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > > - next = vma_lookup(mm, end);
> > > + next = vmg->next = vma_lookup(mm, end);
> > > else
> > > - next = NULL; /* case 5 */
> > > + next = vmg->next = NULL; /* case 5 */
> >
> > Again, is it worth keeping the "next" variable, or could we replace it
> > with "vmg->next" everywhere?
>
> I already responded previously but equally, I'm explicitly using a local
> variable to keep the code relatively simple and to not be constantly
> ostensibly dereferencing vmg.
Yeah, sure. OTOH whoever looks at the code may ask why there is both
"vmg->next" and "next" and if they're really (supposed to be) the same
thing or if there's a subtle difference.
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-06 14:06 ` Petr Tesařík
@ 2024-08-06 14:20 ` Lorenzo Stoakes
2024-08-06 14:32 ` Petr Tesařík
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 14:20 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 04:06:50PM GMT, Petr Tesařík wrote:
> On Tue, 6 Aug 2024 14:43:48 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > On Tue, Aug 06, 2024 at 02:47:54PM GMT, Petr Tesařík wrote:
> > > Hi Lorenzo!
> > >
> > > On Mon, 5 Aug 2024 13:13:49 +0100
> > > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > > Rather than passing around huge numbers of parameters to numerous helper
> > > > functions, abstract them into a single struct that we thread through the
> > > > operation.
> > > >
> > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > ---
> > > > mm/mmap.c | 76 ++++++++------
> > > > mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> > > > mm/vma.h | 92 ++++++++---------
> > > > 3 files changed, 294 insertions(+), 171 deletions(-)
> > > >
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 4a9c2329b09a..f931000c561f 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
> > > > @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > unsigned long end = addr + len;
> > > > unsigned long merge_start = addr, merge_end = end;
> > > > bool writable_file_mapping = false;
> > > > - pgoff_t vm_pgoff;
> > > > int error;
> > > > VMA_ITERATOR(vmi, mm, addr);
> > > > + struct vma_merge_struct vmg = {
> > > > + .vmi = &vmi,
> > > > + .start = addr,
> > > > + .end = end,
> > > > + .flags = vm_flags,
> > > > + .pgoff = pgoff,
> > > > + .file = file,
> > > > + };
> > > >
> > > > /* Check against address space limit. */
> > > > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > > > @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > vm_flags |= VM_ACCOUNT;
> > > > }
> > > >
> > > > - next = vma_next(&vmi);
> > > > - prev = vma_prev(&vmi);
> > > > + next = vmg.next = vma_next(&vmi);
> > > > + prev = vmg.prev = vma_prev(&vmi);
> > >
> > > So, next is now a shortcut for vmg.next, and prev is a shortcut for
> > > vmg.prev. ATM there is only one assignment, so no big deal, but I
> > > wonder if next and prev could be removed instead, same as you replaced
> > > vm_pgoff with vmg.pgoff.
> >
> > It's simply to avoid repeatedly referencing vmg.xxx / at least reduce
> > _some_ churn. Also this will get moved shortly, so it's worth looking at in
> > final form.
>
> I'm not a MM maintainer, so my comments may not be relevant, but my
> experience shows that pointer aliases have a potential to introduce all
> kinds of subtle bugs. That's the reason I generally try to avoid them.
Right, I understand, I don't want to get too deep into a distracting bike
shed when this series is doing something quite major.
If you feel this is absolutely critical, I can adjust this code that I
later delete, if not I suggest leaving it as it is.
>
> >
> > >
> > > Is the resulting code _too_ ugly?
> > >
> > > > if (vm_flags & VM_SPECIAL) {
> > > > if (prev)
> > > > vma_iter_next_range(&vmi);
> > > > @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > /* Attempt to expand an old mapping */
> > > > /* Check next */
> > > > if (next && next->vm_start == end && !vma_policy(next) &&
> > > > - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> > > > - NULL_VM_UFFD_CTX, NULL)) {
> > > > + can_vma_merge_before(&vmg)) {
> > > > merge_end = next->vm_end;
> > > > vma = next;
> > > > - vm_pgoff = next->vm_pgoff - pglen;
> > > > + vmg.pgoff = next->vm_pgoff - pglen;
> > > > + }
> > > > +
> > > > + if (vma) {
> > > > + vmg.anon_vma = vma->anon_vma;
> > > > + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> > > > }
> > > >
> > > > /* Check prev */
> > > > if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> > > > - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> > > > - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> > > > - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> > > > - NULL_VM_UFFD_CTX, NULL))) {
> > > > + can_vma_merge_after(&vmg)) {
> > > > merge_start = prev->vm_start;
> > > > vma = prev;
> > > > - vm_pgoff = prev->vm_pgoff;
> > > > + vmg.pgoff = prev->vm_pgoff;
> > > > } else if (prev) {
> > > > vma_iter_next_range(&vmi);
> > > > }
> > > >
> > > > /* Actually expand, if possible */
> > > > if (vma &&
> > > > - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> > > > + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > > > khugepaged_enter_vma(vma, vm_flags);
> > > > goto expanded;
> > > > }
> > > > @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > * Expand the existing vma if possible; Note that singular lists do not
> > > > * occur after forking, so the expand will only happen on new VMAs.
> > > > */
> > > > - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> > > > - can_vma_merge_after(vma, flags, NULL, NULL,
> > > > - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> > > > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > > > - if (vma_iter_prealloc(vmi, vma))
> > > > - goto unacct_fail;
> > > > -
> > > > - vma_start_write(vma);
> > > > -
> > > > - init_vma_prep(&vp, vma);
> > > > - vma_prepare(&vp);
> > > > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > > > - vma->vm_end = addr + len;
> > > > - vm_flags_set(vma, VM_SOFTDIRTY);
> > > > - vma_iter_store(vmi, vma);
> > > > -
> > > > - vma_complete(&vp, vmi, mm);
> > > > - khugepaged_enter_vma(vma, flags);
> > > > - goto out;
> > > > + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> > > > + struct vma_merge_struct vmg = {
> > > > + .prev = vma,
> > > > + .flags = flags,
> > > > + .pgoff = addr >> PAGE_SHIFT,
> > > > + };
> > > > +
> > > > + if (can_vma_merge_after(&vmg)) {
> > > > + vma_iter_config(vmi, vma->vm_start, addr + len);
> > > > + if (vma_iter_prealloc(vmi, vma))
> > > > + goto unacct_fail;
> > > > +
> > > > + vma_start_write(vma);
> > > > +
> > > > + init_vma_prep(&vp, vma);
> > > > + vma_prepare(&vp);
> > > > + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > > > + vma->vm_end = addr + len;
> > > > + vm_flags_set(vma, VM_SOFTDIRTY);
> > > > + vma_iter_store(vmi, vma);
> > > > +
> > > > + vma_complete(&vp, vmi, mm);
> > > > + khugepaged_enter_vma(vma, flags);
> > > > + goto out;
> > > > + }
> > > > }
> > > >
> > > > if (vma)
> > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > index bf0546fe6eab..20c4ce7712c0 100644
> > > > --- a/mm/vma.c
> > > > +++ b/mm/vma.c
> > > > @@ -7,16 +7,18 @@
> > > > #include "vma_internal.h"
> > > > #include "vma.h"
> > > >
> > > > -/*
> > > > - * If the vma has a ->close operation then the driver probably needs to release
> > > > - * per-vma resources, so we don't attempt to merge those if the caller indicates
> > > > - * the current vma may be removed as part of the merge.
> > > > - */
> > > > -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > > > - struct file *file, unsigned long vm_flags,
> > > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > > - struct anon_vma_name *anon_name, bool may_remove_vma)
> > > > +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> > > > {
> > > > + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> > > > + /*
> > > > + * If the vma has a ->close operation then the driver probably needs to
> > > > + * release per-vma resources, so we don't attempt to merge those if the
> > > > + * caller indicates the current vma may be removed as part of the merge,
> > > > + * which is the case if we are attempting to merge the next VMA into
> > > > + * this one.
> > > > + */
> > > > + bool may_remove_vma = merge_next;
> > > > +
> > >
> > > This variable is used only once. If you want to clarify the double
> > > meaning of the merge_next parameter, consider moving this comment
> > > further down to the conditional and merely renaming the parameter.
> > >
> > > > /*
> > > > * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > > > * match the flags but dirty bit -- the caller should mark
> > > > @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > > > * the kernel to generate new VMAs when old one could be
> > > > * extended instead.
> > > > */
> > > > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > > > + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> > > > return false;
> > > > - if (vma->vm_file != file)
> > > > + if (vma->vm_file != vmg->file)
> > > > return false;
> > > > if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> > >
> > > AFAICS this is the only place where may_remove_vma is used.
> >
> > Yes it is, but the point is to document what we're doing. The compiler
> > simplifies all this in the generated code.
>
> What's wrong with moving the comment for this variable before this
> conditional?
Because in kernel-style C you have to put declarations at the top, the
parameter was originally called may_remove_vma, and it's self-documenting
by having that name.
Do note that I ultimately remove this code in patch 10.
This feels very bike-sheddy.
>
> > >
> > > > return false;
> > > > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > > > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> > > > return false;
> > > > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > > > + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> > > > return false;
> > > > return true;
> > > > }
> > > > @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > > > * We assume the vma may be removed as part of the merge.
> > > > */
> > > > bool
> > > > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > > > - struct anon_vma *anon_vma, struct file *file,
> > > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > > - struct anon_vma_name *anon_name)
> > > > +can_vma_merge_before(struct vma_merge_struct *vmg)
> > > > {
> > > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > > > - if (vma->vm_pgoff == vm_pgoff)
> > > > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > > > +
> > > > + if (is_mergeable_vma(vmg, true) &&
> > > > + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> > > > + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> > > > return true;
> > > > }
> > > > +
> > > > return false;
> > > > }
> > > >
> > > > @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > > > *
> > > > * We assume that vma is not removed as part of the merge.
> > > > */
> > > > -bool
> > > > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > > > - struct anon_vma *anon_vma, struct file *file,
> > > > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > > - struct anon_vma_name *anon_name)
> > > > +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > > > {
> > > > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > > > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > > > - pgoff_t vm_pglen;
> > > > -
> > > > - vm_pglen = vma_pages(vma);
> > > > - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > > > + if (is_mergeable_vma(vmg, false) &&
> > > > + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > > > + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> > > > return true;
> > > > }
> > > > return false;
> > > > @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > > > * VMA Iterator will point to the end VMA.
> > > > */
> > > > static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > - unsigned long addr, int new_below)
> > > > + unsigned long addr, bool new_below)
> > > > {
> > > > struct vma_prepare vp;
> > > > struct vm_area_struct *new;
> > > > @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > * Split a vma into two pieces at address 'addr', a new vma is allocated
> > > > * either for the first part or the tail.
> > > > */
> > > > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > - unsigned long addr, int new_below)
> > > > +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
> > >
> > > IMHO this patch is already long enough. Maybe the type change from int
> > > to bool could be split out to a separate patch to reduce churn here?
> >
> > I don't really understand this comment. This reduces the number of lines of
> > code, and it's a line I have to change anyway, so there'd be _more_ churn
> > to split this out?
> >
> > I don't think this is really all that important, but it'd be very silly to
> > split this out in my opinion.
>
> Possibly a matter of taste. The churn is further down:
>
> >
> > >
> > > > {
> > > > - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > > > + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> > > > return -ENOMEM;
> > > >
> > > > - return __split_vma(vmi, vma, addr, new_below);
> > > > + return __split_vma(vmg->vmi, vmg->vma,
> > > > + new_below ? vmg->start : vmg->end,
> > > > + new_below);
> > > > }
> > > >
> > > > /*
> > > > @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > > > goto map_count_exceeded;
> > > >
> > > > - error = __split_vma(vmi, vma, start, 1);
> > > > + error = __split_vma(vmi, vma, start, true);
>
> Churn here.
>
> > > > if (error)
> > > > goto start_split_failed;
> > > > }
> > > > @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > > > do {
> > > > /* Does it split the end? */
> > > > if (next->vm_end > end) {
> > > > - error = __split_vma(vmi, next, end, 0);
> > > > + error = __split_vma(vmi, next, end, false);
>
> Churn here.
>
Right, this is extremely silly. Churn isn't a synonym for 'literally any
change that you don't think has immediate value'. It implies _significant_
changes made for little to no value.
This is an absolutely tiny change, made when already changing the signature
that improves the code quality.
> But you're right, no big deal.
>
I'm glad we agree on that :)
> > > > if (error)
> > > > goto end_split_failed;
> > > > }
> > > > @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > > > * **** is not represented - it will be merged and the vma containing the
> > > > * area is returned, or the function will return NULL
> > > > */
> > > > -static struct vm_area_struct
> > > > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > > > - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > > > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > > > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > > > - struct anon_vma_name *anon_name)
> > > > +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > > > {
> > > > - struct mm_struct *mm = src->vm_mm;
> > > > - struct anon_vma *anon_vma = src->anon_vma;
> > > > - struct file *file = src->vm_file;
> > > > + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> > > > + struct vm_area_struct *prev = vmg->prev;
> > > > struct vm_area_struct *curr, *next, *res;
> > > > struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > > > struct vm_area_struct *anon_dup = NULL;
> > > > @@ -953,16 +943,18 @@ static struct vm_area_struct
> > > > bool merge_prev = false;
> > > > bool merge_next = false;
> > > > bool vma_expanded = false;
> > > > + unsigned long addr = vmg->start;
> > > > + unsigned long end = vmg->end;
> > > > unsigned long vma_start = addr;
> > > > unsigned long vma_end = end;
> > > > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > > > + pgoff_t pglen = PHYS_PFN(end - addr);
> > > > long adj_start = 0;
> > > >
> > > > /*
> > > > * We later require that vma->vm_flags == vm_flags,
> > > > * so this tests vma->vm_flags & VM_SPECIAL, too.
> > > > */
> > > > - if (vm_flags & VM_SPECIAL)
> > > > + if (vmg->flags & VM_SPECIAL)
> > > > return NULL;
> > > >
> > > > /* Does the input range span an existing VMA? (cases 5 - 8) */
> > > > @@ -970,27 +962,26 @@ static struct vm_area_struct
> > > >
> > > > if (!curr || /* cases 1 - 4 */
> > > > end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > > > - next = vma_lookup(mm, end);
> > > > + next = vmg->next = vma_lookup(mm, end);
> > > > else
> > > > - next = NULL; /* case 5 */
> > > > + next = vmg->next = NULL; /* case 5 */
> > >
> > > Again, is it worth keeping the "next" variable, or could we replace it
> > > with "vmg->next" everywhere?
> >
> > I already responded previously but equally, I'm explicitly using a local
> > variable to keep the code relatively simple and to not be constantly
> > ostensibly dereferencing vmg.
>
> Yeah, sure. OTOH whoever looks at the code may ask why there is both
> "vmg->next" and "next" and if they're really (supposed to be) the same
> thing or if there's a subtle difference.
Again, I ultimately delete this code, so this is not really worth spending
much time on.
>
> Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-06 14:20 ` Lorenzo Stoakes
@ 2024-08-06 14:32 ` Petr Tesařík
0 siblings, 0 replies; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 14:32 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
V Tue, 6 Aug 2024 15:20:49 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> napsáno:
> On Tue, Aug 06, 2024 at 04:06:50PM GMT, Petr Tesařík wrote:
> > On Tue, 6 Aug 2024 14:43:48 +0100
> > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >
> > > On Tue, Aug 06, 2024 at 02:47:54PM GMT, Petr Tesařík wrote:
> > > > Hi Lorenzo!
> > > >
> > > > On Mon, 5 Aug 2024 13:13:49 +0100
> > > > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > >
> > > > > Rather than passing around huge numbers of parameters to numerous helper
> > > > > functions, abstract them into a single struct that we thread through the
> > > > > operation.
> > > > >
> > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > > ---
> > > > > mm/mmap.c | 76 ++++++++------
> > > > > mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> > > > > mm/vma.h | 92 ++++++++---------
> > > > > 3 files changed, 294 insertions(+), 171 deletions(-)
> > > > >
> > > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > > index 4a9c2329b09a..f931000c561f 100644
> > > > > --- a/mm/mmap.c
> > > > > +++ b/mm/mmap.c
> > > > > @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > > unsigned long end = addr + len;
> > > > > unsigned long merge_start = addr, merge_end = end;
> > > > > bool writable_file_mapping = false;
> > > > > - pgoff_t vm_pgoff;
> > > > > int error;
> > > > > VMA_ITERATOR(vmi, mm, addr);
> > > > > + struct vma_merge_struct vmg = {
> > > > > + .vmi = &vmi,
> > > > > + .start = addr,
> > > > > + .end = end,
> > > > > + .flags = vm_flags,
> > > > > + .pgoff = pgoff,
> > > > > + .file = file,
> > > > > + };
> > > > >
> > > > > /* Check against address space limit. */
> > > > > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > > > > @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > > vm_flags |= VM_ACCOUNT;
> > > > > }
> > > > >
> > > > > - next = vma_next(&vmi);
> > > > > - prev = vma_prev(&vmi);
> > > > > + next = vmg.next = vma_next(&vmi);
> > > > > + prev = vmg.prev = vma_prev(&vmi);
> > > >
> > > > So, next is now a shortcut for vmg.next, and prev is a shortcut for
> > > > vmg.prev. ATM there is only one assignment, so no big deal, but I
> > > > wonder if next and prev could be removed instead, same as you replaced
> > > > vm_pgoff with vmg.pgoff.
> > >
> > > It's simply to avoid repeatedly referencing vmg.xxx / at least reduce
> > > _some_ churn. Also this will get moved shortly, so it's worth looking at in
> > > final form.
> >
> > I'm not a MM maintainer, so my comments may not be relevant, but my
> > experience shows that pointer aliases have a potential to introduce all
> > kinds of subtle bugs. That's the reason I generally try to avoid them.
>
> Right, I understand, I don't want to get too deep into a distracting bike
> shed when this series is doing something quite major.
>
> If you feel this is absolutely critical, I can adjust this code that I
> later delete, if not I suggest leaving it as it is.
Fair enough. I missed that _both_ occurences of the pointer aliases are
deleted later.
Then you're right, it's fine as is. No more bike shedding.
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-05 12:13 ` [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters Lorenzo Stoakes
2024-08-06 12:47 ` Petr Tesařík
@ 2024-08-08 12:49 ` Vlastimil Babka
2024-08-08 17:18 ` Lorenzo Stoakes
2024-08-08 20:07 ` Liam R. Howlett
2 siblings, 1 reply; 53+ messages in thread
From: Vlastimil Babka @ 2024-08-08 12:49 UTC (permalink / raw)
To: Lorenzo Stoakes, linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> Rather than passing around huge numbers of parameters to numerous helper
> functions, abstract them into a single struct that we thread through the
> operation.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
In general,
Acked-by: Vlastimil Babka <vbabka@suse.cz>
It would be great to have the tests already at this point but I understand
they depend on this. At least the result can be tested later in the series...
I haven't seen the final form yet, so some suggestions may become moot.
> +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> {
> - struct mm_struct *mm = src->vm_mm;
> - struct anon_vma *anon_vma = src->anon_vma;
> - struct file *file = src->vm_file;
> + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
This feels weird, but I'll leave it to Liam. Can't we just pass the mm? Hope
it's one of the things that will disappear in later patch :)
> + struct vm_area_struct *prev = vmg->prev;
> struct vm_area_struct *curr, *next, *res;
> struct vm_area_struct *vma, *adjust, *remove, *remove2;
> struct vm_area_struct *anon_dup = NULL;
<snip>
> +/* Assumes addr >= vma->vm_start. */
> +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> +{
> + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> +}
> +
> +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_name(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start,
> + unsigned long end,
> + unsigned long new_flags,
> + struct anon_vma_name *new_name)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = new_name,
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_policy(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + struct mempolicy *new_pol)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = new_pol,
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags,
> + struct vm_userfaultfd_ctx new_ctx)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .policy = vma_policy(vma),
> + .uffd_ctx = new_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> }
>
> /*
> @@ -1180,8 +1274,22 @@ struct vm_area_struct
> struct vm_area_struct *vma, unsigned long start,
> unsigned long end, pgoff_t pgoff)
> {
> - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_merge(&vmg);
> }
>
> /*
> @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> unsigned long delta)
> {
> pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = vma,
> + .vma = vma,
> + .start = vma->vm_end,
> + .end = vma->vm_end + delta,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
Wonder if there's a way to initialize a "standard" vmg and then apply the
usage-specific differences on top, instead of needing many repeated but
subtly different blocks like above.
>
> /* vma is specified as prev, so case 1 or 2 will apply. */
> - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> - vma->vm_flags, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + return vma_merge(&vmg);
> }
>
> void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> diff --git a/mm/vma.h b/mm/vma.h
> index 6efdf1768a0a..c31684cc1da6 100644
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-08 12:49 ` Vlastimil Babka
@ 2024-08-08 17:18 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-08 17:18 UTC (permalink / raw)
To: Vlastimil Babka; +Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Thu, Aug 08, 2024 at 02:49:03PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > Rather than passing around huge numbers of parameters to numerous helper
> > functions, abstract them into a single struct that we thread through the
> > operation.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> In general,
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>
> It would be great to have the tests already at this point but I understand
> they depend on this. At least the result can be tested later in the series...
Yeah it felt like too much churn (even for me ;) to do a before vmg/after
vmg version, but I could also still do this. At that point it might be
worth adding benchmarks too to assess impact...
>
> I haven't seen the final form yet, so some suggestions may become moot.
Umm...
>
> > +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > {
> > - struct mm_struct *mm = src->vm_mm;
> > - struct anon_vma *anon_vma = src->anon_vma;
> > - struct file *file = src->vm_file;
> > + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
>
> This feels weird, but I'll leave it to Liam. Can't we just pass the mm? Hope
> it's one of the things that will disappear in later patch :)
Yeah it is weird, I felt dirty and ashamed writing this so fully understand
if Liam wouldn't like.
Previously we'd actually dictate the need for a vma here, but that made it
trickier to write the tests I think.
Anyway we maybe just want to thread an mm?
>
> > + struct vm_area_struct *prev = vmg->prev;
> > struct vm_area_struct *curr, *next, *res;
> > struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > struct vm_area_struct *anon_dup = NULL;
>
> <snip>
>
> > +/* Assumes addr >= vma->vm_start. */
> > +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> > +{
> > + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> > +}
> > +
> > +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_name(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start,
> > + unsigned long end,
> > + unsigned long new_flags,
> > + struct anon_vma_name *new_name)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = new_name,
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_policy(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + struct mempolicy *new_pol)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = new_pol,
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags,
> > + struct vm_userfaultfd_ctx new_ctx)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = new_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > }
> >
> > /*
> > @@ -1180,8 +1274,22 @@ struct vm_area_struct
> > struct vm_area_struct *vma, unsigned long start,
> > unsigned long end, pgoff_t pgoff)
> > {
> > - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_merge(&vmg);
> > }
> >
> > /*
> > @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > unsigned long delta)
> > {
> > pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = vma,
> > + .vma = vma,
> > + .start = vma->vm_end,
> > + .end = vma->vm_end + delta,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
>
> Wonder if there's a way to initialize a "standard" vmg and then apply the
> usage-specific differences on top, instead of needing many repeated but
> subtly different blocks like above.
Yes that'd be nice, I had the same thought but just hadn't got round to
doing it yet.
Will look at it on next respin.
>
> >
> > /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> > - vma->vm_flags, pgoff, vma_policy(vma),
> > - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + return vma_merge(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 6efdf1768a0a..c31684cc1da6 100644
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-05 12:13 ` [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters Lorenzo Stoakes
2024-08-06 12:47 ` Petr Tesařík
2024-08-08 12:49 ` Vlastimil Babka
@ 2024-08-08 20:07 ` Liam R. Howlett
2024-08-09 10:11 ` Lorenzo Stoakes
2 siblings, 1 reply; 53+ messages in thread
From: Liam R. Howlett @ 2024-08-08 20:07 UTC (permalink / raw)
To: Lorenzo Stoakes; +Cc: linux-mm, linux-kernel, Andrew Morton, Vlastimil Babka
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> Rather than passing around huge numbers of parameters to numerous helper
> functions, abstract them into a single struct that we thread through the
> operation.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 76 ++++++++------
> mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> mm/vma.h | 92 ++++++++---------
> 3 files changed, 294 insertions(+), 171 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 4a9c2329b09a..f931000c561f 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long end = addr + len;
> unsigned long merge_start = addr, merge_end = end;
> bool writable_file_mapping = false;
> - pgoff_t vm_pgoff;
> int error;
> VMA_ITERATOR(vmi, mm, addr);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .start = addr,
> + .end = end,
> + .flags = vm_flags,
> + .pgoff = pgoff,
> + .file = file,
> + };
>
> /* Check against address space limit. */
> if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags |= VM_ACCOUNT;
> }
>
> - next = vma_next(&vmi);
> - prev = vma_prev(&vmi);
> + next = vmg.next = vma_next(&vmi);
> + prev = vmg.prev = vma_prev(&vmi);
> if (vm_flags & VM_SPECIAL) {
> if (prev)
> vma_iter_next_range(&vmi);
> @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> /* Attempt to expand an old mapping */
> /* Check next */
> if (next && next->vm_start == end && !vma_policy(next) &&
> - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> - NULL_VM_UFFD_CTX, NULL)) {
> + can_vma_merge_before(&vmg)) {
> merge_end = next->vm_end;
> vma = next;
> - vm_pgoff = next->vm_pgoff - pglen;
> + vmg.pgoff = next->vm_pgoff - pglen;
> + }
> +
> + if (vma) {
> + vmg.anon_vma = vma->anon_vma;
> + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> }
>
> /* Check prev */
> if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> - NULL_VM_UFFD_CTX, NULL))) {
> + can_vma_merge_after(&vmg)) {
> merge_start = prev->vm_start;
> vma = prev;
> - vm_pgoff = prev->vm_pgoff;
> + vmg.pgoff = prev->vm_pgoff;
> } else if (prev) {
> vma_iter_next_range(&vmi);
> }
>
> /* Actually expand, if possible */
> if (vma &&
> - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> khugepaged_enter_vma(vma, vm_flags);
> goto expanded;
> }
> @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Expand the existing vma if possible; Note that singular lists do not
> * occur after forking, so the expand will only happen on new VMAs.
> */
> - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> - can_vma_merge_after(vma, flags, NULL, NULL,
> - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> - vma_iter_config(vmi, vma->vm_start, addr + len);
> - if (vma_iter_prealloc(vmi, vma))
> - goto unacct_fail;
> -
> - vma_start_write(vma);
> -
> - init_vma_prep(&vp, vma);
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> - vma->vm_end = addr + len;
> - vm_flags_set(vma, VM_SOFTDIRTY);
> - vma_iter_store(vmi, vma);
> -
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(vma, flags);
> - goto out;
> + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> + struct vma_merge_struct vmg = {
> + .prev = vma,
> + .flags = flags,
> + .pgoff = addr >> PAGE_SHIFT,
> + };
I see that this struct here makes sense later.
> +
> + if (can_vma_merge_after(&vmg)) {
> + vma_iter_config(vmi, vma->vm_start, addr + len);
> + if (vma_iter_prealloc(vmi, vma))
> + goto unacct_fail;
> +
> + vma_start_write(vma);
> +
> + init_vma_prep(&vp, vma);
> + vma_prepare(&vp);
> + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> + vma->vm_end = addr + len;
> + vm_flags_set(vma, VM_SOFTDIRTY);
> + vma_iter_store(vmi, vma);
> +
> + vma_complete(&vp, vmi, mm);
> + khugepaged_enter_vma(vma, flags);
> + goto out;
> + }
> }
>
> if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index bf0546fe6eab..20c4ce7712c0 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -7,16 +7,18 @@
> #include "vma_internal.h"
> #include "vma.h"
>
> -/*
> - * If the vma has a ->close operation then the driver probably needs to release
> - * per-vma resources, so we don't attempt to merge those if the caller indicates
> - * the current vma may be removed as part of the merge.
> - */
> -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> - struct file *file, unsigned long vm_flags,
> - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name, bool may_remove_vma)
> +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> {
> + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> + /*
> + * If the vma has a ->close operation then the driver probably needs to
> + * release per-vma resources, so we don't attempt to merge those if the
> + * caller indicates the current vma may be removed as part of the merge,
> + * which is the case if we are attempting to merge the next VMA into
> + * this one.
> + */
> + bool may_remove_vma = merge_next;
> +
> /*
> * VM_SOFTDIRTY should not prevent from VMA merging, if we
> * match the flags but dirty bit -- the caller should mark
> @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> * the kernel to generate new VMAs when old one could be
> * extended instead.
> */
> - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> return false;
> - if (vma->vm_file != file)
> + if (vma->vm_file != vmg->file)
> return false;
> if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> return false;
> - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> return false;
> - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> return false;
> return true;
> }
> @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> * We assume the vma may be removed as part of the merge.
> */
> bool
> -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +can_vma_merge_before(struct vma_merge_struct *vmg)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> - if (vma->vm_pgoff == vm_pgoff)
> + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> +
> + if (is_mergeable_vma(vmg, true) &&
> + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> return true;
> }
> +
> return false;
> }
>
> @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> *
> * We assume that vma is not removed as part of the merge.
> */
> -bool
> -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> - pgoff_t vm_pglen;
> -
> - vm_pglen = vma_pages(vma);
> - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> + if (is_mergeable_vma(vmg, false) &&
> + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> return true;
> }
> return false;
> @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> * VMA Iterator will point to the end VMA.
> */
> static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long addr, int new_below)
> + unsigned long addr, bool new_below)
> {
> struct vma_prepare vp;
> struct vm_area_struct *new;
> @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Split a vma into two pieces at address 'addr', a new vma is allocated
> * either for the first part or the tail.
> */
> -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long addr, int new_below)
> +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
> {
> - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> return -ENOMEM;
>
> - return __split_vma(vmi, vma, addr, new_below);
> + return __split_vma(vmg->vmi, vmg->vma,
> + new_below ? vmg->start : vmg->end,
> + new_below);
Why did this get worse?
Maybe split the __split_vma changes out of this patch? I mean, split is
used for more than just merging, it's used for unmapping as well.
Changing argument types in __split_vma() seems unrelated to the main
focus of this patch.
> }
>
> /*
> @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> goto map_count_exceeded;
>
> - error = __split_vma(vmi, vma, start, 1);
> + error = __split_vma(vmi, vma, start, true);
> if (error)
> goto start_split_failed;
> }
> @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> do {
> /* Does it split the end? */
> if (next->vm_end > end) {
> - error = __split_vma(vmi, next, end, 0);
> + error = __split_vma(vmi, next, end, false);
> if (error)
> goto end_split_failed;
> }
> @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> * **** is not represented - it will be merged and the vma containing the
> * area is returned, or the function will return NULL
> */
> -static struct vm_area_struct
> -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name)
> +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> {
> - struct mm_struct *mm = src->vm_mm;
> - struct anon_vma *anon_vma = src->anon_vma;
> - struct file *file = src->vm_file;
> + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
No. current->mm is way better, or put it in that vmg thing.
> + struct vm_area_struct *prev = vmg->prev;
> struct vm_area_struct *curr, *next, *res;
> struct vm_area_struct *vma, *adjust, *remove, *remove2;
> struct vm_area_struct *anon_dup = NULL;
> @@ -953,16 +943,18 @@ static struct vm_area_struct
> bool merge_prev = false;
> bool merge_next = false;
> bool vma_expanded = false;
> + unsigned long addr = vmg->start;
> + unsigned long end = vmg->end;
> unsigned long vma_start = addr;
> unsigned long vma_end = end;
> - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> + pgoff_t pglen = PHYS_PFN(end - addr);
> long adj_start = 0;
>
> /*
> * We later require that vma->vm_flags == vm_flags,
> * so this tests vma->vm_flags & VM_SPECIAL, too.
> */
> - if (vm_flags & VM_SPECIAL)
> + if (vmg->flags & VM_SPECIAL)
> return NULL;
>
> /* Does the input range span an existing VMA? (cases 5 - 8) */
> @@ -970,27 +962,26 @@ static struct vm_area_struct
>
> if (!curr || /* cases 1 - 4 */
> end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> - next = vma_lookup(mm, end);
> + next = vmg->next = vma_lookup(mm, end);
> else
> - next = NULL; /* case 5 */
> + next = vmg->next = NULL; /* case 5 */
>
> if (prev) {
> vma_start = prev->vm_start;
> vma_pgoff = prev->vm_pgoff;
>
> /* Can we merge the predecessor? */
> - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> - && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> - pgoff, vm_userfaultfd_ctx, anon_name)) {
> + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
> + && can_vma_merge_after(vmg)) {
> +
> merge_prev = true;
> - vma_prev(vmi);
> + vma_prev(vmg->vmi);
> }
> }
>
> /* Can we merge the successor? */
> - if (next && mpol_equal(policy, vma_policy(next)) &&
> - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> - vm_userfaultfd_ctx, anon_name)) {
> + if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
> + can_vma_merge_before(vmg)) {
> merge_next = true;
> }
>
> @@ -1041,7 +1032,7 @@ static struct vm_area_struct
> remove = curr;
> } else { /* case 5 */
> adjust = curr;
> - adj_start = (end - curr->vm_start);
> + adj_start = end - curr->vm_start;
> }
> if (!err)
> err = dup_anon_vma(prev, curr, &anon_dup);
> @@ -1081,13 +1072,13 @@ static struct vm_area_struct
> vma_expanded = true;
>
> if (vma_expanded) {
> - vma_iter_config(vmi, vma_start, vma_end);
> + vma_iter_config(vmg->vmi, vma_start, vma_end);
> } else {
> - vma_iter_config(vmi, adjust->vm_start + adj_start,
> + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> adjust->vm_end);
> }
>
> - if (vma_iter_prealloc(vmi, vma))
> + if (vma_iter_prealloc(vmg->vmi, vma))
> goto prealloc_fail;
>
> init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> @@ -1099,19 +1090,19 @@ static struct vm_area_struct
> vma_set_range(vma, vma_start, vma_end, vma_pgoff);
>
> if (vma_expanded)
> - vma_iter_store(vmi, vma);
> + vma_iter_store(vmg->vmi, vma);
>
> if (adj_start) {
> adjust->vm_start += adj_start;
> adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> if (adj_start < 0) {
> WARN_ON(vma_expanded);
> - vma_iter_store(vmi, next);
> + vma_iter_store(vmg->vmi, next);
> }
> }
>
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(res, vm_flags);
> + vma_complete(&vp, vmg->vmi, mm);
> + khugepaged_enter_vma(res, vmg->flags);
> return res;
>
> prealloc_fail:
> @@ -1119,8 +1110,8 @@ static struct vm_area_struct
> unlink_anon_vmas(anon_dup);
>
> anon_vma_fail:
> - vma_iter_set(vmi, addr);
> - vma_iter_load(vmi);
> + vma_iter_set(vmg->vmi, addr);
> + vma_iter_load(vmg->vmi);
> return NULL;
> }
>
> @@ -1137,38 +1128,141 @@ static struct vm_area_struct
> * The function returns either the merged VMA, the original VMA if a split was
> * required instead, or an error if the split failed.
> */
> -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long vm_flags,
> - struct mempolicy *policy,
> - struct vm_userfaultfd_ctx uffd_ctx,
> - struct anon_vma_name *anon_name)
> +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> {
> - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> + struct vm_area_struct *vma = vmg->vma;
> struct vm_area_struct *merged;
>
> - merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> - pgoff, policy, uffd_ctx, anon_name);
> + /* First, try to merge. */
> + merged = vma_merge(vmg);
> if (merged)
> return merged;
>
> - if (vma->vm_start < start) {
> - int err = split_vma(vmi, vma, start, 1);
> + /* Split any preceding portion of the VMA. */
> + if (vma->vm_start < vmg->start) {
> + int err = split_vma(vmg, true);
>
> if (err)
> return ERR_PTR(err);
> }
>
> - if (vma->vm_end > end) {
> - int err = split_vma(vmi, vma, end, 0);
> + /* Split any trailing portion of the VMA. */
> + if (vma->vm_end > vmg->end) {
> + int err = split_vma(vmg, false);
>
> if (err)
> return ERR_PTR(err);
> }
>
> - return vma;
> + return vmg->vma;
> +}
> +
> +/* Assumes addr >= vma->vm_start. */
> +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> +{
> + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> +}
> +
Could we do something like the MA_STATE() in the maple_tree.h for the
below?
> +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_name(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start,
> + unsigned long end,
> + unsigned long new_flags,
> + struct anon_vma_name *new_name)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = new_name,
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_policy(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + struct mempolicy *new_pol)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = new_pol,
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> +}
> +
> +struct vm_area_struct
> +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags,
> + struct vm_userfaultfd_ctx new_ctx)
> +{
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = new_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = vma_pgoff_offset(vma, start),
> + .policy = vma_policy(vma),
> + .uffd_ctx = new_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_modify(&vmg);
> }
>
> /*
> @@ -1180,8 +1274,22 @@ struct vm_area_struct
> struct vm_area_struct *vma, unsigned long start,
> unsigned long end, pgoff_t pgoff)
> {
> - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = prev,
> + .vma = vma,
> + .start = start,
> + .end = end,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
> +
> + return vma_merge(&vmg);
> }
>
> /*
> @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> unsigned long delta)
> {
> pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> + struct vma_merge_struct vmg = {
> + .vmi = vmi,
> + .prev = vma,
> + .vma = vma,
> + .start = vma->vm_end,
> + .end = vma->vm_end + delta,
> + .flags = vma->vm_flags,
> + .file = vma->vm_file,
> + .pgoff = pgoff,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
>
> /* vma is specified as prev, so case 1 or 2 will apply. */
> - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> - vma->vm_flags, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> + return vma_merge(&vmg);
> }
>
> void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> diff --git a/mm/vma.h b/mm/vma.h
> index 6efdf1768a0a..c31684cc1da6 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -26,6 +26,23 @@ struct unlink_vma_file_batch {
> struct vm_area_struct *vmas[8];
> };
>
> +/* Represents a VMA merge operation. */
> +struct vma_merge_struct {
> + struct vma_iterator *vmi;
> + struct vm_area_struct *prev;
> + struct vm_area_struct *next; /* Modified by vma_merge(). */
> + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
> + unsigned long start;
> + unsigned long end;
> + unsigned long flags;
> + pgoff_t pgoff;
> + struct file *file;
> + struct anon_vma *anon_vma;
> + struct mempolicy *policy;
> + struct vm_userfaultfd_ctx uffd_ctx;
> + struct anon_vma_name *anon_name;
> +};
> +
> #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> void validate_mm(struct mm_struct *mm);
> #else
> @@ -72,80 +89,53 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> struct vm_area_struct *next, unsigned long start,
> unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>
> -/* Required by mmap_region(). */
> -bool
> -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name);
> -
> -/* Required by mmap_region() and do_brk_flags(). */
> -bool
> -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> - struct anon_vma *anon_vma, struct file *file,
> - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> - struct anon_vma_name *anon_name);
> -
> -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long vm_flags,
> - struct mempolicy *policy,
> - struct vm_userfaultfd_ctx uffd_ctx,
> - struct anon_vma_name *anon_name);
> +/*
> + * Can we merge the VMA described by vmg into the following VMA vmg->next?
> + *
> + * Required by mmap_region().
> + */
> +bool can_vma_merge_before(struct vma_merge_struct *vmg);
> +
> +/*
> + * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> + *
> + * Required by mmap_region() and do_brk_flags().
> + */
> +bool can_vma_merge_after(struct vma_merge_struct *vmg);
>
> /* We are about to modify the VMA's flags. */
> -static inline struct vm_area_struct
> -*vma_modify_flags(struct vma_iterator *vmi,
> - struct vm_area_struct *prev,
> - struct vm_area_struct *vma,
> - unsigned long start, unsigned long end,
> - unsigned long new_flags)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), vma->vm_userfaultfd_ctx,
> - anon_vma_name(vma));
> -}
> +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> + struct vm_area_struct *prev,
> + struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long new_flags);
Functions with insane number of arguments can be written like this:
struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct,
*vma, unsigned long start, unsigned long end,
unsigned long new_flags);
The advantage is that more of the insane arguments are on the same line,
and if the name changes it will limit the impact.
>
> /* We are about to modify the VMA's flags and/or anon_name. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_flags_name(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start,
> unsigned long end,
> unsigned long new_flags,
> - struct anon_vma_name *new_name)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> -}
> + struct anon_vma_name *new_name);
>
> /* We are about to modify the VMA's memory policy. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_policy(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> - struct mempolicy *new_pol)
> -{
> - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> -}
> + struct mempolicy *new_pol);
>
> /* We are about to modify the VMA's flags and/or uffd context. */
> -static inline struct vm_area_struct
> +struct vm_area_struct
> *vma_modify_flags_uffd(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long new_flags,
> - struct vm_userfaultfd_ctx new_ctx)
> -{
> - return vma_modify(vmi, prev, vma, start, end, new_flags,
> - vma_policy(vma), new_ctx, anon_vma_name(vma));
> -}
> + struct vm_userfaultfd_ctx new_ctx);
>
> struct vm_area_struct
> *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
> --
> 2.45.2
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters
2024-08-08 20:07 ` Liam R. Howlett
@ 2024-08-09 10:11 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 10:11 UTC (permalink / raw)
To: Liam R. Howlett, linux-mm, linux-kernel, Andrew Morton, Vlastimil Babka
On Thu, Aug 08, 2024 at 04:07:15PM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> > Rather than passing around huge numbers of parameters to numerous helper
> > functions, abstract them into a single struct that we thread through the
> > operation.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 76 ++++++++------
> > mm/vma.c | 297 ++++++++++++++++++++++++++++++++++++++----------------
> > mm/vma.h | 92 ++++++++---------
> > 3 files changed, 294 insertions(+), 171 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 4a9c2329b09a..f931000c561f 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1369,9 +1369,16 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > unsigned long end = addr + len;
> > unsigned long merge_start = addr, merge_end = end;
> > bool writable_file_mapping = false;
> > - pgoff_t vm_pgoff;
> > int error;
> > VMA_ITERATOR(vmi, mm, addr);
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .start = addr,
> > + .end = end,
> > + .flags = vm_flags,
> > + .pgoff = pgoff,
> > + .file = file,
> > + };
> >
> > /* Check against address space limit. */
> > if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > @@ -1405,8 +1412,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vma_next(&vmi);
> > - prev = vma_prev(&vmi);
> > + next = vmg.next = vma_next(&vmi);
> > + prev = vmg.prev = vma_prev(&vmi);
> > if (vm_flags & VM_SPECIAL) {
> > if (prev)
> > vma_iter_next_range(&vmi);
> > @@ -1416,29 +1423,30 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > /* Attempt to expand an old mapping */
> > /* Check next */
> > if (next && next->vm_start == end && !vma_policy(next) &&
> > - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
> > - NULL_VM_UFFD_CTX, NULL)) {
> > + can_vma_merge_before(&vmg)) {
> > merge_end = next->vm_end;
> > vma = next;
> > - vm_pgoff = next->vm_pgoff - pglen;
> > + vmg.pgoff = next->vm_pgoff - pglen;
> > + }
> > +
> > + if (vma) {
> > + vmg.anon_vma = vma->anon_vma;
> > + vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> > }
> >
> > /* Check prev */
> > if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> > - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
> > - pgoff, vma->vm_userfaultfd_ctx, NULL) :
> > - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
> > - NULL_VM_UFFD_CTX, NULL))) {
> > + can_vma_merge_after(&vmg)) {
> > merge_start = prev->vm_start;
> > vma = prev;
> > - vm_pgoff = prev->vm_pgoff;
> > + vmg.pgoff = prev->vm_pgoff;
> > } else if (prev) {
> > vma_iter_next_range(&vmi);
> > }
> >
> > /* Actually expand, if possible */
> > if (vma &&
> > - !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
> > + !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > khugepaged_enter_vma(vma, vm_flags);
> > goto expanded;
> > }
> > @@ -1790,25 +1798,31 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > * Expand the existing vma if possible; Note that singular lists do not
> > * occur after forking, so the expand will only happen on new VMAs.
> > */
> > - if (vma && vma->vm_end == addr && !vma_policy(vma) &&
> > - can_vma_merge_after(vma, flags, NULL, NULL,
> > - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
> > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto unacct_fail;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > - vma->vm_end = addr + len;
> > - vm_flags_set(vma, VM_SOFTDIRTY);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(vma, flags);
> > - goto out;
> > + if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> > + struct vma_merge_struct vmg = {
> > + .prev = vma,
> > + .flags = flags,
> > + .pgoff = addr >> PAGE_SHIFT,
> > + };
>
> I see that this struct here makes sense later.
>
> > +
> > + if (can_vma_merge_after(&vmg)) {
> > + vma_iter_config(vmi, vma->vm_start, addr + len);
> > + if (vma_iter_prealloc(vmi, vma))
> > + goto unacct_fail;
> > +
> > + vma_start_write(vma);
> > +
> > + init_vma_prep(&vp, vma);
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > + vma->vm_end = addr + len;
> > + vm_flags_set(vma, VM_SOFTDIRTY);
> > + vma_iter_store(vmi, vma);
> > +
> > + vma_complete(&vp, vmi, mm);
> > + khugepaged_enter_vma(vma, flags);
> > + goto out;
> > + }
> > }
> >
> > if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index bf0546fe6eab..20c4ce7712c0 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -7,16 +7,18 @@
> > #include "vma_internal.h"
> > #include "vma.h"
> >
> > -/*
> > - * If the vma has a ->close operation then the driver probably needs to release
> > - * per-vma resources, so we don't attempt to merge those if the caller indicates
> > - * the current vma may be removed as part of the merge.
> > - */
> > -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > - struct file *file, unsigned long vm_flags,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name, bool may_remove_vma)
> > +static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> > {
> > + struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> > + /*
> > + * If the vma has a ->close operation then the driver probably needs to
> > + * release per-vma resources, so we don't attempt to merge those if the
> > + * caller indicates the current vma may be removed as part of the merge,
> > + * which is the case if we are attempting to merge the next VMA into
> > + * this one.
> > + */
> > + bool may_remove_vma = merge_next;
> > +
> > /*
> > * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > * match the flags but dirty bit -- the caller should mark
> > @@ -25,15 +27,15 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > * the kernel to generate new VMAs when old one could be
> > * extended instead.
> > */
> > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > + if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
> > return false;
> > - if (vma->vm_file != file)
> > + if (vma->vm_file != vmg->file)
> > return false;
> > if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> > return false;
> > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> > return false;
> > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > + if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> > return false;
> > return true;
> > }
> > @@ -94,16 +96,16 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > * We assume the vma may be removed as part of the merge.
> > */
> > bool
> > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +can_vma_merge_before(struct vma_merge_struct *vmg)
> > {
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - if (vma->vm_pgoff == vm_pgoff)
> > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > +
> > + if (is_mergeable_vma(vmg, true) &&
> > + is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
> > + if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
> > return true;
> > }
> > +
> > return false;
> > }
> >
> > @@ -116,18 +118,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > *
> > * We assume that vma is not removed as part of the merge.
> > */
> > -bool
> > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > {
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - pgoff_t vm_pglen;
> > -
> > - vm_pglen = vma_pages(vma);
> > - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > + if (is_mergeable_vma(vmg, false) &&
> > + is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > + if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
> > return true;
> > }
> > return false;
> > @@ -180,7 +175,7 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > * VMA Iterator will point to the end VMA.
> > */
> > static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > + unsigned long addr, bool new_below)
> > {
> > struct vma_prepare vp;
> > struct vm_area_struct *new;
> > @@ -261,13 +256,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > * Split a vma into two pieces at address 'addr', a new vma is allocated
> > * either for the first part or the tail.
> > */
> > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > +static int split_vma(struct vma_merge_struct *vmg, bool new_below)
> > {
> > - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > + if (vmg->vma->vm_mm->map_count >= sysctl_max_map_count)
> > return -ENOMEM;
> >
> > - return __split_vma(vmi, vma, addr, new_below);
> > + return __split_vma(vmg->vmi, vmg->vma,
> > + new_below ? vmg->start : vmg->end,
> > + new_below);
>
> Why did this get worse?
> Maybe split the __split_vma changes out of this patch? I mean, split is
> used for more than just merging, it's used for unmapping as well.
> Changing argument types in __split_vma() seems unrelated to the main
> focus of this patch.
>
Yeah on second thoughts, this isn't an improvement, I'll revert this part
of the change.
This was mostly me trying to thread it through as much as possible, but in
the split_vma() case (note I don't touch __split_vma()) it doesn't make as
much sense.
> > }
> >
> > /*
> > @@ -712,7 +708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > goto map_count_exceeded;
> >
> > - error = __split_vma(vmi, vma, start, 1);
> > + error = __split_vma(vmi, vma, start, true);
> > if (error)
> > goto start_split_failed;
> > }
> > @@ -725,7 +721,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > do {
> > /* Does it split the end? */
> > if (next->vm_end > end) {
> > - error = __split_vma(vmi, next, end, 0);
> > + error = __split_vma(vmi, next, end, false);
> > if (error)
> > goto end_split_failed;
> > }
> > @@ -934,16 +930,10 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > * **** is not represented - it will be merged and the vma containing the
> > * area is returned, or the function will return NULL
> > */
> > -static struct vm_area_struct
> > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > +static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > {
> > - struct mm_struct *mm = src->vm_mm;
> > - struct anon_vma *anon_vma = src->anon_vma;
> > - struct file *file = src->vm_file;
> > + struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
>
> No. current->mm is way better, or put it in that vmg thing.
Ack will use current->mm. This is at least temporary as I drop vma_merge()
altogether later.
>
> > + struct vm_area_struct *prev = vmg->prev;
> > struct vm_area_struct *curr, *next, *res;
> > struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > struct vm_area_struct *anon_dup = NULL;
> > @@ -953,16 +943,18 @@ static struct vm_area_struct
> > bool merge_prev = false;
> > bool merge_next = false;
> > bool vma_expanded = false;
> > + unsigned long addr = vmg->start;
> > + unsigned long end = vmg->end;
> > unsigned long vma_start = addr;
> > unsigned long vma_end = end;
> > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > + pgoff_t pglen = PHYS_PFN(end - addr);
> > long adj_start = 0;
> >
> > /*
> > * We later require that vma->vm_flags == vm_flags,
> > * so this tests vma->vm_flags & VM_SPECIAL, too.
> > */
> > - if (vm_flags & VM_SPECIAL)
> > + if (vmg->flags & VM_SPECIAL)
> > return NULL;
> >
> > /* Does the input range span an existing VMA? (cases 5 - 8) */
> > @@ -970,27 +962,26 @@ static struct vm_area_struct
> >
> > if (!curr || /* cases 1 - 4 */
> > end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > - next = vma_lookup(mm, end);
> > + next = vmg->next = vma_lookup(mm, end);
> > else
> > - next = NULL; /* case 5 */
> > + next = vmg->next = NULL; /* case 5 */
> >
> > if (prev) {
> > vma_start = prev->vm_start;
> > vma_pgoff = prev->vm_pgoff;
> >
> > /* Can we merge the predecessor? */
> > - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> > - && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> > - pgoff, vm_userfaultfd_ctx, anon_name)) {
> > + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
> > + && can_vma_merge_after(vmg)) {
> > +
> > merge_prev = true;
> > - vma_prev(vmi);
> > + vma_prev(vmg->vmi);
> > }
> > }
> >
> > /* Can we merge the successor? */
> > - if (next && mpol_equal(policy, vma_policy(next)) &&
> > - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> > - vm_userfaultfd_ctx, anon_name)) {
> > + if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
> > + can_vma_merge_before(vmg)) {
> > merge_next = true;
> > }
> >
> > @@ -1041,7 +1032,7 @@ static struct vm_area_struct
> > remove = curr;
> > } else { /* case 5 */
> > adjust = curr;
> > - adj_start = (end - curr->vm_start);
> > + adj_start = end - curr->vm_start;
> > }
> > if (!err)
> > err = dup_anon_vma(prev, curr, &anon_dup);
> > @@ -1081,13 +1072,13 @@ static struct vm_area_struct
> > vma_expanded = true;
> >
> > if (vma_expanded) {
> > - vma_iter_config(vmi, vma_start, vma_end);
> > + vma_iter_config(vmg->vmi, vma_start, vma_end);
> > } else {
> > - vma_iter_config(vmi, adjust->vm_start + adj_start,
> > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > adjust->vm_end);
> > }
> >
> > - if (vma_iter_prealloc(vmi, vma))
> > + if (vma_iter_prealloc(vmg->vmi, vma))
> > goto prealloc_fail;
> >
> > init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> > @@ -1099,19 +1090,19 @@ static struct vm_area_struct
> > vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> >
> > if (vma_expanded)
> > - vma_iter_store(vmi, vma);
> > + vma_iter_store(vmg->vmi, vma);
> >
> > if (adj_start) {
> > adjust->vm_start += adj_start;
> > adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> > if (adj_start < 0) {
> > WARN_ON(vma_expanded);
> > - vma_iter_store(vmi, next);
> > + vma_iter_store(vmg->vmi, next);
> > }
> > }
> >
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(res, vm_flags);
> > + vma_complete(&vp, vmg->vmi, mm);
> > + khugepaged_enter_vma(res, vmg->flags);
> > return res;
> >
> > prealloc_fail:
> > @@ -1119,8 +1110,8 @@ static struct vm_area_struct
> > unlink_anon_vmas(anon_dup);
> >
> > anon_vma_fail:
> > - vma_iter_set(vmi, addr);
> > - vma_iter_load(vmi);
> > + vma_iter_set(vmg->vmi, addr);
> > + vma_iter_load(vmg->vmi);
> > return NULL;
> > }
> >
> > @@ -1137,38 +1128,141 @@ static struct vm_area_struct
> > * The function returns either the merged VMA, the original VMA if a split was
> > * required instead, or an error if the split failed.
> > */
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name)
> > +static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> > {
> > - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > + struct vm_area_struct *vma = vmg->vma;
> > struct vm_area_struct *merged;
> >
> > - merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> > - pgoff, policy, uffd_ctx, anon_name);
> > + /* First, try to merge. */
> > + merged = vma_merge(vmg);
> > if (merged)
> > return merged;
> >
> > - if (vma->vm_start < start) {
> > - int err = split_vma(vmi, vma, start, 1);
> > + /* Split any preceding portion of the VMA. */
> > + if (vma->vm_start < vmg->start) {
> > + int err = split_vma(vmg, true);
> >
> > if (err)
> > return ERR_PTR(err);
> > }
> >
> > - if (vma->vm_end > end) {
> > - int err = split_vma(vmi, vma, end, 0);
> > + /* Split any trailing portion of the VMA. */
> > + if (vma->vm_end > vmg->end) {
> > + int err = split_vma(vmg, false);
> >
> > if (err)
> > return ERR_PTR(err);
> > }
> >
> > - return vma;
> > + return vmg->vma;
> > +}
> > +
> > +/* Assumes addr >= vma->vm_start. */
> > +static pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr)
> > +{
> > + return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
> > +}
> > +
>
> Could we do something like the MA_STATE() in the maple_tree.h for the
> below?
Yeah Vlasta suggested the same thing, the main sticking point is that
parameters vary slightly, but I think we can probably rely on the compiler
to handle something like:
struct vma_merge_struct vmg = VMG_STATE(vmi, prev, vma);
vmg.flags = new_flags;
>
> > +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_name(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start,
> > + unsigned long end,
> > + unsigned long new_flags,
> > + struct anon_vma_name *new_name)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = new_name,
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_policy(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + struct mempolicy *new_pol)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = new_pol,
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags,
> > + struct vm_userfaultfd_ctx new_ctx)
> > +{
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = new_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = vma_pgoff_offset(vma, start),
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = new_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_modify(&vmg);
> > }
> >
> > /*
> > @@ -1180,8 +1274,22 @@ struct vm_area_struct
> > struct vm_area_struct *vma, unsigned long start,
> > unsigned long end, pgoff_t pgoff)
> > {
> > - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = prev,
> > + .vma = vma,
> > + .start = start,
> > + .end = end,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> > +
> > + return vma_merge(&vmg);
> > }
> >
> > /*
> > @@ -1193,11 +1301,22 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > unsigned long delta)
> > {
> > pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> > + struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > + .prev = vma,
> > + .vma = vma,
> > + .start = vma->vm_end,
> > + .end = vma->vm_end + delta,
> > + .flags = vma->vm_flags,
> > + .file = vma->vm_file,
> > + .pgoff = pgoff,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> >
> > /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> > - vma->vm_flags, pgoff, vma_policy(vma),
> > - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + return vma_merge(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 6efdf1768a0a..c31684cc1da6 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -26,6 +26,23 @@ struct unlink_vma_file_batch {
> > struct vm_area_struct *vmas[8];
> > };
> >
> > +/* Represents a VMA merge operation. */
> > +struct vma_merge_struct {
> > + struct vma_iterator *vmi;
> > + struct vm_area_struct *prev;
> > + struct vm_area_struct *next; /* Modified by vma_merge(). */
> > + struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
> > + unsigned long start;
> > + unsigned long end;
> > + unsigned long flags;
> > + pgoff_t pgoff;
> > + struct file *file;
> > + struct anon_vma *anon_vma;
> > + struct mempolicy *policy;
> > + struct vm_userfaultfd_ctx uffd_ctx;
> > + struct anon_vma_name *anon_name;
> > +};
> > +
> > #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > void validate_mm(struct mm_struct *mm);
> > #else
> > @@ -72,80 +89,53 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > struct vm_area_struct *next, unsigned long start,
> > unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/* Required by mmap_region(). */
> > -bool
> > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name);
> > -
> > -/* Required by mmap_region() and do_brk_flags(). */
> > -bool
> > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name);
> > -
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name);
> > +/*
> > + * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > + *
> > + * Required by mmap_region().
> > + */
> > +bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > +
> > +/*
> > + * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > + *
> > + * Required by mmap_region() and do_brk_flags().
> > + */
> > +bool can_vma_merge_after(struct vma_merge_struct *vmg);
> >
> > /* We are about to modify the VMA's flags. */
> > -static inline struct vm_area_struct
> > -*vma_modify_flags(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long new_flags)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx,
> > - anon_vma_name(vma));
> > -}
> > +struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags);
>
> Functions with insane number of arguments can be written like this:
>
> struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> struct vm_area_struct *prev, struct vm_area_struct,
> *vma, unsigned long start, unsigned long end,
> unsigned long new_flags);
>
> The advantage is that more of the insane arguments are on the same line,
> and if the name changes it will limit the impact.
Ack will adjust
>
> >
> > /* We are about to modify the VMA's flags and/or anon_name. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_flags_name(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start,
> > unsigned long end,
> > unsigned long new_flags,
> > - struct anon_vma_name *new_name)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> > -}
> > + struct anon_vma_name *new_name);
> >
> > /* We are about to modify the VMA's memory policy. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_policy(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start, unsigned long end,
> > - struct mempolicy *new_pol)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> > - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > -}
> > + struct mempolicy *new_pol);
> >
> > /* We are about to modify the VMA's flags and/or uffd context. */
> > -static inline struct vm_area_struct
> > +struct vm_area_struct
> > *vma_modify_flags_uffd(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > struct vm_area_struct *vma,
> > unsigned long start, unsigned long end,
> > unsigned long new_flags,
> > - struct vm_userfaultfd_ctx new_ctx)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), new_ctx, anon_vma_name(vma));
> > -}
> > + struct vm_userfaultfd_ctx new_ctx);
> >
> > struct vm_area_struct
> > *vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > --
> > 2.45.2
> >
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 03/10] mm: abstract duplicated policy comparison
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 01/10] tools: improve vma test Makefile Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 02/10] mm: introduce vma_merge_struct and abstract merge parameters Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 12:50 ` Petr Tesařík
2024-08-05 12:13 ` [PATCH 04/10] mm: abstract parameters for vma_expand/shrink() Lorenzo Stoakes
` (6 subsequent siblings)
9 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Both can_vma_merge_before() and can_vma_merge_after() are invoked after
checking for compatible VMA NUMA policy, we can simply move this to
is_mergeable_vma() and abstract this altogether.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/mmap.c | 8 +++-----
mm/vma.c | 9 ++++-----
2 files changed, 7 insertions(+), 10 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index f931000c561f..721ced6e37b0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1422,8 +1422,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Attempt to expand an old mapping */
/* Check next */
- if (next && next->vm_start == end && !vma_policy(next) &&
- can_vma_merge_before(&vmg)) {
+ if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
merge_end = next->vm_end;
vma = next;
vmg.pgoff = next->vm_pgoff - pglen;
@@ -1435,8 +1434,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
/* Check prev */
- if (prev && prev->vm_end == addr && !vma_policy(prev) &&
- can_vma_merge_after(&vmg)) {
+ if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
merge_start = prev->vm_start;
vma = prev;
vmg.pgoff = prev->vm_pgoff;
@@ -1798,7 +1796,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Expand the existing vma if possible; Note that singular lists do not
* occur after forking, so the expand will only happen on new VMAs.
*/
- if (vma && vma->vm_end == addr && !vma_policy(vma)) {
+ if (vma && vma->vm_end == addr) {
struct vma_merge_struct vmg = {
.prev = vma,
.flags = flags,
diff --git a/mm/vma.c b/mm/vma.c
index 20c4ce7712c0..b452b472a085 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -19,6 +19,8 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
*/
bool may_remove_vma = merge_next;
+ if (!mpol_equal(vmg->policy, vma_policy(vma)))
+ return false;
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
@@ -971,17 +973,14 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
vma_pgoff = prev->vm_pgoff;
/* Can we merge the predecessor? */
- if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
- && can_vma_merge_after(vmg)) {
-
+ if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
merge_prev = true;
vma_prev(vmg->vmi);
}
}
/* Can we merge the successor? */
- if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
- can_vma_merge_before(vmg)) {
+ if (next && can_vma_merge_before(vmg)) {
merge_next = true;
}
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 03/10] mm: abstract duplicated policy comparison
2024-08-05 12:13 ` [PATCH 03/10] mm: abstract duplicated policy comparison Lorenzo Stoakes
@ 2024-08-06 12:50 ` Petr Tesařík
0 siblings, 0 replies; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 12:50 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:50 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> Both can_vma_merge_before() and can_vma_merge_after() are invoked after
> checking for compatible VMA NUMA policy, we can simply move this to
> is_mergeable_vma() and abstract this altogether.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 8 +++-----
> mm/vma.c | 9 ++++-----
> 2 files changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f931000c561f..721ced6e37b0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1422,8 +1422,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> /* Attempt to expand an old mapping */
> /* Check next */
> - if (next && next->vm_start == end && !vma_policy(next) &&
> - can_vma_merge_before(&vmg)) {
> + if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> merge_end = next->vm_end;
> vma = next;
> vmg.pgoff = next->vm_pgoff - pglen;
> @@ -1435,8 +1434,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> }
>
> /* Check prev */
> - if (prev && prev->vm_end == addr && !vma_policy(prev) &&
> - can_vma_merge_after(&vmg)) {
> + if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> merge_start = prev->vm_start;
> vma = prev;
> vmg.pgoff = prev->vm_pgoff;
> @@ -1798,7 +1796,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Expand the existing vma if possible; Note that singular lists do not
> * occur after forking, so the expand will only happen on new VMAs.
> */
> - if (vma && vma->vm_end == addr && !vma_policy(vma)) {
> + if (vma && vma->vm_end == addr) {
> struct vma_merge_struct vmg = {
> .prev = vma,
> .flags = flags,
> diff --git a/mm/vma.c b/mm/vma.c
> index 20c4ce7712c0..b452b472a085 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -19,6 +19,8 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
> */
> bool may_remove_vma = merge_next;
>
> + if (!mpol_equal(vmg->policy, vma_policy(vma)))
> + return false;
> /*
> * VM_SOFTDIRTY should not prevent from VMA merging, if we
> * match the flags but dirty bit -- the caller should mark
> @@ -971,17 +973,14 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> vma_pgoff = prev->vm_pgoff;
>
> /* Can we merge the predecessor? */
> - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), vmg->policy)
> - && can_vma_merge_after(vmg)) {
> -
> + if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
> merge_prev = true;
> vma_prev(vmg->vmi);
> }
> }
>
> /* Can we merge the successor? */
> - if (next && mpol_equal(vmg->policy, vma_policy(next)) &&
> - can_vma_merge_before(vmg)) {
> + if (next && can_vma_merge_before(vmg)) {
> merge_next = true;
> }
>
Looks good.
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (2 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 03/10] mm: abstract duplicated policy comparison Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 12:54 ` Petr Tesařík
[not found] ` <f12608ec-9c40-4977-a5a6-479f86b44e80@kernel.org>
2024-08-05 12:13 ` [PATCH 05/10] mm: abstract vma_merge_new_vma() to use vma_merge_struct Lorenzo Stoakes
` (5 subsequent siblings)
9 siblings, 2 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Equally use struct vma_merge_struct to abstract parameters for VMA
expansion and shrinking.
This leads the way to further refactoring and de-duplication by
standardising the interface.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/mmap.c | 30 +++++++++++--------
mm/vma.c | 66 ++++++++++++++++++-----------------------
mm/vma.h | 8 ++---
tools/testing/vma/vma.c | 18 +++++++++--
4 files changed, 65 insertions(+), 57 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 721ced6e37b0..04145347c245 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
unsigned long end = addr + len;
- unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
int error;
VMA_ITERATOR(vmi, mm, addr);
@@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Attempt to expand an old mapping */
/* Check next */
if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
- merge_end = next->vm_end;
- vma = next;
+ /* We can adjust this as can_vma_merge_after() doesn't touch */
+ vmg.end = next->vm_end;
+ vma = vmg.vma = next;
vmg.pgoff = next->vm_pgoff - pglen;
- }
- if (vma) {
+ /* We may merge our NULL anon_vma with non-NULL in next. */
vmg.anon_vma = vma->anon_vma;
- vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
}
/* Check prev */
if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
- merge_start = prev->vm_start;
- vma = prev;
+ vmg.start = prev->vm_start;
+ vma = vmg.vma = prev;
vmg.pgoff = prev->vm_pgoff;
} else if (prev) {
vma_iter_next_range(&vmi);
}
/* Actually expand, if possible */
- if (vma &&
- !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
+ if (vma && !vma_expand(&vmg)) {
khugepaged_enter_vma(vma, vm_flags);
goto expanded;
}
@@ -2359,6 +2356,13 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
VMA_ITERATOR(vmi, mm, new_start);
struct vm_area_struct *next;
struct mmu_gather tlb;
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .vma = vma,
+ .start = new_start,
+ .end = old_end,
+ .pgoff = vma->vm_pgoff,
+ };
BUG_ON(new_start > new_end);
@@ -2373,7 +2377,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
/*
* cover the whole range: [new_start, old_end)
*/
- if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
+ if (vma_expand(&vmg))
return -ENOMEM;
/*
@@ -2406,6 +2410,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
tlb_finish_mmu(&tlb);
vma_prev(&vmi);
+ vmg.end = new_end;
+
/* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+ return vma_shrink(&vmg);
}
diff --git a/mm/vma.c b/mm/vma.c
index b452b472a085..3d6ce04f1b9c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -489,30 +489,25 @@ void validate_mm(struct mm_struct *mm)
/*
* vma_expand - Expand an existing VMA
*
- * @vmi: The vma iterator
- * @vma: The vma to expand
- * @start: The start of the vma
- * @end: The exclusive end of the vma
- * @pgoff: The page offset of vma
- * @next: The current of next vma.
+ * @vmg: Describes a VMA expansion operation.
*
- * Expand @vma to @start and @end. Can expand off the start and end. Will
- * expand over @next if it's different from @vma and @end == @next->vm_end.
- * Checking if the @vma can expand and merge with @next needs to be handled by
- * the caller.
+ * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
+ * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
+ * vmg->next needs to be handled by the caller.
*
* Returns: 0 on success
*/
-int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next)
+int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *next = vmg->next;
struct vma_prepare vp;
vma_start_write(vma);
- if (next && (vma != next) && (end == next->vm_end)) {
+ if (next && (vma != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
@@ -525,21 +520,21 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON(next && !vp.remove &&
- next != vma && end > next->vm_start);
+ next != vma && vmg->end > next->vm_start);
/* Only handles expanding */
- VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
+ VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
/* Note: vma iterator must be pointing to 'start' */
- vma_iter_config(vmi, start, end);
- if (vma_iter_prealloc(vmi, vma))
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ if (vma_iter_prealloc(vmg->vmi, vma))
goto nomem;
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
- vma_set_range(vma, start, end, pgoff);
- vma_iter_store(vmi, vma);
+ vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
+ vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+ vma_iter_store(vmg->vmi, vma);
- vma_complete(&vp, vmi, vma->vm_mm);
+ vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
nomem:
@@ -550,37 +545,34 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
/*
* vma_shrink() - Reduce an existing VMAs memory area
- * @vmi: The vma iterator
- * @vma: The VMA to modify
- * @start: The new start
- * @end: The new end
+ * @vmg: Describes a VMA shrink operation.
*
* Returns: 0 on success, -ENOMEM otherwise
*/
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff)
+int vma_shrink(struct vma_merge_struct *vmg)
{
+ struct vm_area_struct *vma = vmg->vma;
struct vma_prepare vp;
- WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
+ WARN_ON((vma->vm_start != vmg->start) && (vma->vm_end != vmg->end));
- if (vma->vm_start < start)
- vma_iter_config(vmi, vma->vm_start, start);
+ if (vma->vm_start < vmg->start)
+ vma_iter_config(vmg->vmi, vma->vm_start, vmg->start);
else
- vma_iter_config(vmi, end, vma->vm_end);
+ vma_iter_config(vmg->vmi, vmg->end, vma->vm_end);
- if (vma_iter_prealloc(vmi, NULL))
+ if (vma_iter_prealloc(vmg->vmi, NULL))
return -ENOMEM;
vma_start_write(vma);
init_vma_prep(&vp, vma);
vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
+ vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
- vma_iter_clear(vmi);
- vma_set_range(vma, start, end, pgoff);
- vma_complete(&vp, vmi, vma->vm_mm);
+ vma_iter_clear(vmg->vmi);
+ vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
+ vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
}
diff --git a/mm/vma.h b/mm/vma.h
index c31684cc1da6..c464d25da120 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -66,12 +66,8 @@ void init_vma_prep(struct vma_prepare *vp,
void vma_complete(struct vma_prepare *vp,
struct vma_iterator *vmi, struct mm_struct *mm);
-int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next);
-
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff);
+int vma_expand(struct vma_merge_struct *vmg);
+int vma_shrink(struct vma_merge_struct *vmg);
int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 48e033c60d87..d216e51206c1 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -142,10 +142,17 @@ static bool test_simple_expand(void)
struct mm_struct mm = {};
struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags);
VMA_ITERATOR(vmi, &mm, 0);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .vma = vma,
+ .start = 0,
+ .end = 0x3000,
+ .pgoff = 0,
+ };
ASSERT_FALSE(vma_link(&mm, vma));
- ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL));
+ ASSERT_FALSE(vma_expand(&vmg));
ASSERT_EQ(vma->vm_start, 0);
ASSERT_EQ(vma->vm_end, 0x3000);
@@ -163,10 +170,17 @@ static bool test_simple_shrink(void)
struct mm_struct mm = {};
struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
VMA_ITERATOR(vmi, &mm, 0);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .vma = vma,
+ .start = 0,
+ .end = 0x1000,
+ .pgoff = 0,
+ };
ASSERT_FALSE(vma_link(&mm, vma));
- ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
+ ASSERT_FALSE(vma_shrink(&vmg));
ASSERT_EQ(vma->vm_start, 0);
ASSERT_EQ(vma->vm_end, 0x1000);
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
2024-08-05 12:13 ` [PATCH 04/10] mm: abstract parameters for vma_expand/shrink() Lorenzo Stoakes
@ 2024-08-06 12:54 ` Petr Tesařík
[not found] ` <f12608ec-9c40-4977-a5a6-479f86b44e80@kernel.org>
1 sibling, 0 replies; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 12:54 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:51 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> Equally use struct vma_merge_struct to abstract parameters for VMA
> expansion and shrinking.
>
> This leads the way to further refactoring and de-duplication by
> standardising the interface.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 30 +++++++++++--------
> mm/vma.c | 66 ++++++++++++++++++-----------------------
> mm/vma.h | 8 ++---
> tools/testing/vma/vma.c | 18 +++++++++--
> 4 files changed, 65 insertions(+), 57 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 721ced6e37b0..04145347c245 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> pgoff_t pglen = len >> PAGE_SHIFT;
> unsigned long charged = 0;
> unsigned long end = addr + len;
> - unsigned long merge_start = addr, merge_end = end;
> bool writable_file_mapping = false;
> int error;
> VMA_ITERATOR(vmi, mm, addr);
> @@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> /* Attempt to expand an old mapping */
> /* Check next */
> if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> - merge_end = next->vm_end;
> - vma = next;
> + /* We can adjust this as can_vma_merge_after() doesn't touch */
> + vmg.end = next->vm_end;
> + vma = vmg.vma = next;
> vmg.pgoff = next->vm_pgoff - pglen;
> - }
>
> - if (vma) {
> + /* We may merge our NULL anon_vma with non-NULL in next. */
> vmg.anon_vma = vma->anon_vma;
> - vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> }
>
> /* Check prev */
> if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> - merge_start = prev->vm_start;
> - vma = prev;
> + vmg.start = prev->vm_start;
> + vma = vmg.vma = prev;
> vmg.pgoff = prev->vm_pgoff;
> } else if (prev) {
> vma_iter_next_range(&vmi);
> }
>
> /* Actually expand, if possible */
> - if (vma &&
> - !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> + if (vma && !vma_expand(&vmg)) {
See? One more use of "next" that has gone away in the end...
Petr T
> khugepaged_enter_vma(vma, vm_flags);
> goto expanded;
> }
> @@ -2359,6 +2356,13 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> VMA_ITERATOR(vmi, mm, new_start);
> struct vm_area_struct *next;
> struct mmu_gather tlb;
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .vma = vma,
> + .start = new_start,
> + .end = old_end,
> + .pgoff = vma->vm_pgoff,
> + };
>
> BUG_ON(new_start > new_end);
>
> @@ -2373,7 +2377,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> /*
> * cover the whole range: [new_start, old_end)
> */
> - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
> + if (vma_expand(&vmg))
> return -ENOMEM;
>
> /*
> @@ -2406,6 +2410,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> tlb_finish_mmu(&tlb);
>
> vma_prev(&vmi);
> + vmg.end = new_end;
> +
> /* Shrink the vma to just the new range */
> - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> + return vma_shrink(&vmg);
> }
> diff --git a/mm/vma.c b/mm/vma.c
> index b452b472a085..3d6ce04f1b9c 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -489,30 +489,25 @@ void validate_mm(struct mm_struct *mm)
> /*
> * vma_expand - Expand an existing VMA
> *
> - * @vmi: The vma iterator
> - * @vma: The vma to expand
> - * @start: The start of the vma
> - * @end: The exclusive end of the vma
> - * @pgoff: The page offset of vma
> - * @next: The current of next vma.
> + * @vmg: Describes a VMA expansion operation.
> *
> - * Expand @vma to @start and @end. Can expand off the start and end. Will
> - * expand over @next if it's different from @vma and @end == @next->vm_end.
> - * Checking if the @vma can expand and merge with @next needs to be handled by
> - * the caller.
> + * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
> + * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
> + * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> + * vmg->next needs to be handled by the caller.
> *
> * Returns: 0 on success
> */
> -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long start, unsigned long end, pgoff_t pgoff,
> - struct vm_area_struct *next)
> +int vma_expand(struct vma_merge_struct *vmg)
> {
> struct vm_area_struct *anon_dup = NULL;
> bool remove_next = false;
> + struct vm_area_struct *vma = vmg->vma;
> + struct vm_area_struct *next = vmg->next;
> struct vma_prepare vp;
>
> vma_start_write(vma);
> - if (next && (vma != next) && (end == next->vm_end)) {
> + if (next && (vma != next) && (vmg->end == next->vm_end)) {
> int ret;
>
> remove_next = true;
> @@ -525,21 +520,21 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> /* Not merging but overwriting any part of next is not handled. */
> VM_WARN_ON(next && !vp.remove &&
> - next != vma && end > next->vm_start);
> + next != vma && vmg->end > next->vm_start);
> /* Only handles expanding */
> - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
> + VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
>
> /* Note: vma iterator must be pointing to 'start' */
> - vma_iter_config(vmi, start, end);
> - if (vma_iter_prealloc(vmi, vma))
> + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> + if (vma_iter_prealloc(vmg->vmi, vma))
> goto nomem;
>
> vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, start, end, 0);
> - vma_set_range(vma, start, end, pgoff);
> - vma_iter_store(vmi, vma);
> + vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
> + vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> + vma_iter_store(vmg->vmi, vma);
>
> - vma_complete(&vp, vmi, vma->vm_mm);
> + vma_complete(&vp, vmg->vmi, vma->vm_mm);
> return 0;
>
> nomem:
> @@ -550,37 +545,34 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
>
> /*
> * vma_shrink() - Reduce an existing VMAs memory area
> - * @vmi: The vma iterator
> - * @vma: The VMA to modify
> - * @start: The new start
> - * @end: The new end
> + * @vmg: Describes a VMA shrink operation.
> *
> * Returns: 0 on success, -ENOMEM otherwise
> */
> -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long start, unsigned long end, pgoff_t pgoff)
> +int vma_shrink(struct vma_merge_struct *vmg)
> {
> + struct vm_area_struct *vma = vmg->vma;
> struct vma_prepare vp;
>
> - WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
> + WARN_ON((vma->vm_start != vmg->start) && (vma->vm_end != vmg->end));
>
> - if (vma->vm_start < start)
> - vma_iter_config(vmi, vma->vm_start, start);
> + if (vma->vm_start < vmg->start)
> + vma_iter_config(vmg->vmi, vma->vm_start, vmg->start);
> else
> - vma_iter_config(vmi, end, vma->vm_end);
> + vma_iter_config(vmg->vmi, vmg->end, vma->vm_end);
>
> - if (vma_iter_prealloc(vmi, NULL))
> + if (vma_iter_prealloc(vmg->vmi, NULL))
> return -ENOMEM;
>
> vma_start_write(vma);
>
> init_vma_prep(&vp, vma);
> vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, start, end, 0);
> + vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
>
> - vma_iter_clear(vmi);
> - vma_set_range(vma, start, end, pgoff);
> - vma_complete(&vp, vmi, vma->vm_mm);
> + vma_iter_clear(vmg->vmi);
> + vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> + vma_complete(&vp, vmg->vmi, vma->vm_mm);
> return 0;
> }
>
> diff --git a/mm/vma.h b/mm/vma.h
> index c31684cc1da6..c464d25da120 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -66,12 +66,8 @@ void init_vma_prep(struct vma_prepare *vp,
> void vma_complete(struct vma_prepare *vp,
> struct vma_iterator *vmi, struct mm_struct *mm);
>
> -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long start, unsigned long end, pgoff_t pgoff,
> - struct vm_area_struct *next);
> -
> -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> - unsigned long start, unsigned long end, pgoff_t pgoff);
> +int vma_expand(struct vma_merge_struct *vmg);
> +int vma_shrink(struct vma_merge_struct *vmg);
>
> int
> do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index 48e033c60d87..d216e51206c1 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -142,10 +142,17 @@ static bool test_simple_expand(void)
> struct mm_struct mm = {};
> struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags);
> VMA_ITERATOR(vmi, &mm, 0);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .vma = vma,
> + .start = 0,
> + .end = 0x3000,
> + .pgoff = 0,
> + };
>
> ASSERT_FALSE(vma_link(&mm, vma));
>
> - ASSERT_FALSE(vma_expand(&vmi, vma, 0, 0x3000, 0, NULL));
> + ASSERT_FALSE(vma_expand(&vmg));
>
> ASSERT_EQ(vma->vm_start, 0);
> ASSERT_EQ(vma->vm_end, 0x3000);
> @@ -163,10 +170,17 @@ static bool test_simple_shrink(void)
> struct mm_struct mm = {};
> struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
> VMA_ITERATOR(vmi, &mm, 0);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .vma = vma,
> + .start = 0,
> + .end = 0x1000,
> + .pgoff = 0,
> + };
>
> ASSERT_FALSE(vma_link(&mm, vma));
>
> - ASSERT_FALSE(vma_shrink(&vmi, vma, 0, 0x1000, 0));
> + ASSERT_FALSE(vma_shrink(&vmg));
>
> ASSERT_EQ(vma->vm_start, 0);
> ASSERT_EQ(vma->vm_end, 0x1000);
^ permalink raw reply [flat|nested] 53+ messages in thread[parent not found: <f12608ec-9c40-4977-a5a6-479f86b44e80@kernel.org>]
* Re: [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
[not found] ` <f12608ec-9c40-4977-a5a6-479f86b44e80@kernel.org>
@ 2024-08-08 15:45 ` Lorenzo Stoakes
2024-08-08 20:20 ` Liam R. Howlett
2024-08-14 13:53 ` Lorenzo Stoakes
0 siblings, 2 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-08 15:45 UTC (permalink / raw)
To: Vlastimil Babka (SUSE)
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Thu, Aug 08, 2024 at 04:20:26PM GMT, Vlastimil Babka (SUSE) wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > Equally use struct vma_merge_struct to abstract parameters for VMA
> > expansion and shrinking.
> >
> > This leads the way to further refactoring and de-duplication by
> > standardising the interface.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 30 +++++++++++--------
> > mm/vma.c | 66 ++++++++++++++++++-----------------------
> > mm/vma.h | 8 ++---
> > tools/testing/vma/vma.c | 18 +++++++++--
> > 4 files changed, 65 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 721ced6e37b0..04145347c245 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > pgoff_t pglen = len >> PAGE_SHIFT;
> > unsigned long charged = 0;
> > unsigned long end = addr + len;
> > - unsigned long merge_start = addr, merge_end = end;
> > bool writable_file_mapping = false;
> > int error;
> > VMA_ITERATOR(vmi, mm, addr);
> > @@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > /* Attempt to expand an old mapping */
> > /* Check next */
> > if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > - merge_end = next->vm_end;
> > - vma = next;
> > + /* We can adjust this as can_vma_merge_after() doesn't touch */
> > + vmg.end = next->vm_end;
>
> Ugh, ok but wonder how fragile that is.
Yeah you're right this is a bit horrid, I'll find a way to make this less
brittle.
>
> > + vma = vmg.vma = next;
> > vmg.pgoff = next->vm_pgoff - pglen;
> > - }
> >
> > - if (vma) {
> > + /* We may merge our NULL anon_vma with non-NULL in next. */
>
> Hm now I realize the if (vma) block probably didn't need to be added in
> patch 2 only to removed here, it could have been part of the if (next &&
> ...) block above already? Which is not that important, but...
You're right, will fix.
>
> > vmg.anon_vma = vma->anon_vma;
> > - vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
>
> I don't see why it's now ok to remove this line? Was it intended? In patch 2
> it made sense to me to add it so the can_vma_merge_after() still has the
> right ctx for comparing, and this didn't change?
Yeah, yikes, I think I was lost in the maelstrom of considering edge cases,
and now this is broken for the whole prev vs. next uffd thing.
The fact the mmap stuff is not directly testable is a factor here.
TL;DR: I'll fix this, you're right.
>
> > }
> >
> > /* Check prev */
> > if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > - merge_start = prev->vm_start;
> > - vma = prev;
> > + vmg.start = prev->vm_start;
> > + vma = vmg.vma = prev;
> > vmg.pgoff = prev->vm_pgoff;
> > } else if (prev) {
> > vma_iter_next_range(&vmi);
> > }
> >
> > /* Actually expand, if possible */
> > - if (vma &&
> > - !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > + if (vma && !vma_expand(&vmg)) {
> > khugepaged_enter_vma(vma, vm_flags);
> > goto expanded;
> > }
> > @@ -2359,6 +2356,13 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > VMA_ITERATOR(vmi, mm, new_start);
> > struct vm_area_struct *next;
> > struct mmu_gather tlb;
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .vma = vma,
> > + .start = new_start,
> > + .end = old_end,
> > + .pgoff = vma->vm_pgoff,
> > + };
> >
> > BUG_ON(new_start > new_end);
> >
> > @@ -2373,7 +2377,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > /*
> > * cover the whole range: [new_start, old_end)
> > */
> > - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
> > + if (vma_expand(&vmg))
> > return -ENOMEM;
> >
> > /*
> > @@ -2406,6 +2410,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > tlb_finish_mmu(&tlb);
> >
> > vma_prev(&vmi);
> > + vmg.end = new_end;
> > +
> > /* Shrink the vma to just the new range */
> > - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > + return vma_shrink(&vmg);
>
> The vma_shrink() doesn't seem to benefit that much from vmg conversion but I
> guess why not. Maybe this will further change anyway...
>
No it doesn't, but it's more about being consistent with vma_expand(). We
maybe want to find a way to unite them possibly.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
2024-08-08 15:45 ` Lorenzo Stoakes
@ 2024-08-08 20:20 ` Liam R. Howlett
2024-08-09 10:18 ` Lorenzo Stoakes
2024-08-14 13:53 ` Lorenzo Stoakes
1 sibling, 1 reply; 53+ messages in thread
From: Liam R. Howlett @ 2024-08-08 20:20 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Vlastimil Babka (SUSE), linux-mm, linux-kernel, Andrew Morton
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 11:46]:
> On Thu, Aug 08, 2024 at 04:20:26PM GMT, Vlastimil Babka (SUSE) wrote:
> > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > Equally use struct vma_merge_struct to abstract parameters for VMA
> > > expansion and shrinking.
> > >
> > > This leads the way to further refactoring and de-duplication by
> > > standardising the interface.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > > mm/mmap.c | 30 +++++++++++--------
> > > mm/vma.c | 66 ++++++++++++++++++-----------------------
> > > mm/vma.h | 8 ++---
> > > tools/testing/vma/vma.c | 18 +++++++++--
> > > 4 files changed, 65 insertions(+), 57 deletions(-)
> > >
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 721ced6e37b0..04145347c245 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > pgoff_t pglen = len >> PAGE_SHIFT;
> > > unsigned long charged = 0;
> > > unsigned long end = addr + len;
> > > - unsigned long merge_start = addr, merge_end = end;
> > > bool writable_file_mapping = false;
> > > int error;
> > > VMA_ITERATOR(vmi, mm, addr);
> > > @@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > /* Attempt to expand an old mapping */
> > > /* Check next */
> > > if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > > - merge_end = next->vm_end;
> > > - vma = next;
> > > + /* We can adjust this as can_vma_merge_after() doesn't touch */
> > > + vmg.end = next->vm_end;
> >
> > Ugh, ok but wonder how fragile that is.
>
> Yeah you're right this is a bit horrid, I'll find a way to make this less
> brittle.
>
> >
> > > + vma = vmg.vma = next;
> > > vmg.pgoff = next->vm_pgoff - pglen;
> > > - }
> > >
> > > - if (vma) {
> > > + /* We may merge our NULL anon_vma with non-NULL in next. */
> >
> > Hm now I realize the if (vma) block probably didn't need to be added in
> > patch 2 only to removed here, it could have been part of the if (next &&
> > ...) block above already? Which is not that important, but...
>
> You're right, will fix.
>
> >
> > > vmg.anon_vma = vma->anon_vma;
> > > - vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> >
> > I don't see why it's now ok to remove this line? Was it intended? In patch 2
> > it made sense to me to add it so the can_vma_merge_after() still has the
> > right ctx for comparing, and this didn't change?
>
> Yeah, yikes, I think I was lost in the maelstrom of considering edge cases,
> and now this is broken for the whole prev vs. next uffd thing.
>
> The fact the mmap stuff is not directly testable is a factor here.
>
> TL;DR: I'll fix this, you're right.
>
> >
> > > }
> > >
> > > /* Check prev */
> > > if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > > - merge_start = prev->vm_start;
> > > - vma = prev;
> > > + vmg.start = prev->vm_start;
> > > + vma = vmg.vma = prev;
> > > vmg.pgoff = prev->vm_pgoff;
> > > } else if (prev) {
> > > vma_iter_next_range(&vmi);
> > > }
> > >
> > > /* Actually expand, if possible */
> > > - if (vma &&
> > > - !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > > + if (vma && !vma_expand(&vmg)) {
> > > khugepaged_enter_vma(vma, vm_flags);
> > > goto expanded;
> > > }
> > > @@ -2359,6 +2356,13 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > VMA_ITERATOR(vmi, mm, new_start);
> > > struct vm_area_struct *next;
> > > struct mmu_gather tlb;
> > > + struct vma_merge_struct vmg = {
> > > + .vmi = &vmi,
> > > + .vma = vma,
> > > + .start = new_start,
> > > + .end = old_end,
> > > + .pgoff = vma->vm_pgoff,
> > > + };
> > >
> > > BUG_ON(new_start > new_end);
> > >
> > > @@ -2373,7 +2377,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > /*
> > > * cover the whole range: [new_start, old_end)
> > > */
> > > - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
> > > + if (vma_expand(&vmg))
> > > return -ENOMEM;
> > >
> > > /*
> > > @@ -2406,6 +2410,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > tlb_finish_mmu(&tlb);
> > >
> > > vma_prev(&vmi);
> > > + vmg.end = new_end;
> > > +
> > > /* Shrink the vma to just the new range */
> > > - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > > + return vma_shrink(&vmg);
> >
> > The vma_shrink() doesn't seem to benefit that much from vmg conversion but I
> > guess why not. Maybe this will further change anyway...
> >
>
> No it doesn't, but it's more about being consistent with vma_expand(). We
> maybe want to find a way to unite them possibly.
No, we probably should not unite them - the shrink happens in a single
place on setup.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
2024-08-08 20:20 ` Liam R. Howlett
@ 2024-08-09 10:18 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 10:18 UTC (permalink / raw)
To: Liam R. Howlett, Vlastimil Babka (SUSE),
linux-mm, linux-kernel, Andrew Morton
On Thu, Aug 08, 2024 at 04:20:57PM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 11:46]:
> > On Thu, Aug 08, 2024 at 04:20:26PM GMT, Vlastimil Babka (SUSE) wrote:
> > > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > > Equally use struct vma_merge_struct to abstract parameters for VMA
> > > > expansion and shrinking.
> > > >
> > > > This leads the way to further refactoring and de-duplication by
> > > > standardising the interface.
> > > >
> > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > ---
> > > > mm/mmap.c | 30 +++++++++++--------
> > > > mm/vma.c | 66 ++++++++++++++++++-----------------------
> > > > mm/vma.h | 8 ++---
> > > > tools/testing/vma/vma.c | 18 +++++++++--
> > > > 4 files changed, 65 insertions(+), 57 deletions(-)
> > > >
> > > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > > index 721ced6e37b0..04145347c245 100644
> > > > --- a/mm/mmap.c
> > > > +++ b/mm/mmap.c
> > > > @@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > pgoff_t pglen = len >> PAGE_SHIFT;
> > > > unsigned long charged = 0;
> > > > unsigned long end = addr + len;
> > > > - unsigned long merge_start = addr, merge_end = end;
> > > > bool writable_file_mapping = false;
> > > > int error;
> > > > VMA_ITERATOR(vmi, mm, addr);
> > > > @@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > > /* Attempt to expand an old mapping */
> > > > /* Check next */
> > > > if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > > > - merge_end = next->vm_end;
> > > > - vma = next;
> > > > + /* We can adjust this as can_vma_merge_after() doesn't touch */
> > > > + vmg.end = next->vm_end;
> > >
> > > Ugh, ok but wonder how fragile that is.
> >
> > Yeah you're right this is a bit horrid, I'll find a way to make this less
> > brittle.
> >
> > >
> > > > + vma = vmg.vma = next;
> > > > vmg.pgoff = next->vm_pgoff - pglen;
> > > > - }
> > > >
> > > > - if (vma) {
> > > > + /* We may merge our NULL anon_vma with non-NULL in next. */
> > >
> > > Hm now I realize the if (vma) block probably didn't need to be added in
> > > patch 2 only to removed here, it could have been part of the if (next &&
> > > ...) block above already? Which is not that important, but...
> >
> > You're right, will fix.
> >
> > >
> > > > vmg.anon_vma = vma->anon_vma;
> > > > - vmg.uffd_ctx = vma->vm_userfaultfd_ctx;
> > >
> > > I don't see why it's now ok to remove this line? Was it intended? In patch 2
> > > it made sense to me to add it so the can_vma_merge_after() still has the
> > > right ctx for comparing, and this didn't change?
> >
> > Yeah, yikes, I think I was lost in the maelstrom of considering edge cases,
> > and now this is broken for the whole prev vs. next uffd thing.
> >
> > The fact the mmap stuff is not directly testable is a factor here.
> >
> > TL;DR: I'll fix this, you're right.
> >
> > >
> > > > }
> > > >
> > > > /* Check prev */
> > > > if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > > > - merge_start = prev->vm_start;
> > > > - vma = prev;
> > > > + vmg.start = prev->vm_start;
> > > > + vma = vmg.vma = prev;
> > > > vmg.pgoff = prev->vm_pgoff;
> > > > } else if (prev) {
> > > > vma_iter_next_range(&vmi);
> > > > }
> > > >
> > > > /* Actually expand, if possible */
> > > > - if (vma &&
> > > > - !vma_expand(&vmi, vma, merge_start, merge_end, vmg.pgoff, next)) {
> > > > + if (vma && !vma_expand(&vmg)) {
> > > > khugepaged_enter_vma(vma, vm_flags);
> > > > goto expanded;
> > > > }
> > > > @@ -2359,6 +2356,13 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > > VMA_ITERATOR(vmi, mm, new_start);
> > > > struct vm_area_struct *next;
> > > > struct mmu_gather tlb;
> > > > + struct vma_merge_struct vmg = {
> > > > + .vmi = &vmi,
> > > > + .vma = vma,
> > > > + .start = new_start,
> > > > + .end = old_end,
> > > > + .pgoff = vma->vm_pgoff,
> > > > + };
> > > >
> > > > BUG_ON(new_start > new_end);
> > > >
> > > > @@ -2373,7 +2377,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > > /*
> > > > * cover the whole range: [new_start, old_end)
> > > > */
> > > > - if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
> > > > + if (vma_expand(&vmg))
> > > > return -ENOMEM;
> > > >
> > > > /*
> > > > @@ -2406,6 +2410,8 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > > tlb_finish_mmu(&tlb);
> > > >
> > > > vma_prev(&vmi);
> > > > + vmg.end = new_end;
> > > > +
> > > > /* Shrink the vma to just the new range */
> > > > - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > > > + return vma_shrink(&vmg);
> > >
> > > The vma_shrink() doesn't seem to benefit that much from vmg conversion but I
> > > guess why not. Maybe this will further change anyway...
> > >
> >
> > No it doesn't, but it's more about being consistent with vma_expand(). We
> > maybe want to find a way to unite them possibly.
>
> No, we probably should not unite them - the shrink happens in a single
> place on setup.
>
Ack will in that case un-vmg vma_shrink().
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 04/10] mm: abstract parameters for vma_expand/shrink()
2024-08-08 15:45 ` Lorenzo Stoakes
2024-08-08 20:20 ` Liam R. Howlett
@ 2024-08-14 13:53 ` Lorenzo Stoakes
1 sibling, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-14 13:53 UTC (permalink / raw)
To: Vlastimil Babka (SUSE)
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Thu, Aug 08, 2024 at 04:45:53PM GMT, Lorenzo Stoakes wrote:
> On Thu, Aug 08, 2024 at 04:20:26PM GMT, Vlastimil Babka (SUSE) wrote:
> > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > Equally use struct vma_merge_struct to abstract parameters for VMA
> > > expansion and shrinking.
> > >
> > > This leads the way to further refactoring and de-duplication by
> > > standardising the interface.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > > mm/mmap.c | 30 +++++++++++--------
> > > mm/vma.c | 66 ++++++++++++++++++-----------------------
> > > mm/vma.h | 8 ++---
> > > tools/testing/vma/vma.c | 18 +++++++++--
> > > 4 files changed, 65 insertions(+), 57 deletions(-)
> > >
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index 721ced6e37b0..04145347c245 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1367,7 +1367,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > pgoff_t pglen = len >> PAGE_SHIFT;
> > > unsigned long charged = 0;
> > > unsigned long end = addr + len;
> > > - unsigned long merge_start = addr, merge_end = end;
> > > bool writable_file_mapping = false;
> > > int error;
> > > VMA_ITERATOR(vmi, mm, addr);
> > > @@ -1423,28 +1422,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > > /* Attempt to expand an old mapping */
> > > /* Check next */
> > > if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > > - merge_end = next->vm_end;
> > > - vma = next;
> > > + /* We can adjust this as can_vma_merge_after() doesn't touch */
> > > + vmg.end = next->vm_end;
> >
> > Ugh, ok but wonder how fragile that is.
>
> Yeah you're right this is a bit horrid, I'll find a way to make this less
> brittle.
FYI for when I send out the v2 respin:
Actually, as I work through it now, I think this is OK as-is (I'll remove
the comment as it's confusing though).
The next block checks prev, so the end of the VMA doesn't really matter,
and in any case isn't checked by can_vma_merge_after(), but rather by the
prev->vm_end == addr conditional below.
I've addressed your other comments.
[snip]
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 05/10] mm: abstract vma_merge_new_vma() to use vma_merge_struct
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (3 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 04/10] mm: abstract parameters for vma_expand/shrink() Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
[not found] ` <82b802e0-94fd-4cca-ad8f-ea2d85bcae64@kernel.org>
2024-08-05 12:13 ` [PATCH 06/10] tools: add VMA merge tests Lorenzo Stoakes
` (4 subsequent siblings)
9 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Abstract this function to so we can write tests which use the newly
abstracted interface and maintain a stable interface for tests before/after
refactoring.
We introduce a temporary wrapper vma_merge_new_vma_wrapper() to minimise
the code changes, in a subsequent commit we will entirely refactor this
function.
We also introduce a temporary implementation of vma_merge_modified() for
the same reason - maintaining a common interface to the tests, this will be
removed when vma_merge_modified() is correctly implemented in a subsequent
commit.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/mmap.c | 6 +++---
mm/vma.c | 33 ++++++++++++---------------------
mm/vma.h | 33 ++++++++++++++++++++++++++++++---
tools/testing/vma/vma.c | 12 ++++++++----
4 files changed, 53 insertions(+), 31 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 04145347c245..f6593a81f73d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1494,9 +1494,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge_new_vma(&vmi, prev, vma,
- vma->vm_start, vma->vm_end,
- vma->vm_pgoff);
+ merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
+ vma->vm_start, vma->vm_end,
+ vma->vm_pgoff);
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
diff --git a/mm/vma.c b/mm/vma.c
index 3d6ce04f1b9c..55615392e8d2 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1106,6 +1106,11 @@ static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
return NULL;
}
+struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
+{
+ return vma_merge(vmg);
+}
+
/*
* We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
* context and anonymous VMA name within the range [start, end).
@@ -1260,27 +1265,14 @@ struct vm_area_struct
* Attempt to merge a newly mapped VMA with those adjacent to it. The caller
* must ensure that [start, end) does not overlap any existing VMA.
*/
-struct vm_area_struct
-*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff)
+struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
{
- struct vma_merge_struct vmg = {
- .vmi = vmi,
- .prev = prev,
- .vma = vma,
- .start = start,
- .end = end,
- .flags = vma->vm_flags,
- .file = vma->vm_file,
- .anon_vma = vma->anon_vma,
- .pgoff = pgoff,
- .policy = vma_policy(vma),
- .uffd_ctx = vma->vm_userfaultfd_ctx,
- .anon_name = anon_vma_name(vma),
- };
+ if (!vmg->prev) {
+ vmg->prev = vma_prev(vmg->vmi);
+ vma_iter_set(vmg->vmi, vmg->start);
+ }
- return vma_merge(&vmg);
+ return vma_merge(vmg);
}
/*
@@ -1295,7 +1287,6 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
struct vma_merge_struct vmg = {
.vmi = vmi,
.prev = vma,
- .vma = vma,
.start = vma->vm_end,
.end = vma->vm_end + delta,
.flags = vma->vm_flags,
@@ -1425,7 +1416,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
+ new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/vma.h b/mm/vma.h
index c464d25da120..50459f9e4c7f 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -134,9 +134,36 @@ struct vm_area_struct
struct vm_userfaultfd_ctx new_ctx);
struct vm_area_struct
-*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff);
+*vma_merge_new_vma(struct vma_merge_struct *vmg);
+
+/* Temporary convenience wrapper. */
+static inline struct vm_area_struct
+*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff)
+{
+ struct vma_merge_struct vmg = {
+ .vmi = vmi,
+ .prev = prev,
+ .start = start,
+ .end = end,
+ .flags = vma->vm_flags,
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .pgoff = pgoff,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
+
+ return vma_merge_new_vma(&vmg);
+}
+
+/*
+ * Temporary wrapper around vma_merge() so we can have a common interface for
+ * tests.
+ */
+struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg);
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
struct vm_area_struct *vma,
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index d216e51206c1..4416cfa93056 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -53,16 +53,20 @@ static bool test_simple_merge(void)
unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
struct mm_struct mm = {};
struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags);
- struct vm_area_struct *vma_middle = alloc_vma(&mm, 0x1000, 0x2000, 1, flags);
struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags);
VMA_ITERATOR(vmi, &mm, 0x1000);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .start = 0x1000,
+ .end = 0x2000,
+ .flags = flags,
+ .pgoff = 1,
+ };
ASSERT_FALSE(vma_link(&mm, vma_left));
- ASSERT_FALSE(vma_link(&mm, vma_middle));
ASSERT_FALSE(vma_link(&mm, vma_right));
- vma = vma_merge_new_vma(&vmi, vma_left, vma_middle, 0x1000,
- 0x2000, 1);
+ vma = vma_merge_new_vma(&vmg);
ASSERT_NE(vma, NULL);
ASSERT_EQ(vma->vm_start, 0);
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* [PATCH 06/10] tools: add VMA merge tests
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (4 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 05/10] mm: abstract vma_merge_new_vma() to use vma_merge_struct Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 07/10] mm: avoid using vma_merge() for new VMAs Lorenzo Stoakes
` (3 subsequent siblings)
9 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Add a variety of VMA merge unit tests to assert that the behaviour of VMA
merge is correct at an abstract level and VMAs are merged or not merged as
expected.
These are added intentionally in advance of heavily refactoring the VMA
merge functionality in order that we can compare the test results before
and after the refactoring to ensure that it functions correctly.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
tools/testing/vma/vma.c | 814 +++++++++++++++++++++++++++++++
tools/testing/vma/vma_internal.h | 4 +-
2 files changed, 817 insertions(+), 1 deletion(-)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 4416cfa93056..e465dc22e2d0 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -14,6 +14,7 @@
#include "../../../mm/vma.c"
const struct vm_operations_struct vma_dummy_vm_ops;
+static struct anon_vma dummy_anon_vma;
#define ASSERT_TRUE(_expr) \
do { \
@@ -28,6 +29,7 @@ const struct vm_operations_struct vma_dummy_vm_ops;
#define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2))
#define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2))
+/* Helper function to simply allocate a VMA. */
static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
unsigned long start,
unsigned long end,
@@ -47,6 +49,115 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
return ret;
}
+/* Helper function to allocate a VMA and link it to the tree. */
+static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ pgoff_t pgoff,
+ vm_flags_t flags)
+{
+ struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags);
+
+ if (vma == NULL)
+ return NULL;
+
+ if (vma_link(mm, vma)) {
+ vm_area_free(vma);
+ return NULL;
+ }
+
+ /*
+ * Reset this counter which we use to track whether writes have
+ * begun. Linking to the tree will have caused this to be incremented,
+ * which means we will get a false positive otherwise.
+ */
+ vma->vm_lock_seq = -1;
+
+ return vma;
+}
+
+/*
+ * Helper function to reset merge state the associated VMA iterator to a
+ * specified new range.
+ */
+static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
+ unsigned long end, pgoff_t pgoff, vm_flags_t flags)
+{
+ vma_iter_set(vmg->vmi, start);
+
+ vmg->prev = NULL;
+ vmg->next = NULL;
+ vmg->vma = NULL;
+
+ vmg->start = start;
+ vmg->end = end;
+ vmg->pgoff = pgoff;
+ vmg->flags = flags;
+}
+
+/*
+ * Helper function to try to merge a new VMA.
+ *
+ * Update vmg and the iterator for it and try to merge, otherwise allocate a new
+ * VMA, link it to the maple tree and return it.
+ */
+static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
+ struct vma_merge_struct *vmg,
+ unsigned long start, unsigned long end,
+ pgoff_t pgoff, vm_flags_t flags,
+ bool *was_merged)
+{
+ struct vm_area_struct *merged;
+
+ vmg_set_range(vmg, start, end, pgoff, flags);
+
+ merged = vma_merge_new_vma(vmg);
+ if (merged) {
+ *was_merged = true;
+ return merged;
+ }
+
+ *was_merged = false;
+ return alloc_and_link_vma(mm, start, end, pgoff, flags);
+}
+
+/*
+ * Helper function to remove all VMAs and destroy the maple tree associated with
+ * a virtual address space. Returns a count of VMAs in the tree.
+ */
+static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
+{
+ struct vm_area_struct *vma;
+ int count = 0;
+
+ vma_iter_set(vmi, 0);
+ for_each_vma(*vmi, vma) {
+ vm_area_free(vma);
+ count++;
+ }
+
+ mtree_destroy(&mm->mm_mt);
+ mm->map_count = 0;
+ return count;
+}
+
+/* Helper function to determine if VMA has had vma_start_write() performed. */
+static bool vma_write_started(struct vm_area_struct *vma)
+{
+ int seq = vma->vm_lock_seq;
+
+ /* We reset after each check. */
+ vma->vm_lock_seq = -1;
+
+ /* The vma_start_write() stub simply increments this value. */
+ return seq > -1;
+}
+
+/* Helper function providing a dummy vm_ops->close() method.*/
+static void dummy_close(struct vm_area_struct *)
+{
+}
+
static bool test_simple_merge(void)
{
struct vm_area_struct *vma;
@@ -196,6 +307,702 @@ static bool test_simple_shrink(void)
return true;
}
+static bool test_vma_merge_new_vma(void)
+{
+ unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain_a = {
+ .anon_vma = &dummy_anon_vma,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain_b = {
+ .anon_vma = &dummy_anon_vma,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain_c = {
+ .anon_vma = &dummy_anon_vma,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain_d = {
+ .anon_vma = &dummy_anon_vma,
+ };
+ int count;
+ struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
+ bool merged;
+
+ /*
+ * 0123456789abc
+ * AA B CC
+ */
+ vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+ ASSERT_NE(vma_a, NULL);
+ /* We give each VMA a single avc so we can test anon_vma duplication. */
+ INIT_LIST_HEAD(&vma_a->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
+
+ vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+ ASSERT_NE(vma_b, NULL);
+ INIT_LIST_HEAD(&vma_b->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
+
+ vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags);
+ ASSERT_NE(vma_c, NULL);
+ INIT_LIST_HEAD(&vma_c->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
+
+ /*
+ * NO merge.
+ *
+ * 0123456789abc
+ * AA B ** CC
+ */
+ vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged);
+ ASSERT_NE(vma_d, NULL);
+ INIT_LIST_HEAD(&vma_d->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
+ ASSERT_FALSE(merged);
+ ASSERT_EQ(mm.map_count, 4);
+
+ /*
+ * Merge BOTH sides.
+ *
+ * 0123456789abc
+ * AA*B DD CC
+ */
+ vma_b->anon_vma = &dummy_anon_vma;
+ vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
+ ASSERT_EQ(vma, vma_a);
+ /* Merge with A, delete B. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0);
+ ASSERT_EQ(vma->vm_end, 0x4000);
+ ASSERT_EQ(vma->vm_pgoff, 0);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 3);
+
+ /*
+ * Merge to PREVIOUS VMA.
+ *
+ * 0123456789abc
+ * AAAA* DD CC
+ */
+ vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged);
+ ASSERT_EQ(vma, vma_a);
+ /* Extend A. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0);
+ ASSERT_EQ(vma->vm_end, 0x5000);
+ ASSERT_EQ(vma->vm_pgoff, 0);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 3);
+
+ /*
+ * Merge to NEXT VMA.
+ *
+ * 0123456789abc
+ * AAAAA *DD CC
+ */
+ vma_d->anon_vma = &dummy_anon_vma;
+ vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
+ ASSERT_EQ(vma, vma_d);
+ /* Prepend. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0x6000);
+ ASSERT_EQ(vma->vm_end, 0x9000);
+ ASSERT_EQ(vma->vm_pgoff, 6);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 3);
+
+ /*
+ * Merge BOTH sides.
+ *
+ * 0123456789abc
+ * AAAAA*DDD CC
+ */
+ vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
+ ASSERT_EQ(vma, vma_a);
+ /* Merge with A, delete D. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0);
+ ASSERT_EQ(vma->vm_end, 0x9000);
+ ASSERT_EQ(vma->vm_pgoff, 0);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 2);
+
+ /*
+ * Merge to NEXT VMA.
+ *
+ * 0123456789abc
+ * AAAAAAAAA *CC
+ */
+ vma_c->anon_vma = &dummy_anon_vma;
+ vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged);
+ ASSERT_EQ(vma, vma_c);
+ /* Prepend C. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0xa000);
+ ASSERT_EQ(vma->vm_end, 0xc000);
+ ASSERT_EQ(vma->vm_pgoff, 0xa);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 2);
+
+ /*
+ * Merge BOTH sides.
+ *
+ * 0123456789abc
+ * AAAAAAAAA*CCC
+ */
+ vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged);
+ ASSERT_EQ(vma, vma_a);
+ /* Extend A and delete C. */
+ ASSERT_TRUE(merged);
+ ASSERT_EQ(vma->vm_start, 0);
+ ASSERT_EQ(vma->vm_end, 0xc000);
+ ASSERT_EQ(vma->vm_pgoff, 0);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 1);
+
+ /*
+ * Final state.
+ *
+ * 0123456789abc
+ * AAAAAAAAAAAAA
+ */
+
+ count = 0;
+ vma_iter_set(&vmi, 0);
+ for_each_vma(vmi, vma) {
+ ASSERT_NE(vma, NULL);
+ ASSERT_EQ(vma->vm_start, 0);
+ ASSERT_EQ(vma->vm_end, 0xc000);
+ ASSERT_EQ(vma->vm_pgoff, 0);
+ ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
+
+ vm_area_free(vma);
+ count++;
+ }
+
+ /* Should only have one VMA left (though freed) after all is done.*/
+ ASSERT_EQ(count, 1);
+
+ mtree_destroy(&mm.mm_mt);
+ return true;
+}
+
+static bool test_vma_merge_special_flags(void)
+{
+ unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ };
+ vm_flags_t special_flags[] = { VM_IO, VM_DONTEXPAND, VM_PFNMAP, VM_MIXEDMAP };
+ vm_flags_t all_special_flags = 0;
+ int i;
+ struct vm_area_struct *vma_left, *vma;
+
+ /* Make sure there aren't new VM_SPECIAL flags. */
+ for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+ all_special_flags |= special_flags[i];
+ }
+ ASSERT_EQ(all_special_flags, VM_SPECIAL);
+
+ /*
+ * 01234
+ * AAA
+ */
+ vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ ASSERT_NE(vma_left, NULL);
+
+ /* 1. Set up new VMA with special flag that would otherwise merge. */
+
+ /*
+ * 01234
+ * AAA*
+ *
+ * This should merge if not for the VM_SPECIAL flag.
+ */
+ vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags);
+ for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+ vm_flags_t special_flag = special_flags[i];
+
+ vma_left->__vm_flags = flags | special_flag;
+ vmg.flags = flags | special_flag;
+ vma = vma_merge_new_vma(&vmg);
+ ASSERT_EQ(vma, NULL);
+ }
+
+ /* 2. Modify VMA with special flag that would otherwise merge. */
+
+ /*
+ * 01234
+ * AAAB
+ *
+ * Create a VMA to modify.
+ */
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+ ASSERT_NE(vma, NULL);
+ vmg.vma = vma;
+
+ for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
+ vm_flags_t special_flag = special_flags[i];
+
+ vma_left->__vm_flags = flags | special_flag;
+ vmg.flags = flags | special_flag;
+ vma = vma_merge_modified(&vmg);
+ ASSERT_EQ(vma, NULL);
+ }
+
+ cleanup_mm(&mm, &vmi);
+ return true;
+}
+
+static bool test_vma_merge_with_close(void)
+{
+ unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ };
+ struct vm_operations_struct vm_ops = {};
+ struct vm_area_struct *vma_next =
+ alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
+ struct vm_area_struct *vma;
+
+ /*
+ * When we merge VMAs we sometimes have to delete others as part of the
+ * operation.
+ *
+ * Considering the two possible adjacent VMAs to which a VMA can be
+ * merged:
+ *
+ * [ prev ][ vma ][ next ]
+ *
+ * In no case will we need to delete prev. If the operation is
+ * mergeable, then prev will be extended with one or both of vma and
+ * next deleted.
+ *
+ * As a result, during initial mergeability checks, only
+ * can_vma_merge_before() (which implies the VMA being merged with is
+ * 'next' as shown above) bothers to check to see whether the next VMA
+ * has a vm_ops->close() callback that will need to be called when
+ * removed.
+ *
+ * If it does, then we cannot merge as the resources that the close()
+ * operation potentially clears down are tied only to the existing VMA
+ * range and we have no way of extending those to the nearly merged one.
+ *
+ * We must consider two scenarios:
+ *
+ * A.
+ *
+ * vm_ops->close: - - !NULL
+ * [ prev ][ vma ][ next ]
+ *
+ * Where prev may or may not be present/mergeable.
+ *
+ * This is picked up by a specific check in can_vma_merge_before().
+ *
+ * B.
+ *
+ * vm_ops->close: - !NULL
+ * [ prev ][ vma ]
+ *
+ * Where prev and vma are present and mergeable.
+ *
+ * This is picked up by a specific check in vma_merge_modified().
+ *
+ * IMPORTANT NOTE: We make the assumption that the following case:
+ *
+ * - !NULL NULL
+ * [ prev ][ vma ][ next ]
+ *
+ * Cannot occur, because vma->vm_ops being the same implies the same
+ * vma->vm_file, and therefore this would mean that next->vm_ops->close
+ * would be set too, and thus scenario A would pick this up.
+ */
+
+ ASSERT_NE(vma_next, NULL);
+
+ /*
+ * SCENARIO A
+ *
+ * 0123
+ * *N
+ */
+
+ /* Make the next VMA have a close() callback. */
+ vm_ops.close = dummy_close;
+ vma_next->vm_ops = (const struct vm_operations_struct *)&vm_ops;
+
+ /* Our proposed VMA has characteristics that would otherwise be merged. */
+ vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+
+ /* The next VMA having a close() operator should cause the merge to fail.*/
+ ASSERT_EQ(vma_merge_new_vma(&vmg), NULL);
+
+ /* Now create the VMA so we can merge via modified flags */
+ vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+ vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
+ vmg.vma = vma;
+
+ /*
+ * The VMA being modified in a way that would otherwise merge should
+ * also fail.
+ */
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ /* SCENARIO B
+ *
+ * 0123
+ * P*
+ *
+ * In order for this scenario to trigger, the VMA currently being
+ * modified must also have a .close().
+ */
+
+ /* Reset VMG state. */
+ vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+ /*
+ * Make next unmergeable, and don't let the scenario A check pick this
+ * up, we want to reproduce scenario B only.
+ */
+ vma_next->vm_ops = NULL;
+ vma_next->__vm_flags &= ~VM_MAYWRITE;
+ /* Allocate prev. */
+ vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
+ /* Assign a vm_ops->close() function to VMA explicitly. */
+ vma->vm_ops = (const struct vm_operations_struct *)&vm_ops;
+ vmg.vma = vma;
+ /* Make sure merge does not occur. */
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ cleanup_mm(&mm, &vmi);
+ return true;
+}
+
+static bool test_vma_merge_modified(void)
+{
+ unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vm_area_struct *vma, *vma_prev, *vma_next;
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ };
+
+ /*
+ * Merge right case - partial span.
+ *
+ * <->
+ * 0123456789
+ * VVVVNNN
+ * ->
+ * 0123456789
+ * VNNNNNN
+ */
+ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+ vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
+ vmg.vma = vma;
+ vmg.prev = vma;
+ vma->anon_vma = &dummy_anon_vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_next);
+ ASSERT_EQ(vma_next->vm_start, 0x3000);
+ ASSERT_EQ(vma_next->vm_end, 0x9000);
+ ASSERT_EQ(vma_next->vm_pgoff, 3);
+ ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+ ASSERT_EQ(vma->vm_start, 0x2000);
+ ASSERT_EQ(vma->vm_end, 0x3000);
+ ASSERT_EQ(vma->vm_pgoff, 2);
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_TRUE(vma_write_started(vma_next));
+ ASSERT_EQ(mm.map_count, 2);
+
+ /* Clear down and reset. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+ /*
+ * Merge right case - full span.
+ *
+ * <-->
+ * 0123456789
+ * VVVVNNN
+ * ->
+ * 0123456789
+ * NNNNNNN
+ */
+ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+ vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
+ vmg.vma = vma;
+ vma->anon_vma = &dummy_anon_vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_next);
+ ASSERT_EQ(vma_next->vm_start, 0x2000);
+ ASSERT_EQ(vma_next->vm_end, 0x9000);
+ ASSERT_EQ(vma_next->vm_pgoff, 2);
+ ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma_next));
+ ASSERT_EQ(mm.map_count, 1);
+
+ /* Clear down and reset. We should have deleted vma. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+ /*
+ * Merge left case - partial span.
+ *
+ * <->
+ * 0123456789
+ * PPPVVVV
+ * ->
+ * 0123456789
+ * PPPPPPV
+ */
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+ vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
+ vma->anon_vma = &dummy_anon_vma;
+
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x6000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+ ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+ ASSERT_EQ(vma->vm_start, 0x6000);
+ ASSERT_EQ(vma->vm_end, 0x7000);
+ ASSERT_EQ(vma->vm_pgoff, 6);
+ ASSERT_TRUE(vma_write_started(vma_prev));
+ ASSERT_TRUE(vma_write_started(vma));
+ ASSERT_EQ(mm.map_count, 2);
+
+ /* Clear down and reset. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+ /*
+ * Merge left case - full span.
+ *
+ * <-->
+ * 0123456789
+ * PPPVVVV
+ * ->
+ * 0123456789
+ * PPPPPPP
+ */
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+ vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
+ vma->anon_vma = &dummy_anon_vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x7000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+ ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma_prev));
+ ASSERT_EQ(mm.map_count, 1);
+
+ /* Clear down and reset. We should have deleted vma. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+ /*
+ * Merge both case.
+ *
+ * <-->
+ * 0123456789
+ * PPPVVVVNNN
+ * ->
+ * 0123456789
+ * PPPPPPPPPP
+ */
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+ vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
+ vma->anon_vma = &dummy_anon_vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x9000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+ ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma);
+ ASSERT_TRUE(vma_write_started(vma_prev));
+ ASSERT_EQ(mm.map_count, 1);
+
+ /* Clear down and reset. We should have deleted prev and next. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1);
+
+ /*
+ * Non-merge ranges. vma_merge_modified() assumes that the caller always
+ * specifies ranges within the input VMA so we need only examine these
+ * cases.
+ *
+ * -
+ * -
+ * -
+ * <->
+ * <>
+ * <>
+ * 0123456789a
+ * PPPVVVVVNNN
+ */
+
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags);
+
+ vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+ vmg.prev = vma;
+ vmg.vma = vma;
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
+ return true;
+}
+
+static bool test_anon_vma_non_mergeable(void)
+{
+ unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+ struct mm_struct mm = {};
+ VMA_ITERATOR(vmi, &mm, 0);
+ struct vm_area_struct *vma, *vma_prev, *vma_next;
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain1 = {
+ .anon_vma = &dummy_anon_vma,
+ };
+ struct anon_vma_chain dummy_anon_vma_chain2 = {
+ .anon_vma = &dummy_anon_vma,
+ };
+
+ /*
+ * In the case of vma_merge_modified() merging both left and right VMAs
+ * but where prev and next have incompatible anon_vma objects, we revert
+ * to a merge of prev and VMA:
+ *
+ * <-->
+ * 0123456789
+ * PPPVVVVNNN
+ * ->
+ * 0123456789
+ * PPPPPPPNNN
+ */
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+
+ /*
+ * Give both prev and next single anon_vma_chain fields, so they will
+ * merge with the NULL vmg->anon_vma.
+ *
+ * However, when prev is compared to next, the merge should fail.
+ */
+
+ INIT_LIST_HEAD(&vma_prev->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain);
+ ASSERT_TRUE(list_is_singular(&vma_prev->anon_vma_chain));
+ vma_prev->anon_vma = &dummy_anon_vma;
+ ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_prev->anon_vma, vma_prev));
+
+ INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain);
+ ASSERT_TRUE(list_is_singular(&vma_next->anon_vma_chain));
+ vma_next->anon_vma = (struct anon_vma *)2;
+ ASSERT_TRUE(is_mergeable_anon_vma(NULL, vma_next->anon_vma, vma_next));
+
+ ASSERT_FALSE(is_mergeable_anon_vma(vma_prev->anon_vma, vma_next->anon_vma, NULL));
+
+ vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
+
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x7000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+ ASSERT_TRUE(vma_write_started(vma_prev));
+ ASSERT_FALSE(vma_write_started(vma_next));
+
+ /* Clear down and reset. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+ /*
+ * Now consider the new VMA case. This is equivalent, only adding a new
+ * VMA in a gap between prev and next.
+ *
+ * <-->
+ * 0123456789
+ * PPP****NNN
+ * ->
+ * 0123456789
+ * PPPPPPPNNN
+ */
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+
+ INIT_LIST_HEAD(&vma_prev->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain1.same_vma, &vma_prev->anon_vma_chain);
+ vma_prev->anon_vma = (struct anon_vma *)1;
+
+ INIT_LIST_HEAD(&vma_next->anon_vma_chain);
+ list_add(&dummy_anon_vma_chain2.same_vma, &vma_next->anon_vma_chain);
+ vma_next->anon_vma = (struct anon_vma *)2;
+
+ vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
+ vmg.prev = vma_prev;
+
+ ASSERT_EQ(vma_merge_new_vma(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x7000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+ ASSERT_TRUE(vma_write_started(vma_prev));
+ ASSERT_FALSE(vma_write_started(vma_next));
+
+ /* Final cleanup. */
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+ return true;
+}
+
int main(void)
{
int num_tests = 0, num_fail = 0;
@@ -211,11 +1018,18 @@ int main(void)
} \
} while (0)
+ /* Very simple tests to kick the tyres. */
TEST(simple_merge);
TEST(simple_modify);
TEST(simple_expand);
TEST(simple_shrink);
+ TEST(vma_merge_new_vma);
+ TEST(vma_merge_special_flags);
+ TEST(vma_merge_with_close);
+ TEST(vma_merge_modified);
+ TEST(anon_vma_non_mergeable);
+
#undef TEST
printf("%d tests run, %d passed, %d failed.\n",
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 093560e5b2ac..40797a819d3d 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -687,8 +687,10 @@ static inline int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct
return 0;
}
-static inline void vma_start_write(struct vm_area_struct *)
+static inline void vma_start_write(struct vm_area_struct *vma)
{
+ /* Used to indicate to tests that a write operation has begun. */
+ vma->vm_lock_seq++;
}
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (5 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 06/10] tools: add VMA merge tests Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 13:04 ` Petr Tesařík
` (2 more replies)
2024-08-05 12:13 ` [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation Lorenzo Stoakes
` (2 subsequent siblings)
9 siblings, 3 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
In mmap_region() and do_brk_flags() we open code scenarios where we prefer
to use vma_expand() rather than invoke a full vma_merge() operation.
Abstract this logic and eliminate all of the open-coding, and also use the
same logic for all cases where we add new VMAs to, rather than ultimately
use vma_merge(), rather use vma_expand().
We implement this by replacing vma_merge_new_vma() with this newly
abstracted logic.
Doing so removes duplication and simplifies VMA merging in all such cases,
laying the ground for us to eliminate the merging of new VMAs in
vma_merge() altogether.
This makes it far easier to understand what is happening in these cases
avoiding confusion, bugs and allowing for future optimisation.
As a result of this change we are also able to make vma_prepare(),
init_vma_prep(), vma_complete(), can_vma_merge_before() and
can_vma_merge_after() static and internal to vma.c.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/mmap.c | 79 ++---
mm/vma.c | 482 +++++++++++++++++++------------
mm/vma.h | 51 +---
tools/testing/vma/vma_internal.h | 6 +
4 files changed, 324 insertions(+), 294 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index f6593a81f73d..c03f50f46396 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct vm_area_struct *next, *prev, *merge;
- pgoff_t pglen = len >> PAGE_SHIFT;
+ struct vm_area_struct *merge;
unsigned long charged = 0;
unsigned long end = addr + len;
bool writable_file_mapping = false;
@@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}
- next = vmg.next = vma_next(&vmi);
- prev = vmg.prev = vma_prev(&vmi);
- if (vm_flags & VM_SPECIAL) {
- if (prev)
- vma_iter_next_range(&vmi);
- goto cannot_expand;
- }
-
- /* Attempt to expand an old mapping */
- /* Check next */
- if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
- /* We can adjust this as can_vma_merge_after() doesn't touch */
- vmg.end = next->vm_end;
- vma = vmg.vma = next;
- vmg.pgoff = next->vm_pgoff - pglen;
-
- /* We may merge our NULL anon_vma with non-NULL in next. */
- vmg.anon_vma = vma->anon_vma;
- }
-
- /* Check prev */
- if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
- vmg.start = prev->vm_start;
- vma = vmg.vma = prev;
- vmg.pgoff = prev->vm_pgoff;
- } else if (prev) {
- vma_iter_next_range(&vmi);
- }
-
- /* Actually expand, if possible */
- if (vma && !vma_expand(&vmg)) {
- khugepaged_enter_vma(vma, vm_flags);
+ vma = vma_merge_new_vma(&vmg);
+ if (vma)
goto expanded;
- }
-
- if (vma == prev)
- vma_iter_set(&vmi, addr);
-cannot_expand:
/*
* Determine the object being mapped and call the appropriate
@@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
*/
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
- vma->vm_start, vma->vm_end,
- vma->vm_pgoff);
+ if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
+ merge = vma_merge_new_vma(&vmg);
+
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
@@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
+ unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
vma->vm_end, vma->vm_end, true);
}
if (writable_file_mapping)
@@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, unsigned long len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vma_prepare vp;
/*
* Check against address space limits by the changed size
@@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
*/
if (vma && vma->vm_end == addr) {
struct vma_merge_struct vmg = {
+ .vmi = vmi,
.prev = vma,
+ .next = NULL,
+ .start = addr,
+ .end = addr + len,
.flags = flags,
.pgoff = addr >> PAGE_SHIFT,
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
};
- if (can_vma_merge_after(&vmg)) {
- vma_iter_config(vmi, vma->vm_start, addr + len);
- if (vma_iter_prealloc(vmi, vma))
- goto unacct_fail;
-
- vma_start_write(vma);
-
- init_vma_prep(&vp, vma);
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
- vma->vm_end = addr + len;
- vm_flags_set(vma, VM_SOFTDIRTY);
- vma_iter_store(vmi, vma);
-
- vma_complete(&vp, vmi, mm);
- khugepaged_enter_vma(vma, flags);
+ if (vma_merge_new_vma(&vmg))
goto out;
- }
}
if (vma)
diff --git a/mm/vma.c b/mm/vma.c
index 55615392e8d2..a404cf718f9e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
*
* We assume the vma may be removed as part of the merge.
*/
-bool
-can_vma_merge_before(struct vma_merge_struct *vmg)
+static bool can_vma_merge_before(struct vma_merge_struct *vmg)
{
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
@@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
*
* We assume that vma is not removed as part of the merge.
*/
-bool can_vma_merge_after(struct vma_merge_struct *vmg)
+static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, false) &&
is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
@@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
return false;
}
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma_is_shared_maywrite(vma))
+ mapping_allow_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Requires inode->i_mapping->i_mmap_rwsem
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma_is_shared_maywrite(vma))
+ mapping_unmap_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * vma_prepare() - Helper function for handling locking VMAs prior to altering
+ * @vp: The initialized vma_prepare struct
+ */
+static void vma_prepare(struct vma_prepare *vp)
+{
+ if (vp->file) {
+ uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+
+ if (vp->adj_next)
+ uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
+ vp->adj_next->vm_end);
+
+ i_mmap_lock_write(vp->mapping);
+ if (vp->insert && vp->insert->vm_file) {
+ /*
+ * Put into interval tree now, so instantiated pages
+ * are visible to arm/parisc __flush_dcache_page
+ * throughout; but we cannot insert into address
+ * space until vma start or end is updated.
+ */
+ __vma_link_file(vp->insert,
+ vp->insert->vm_file->f_mapping);
+ }
+ }
+
+ if (vp->anon_vma) {
+ anon_vma_lock_write(vp->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_pre_update_vma(vp->adj_next);
+ }
+
+ if (vp->file) {
+ flush_dcache_mmap_lock(vp->mapping);
+ vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+ if (vp->adj_next)
+ vma_interval_tree_remove(vp->adj_next,
+ &vp->mapping->i_mmap);
+ }
+
+}
+
+/*
+ * vma_complete- Helper function for handling the unlocking after altering VMAs,
+ * or for inserting a VMA.
+ *
+ * @vp: The vma_prepare struct
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ */
+static void vma_complete(struct vma_prepare *vp,
+ struct vma_iterator *vmi, struct mm_struct *mm)
+{
+ if (vp->file) {
+ if (vp->adj_next)
+ vma_interval_tree_insert(vp->adj_next,
+ &vp->mapping->i_mmap);
+ vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+ flush_dcache_mmap_unlock(vp->mapping);
+ }
+
+ if (vp->remove && vp->file) {
+ __remove_shared_vm_struct(vp->remove, vp->mapping);
+ if (vp->remove2)
+ __remove_shared_vm_struct(vp->remove2, vp->mapping);
+ } else if (vp->insert) {
+ /*
+ * split_vma has split insert from vma, and needs
+ * us to insert it before dropping the locks
+ * (it may either follow vma or precede it).
+ */
+ vma_iter_store(vmi, vp->insert);
+ mm->map_count++;
+ }
+
+ if (vp->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_post_update_vma(vp->adj_next);
+ anon_vma_unlock_write(vp->anon_vma);
+ }
+
+ if (vp->file) {
+ i_mmap_unlock_write(vp->mapping);
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
+
+ if (vp->remove) {
+again:
+ vma_mark_detached(vp->remove, true);
+ if (vp->file) {
+ uprobe_munmap(vp->remove, vp->remove->vm_start,
+ vp->remove->vm_end);
+ fput(vp->file);
+ }
+ if (vp->remove->anon_vma)
+ anon_vma_merge(vp->vma, vp->remove);
+ mm->map_count--;
+ mpol_put(vma_policy(vp->remove));
+ if (!vp->remove2)
+ WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
+ vm_area_free(vp->remove);
+
+ /*
+ * In mprotect's case 6 (see comments on vma_merge),
+ * we are removing both mid and next vmas
+ */
+ if (vp->remove2) {
+ vp->remove = vp->remove2;
+ vp->remove2 = NULL;
+ goto again;
+ }
+ }
+ if (vp->insert && vp->file)
+ uprobe_mmap(vp->insert);
+ validate_mm(mm);
+}
+
+/*
+ * init_vma_prep() - Initializer wrapper for vma_prepare struct
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ */
+static void init_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma)
+{
+ init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+}
+
/*
* Close a vm structure and free it.
*/
@@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
vm_unacct_memory(nr_accounted);
}
-/*
- * init_vma_prep() - Initializer wrapper for vma_prepare struct
- * @vp: The vma_prepare struct
- * @vma: The vma that will be altered once locked
- */
-void init_vma_prep(struct vma_prepare *vp,
- struct vm_area_struct *vma)
-{
- init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
-}
-
-/*
- * Requires inode->i_mapping->i_mmap_rwsem
- */
-static void __remove_shared_vm_struct(struct vm_area_struct *vma,
- struct address_space *mapping)
-{
- if (vma_is_shared_maywrite(vma))
- mapping_unmap_writable(mapping);
-
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_remove(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
-}
-
/*
* vma has some anon_vma assigned, and is already inserted on that
* anon_vma's interval trees.
@@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static void __vma_link_file(struct vm_area_struct *vma,
- struct address_space *mapping)
-{
- if (vma_is_shared_maywrite(vma))
- mapping_allow_writable(mapping);
-
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
-}
-
-/*
- * vma_prepare() - Helper function for handling locking VMAs prior to altering
- * @vp: The initialized vma_prepare struct
- */
-void vma_prepare(struct vma_prepare *vp)
-{
- if (vp->file) {
- uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
-
- if (vp->adj_next)
- uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
- vp->adj_next->vm_end);
-
- i_mmap_lock_write(vp->mapping);
- if (vp->insert && vp->insert->vm_file) {
- /*
- * Put into interval tree now, so instantiated pages
- * are visible to arm/parisc __flush_dcache_page
- * throughout; but we cannot insert into address
- * space until vma start or end is updated.
- */
- __vma_link_file(vp->insert,
- vp->insert->vm_file->f_mapping);
- }
- }
-
- if (vp->anon_vma) {
- anon_vma_lock_write(vp->anon_vma);
- anon_vma_interval_tree_pre_update_vma(vp->vma);
- if (vp->adj_next)
- anon_vma_interval_tree_pre_update_vma(vp->adj_next);
- }
-
- if (vp->file) {
- flush_dcache_mmap_lock(vp->mapping);
- vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
- if (vp->adj_next)
- vma_interval_tree_remove(vp->adj_next,
- &vp->mapping->i_mmap);
- }
-
-}
-
/*
* dup_anon_vma() - Helper function to duplicate anon_vma
* @dst: The destination VMA
@@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+/*
+ * vma_merge_new_vma - Attempt to merge a new VMA into address space
+ *
+ * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
+ * (exclusive), which we try to merge with any adjacent VMAs if possible.
+ *
+ * We are about to add a VMA to the address space starting at @vmg->start and
+ * ending at @vmg->end. There are three different possible scenarios:
+ *
+ * 1. There is a VMA with identical properties immediately adjacent to the
+ * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
+ * EXPAND that VMA:
+ *
+ * Proposed: |-----| or |-----|
+ * Existing: |----| |----|
+ *
+ * 2. There are VMAs with identical properties immediately adjacent to the
+ * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
+ * EXPAND the former and REMOVE the latter:
+ *
+ * Proposed: |-----|
+ * Existing: |----| |----|
+ *
+ * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
+ * VMAs do not have identical attributes - NO MERGE POSSIBLE.
+ *
+ * In instances where we can merge, this function returns the expanded VMA which
+ * will have its range adjusted accordingly and the underlying maple tree also
+ * adjusted.
+ *
+ * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
+ * to the VMA we expanded.
+ *
+ * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
+ * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
+ * expanded range.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
+ */
+struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
+{
+ bool is_special = vmg->flags & VM_SPECIAL;
+ struct vm_area_struct *prev = vmg->prev;
+ struct vm_area_struct *next = vmg->next;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
+ pgoff_t pgoff = vmg->pgoff;
+ pgoff_t pglen = PHYS_PFN(end - start);
+
+ VM_WARN_ON(vmg->vma);
+
+ if (!prev && !next) {
+ /*
+ * Since the caller must have determined that the requested
+ * range is empty, vmg->vmi will be left pointing at the VMA
+ * immediately prior.
+ */
+ next = vmg->next = vma_next(vmg->vmi);
+ prev = vmg->prev = vma_prev(vmg->vmi);
+
+ /* Avoid maple tree re-walk. */
+ if (is_special && prev)
+ vma_iter_next_range(vmg->vmi);
+ }
+
+ /* If special mapping or no adjacent VMAs, nothing to merge. */
+ if (is_special || (!prev && !next))
+ return NULL;
+
+ /* If we can merge with the following VMA, adjust vmg accordingly. */
+ if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
+ /*
+ * We can adjust this here as can_vma_merge_after() doesn't
+ * touch vmg->end.
+ */
+ vmg->end = next->vm_end;
+ vmg->vma = next;
+ vmg->pgoff = next->vm_pgoff - pglen;
+
+ vmg->anon_vma = next->anon_vma;
+ }
+
+ /* If we can merge with the previous VMA, adjust vmg accordingly. */
+ if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
+ vmg->start = prev->vm_start;
+ vmg->vma = prev;
+ vmg->pgoff = prev->vm_pgoff;
+ } else if (prev) {
+ vma_iter_next_range(vmg->vmi);
+ }
+
+ /*
+ * Now try to expand adjacent VMA(s). This takes care of removing the
+ * following VMA if we have VMAs on both sides.
+ */
+ if (vmg->vma && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->vma, vmg->flags);
+ return vmg->vma;
+ }
+
+ /* If expansion failed, reset state. Allows us to retry merge later. */
+ vmg->vma = NULL;
+ vmg->anon_vma = NULL;
+ vmg->start = start;
+ vmg->end = end;
+ vmg->pgoff = pgoff;
+ if (vmg->vma == prev)
+ vma_iter_set(vmg->vmi, start);
+
+ return NULL;
+}
+
/*
* vma_expand - Expand an existing VMA
*
@@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
* vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
* vmg->next needs to be handled by the caller.
*
- * Returns: 0 on success
+ * Returns: 0 on success.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
+ * - The caller must have set @vmg->prev and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
@@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
return 0;
}
-/*
- * vma_complete- Helper function for handling the unlocking after altering VMAs,
- * or for inserting a VMA.
- *
- * @vp: The vma_prepare struct
- * @vmi: The vma iterator
- * @mm: The mm_struct
- */
-void vma_complete(struct vma_prepare *vp,
- struct vma_iterator *vmi, struct mm_struct *mm)
-{
- if (vp->file) {
- if (vp->adj_next)
- vma_interval_tree_insert(vp->adj_next,
- &vp->mapping->i_mmap);
- vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
- flush_dcache_mmap_unlock(vp->mapping);
- }
-
- if (vp->remove && vp->file) {
- __remove_shared_vm_struct(vp->remove, vp->mapping);
- if (vp->remove2)
- __remove_shared_vm_struct(vp->remove2, vp->mapping);
- } else if (vp->insert) {
- /*
- * split_vma has split insert from vma, and needs
- * us to insert it before dropping the locks
- * (it may either follow vma or precede it).
- */
- vma_iter_store(vmi, vp->insert);
- mm->map_count++;
- }
-
- if (vp->anon_vma) {
- anon_vma_interval_tree_post_update_vma(vp->vma);
- if (vp->adj_next)
- anon_vma_interval_tree_post_update_vma(vp->adj_next);
- anon_vma_unlock_write(vp->anon_vma);
- }
-
- if (vp->file) {
- i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
-
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
- }
-
- if (vp->remove) {
-again:
- vma_mark_detached(vp->remove, true);
- if (vp->file) {
- uprobe_munmap(vp->remove, vp->remove->vm_start,
- vp->remove->vm_end);
- fput(vp->file);
- }
- if (vp->remove->anon_vma)
- anon_vma_merge(vp->vma, vp->remove);
- mm->map_count--;
- mpol_put(vma_policy(vp->remove));
- if (!vp->remove2)
- WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
- vm_area_free(vp->remove);
-
- /*
- * In mprotect's case 6 (see comments on vma_merge),
- * we are removing both mid and next vmas
- */
- if (vp->remove2) {
- vp->remove = vp->remove2;
- vp->remove2 = NULL;
- goto again;
- }
- }
- if (vp->insert && vp->file)
- uprobe_mmap(vp->insert);
- validate_mm(mm);
-}
-
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -1261,20 +1378,6 @@ struct vm_area_struct
return vma_modify(&vmg);
}
-/*
- * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
- * must ensure that [start, end) does not overlap any existing VMA.
- */
-struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
-{
- if (!vmg->prev) {
- vmg->prev = vma_prev(vmg->vmi);
- vma_iter_set(vmg->vmi, vmg->start);
- }
-
- return vma_merge(vmg);
-}
-
/*
* Expand vma by delta bytes, potentially merging with an immediately adjacent
* VMA with identical properties.
@@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
.anon_name = anon_vma_name(vma),
};
- /* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(&vmg);
+ return vma_merge_new_vma(&vmg);
}
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
@@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma, *prev;
+ struct vm_area_struct *new_vma;
bool faulted_in_anon_vma = true;
VMA_ITERATOR(vmi, mm, addr);
+ struct vma_merge_struct vmg = {
+ .vmi = &vmi,
+ .start = addr,
+ .end = addr + len,
+ .flags = vma->vm_flags,
+ .pgoff = pgoff,
+ .file = vma->vm_file,
+ .anon_vma = vma->anon_vma,
+ .policy = vma_policy(vma),
+ .uffd_ctx = vma->vm_userfaultfd_ctx,
+ .anon_name = anon_vma_name(vma),
+ };
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
- pgoff = addr >> PAGE_SHIFT;
+ pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
- new_vma = find_vma_prev(mm, addr, &prev);
+ new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
+ vmg.next = vma_next(&vmi);
+ vma_prev(&vmi);
+
+ new_vma = vma_merge_new_vma(&vmg);
+
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/vma.h b/mm/vma.h
index 50459f9e4c7f..bbb173053f34 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
/* Required for expand_downwards(). */
void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
-/* Required for do_brk_flags(). */
-void vma_prepare(struct vma_prepare *vp);
-
-/* Required for do_brk_flags(). */
-void init_vma_prep(struct vma_prepare *vp,
- struct vm_area_struct *vma);
-
-/* Required for do_brk_flags(). */
-void vma_complete(struct vma_prepare *vp,
- struct vma_iterator *vmi, struct mm_struct *mm);
-
int vma_expand(struct vma_merge_struct *vmg);
int vma_shrink(struct vma_merge_struct *vmg);
@@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
struct vm_area_struct *next, unsigned long start,
unsigned long end, unsigned long tree_end, bool mm_wr_locked);
-/*
- * Can we merge the VMA described by vmg into the following VMA vmg->next?
- *
- * Required by mmap_region().
- */
-bool can_vma_merge_before(struct vma_merge_struct *vmg);
-
-/*
- * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
- *
- * Required by mmap_region() and do_brk_flags().
- */
-bool can_vma_merge_after(struct vma_merge_struct *vmg);
-
/* We are about to modify the VMA's flags. */
struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
struct vm_area_struct *prev,
@@ -133,31 +108,7 @@ struct vm_area_struct
unsigned long new_flags,
struct vm_userfaultfd_ctx new_ctx);
-struct vm_area_struct
-*vma_merge_new_vma(struct vma_merge_struct *vmg);
-
-/* Temporary convenience wrapper. */
-static inline struct vm_area_struct
-*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff)
-{
- struct vma_merge_struct vmg = {
- .vmi = vmi,
- .prev = prev,
- .start = start,
- .end = end,
- .flags = vma->vm_flags,
- .file = vma->vm_file,
- .anon_vma = vma->anon_vma,
- .pgoff = pgoff,
- .policy = vma_policy(vma),
- .uffd_ctx = vma->vm_userfaultfd_ctx,
- .anon_name = anon_vma_name(vma),
- };
-
- return vma_merge_new_vma(&vmg);
-}
+struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
/*
* Temporary wrapper around vma_merge() so we can have a common interface for
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 40797a819d3d..a39a734282d0 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
mas_destroy(&vmi->mas);
}
+static inline
+struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
+{
+ return mas_next_range(&vmi->mas, ULONG_MAX);
+}
+
static inline void vm_acct_memory(long pages)
{
}
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-05 12:13 ` [PATCH 07/10] mm: avoid using vma_merge() for new VMAs Lorenzo Stoakes
@ 2024-08-06 13:04 ` Petr Tesařík
2024-08-06 13:44 ` Lorenzo Stoakes
2024-08-08 16:45 ` Vlastimil Babka
2024-08-09 15:23 ` Liam R. Howlett
2 siblings, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 13:04 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:54 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
>
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
>
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
>
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
>
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
>
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.
This patch truly rocks. Let me just say: Wow!
Petr T
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 79 ++---
> mm/vma.c | 482 +++++++++++++++++++------------
> mm/vma.h | 51 +---
> tools/testing/vma/vma_internal.h | 6 +
> 4 files changed, 324 insertions(+), 294 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> - struct vm_area_struct *next, *prev, *merge;
> - pgoff_t pglen = len >> PAGE_SHIFT;
> + struct vm_area_struct *merge;
> unsigned long charged = 0;
> unsigned long end = addr + len;
> bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags |= VM_ACCOUNT;
> }
>
> - next = vmg.next = vma_next(&vmi);
> - prev = vmg.prev = vma_prev(&vmi);
> - if (vm_flags & VM_SPECIAL) {
> - if (prev)
> - vma_iter_next_range(&vmi);
> - goto cannot_expand;
> - }
> -
> - /* Attempt to expand an old mapping */
> - /* Check next */
> - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> - /* We can adjust this as can_vma_merge_after() doesn't touch */
> - vmg.end = next->vm_end;
> - vma = vmg.vma = next;
> - vmg.pgoff = next->vm_pgoff - pglen;
> -
> - /* We may merge our NULL anon_vma with non-NULL in next. */
> - vmg.anon_vma = vma->anon_vma;
> - }
> -
> - /* Check prev */
> - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> - vmg.start = prev->vm_start;
> - vma = vmg.vma = prev;
> - vmg.pgoff = prev->vm_pgoff;
> - } else if (prev) {
> - vma_iter_next_range(&vmi);
> - }
> -
> - /* Actually expand, if possible */
> - if (vma && !vma_expand(&vmg)) {
> - khugepaged_enter_vma(vma, vm_flags);
> + vma = vma_merge_new_vma(&vmg);
> + if (vma)
> goto expanded;
> - }
> -
> - if (vma == prev)
> - vma_iter_set(&vmi, addr);
> -cannot_expand:
>
> /*
> * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * If vm_flags changed after call_mmap(), we should try merge
> * vma again as we may succeed this time.
> */
> - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> - vma->vm_start, vma->vm_end,
> - vma->vm_pgoff);
> + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> + merge = vma_merge_new_vma(&vmg);
> +
> if (merge) {
> /*
> * ->mmap() can change vma->vm_file and fput
> @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> vma_iter_set(&vmi, vma->vm_end);
> /* Undo any partial mapping done by a device driver. */
> - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> vma->vm_end, vma->vm_end, true);
> }
> if (writable_file_mapping)
> @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> unsigned long addr, unsigned long len, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> - struct vma_prepare vp;
>
> /*
> * Check against address space limits by the changed size
> @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> */
> if (vma && vma->vm_end == addr) {
> struct vma_merge_struct vmg = {
> + .vmi = vmi,
> .prev = vma,
> + .next = NULL,
> + .start = addr,
> + .end = addr + len,
> .flags = flags,
> .pgoff = addr >> PAGE_SHIFT,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> };
>
> - if (can_vma_merge_after(&vmg)) {
> - vma_iter_config(vmi, vma->vm_start, addr + len);
> - if (vma_iter_prealloc(vmi, vma))
> - goto unacct_fail;
> -
> - vma_start_write(vma);
> -
> - init_vma_prep(&vp, vma);
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> - vma->vm_end = addr + len;
> - vm_flags_set(vma, VM_SOFTDIRTY);
> - vma_iter_store(vmi, vma);
> -
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(vma, flags);
> + if (vma_merge_new_vma(&vmg))
> goto out;
> - }
> }
>
> if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index 55615392e8d2..a404cf718f9e 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> *
> * We assume the vma may be removed as part of the merge.
> */
> -bool
> -can_vma_merge_before(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> {
> pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
>
> @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> *
> * We assume that vma is not removed as part of the merge.
> */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> {
> if (is_mergeable_vma(vmg, false) &&
> is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> return false;
> }
>
> +static void __vma_link_file(struct vm_area_struct *vma,
> + struct address_space *mapping)
> +{
> + if (vma_is_shared_maywrite(vma))
> + mapping_allow_writable(mapping);
> +
> + flush_dcache_mmap_lock(mapping);
> + vma_interval_tree_insert(vma, &mapping->i_mmap);
> + flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * Requires inode->i_mapping->i_mmap_rwsem
> + */
> +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> + struct address_space *mapping)
> +{
> + if (vma_is_shared_maywrite(vma))
> + mapping_unmap_writable(mapping);
> +
> + flush_dcache_mmap_lock(mapping);
> + vma_interval_tree_remove(vma, &mapping->i_mmap);
> + flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> + * @vp: The initialized vma_prepare struct
> + */
> +static void vma_prepare(struct vma_prepare *vp)
> +{
> + if (vp->file) {
> + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> +
> + if (vp->adj_next)
> + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> + vp->adj_next->vm_end);
> +
> + i_mmap_lock_write(vp->mapping);
> + if (vp->insert && vp->insert->vm_file) {
> + /*
> + * Put into interval tree now, so instantiated pages
> + * are visible to arm/parisc __flush_dcache_page
> + * throughout; but we cannot insert into address
> + * space until vma start or end is updated.
> + */
> + __vma_link_file(vp->insert,
> + vp->insert->vm_file->f_mapping);
> + }
> + }
> +
> + if (vp->anon_vma) {
> + anon_vma_lock_write(vp->anon_vma);
> + anon_vma_interval_tree_pre_update_vma(vp->vma);
> + if (vp->adj_next)
> + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> + }
> +
> + if (vp->file) {
> + flush_dcache_mmap_lock(vp->mapping);
> + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> + if (vp->adj_next)
> + vma_interval_tree_remove(vp->adj_next,
> + &vp->mapping->i_mmap);
> + }
> +
> +}
> +
> +/*
> + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> + * or for inserting a VMA.
> + *
> + * @vp: The vma_prepare struct
> + * @vmi: The vma iterator
> + * @mm: The mm_struct
> + */
> +static void vma_complete(struct vma_prepare *vp,
> + struct vma_iterator *vmi, struct mm_struct *mm)
> +{
> + if (vp->file) {
> + if (vp->adj_next)
> + vma_interval_tree_insert(vp->adj_next,
> + &vp->mapping->i_mmap);
> + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> + flush_dcache_mmap_unlock(vp->mapping);
> + }
> +
> + if (vp->remove && vp->file) {
> + __remove_shared_vm_struct(vp->remove, vp->mapping);
> + if (vp->remove2)
> + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> + } else if (vp->insert) {
> + /*
> + * split_vma has split insert from vma, and needs
> + * us to insert it before dropping the locks
> + * (it may either follow vma or precede it).
> + */
> + vma_iter_store(vmi, vp->insert);
> + mm->map_count++;
> + }
> +
> + if (vp->anon_vma) {
> + anon_vma_interval_tree_post_update_vma(vp->vma);
> + if (vp->adj_next)
> + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> + anon_vma_unlock_write(vp->anon_vma);
> + }
> +
> + if (vp->file) {
> + i_mmap_unlock_write(vp->mapping);
> + uprobe_mmap(vp->vma);
> +
> + if (vp->adj_next)
> + uprobe_mmap(vp->adj_next);
> + }
> +
> + if (vp->remove) {
> +again:
> + vma_mark_detached(vp->remove, true);
> + if (vp->file) {
> + uprobe_munmap(vp->remove, vp->remove->vm_start,
> + vp->remove->vm_end);
> + fput(vp->file);
> + }
> + if (vp->remove->anon_vma)
> + anon_vma_merge(vp->vma, vp->remove);
> + mm->map_count--;
> + mpol_put(vma_policy(vp->remove));
> + if (!vp->remove2)
> + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> + vm_area_free(vp->remove);
> +
> + /*
> + * In mprotect's case 6 (see comments on vma_merge),
> + * we are removing both mid and next vmas
> + */
> + if (vp->remove2) {
> + vp->remove = vp->remove2;
> + vp->remove2 = NULL;
> + goto again;
> + }
> + }
> + if (vp->insert && vp->file)
> + uprobe_mmap(vp->insert);
> + validate_mm(mm);
> +}
> +
> +/*
> + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> + * @vp: The vma_prepare struct
> + * @vma: The vma that will be altered once locked
> + */
> +static void init_vma_prep(struct vma_prepare *vp,
> + struct vm_area_struct *vma)
> +{
> + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> +}
> +
> /*
> * Close a vm structure and free it.
> */
> @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> vm_unacct_memory(nr_accounted);
> }
>
> -/*
> - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> - * @vp: The vma_prepare struct
> - * @vma: The vma that will be altered once locked
> - */
> -void init_vma_prep(struct vma_prepare *vp,
> - struct vm_area_struct *vma)
> -{
> - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> -}
> -
> -/*
> - * Requires inode->i_mapping->i_mmap_rwsem
> - */
> -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> - struct address_space *mapping)
> -{
> - if (vma_is_shared_maywrite(vma))
> - mapping_unmap_writable(mapping);
> -
> - flush_dcache_mmap_lock(mapping);
> - vma_interval_tree_remove(vma, &mapping->i_mmap);
> - flush_dcache_mmap_unlock(mapping);
> -}
> -
> /*
> * vma has some anon_vma assigned, and is already inserted on that
> * anon_vma's interval trees.
> @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> }
>
> -static void __vma_link_file(struct vm_area_struct *vma,
> - struct address_space *mapping)
> -{
> - if (vma_is_shared_maywrite(vma))
> - mapping_allow_writable(mapping);
> -
> - flush_dcache_mmap_lock(mapping);
> - vma_interval_tree_insert(vma, &mapping->i_mmap);
> - flush_dcache_mmap_unlock(mapping);
> -}
> -
> -/*
> - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> - * @vp: The initialized vma_prepare struct
> - */
> -void vma_prepare(struct vma_prepare *vp)
> -{
> - if (vp->file) {
> - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> -
> - if (vp->adj_next)
> - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> - vp->adj_next->vm_end);
> -
> - i_mmap_lock_write(vp->mapping);
> - if (vp->insert && vp->insert->vm_file) {
> - /*
> - * Put into interval tree now, so instantiated pages
> - * are visible to arm/parisc __flush_dcache_page
> - * throughout; but we cannot insert into address
> - * space until vma start or end is updated.
> - */
> - __vma_link_file(vp->insert,
> - vp->insert->vm_file->f_mapping);
> - }
> - }
> -
> - if (vp->anon_vma) {
> - anon_vma_lock_write(vp->anon_vma);
> - anon_vma_interval_tree_pre_update_vma(vp->vma);
> - if (vp->adj_next)
> - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> - }
> -
> - if (vp->file) {
> - flush_dcache_mmap_lock(vp->mapping);
> - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> - if (vp->adj_next)
> - vma_interval_tree_remove(vp->adj_next,
> - &vp->mapping->i_mmap);
> - }
> -
> -}
> -
> /*
> * dup_anon_vma() - Helper function to duplicate anon_vma
> * @dst: The destination VMA
> @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> }
> #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>
> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + * EXPAND that VMA:
> + *
> + * Proposed: |-----| or |-----|
> + * Existing: |----| |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + * EXPAND the former and REMOVE the latter:
> + *
> + * Proposed: |-----|
> + * Existing: |----| |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + * to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> + bool is_special = vmg->flags & VM_SPECIAL;
> + struct vm_area_struct *prev = vmg->prev;
> + struct vm_area_struct *next = vmg->next;
> + unsigned long start = vmg->start;
> + unsigned long end = vmg->end;
> + pgoff_t pgoff = vmg->pgoff;
> + pgoff_t pglen = PHYS_PFN(end - start);
> +
> + VM_WARN_ON(vmg->vma);
> +
> + if (!prev && !next) {
> + /*
> + * Since the caller must have determined that the requested
> + * range is empty, vmg->vmi will be left pointing at the VMA
> + * immediately prior.
> + */
> + next = vmg->next = vma_next(vmg->vmi);
> + prev = vmg->prev = vma_prev(vmg->vmi);
> +
> + /* Avoid maple tree re-walk. */
> + if (is_special && prev)
> + vma_iter_next_range(vmg->vmi);
> + }
> +
> + /* If special mapping or no adjacent VMAs, nothing to merge. */
> + if (is_special || (!prev && !next))
> + return NULL;
> +
> + /* If we can merge with the following VMA, adjust vmg accordingly. */
> + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> + /*
> + * We can adjust this here as can_vma_merge_after() doesn't
> + * touch vmg->end.
> + */
> + vmg->end = next->vm_end;
> + vmg->vma = next;
> + vmg->pgoff = next->vm_pgoff - pglen;
> +
> + vmg->anon_vma = next->anon_vma;
> + }
> +
> + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> + vmg->start = prev->vm_start;
> + vmg->vma = prev;
> + vmg->pgoff = prev->vm_pgoff;
> + } else if (prev) {
> + vma_iter_next_range(vmg->vmi);
> + }
> +
> + /*
> + * Now try to expand adjacent VMA(s). This takes care of removing the
> + * following VMA if we have VMAs on both sides.
> + */
> + if (vmg->vma && !vma_expand(vmg)) {
> + khugepaged_enter_vma(vmg->vma, vmg->flags);
> + return vmg->vma;
> + }
> +
> + /* If expansion failed, reset state. Allows us to retry merge later. */
> + vmg->vma = NULL;
> + vmg->anon_vma = NULL;
> + vmg->start = start;
> + vmg->end = end;
> + vmg->pgoff = pgoff;
> + if (vmg->vma == prev)
> + vma_iter_set(vmg->vmi, start);
> +
> + return NULL;
> +}
> +
> /*
> * vma_expand - Expand an existing VMA
> *
> @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> * vmg->next needs to be handled by the caller.
> *
> - * Returns: 0 on success
> + * Returns: 0 on success.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> + * - The caller must have set @vmg->prev and @vmg->next.
> */
> int vma_expand(struct vma_merge_struct *vmg)
> {
> @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> return 0;
> }
>
> -/*
> - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> - * or for inserting a VMA.
> - *
> - * @vp: The vma_prepare struct
> - * @vmi: The vma iterator
> - * @mm: The mm_struct
> - */
> -void vma_complete(struct vma_prepare *vp,
> - struct vma_iterator *vmi, struct mm_struct *mm)
> -{
> - if (vp->file) {
> - if (vp->adj_next)
> - vma_interval_tree_insert(vp->adj_next,
> - &vp->mapping->i_mmap);
> - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> - flush_dcache_mmap_unlock(vp->mapping);
> - }
> -
> - if (vp->remove && vp->file) {
> - __remove_shared_vm_struct(vp->remove, vp->mapping);
> - if (vp->remove2)
> - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> - } else if (vp->insert) {
> - /*
> - * split_vma has split insert from vma, and needs
> - * us to insert it before dropping the locks
> - * (it may either follow vma or precede it).
> - */
> - vma_iter_store(vmi, vp->insert);
> - mm->map_count++;
> - }
> -
> - if (vp->anon_vma) {
> - anon_vma_interval_tree_post_update_vma(vp->vma);
> - if (vp->adj_next)
> - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> - anon_vma_unlock_write(vp->anon_vma);
> - }
> -
> - if (vp->file) {
> - i_mmap_unlock_write(vp->mapping);
> - uprobe_mmap(vp->vma);
> -
> - if (vp->adj_next)
> - uprobe_mmap(vp->adj_next);
> - }
> -
> - if (vp->remove) {
> -again:
> - vma_mark_detached(vp->remove, true);
> - if (vp->file) {
> - uprobe_munmap(vp->remove, vp->remove->vm_start,
> - vp->remove->vm_end);
> - fput(vp->file);
> - }
> - if (vp->remove->anon_vma)
> - anon_vma_merge(vp->vma, vp->remove);
> - mm->map_count--;
> - mpol_put(vma_policy(vp->remove));
> - if (!vp->remove2)
> - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> - vm_area_free(vp->remove);
> -
> - /*
> - * In mprotect's case 6 (see comments on vma_merge),
> - * we are removing both mid and next vmas
> - */
> - if (vp->remove2) {
> - vp->remove = vp->remove2;
> - vp->remove2 = NULL;
> - goto again;
> - }
> - }
> - if (vp->insert && vp->file)
> - uprobe_mmap(vp->insert);
> - validate_mm(mm);
> -}
> -
> /*
> * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> * @vmi: The vma iterator
> @@ -1261,20 +1378,6 @@ struct vm_area_struct
> return vma_modify(&vmg);
> }
>
> -/*
> - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> - * must ensure that [start, end) does not overlap any existing VMA.
> - */
> -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> -{
> - if (!vmg->prev) {
> - vmg->prev = vma_prev(vmg->vmi);
> - vma_iter_set(vmg->vmi, vmg->start);
> - }
> -
> - return vma_merge(vmg);
> -}
> -
> /*
> * Expand vma by delta bytes, potentially merging with an immediately adjacent
> * VMA with identical properties.
> @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> .anon_name = anon_vma_name(vma),
> };
>
> - /* vma is specified as prev, so case 1 or 2 will apply. */
> - return vma_merge(&vmg);
> + return vma_merge_new_vma(&vmg);
> }
>
> void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> struct vm_area_struct *vma = *vmap;
> unsigned long vma_start = vma->vm_start;
> struct mm_struct *mm = vma->vm_mm;
> - struct vm_area_struct *new_vma, *prev;
> + struct vm_area_struct *new_vma;
> bool faulted_in_anon_vma = true;
> VMA_ITERATOR(vmi, mm, addr);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .start = addr,
> + .end = addr + len,
> + .flags = vma->vm_flags,
> + .pgoff = pgoff,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
>
> /*
> * If anonymous vma has not yet been faulted, update new pgoff
> * to match new location, to increase its chance of merging.
> */
> if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> - pgoff = addr >> PAGE_SHIFT;
> + pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> faulted_in_anon_vma = false;
> }
>
> - new_vma = find_vma_prev(mm, addr, &prev);
> + new_vma = find_vma_prev(mm, addr, &vmg.prev);
> if (new_vma && new_vma->vm_start < addr + len)
> return NULL; /* should never get here */
>
> - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> + vmg.next = vma_next(&vmi);
> + vma_prev(&vmi);
> +
> + new_vma = vma_merge_new_vma(&vmg);
> +
> if (new_vma) {
> /*
> * Source vma may have been merged into new_vma
> diff --git a/mm/vma.h b/mm/vma.h
> index 50459f9e4c7f..bbb173053f34 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> /* Required for expand_downwards(). */
> void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
>
> -/* Required for do_brk_flags(). */
> -void vma_prepare(struct vma_prepare *vp);
> -
> -/* Required for do_brk_flags(). */
> -void init_vma_prep(struct vma_prepare *vp,
> - struct vm_area_struct *vma);
> -
> -/* Required for do_brk_flags(). */
> -void vma_complete(struct vma_prepare *vp,
> - struct vma_iterator *vmi, struct mm_struct *mm);
> -
> int vma_expand(struct vma_merge_struct *vmg);
> int vma_shrink(struct vma_merge_struct *vmg);
>
> @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> struct vm_area_struct *next, unsigned long start,
> unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>
> -/*
> - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> - *
> - * Required by mmap_region().
> - */
> -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> -
> -/*
> - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> - *
> - * Required by mmap_region() and do_brk_flags().
> - */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> -
> /* We are about to modify the VMA's flags. */
> struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> @@ -133,31 +108,7 @@ struct vm_area_struct
> unsigned long new_flags,
> struct vm_userfaultfd_ctx new_ctx);
>
> -struct vm_area_struct
> -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> -
> -/* Temporary convenience wrapper. */
> -static inline struct vm_area_struct
> -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> - struct vm_area_struct *vma, unsigned long start,
> - unsigned long end, pgoff_t pgoff)
> -{
> - struct vma_merge_struct vmg = {
> - .vmi = vmi,
> - .prev = prev,
> - .start = start,
> - .end = end,
> - .flags = vma->vm_flags,
> - .file = vma->vm_file,
> - .anon_vma = vma->anon_vma,
> - .pgoff = pgoff,
> - .policy = vma_policy(vma),
> - .uffd_ctx = vma->vm_userfaultfd_ctx,
> - .anon_name = anon_vma_name(vma),
> - };
> -
> - return vma_merge_new_vma(&vmg);
> -}
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
>
> /*
> * Temporary wrapper around vma_merge() so we can have a common interface for
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 40797a819d3d..a39a734282d0 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> mas_destroy(&vmi->mas);
> }
>
> +static inline
> +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> +{
> + return mas_next_range(&vmi->mas, ULONG_MAX);
> +}
> +
> static inline void vm_acct_memory(long pages)
> {
> }
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-06 13:04 ` Petr Tesařík
@ 2024-08-06 13:44 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 13:44 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 03:04:22PM GMT, Petr Tesařík wrote:
> On Mon, 5 Aug 2024 13:13:54 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
>
> This patch truly rocks. Let me just say: Wow!
Thanks!
>
> Petr T
>
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 79 ++---
> > mm/vma.c | 482 +++++++++++++++++++------------
> > mm/vma.h | 51 +---
> > tools/testing/vma/vma_internal.h | 6 +
> > 4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > {
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma = NULL;
> > - struct vm_area_struct *next, *prev, *merge;
> > - pgoff_t pglen = len >> PAGE_SHIFT;
> > + struct vm_area_struct *merge;
> > unsigned long charged = 0;
> > unsigned long end = addr + len;
> > bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vmg.next = vma_next(&vmi);
> > - prev = vmg.prev = vma_prev(&vmi);
> > - if (vm_flags & VM_SPECIAL) {
> > - if (prev)
> > - vma_iter_next_range(&vmi);
> > - goto cannot_expand;
> > - }
> > -
> > - /* Attempt to expand an old mapping */
> > - /* Check next */
> > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > - /* We can adjust this as can_vma_merge_after() doesn't touch */
> > - vmg.end = next->vm_end;
> > - vma = vmg.vma = next;
> > - vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > - /* We may merge our NULL anon_vma with non-NULL in next. */
> > - vmg.anon_vma = vma->anon_vma;
> > - }
> > -
> > - /* Check prev */
> > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > - vmg.start = prev->vm_start;
> > - vma = vmg.vma = prev;
> > - vmg.pgoff = prev->vm_pgoff;
> > - } else if (prev) {
> > - vma_iter_next_range(&vmi);
> > - }
> > -
> > - /* Actually expand, if possible */
> > - if (vma && !vma_expand(&vmg)) {
> > - khugepaged_enter_vma(vma, vm_flags);
> > + vma = vma_merge_new_vma(&vmg);
> > + if (vma)
> > goto expanded;
> > - }
> > -
> > - if (vma == prev)
> > - vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> > /*
> > * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > * If vm_flags changed after call_mmap(), we should try merge
> > * vma again as we may succeed this time.
> > */
> > - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > - vma->vm_start, vma->vm_end,
> > - vma->vm_pgoff);
> > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > + merge = vma_merge_new_vma(&vmg);
> > +
> > if (merge) {
> > /*
> > * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> > vma_iter_set(&vmi, vma->vm_end);
> > /* Undo any partial mapping done by a device driver. */
> > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> > vma->vm_end, vma->vm_end, true);
> > }
> > if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > unsigned long addr, unsigned long len, unsigned long flags)
> > {
> > struct mm_struct *mm = current->mm;
> > - struct vma_prepare vp;
> >
> > /*
> > * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > */
> > if (vma && vma->vm_end == addr) {
> > struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > .prev = vma,
> > + .next = NULL,
> > + .start = addr,
> > + .end = addr + len,
> > .flags = flags,
> > .pgoff = addr >> PAGE_SHIFT,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > };
> >
> > - if (can_vma_merge_after(&vmg)) {
> > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto unacct_fail;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > - vma->vm_end = addr + len;
> > - vm_flags_set(vma, VM_SOFTDIRTY);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(vma, flags);
> > + if (vma_merge_new_vma(&vmg))
> > goto out;
> > - }
> > }
> >
> > if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > *
> > * We assume the vma may be removed as part of the merge.
> > */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> > {
> > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> > *
> > * We assume that vma is not removed as part of the merge.
> > */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > {
> > if (is_mergeable_vma(vmg, false) &&
> > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > return false;
> > }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_allow_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_insert(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_unmap_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_remove(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > + if (vp->file) {
> > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > + if (vp->adj_next)
> > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > + vp->adj_next->vm_end);
> > +
> > + i_mmap_lock_write(vp->mapping);
> > + if (vp->insert && vp->insert->vm_file) {
> > + /*
> > + * Put into interval tree now, so instantiated pages
> > + * are visible to arm/parisc __flush_dcache_page
> > + * throughout; but we cannot insert into address
> > + * space until vma start or end is updated.
> > + */
> > + __vma_link_file(vp->insert,
> > + vp->insert->vm_file->f_mapping);
> > + }
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_lock_write(vp->anon_vma);
> > + anon_vma_interval_tree_pre_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > + }
> > +
> > + if (vp->file) {
> > + flush_dcache_mmap_lock(vp->mapping);
> > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > + if (vp->adj_next)
> > + vma_interval_tree_remove(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + }
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > + struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > + if (vp->file) {
> > + if (vp->adj_next)
> > + vma_interval_tree_insert(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > + flush_dcache_mmap_unlock(vp->mapping);
> > + }
> > +
> > + if (vp->remove && vp->file) {
> > + __remove_shared_vm_struct(vp->remove, vp->mapping);
> > + if (vp->remove2)
> > + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > + } else if (vp->insert) {
> > + /*
> > + * split_vma has split insert from vma, and needs
> > + * us to insert it before dropping the locks
> > + * (it may either follow vma or precede it).
> > + */
> > + vma_iter_store(vmi, vp->insert);
> > + mm->map_count++;
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_interval_tree_post_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > + anon_vma_unlock_write(vp->anon_vma);
> > + }
> > +
> > + if (vp->file) {
> > + i_mmap_unlock_write(vp->mapping);
> > + uprobe_mmap(vp->vma);
> > +
> > + if (vp->adj_next)
> > + uprobe_mmap(vp->adj_next);
> > + }
> > +
> > + if (vp->remove) {
> > +again:
> > + vma_mark_detached(vp->remove, true);
> > + if (vp->file) {
> > + uprobe_munmap(vp->remove, vp->remove->vm_start,
> > + vp->remove->vm_end);
> > + fput(vp->file);
> > + }
> > + if (vp->remove->anon_vma)
> > + anon_vma_merge(vp->vma, vp->remove);
> > + mm->map_count--;
> > + mpol_put(vma_policy(vp->remove));
> > + if (!vp->remove2)
> > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > + vm_area_free(vp->remove);
> > +
> > + /*
> > + * In mprotect's case 6 (see comments on vma_merge),
> > + * we are removing both mid and next vmas
> > + */
> > + if (vp->remove2) {
> > + vp->remove = vp->remove2;
> > + vp->remove2 = NULL;
> > + goto again;
> > + }
> > + }
> > + if (vp->insert && vp->file)
> > + uprobe_mmap(vp->insert);
> > + validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma)
> > +{
> > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> > /*
> > * Close a vm structure and free it.
> > */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> > vm_unacct_memory(nr_accounted);
> > }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma)
> > -{
> > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_unmap_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_remove(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > /*
> > * vma has some anon_vma assigned, and is already inserted on that
> > * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_allow_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_insert(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > - if (vp->file) {
> > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > - if (vp->adj_next)
> > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > - vp->adj_next->vm_end);
> > -
> > - i_mmap_lock_write(vp->mapping);
> > - if (vp->insert && vp->insert->vm_file) {
> > - /*
> > - * Put into interval tree now, so instantiated pages
> > - * are visible to arm/parisc __flush_dcache_page
> > - * throughout; but we cannot insert into address
> > - * space until vma start or end is updated.
> > - */
> > - __vma_link_file(vp->insert,
> > - vp->insert->vm_file->f_mapping);
> > - }
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_lock_write(vp->anon_vma);
> > - anon_vma_interval_tree_pre_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > - }
> > -
> > - if (vp->file) {
> > - flush_dcache_mmap_lock(vp->mapping);
> > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > - if (vp->adj_next)
> > - vma_interval_tree_remove(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - }
> > -
> > -}
> > -
> > /*
> > * dup_anon_vma() - Helper function to duplicate anon_vma
> > * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> > }
> > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + * EXPAND that VMA:
> > + *
> > + * Proposed: |-----| or |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + * EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed: |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + * to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > + bool is_special = vmg->flags & VM_SPECIAL;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next = vmg->next;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + pgoff_t pgoff = vmg->pgoff;
> > + pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > + VM_WARN_ON(vmg->vma);
> > +
> > + if (!prev && !next) {
> > + /*
> > + * Since the caller must have determined that the requested
> > + * range is empty, vmg->vmi will be left pointing at the VMA
> > + * immediately prior.
> > + */
> > + next = vmg->next = vma_next(vmg->vmi);
> > + prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > + /* Avoid maple tree re-walk. */
> > + if (is_special && prev)
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /* If special mapping or no adjacent VMAs, nothing to merge. */
> > + if (is_special || (!prev && !next))
> > + return NULL;
> > +
> > + /* If we can merge with the following VMA, adjust vmg accordingly. */
> > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > + /*
> > + * We can adjust this here as can_vma_merge_after() doesn't
> > + * touch vmg->end.
> > + */
> > + vmg->end = next->vm_end;
> > + vmg->vma = next;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > + vmg->anon_vma = next->anon_vma;
> > + }
> > +
> > + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + vmg->start = prev->vm_start;
> > + vmg->vma = prev;
> > + vmg->pgoff = prev->vm_pgoff;
> > + } else if (prev) {
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /*
> > + * Now try to expand adjacent VMA(s). This takes care of removing the
> > + * following VMA if we have VMAs on both sides.
> > + */
> > + if (vmg->vma && !vma_expand(vmg)) {
> > + khugepaged_enter_vma(vmg->vma, vmg->flags);
> > + return vmg->vma;
> > + }
> > +
> > + /* If expansion failed, reset state. Allows us to retry merge later. */
> > + vmg->vma = NULL;
> > + vmg->anon_vma = NULL;
> > + vmg->start = start;
> > + vmg->end = end;
> > + vmg->pgoff = pgoff;
> > + if (vmg->vma == prev)
> > + vma_iter_set(vmg->vmi, start);
> > +
> > + return NULL;
> > +}
> > +
> > /*
> > * vma_expand - Expand an existing VMA
> > *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> > * vmg->next needs to be handled by the caller.
> > *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> > */
> > int vma_expand(struct vma_merge_struct *vmg)
> > {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> > return 0;
> > }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > - if (vp->file) {
> > - if (vp->adj_next)
> > - vma_interval_tree_insert(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > - flush_dcache_mmap_unlock(vp->mapping);
> > - }
> > -
> > - if (vp->remove && vp->file) {
> > - __remove_shared_vm_struct(vp->remove, vp->mapping);
> > - if (vp->remove2)
> > - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > - } else if (vp->insert) {
> > - /*
> > - * split_vma has split insert from vma, and needs
> > - * us to insert it before dropping the locks
> > - * (it may either follow vma or precede it).
> > - */
> > - vma_iter_store(vmi, vp->insert);
> > - mm->map_count++;
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_interval_tree_post_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > - anon_vma_unlock_write(vp->anon_vma);
> > - }
> > -
> > - if (vp->file) {
> > - i_mmap_unlock_write(vp->mapping);
> > - uprobe_mmap(vp->vma);
> > -
> > - if (vp->adj_next)
> > - uprobe_mmap(vp->adj_next);
> > - }
> > -
> > - if (vp->remove) {
> > -again:
> > - vma_mark_detached(vp->remove, true);
> > - if (vp->file) {
> > - uprobe_munmap(vp->remove, vp->remove->vm_start,
> > - vp->remove->vm_end);
> > - fput(vp->file);
> > - }
> > - if (vp->remove->anon_vma)
> > - anon_vma_merge(vp->vma, vp->remove);
> > - mm->map_count--;
> > - mpol_put(vma_policy(vp->remove));
> > - if (!vp->remove2)
> > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > - vm_area_free(vp->remove);
> > -
> > - /*
> > - * In mprotect's case 6 (see comments on vma_merge),
> > - * we are removing both mid and next vmas
> > - */
> > - if (vp->remove2) {
> > - vp->remove = vp->remove2;
> > - vp->remove2 = NULL;
> > - goto again;
> > - }
> > - }
> > - if (vp->insert && vp->file)
> > - uprobe_mmap(vp->insert);
> > - validate_mm(mm);
> > -}
> > -
> > /*
> > * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> > * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> > return vma_modify(&vmg);
> > }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > - if (!vmg->prev) {
> > - vmg->prev = vma_prev(vmg->vmi);
> > - vma_iter_set(vmg->vmi, vmg->start);
> > - }
> > -
> > - return vma_merge(vmg);
> > -}
> > -
> > /*
> > * Expand vma by delta bytes, potentially merging with an immediately adjacent
> > * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > .anon_name = anon_vma_name(vma),
> > };
> >
> > - /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(&vmg);
> > + return vma_merge_new_vma(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > struct vm_area_struct *vma = *vmap;
> > unsigned long vma_start = vma->vm_start;
> > struct mm_struct *mm = vma->vm_mm;
> > - struct vm_area_struct *new_vma, *prev;
> > + struct vm_area_struct *new_vma;
> > bool faulted_in_anon_vma = true;
> > VMA_ITERATOR(vmi, mm, addr);
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .start = addr,
> > + .end = addr + len,
> > + .flags = vma->vm_flags,
> > + .pgoff = pgoff,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> >
> > /*
> > * If anonymous vma has not yet been faulted, update new pgoff
> > * to match new location, to increase its chance of merging.
> > */
> > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > - pgoff = addr >> PAGE_SHIFT;
> > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> > faulted_in_anon_vma = false;
> > }
> >
> > - new_vma = find_vma_prev(mm, addr, &prev);
> > + new_vma = find_vma_prev(mm, addr, &vmg.prev);
> > if (new_vma && new_vma->vm_start < addr + len)
> > return NULL; /* should never get here */
> >
> > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > + vmg.next = vma_next(&vmi);
> > + vma_prev(&vmi);
> > +
> > + new_vma = vma_merge_new_vma(&vmg);
> > +
> > if (new_vma) {
> > /*
> > * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> > /* Required for expand_downwards(). */
> > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> > int vma_expand(struct vma_merge_struct *vmg);
> > int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > struct vm_area_struct *next, unsigned long start,
> > unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> > /* We are about to modify the VMA's flags. */
> > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> > unsigned long new_flags,
> > struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *vma, unsigned long start,
> > - unsigned long end, pgoff_t pgoff)
> > -{
> > - struct vma_merge_struct vmg = {
> > - .vmi = vmi,
> > - .prev = prev,
> > - .start = start,
> > - .end = end,
> > - .flags = vma->vm_flags,
> > - .file = vma->vm_file,
> > - .anon_vma = vma->anon_vma,
> > - .pgoff = pgoff,
> > - .policy = vma_policy(vma),
> > - .uffd_ctx = vma->vm_userfaultfd_ctx,
> > - .anon_name = anon_vma_name(vma),
> > - };
> > -
> > - return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> > /*
> > * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> > mas_destroy(&vmi->mas);
> > }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > + return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> > static inline void vm_acct_memory(long pages)
> > {
> > }
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-05 12:13 ` [PATCH 07/10] mm: avoid using vma_merge() for new VMAs Lorenzo Stoakes
2024-08-06 13:04 ` Petr Tesařík
@ 2024-08-08 16:45 ` Vlastimil Babka
2024-08-08 18:02 ` Lorenzo Stoakes
2024-08-09 15:23 ` Liam R. Howlett
2 siblings, 1 reply; 53+ messages in thread
From: Vlastimil Babka @ 2024-08-08 16:45 UTC (permalink / raw)
To: Lorenzo Stoakes, linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
>
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
>
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
>
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
>
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
>
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.
That's really great, but it would be even better if these code moves could
be a separate patch as it would make reviewing so much easier. But with git
diff's --color-moved to the rescue, let me try...
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 79 ++---
> mm/vma.c | 482 +++++++++++++++++++------------
> mm/vma.h | 51 +---
> tools/testing/vma/vma_internal.h | 6 +
> 4 files changed, 324 insertions(+), 294 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> - struct vm_area_struct *next, *prev, *merge;
> - pgoff_t pglen = len >> PAGE_SHIFT;
> + struct vm_area_struct *merge;
> unsigned long charged = 0;
> unsigned long end = addr + len;
> bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags |= VM_ACCOUNT;
> }
>
> - next = vmg.next = vma_next(&vmi);
> - prev = vmg.prev = vma_prev(&vmi);
> - if (vm_flags & VM_SPECIAL) {
> - if (prev)
> - vma_iter_next_range(&vmi);
> - goto cannot_expand;
> - }
> -
> - /* Attempt to expand an old mapping */
> - /* Check next */
> - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> - /* We can adjust this as can_vma_merge_after() doesn't touch */
> - vmg.end = next->vm_end;
> - vma = vmg.vma = next;
> - vmg.pgoff = next->vm_pgoff - pglen;
> -
> - /* We may merge our NULL anon_vma with non-NULL in next. */
> - vmg.anon_vma = vma->anon_vma;
> - }
> -
> - /* Check prev */
> - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> - vmg.start = prev->vm_start;
> - vma = vmg.vma = prev;
> - vmg.pgoff = prev->vm_pgoff;
> - } else if (prev) {
> - vma_iter_next_range(&vmi);
> - }
> -
> - /* Actually expand, if possible */
> - if (vma && !vma_expand(&vmg)) {
> - khugepaged_enter_vma(vma, vm_flags);
> + vma = vma_merge_new_vma(&vmg);
> + if (vma)
> goto expanded;
> - }
> -
> - if (vma == prev)
> - vma_iter_set(&vmi, addr);
> -cannot_expand:
>
> /*
> * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * If vm_flags changed after call_mmap(), we should try merge
> * vma again as we may succeed this time.
> */
> - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> - vma->vm_start, vma->vm_end,
> - vma->vm_pgoff);
> + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> + merge = vma_merge_new_vma(&vmg);
Can this even succeed if we don't update vmg->vm_flags? Previously the
wrapper would take them from vma.
> +
> if (merge) {
> /*
> * ->mmap() can change vma->vm_file and fput
<snip>
> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + * EXPAND that VMA:
> + *
> + * Proposed: |-----| or |-----|
> + * Existing: |----| |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + * EXPAND the former and REMOVE the latter:
> + *
> + * Proposed: |-----|
> + * Existing: |----| |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + * to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
Should we be paranoid and assert something?
> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> + bool is_special = vmg->flags & VM_SPECIAL;
> + struct vm_area_struct *prev = vmg->prev;
> + struct vm_area_struct *next = vmg->next;
> + unsigned long start = vmg->start;
> + unsigned long end = vmg->end;
> + pgoff_t pgoff = vmg->pgoff;
> + pgoff_t pglen = PHYS_PFN(end - start);
> +
> + VM_WARN_ON(vmg->vma);
> +
> + if (!prev && !next) {
> + /*
> + * Since the caller must have determined that the requested
> + * range is empty, vmg->vmi will be left pointing at the VMA
> + * immediately prior.
> + */
OK that's perhaps not that obvious, as it seems copy_vma() is doing some
special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
it, or is there a maple tree operation we can do to ensure it, ideally if
it's very cheap if the iterator is already set the way we want it to be?
> + next = vmg->next = vma_next(vmg->vmi);
> + prev = vmg->prev = vma_prev(vmg->vmi);
> +
> + /* Avoid maple tree re-walk. */
> + if (is_special && prev)
> + vma_iter_next_range(vmg->vmi);
I wish I knew what this did but seems it's the same as the old code did so
hopefully that's fine.
> + }
> +
> + /* If special mapping or no adjacent VMAs, nothing to merge. */
> + if (is_special || (!prev && !next))
> + return NULL;
> +
> + /* If we can merge with the following VMA, adjust vmg accordingly. */
> + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> + /*
> + * We can adjust this here as can_vma_merge_after() doesn't
> + * touch vmg->end.
> + */
> + vmg->end = next->vm_end;
> + vmg->vma = next;
> + vmg->pgoff = next->vm_pgoff - pglen;
> +
> + vmg->anon_vma = next->anon_vma;
> + }
> +
> + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> + vmg->start = prev->vm_start;
> + vmg->vma = prev;
> + vmg->pgoff = prev->vm_pgoff;
> + } else if (prev) {
> + vma_iter_next_range(vmg->vmi);
> + }
Sigh... ditto.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-08 16:45 ` Vlastimil Babka
@ 2024-08-08 18:02 ` Lorenzo Stoakes
2024-08-08 18:34 ` Liam R. Howlett
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-08 18:02 UTC (permalink / raw)
To: Vlastimil Babka; +Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
>
> That's really great, but it would be even better if these code moves could
> be a separate patch as it would make reviewing so much easier. But with git
> diff's --color-moved to the rescue, let me try...
Will separate out on respin.
>
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 79 ++---
> > mm/vma.c | 482 +++++++++++++++++++------------
> > mm/vma.h | 51 +---
> > tools/testing/vma/vma_internal.h | 6 +
> > 4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > {
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma = NULL;
> > - struct vm_area_struct *next, *prev, *merge;
> > - pgoff_t pglen = len >> PAGE_SHIFT;
> > + struct vm_area_struct *merge;
> > unsigned long charged = 0;
> > unsigned long end = addr + len;
> > bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vmg.next = vma_next(&vmi);
> > - prev = vmg.prev = vma_prev(&vmi);
> > - if (vm_flags & VM_SPECIAL) {
> > - if (prev)
> > - vma_iter_next_range(&vmi);
> > - goto cannot_expand;
> > - }
> > -
> > - /* Attempt to expand an old mapping */
> > - /* Check next */
> > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > - /* We can adjust this as can_vma_merge_after() doesn't touch */
> > - vmg.end = next->vm_end;
> > - vma = vmg.vma = next;
> > - vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > - /* We may merge our NULL anon_vma with non-NULL in next. */
> > - vmg.anon_vma = vma->anon_vma;
> > - }
> > -
> > - /* Check prev */
> > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > - vmg.start = prev->vm_start;
> > - vma = vmg.vma = prev;
> > - vmg.pgoff = prev->vm_pgoff;
> > - } else if (prev) {
> > - vma_iter_next_range(&vmi);
> > - }
> > -
> > - /* Actually expand, if possible */
> > - if (vma && !vma_expand(&vmg)) {
> > - khugepaged_enter_vma(vma, vm_flags);
> > + vma = vma_merge_new_vma(&vmg);
> > + if (vma)
> > goto expanded;
> > - }
> > -
> > - if (vma == prev)
> > - vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> > /*
> > * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > * If vm_flags changed after call_mmap(), we should try merge
> > * vma again as we may succeed this time.
> > */
> > - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > - vma->vm_start, vma->vm_end,
> > - vma->vm_pgoff);
> > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > + merge = vma_merge_new_vma(&vmg);
>
> Can this even succeed if we don't update vmg->vm_flags? Previously the
> wrapper would take them from vma.
You're right... ugh. Will fix.
This is yet another example of how having this _not_ be under test is
problematic, as that'd have picked this up.
I will try to move at least VMA merge invocation logic over in a later
series.
>
> > +
> > if (merge) {
> > /*
> > * ->mmap() can change vma->vm_file and fput
>
> <snip>
>
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + * EXPAND that VMA:
> > + *
> > + * Proposed: |-----| or |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + * EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed: |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + * to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
>
> Should we be paranoid and assert something?
This will have a performance impact, if we do that we'll want something like
an #ifdef CONFIG_DEBUG_VM around that.
>
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > + bool is_special = vmg->flags & VM_SPECIAL;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next = vmg->next;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + pgoff_t pgoff = vmg->pgoff;
> > + pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > + VM_WARN_ON(vmg->vma);
> > +
> > + if (!prev && !next) {
> > + /*
> > + * Since the caller must have determined that the requested
> > + * range is empty, vmg->vmi will be left pointing at the VMA
> > + * immediately prior.
> > + */
>
> OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> it, or is there a maple tree operation we can do to ensure it, ideally if
> it's very cheap if the iterator is already set the way we want it to be?
>
To be fair this is something that was previously assumed, and I just added
a comment.
Will add to assumptions, and again I think any assert should be done in
such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
VM_WARN_ON()?
Will try to come up with something.
> > + next = vmg->next = vma_next(vmg->vmi);
> > + prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > + /* Avoid maple tree re-walk. */
> > + if (is_special && prev)
> > + vma_iter_next_range(vmg->vmi);
>
> I wish I knew what this did but seems it's the same as the old code did so
> hopefully that's fine.
I think point is that we are about to exit, so we'd be left pointing at
prev. But since we're exiting in just a second, we want to be pointing at
the next vma which will become the prev of the next merge attempt.
Liam can maybe elucidate further.
>
> > + }
> > +
> > + /* If special mapping or no adjacent VMAs, nothing to merge. */
> > + if (is_special || (!prev && !next))
> > + return NULL;
> > +
> > + /* If we can merge with the following VMA, adjust vmg accordingly. */
> > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > + /*
> > + * We can adjust this here as can_vma_merge_after() doesn't
> > + * touch vmg->end.
> > + */
> > + vmg->end = next->vm_end;
> > + vmg->vma = next;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > + vmg->anon_vma = next->anon_vma;
> > + }
> > +
> > + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + vmg->start = prev->vm_start;
> > + vmg->vma = prev;
> > + vmg->pgoff = prev->vm_pgoff;
> > + } else if (prev) {
> > + vma_iter_next_range(vmg->vmi);
> > + }
>
> Sigh... ditto.
>
(Liam can correct me) I think this is just setting up the vmi similar to
the other case such that if expansion fails we can positioned correctly for
the next merge attempt.
Yes it's fiddly, maybe needs a comment...
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-08 18:02 ` Lorenzo Stoakes
@ 2024-08-08 18:34 ` Liam R. Howlett
2024-08-08 19:06 ` Liam R. Howlett
0 siblings, 1 reply; 53+ messages in thread
From: Liam R. Howlett @ 2024-08-08 18:34 UTC (permalink / raw)
To: Lorenzo Stoakes; +Cc: Vlastimil Babka, linux-mm, linux-kernel, Andrew Morton
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > >
> > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > use vma_merge(), rather use vma_expand().
> > >
> > > We implement this by replacing vma_merge_new_vma() with this newly
> > > abstracted logic.
> > >
> > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > laying the ground for us to eliminate the merging of new VMAs in
> > > vma_merge() altogether.
> > >
> > > This makes it far easier to understand what is happening in these cases
> > > avoiding confusion, bugs and allowing for future optimisation.
> > >
> > > As a result of this change we are also able to make vma_prepare(),
> > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > can_vma_merge_after() static and internal to vma.c.
> >
> > That's really great, but it would be even better if these code moves could
> > be a separate patch as it would make reviewing so much easier. But with git
> > diff's --color-moved to the rescue, let me try...
>
> Will separate out on respin.
>
> >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > > mm/mmap.c | 79 ++---
> > > mm/vma.c | 482 +++++++++++++++++++------------
> > > mm/vma.h | 51 +---
> > > tools/testing/vma/vma_internal.h | 6 +
> > > 4 files changed, 324 insertions(+), 294 deletions(-)
...
> > > + */
> > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > +{
> > > + bool is_special = vmg->flags & VM_SPECIAL;
> > > + struct vm_area_struct *prev = vmg->prev;
> > > + struct vm_area_struct *next = vmg->next;
> > > + unsigned long start = vmg->start;
> > > + unsigned long end = vmg->end;
> > > + pgoff_t pgoff = vmg->pgoff;
> > > + pgoff_t pglen = PHYS_PFN(end - start);
> > > +
> > > + VM_WARN_ON(vmg->vma);
> > > +
> > > + if (!prev && !next) {
> > > + /*
> > > + * Since the caller must have determined that the requested
> > > + * range is empty, vmg->vmi will be left pointing at the VMA
> > > + * immediately prior.
> > > + */
> >
> > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > it, or is there a maple tree operation we can do to ensure it, ideally if
> > it's very cheap if the iterator is already set the way we want it to be?
> >
>
> To be fair this is something that was previously assumed, and I just added
> a comment.
>
> Will add to assumptions, and again I think any assert should be done in
> such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> VM_WARN_ON()?
>
> Will try to come up with something.
>
> > > + next = vmg->next = vma_next(vmg->vmi);
> > > + prev = vmg->prev = vma_prev(vmg->vmi);
> > > +
> > > + /* Avoid maple tree re-walk. */
> > > + if (is_special && prev)
> > > + vma_iter_next_range(vmg->vmi);
> >
> > I wish I knew what this did but seems it's the same as the old code did so
> > hopefully that's fine.
>
> I think point is that we are about to exit, so we'd be left pointing at
> prev. But since we're exiting in just a second, we want to be pointing at
> the next vma which will become the prev of the next merge attempt.
>
> Liam can maybe elucidate further.
What you have to remember is that the vma iterator (vmg->vmi above),
contains (or, basically is) a maple state (usually written as mas). We
keep state of the maple tree walker so that we don't have to keep
re-walking to find the same thing. We move around the tree with this
maple state because going prev/next is faster from leaves (almost always
just the next thing in the nodes array of pointers).
We use the maple state to write as well, so the maple state needs to
point to the correct location in the tree for a write.
The maple tree is a range-based tree, so each entry exists for a span of
values. A write happens at the lowest index and can overwrite
subsequent values. This means that the maple state needs to point to
the range containing the lowest index for the write (if it's pointing to
a node - it could walk from the top).
A side effect of writing to the lowest index is that we need to point to
the previous vma if we are going to 'expand' the vma. The range is
essentially going to be from prev->start to "whatever we are expanding
over".
In the old code, the vm_flags & VM_SPECIAL code meant there was no way
an expansion was going to happen, but we've moved the maple state to the
wrong location for a write of a new vma - so this vma_iter_next_range()
just moves it back. Then we "goto cannot_expand".
>
> >
> > > + }
> > > +
> > > + /* If special mapping or no adjacent VMAs, nothing to merge. */
> > > + if (is_special || (!prev && !next))
> > > + return NULL;
> > > +
> > > + /* If we can merge with the following VMA, adjust vmg accordingly. */
> > > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > > + /*
> > > + * We can adjust this here as can_vma_merge_after() doesn't
> > > + * touch vmg->end.
> > > + */
> > > + vmg->end = next->vm_end;
> > > + vmg->vma = next;
> > > + vmg->pgoff = next->vm_pgoff - pglen;
> > > +
> > > + vmg->anon_vma = next->anon_vma;
> > > + }
> > > +
> > > + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > > + vmg->start = prev->vm_start;
> > > + vmg->vma = prev;
> > > + vmg->pgoff = prev->vm_pgoff;
> > > + } else if (prev) {
> > > + vma_iter_next_range(vmg->vmi);
> > > + }
> >
> > Sigh... ditto.
> >
>
> (Liam can correct me) I think this is just setting up the vmi similar to
> the other case such that if expansion fails we can positioned correctly for
> the next merge attempt.
>
> Yes it's fiddly, maybe needs a comment...
Yes, ditto.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-08 18:34 ` Liam R. Howlett
@ 2024-08-08 19:06 ` Liam R. Howlett
2024-08-09 10:14 ` Lorenzo Stoakes
0 siblings, 1 reply; 53+ messages in thread
From: Liam R. Howlett @ 2024-08-08 19:06 UTC (permalink / raw)
To: Lorenzo Stoakes, Vlastimil Babka, linux-mm, linux-kernel, Andrew Morton
* Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > > >
> > > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > > use vma_merge(), rather use vma_expand().
> > > >
> > > > We implement this by replacing vma_merge_new_vma() with this newly
> > > > abstracted logic.
> > > >
> > > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > > laying the ground for us to eliminate the merging of new VMAs in
> > > > vma_merge() altogether.
> > > >
> > > > This makes it far easier to understand what is happening in these cases
> > > > avoiding confusion, bugs and allowing for future optimisation.
> > > >
> > > > As a result of this change we are also able to make vma_prepare(),
> > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > > can_vma_merge_after() static and internal to vma.c.
> > >
> > > That's really great, but it would be even better if these code moves could
> > > be a separate patch as it would make reviewing so much easier. But with git
> > > diff's --color-moved to the rescue, let me try...
> >
> > Will separate out on respin.
> >
> > >
> > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > ---
> > > > mm/mmap.c | 79 ++---
> > > > mm/vma.c | 482 +++++++++++++++++++------------
> > > > mm/vma.h | 51 +---
> > > > tools/testing/vma/vma_internal.h | 6 +
> > > > 4 files changed, 324 insertions(+), 294 deletions(-)
>
> ...
> > > > + */
> > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > +{
> > > > + bool is_special = vmg->flags & VM_SPECIAL;
> > > > + struct vm_area_struct *prev = vmg->prev;
> > > > + struct vm_area_struct *next = vmg->next;
> > > > + unsigned long start = vmg->start;
> > > > + unsigned long end = vmg->end;
> > > > + pgoff_t pgoff = vmg->pgoff;
> > > > + pgoff_t pglen = PHYS_PFN(end - start);
> > > > +
> > > > + VM_WARN_ON(vmg->vma);
> > > > +
> > > > + if (!prev && !next) {
> > > > + /*
> > > > + * Since the caller must have determined that the requested
> > > > + * range is empty, vmg->vmi will be left pointing at the VMA
> > > > + * immediately prior.
> > > > + */
> > >
> > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > > it, or is there a maple tree operation we can do to ensure it, ideally if
> > > it's very cheap if the iterator is already set the way we want it to be?
> > >
> >
> > To be fair this is something that was previously assumed, and I just added
> > a comment.
> >
> > Will add to assumptions, and again I think any assert should be done in
> > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> > VM_WARN_ON()?
> >
> > Will try to come up with something.
Something like:
VM_BUG_ON(vma_iter_end(vmg->vmi) > start);
> >
> > > > + next = vmg->next = vma_next(vmg->vmi);
and:
VM_BUG_ON(vma_iter_addr(vmg->vmi) < end);
> > > > + prev = vmg->prev = vma_prev(vmg->vmi);
> > > > +
> > > > + /* Avoid maple tree re-walk. */
> > > > + if (is_special && prev)
> > > > + vma_iter_next_range(vmg->vmi);
> > >
> > > I wish I knew what this did but seems it's the same as the old code did so
> > > hopefully that's fine.
> >
> > I think point is that we are about to exit, so we'd be left pointing at
> > prev. But since we're exiting in just a second, we want to be pointing at
> > the next vma which will become the prev of the next merge attempt.
> >
> > Liam can maybe elucidate further.
>
> What you have to remember is that the vma iterator (vmg->vmi above),
> contains (or, basically is) a maple state (usually written as mas). We
> keep state of the maple tree walker so that we don't have to keep
> re-walking to find the same thing. We move around the tree with this
> maple state because going prev/next is faster from leaves (almost always
> just the next thing in the nodes array of pointers).
>
> We use the maple state to write as well, so the maple state needs to
> point to the correct location in the tree for a write.
>
> The maple tree is a range-based tree, so each entry exists for a span of
> values. A write happens at the lowest index and can overwrite
> subsequent values. This means that the maple state needs to point to
> the range containing the lowest index for the write (if it's pointing to
> a node - it could walk from the top).
>
> A side effect of writing to the lowest index is that we need to point to
> the previous vma if we are going to 'expand' the vma. The range is
> essentially going to be from prev->start to "whatever we are expanding
> over".
>
> In the old code, the vm_flags & VM_SPECIAL code meant there was no way
> an expansion was going to happen, but we've moved the maple state to the
> wrong location for a write of a new vma - so this vma_iter_next_range()
> just moves it back. Then we "goto cannot_expand".
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-08 19:06 ` Liam R. Howlett
@ 2024-08-09 10:14 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 10:14 UTC (permalink / raw)
To: Liam R. Howlett, Vlastimil Babka, linux-mm, linux-kernel, Andrew Morton
On Thu, Aug 08, 2024 at 03:06:14PM GMT, Liam R. Howlett wrote:
> * Liam R. Howlett <Liam.Howlett@oracle.com> [240808 14:34]:
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240808 14:02]:
> > > On Thu, Aug 08, 2024 at 06:45:43PM GMT, Vlastimil Babka wrote:
> > > > On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > > > > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > > > > to use vma_expand() rather than invoke a full vma_merge() operation.
> > > > >
> > > > > Abstract this logic and eliminate all of the open-coding, and also use the
> > > > > same logic for all cases where we add new VMAs to, rather than ultimately
> > > > > use vma_merge(), rather use vma_expand().
> > > > >
> > > > > We implement this by replacing vma_merge_new_vma() with this newly
> > > > > abstracted logic.
> > > > >
> > > > > Doing so removes duplication and simplifies VMA merging in all such cases,
> > > > > laying the ground for us to eliminate the merging of new VMAs in
> > > > > vma_merge() altogether.
> > > > >
> > > > > This makes it far easier to understand what is happening in these cases
> > > > > avoiding confusion, bugs and allowing for future optimisation.
> > > > >
> > > > > As a result of this change we are also able to make vma_prepare(),
> > > > > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > > > > can_vma_merge_after() static and internal to vma.c.
> > > >
> > > > That's really great, but it would be even better if these code moves could
> > > > be a separate patch as it would make reviewing so much easier. But with git
> > > > diff's --color-moved to the rescue, let me try...
> > >
> > > Will separate out on respin.
> > >
> > > >
> > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > > ---
> > > > > mm/mmap.c | 79 ++---
> > > > > mm/vma.c | 482 +++++++++++++++++++------------
> > > > > mm/vma.h | 51 +---
> > > > > tools/testing/vma/vma_internal.h | 6 +
> > > > > 4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > ...
> > > > > + */
> > > > > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > > +{
> > > > > + bool is_special = vmg->flags & VM_SPECIAL;
> > > > > + struct vm_area_struct *prev = vmg->prev;
> > > > > + struct vm_area_struct *next = vmg->next;
> > > > > + unsigned long start = vmg->start;
> > > > > + unsigned long end = vmg->end;
> > > > > + pgoff_t pgoff = vmg->pgoff;
> > > > > + pgoff_t pglen = PHYS_PFN(end - start);
> > > > > +
> > > > > + VM_WARN_ON(vmg->vma);
> > > > > +
> > > > > + if (!prev && !next) {
> > > > > + /*
> > > > > + * Since the caller must have determined that the requested
> > > > > + * range is empty, vmg->vmi will be left pointing at the VMA
> > > > > + * immediately prior.
> > > > > + */
> > > >
> > > > OK that's perhaps not that obvious, as it seems copy_vma() is doing some
> > > > special dance to ensure this. Should we add it to the ASSUMPTIONS and assert
> > > > it, or is there a maple tree operation we can do to ensure it, ideally if
> > > > it's very cheap if the iterator is already set the way we want it to be?
> > > >
> > >
> > > To be fair this is something that was previously assumed, and I just added
> > > a comment.
> > >
> > > Will add to assumptions, and again I think any assert should be done in
> > > such a way that under non-CONFIG_DEBUG_VM nothing happens, maybe
> > > VM_WARN_ON()?
> > >
> > > Will try to come up with something.
>
> Something like:
>
> VM_BUG_ON(vma_iter_end(vmg->vmi) > start);
>
> > >
> > > > > + next = vmg->next = vma_next(vmg->vmi);
>
> and:
>
> VM_BUG_ON(vma_iter_addr(vmg->vmi) < end);
>
Ack x2.
Thought VM_BUG_ON() was 'not done' these days though... but checkpatch.pl
has become rather hit or miss as to what should be given attention to or
not.
> > > > > + prev = vmg->prev = vma_prev(vmg->vmi);
> > > > > +
> > > > > + /* Avoid maple tree re-walk. */
> > > > > + if (is_special && prev)
> > > > > + vma_iter_next_range(vmg->vmi);
> > > >
> > > > I wish I knew what this did but seems it's the same as the old code did so
> > > > hopefully that's fine.
> > >
> > > I think point is that we are about to exit, so we'd be left pointing at
> > > prev. But since we're exiting in just a second, we want to be pointing at
> > > the next vma which will become the prev of the next merge attempt.
> > >
> > > Liam can maybe elucidate further.
> >
> > What you have to remember is that the vma iterator (vmg->vmi above),
> > contains (or, basically is) a maple state (usually written as mas). We
> > keep state of the maple tree walker so that we don't have to keep
> > re-walking to find the same thing. We move around the tree with this
> > maple state because going prev/next is faster from leaves (almost always
> > just the next thing in the nodes array of pointers).
> >
> > We use the maple state to write as well, so the maple state needs to
> > point to the correct location in the tree for a write.
> >
> > The maple tree is a range-based tree, so each entry exists for a span of
> > values. A write happens at the lowest index and can overwrite
> > subsequent values. This means that the maple state needs to point to
> > the range containing the lowest index for the write (if it's pointing to
> > a node - it could walk from the top).
> >
> > A side effect of writing to the lowest index is that we need to point to
> > the previous vma if we are going to 'expand' the vma. The range is
> > essentially going to be from prev->start to "whatever we are expanding
> > over".
> >
> > In the old code, the vm_flags & VM_SPECIAL code meant there was no way
> > an expansion was going to happen, but we've moved the maple state to the
> > wrong location for a write of a new vma - so this vma_iter_next_range()
> > just moves it back. Then we "goto cannot_expand".
> >
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-05 12:13 ` [PATCH 07/10] mm: avoid using vma_merge() for new VMAs Lorenzo Stoakes
2024-08-06 13:04 ` Petr Tesařík
2024-08-08 16:45 ` Vlastimil Babka
@ 2024-08-09 15:23 ` Liam R. Howlett
2024-08-09 17:20 ` Lorenzo Stoakes
2 siblings, 1 reply; 53+ messages in thread
From: Liam R. Howlett @ 2024-08-09 15:23 UTC (permalink / raw)
To: Lorenzo Stoakes; +Cc: linux-mm, linux-kernel, Andrew Morton, Vlastimil Babka
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> to use vma_expand() rather than invoke a full vma_merge() operation.
>
> Abstract this logic and eliminate all of the open-coding, and also use the
> same logic for all cases where we add new VMAs to, rather than ultimately
> use vma_merge(), rather use vma_expand().
>
> We implement this by replacing vma_merge_new_vma() with this newly
> abstracted logic.
>
> Doing so removes duplication and simplifies VMA merging in all such cases,
> laying the ground for us to eliminate the merging of new VMAs in
> vma_merge() altogether.
>
> This makes it far easier to understand what is happening in these cases
> avoiding confusion, bugs and allowing for future optimisation.
>
> As a result of this change we are also able to make vma_prepare(),
> init_vma_prep(), vma_complete(), can_vma_merge_before() and
> can_vma_merge_after() static and internal to vma.c.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/mmap.c | 79 ++---
> mm/vma.c | 482 +++++++++++++++++++------------
> mm/vma.h | 51 +---
> tools/testing/vma/vma_internal.h | 6 +
> 4 files changed, 324 insertions(+), 294 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f6593a81f73d..c03f50f46396 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> - struct vm_area_struct *next, *prev, *merge;
> - pgoff_t pglen = len >> PAGE_SHIFT;
> + struct vm_area_struct *merge;
> unsigned long charged = 0;
> unsigned long end = addr + len;
> bool writable_file_mapping = false;
> @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags |= VM_ACCOUNT;
> }
>
> - next = vmg.next = vma_next(&vmi);
> - prev = vmg.prev = vma_prev(&vmi);
> - if (vm_flags & VM_SPECIAL) {
> - if (prev)
> - vma_iter_next_range(&vmi);
> - goto cannot_expand;
> - }
> -
> - /* Attempt to expand an old mapping */
> - /* Check next */
> - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> - /* We can adjust this as can_vma_merge_after() doesn't touch */
> - vmg.end = next->vm_end;
> - vma = vmg.vma = next;
> - vmg.pgoff = next->vm_pgoff - pglen;
> -
> - /* We may merge our NULL anon_vma with non-NULL in next. */
> - vmg.anon_vma = vma->anon_vma;
> - }
> -
> - /* Check prev */
> - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> - vmg.start = prev->vm_start;
> - vma = vmg.vma = prev;
> - vmg.pgoff = prev->vm_pgoff;
> - } else if (prev) {
> - vma_iter_next_range(&vmi);
> - }
> -
> - /* Actually expand, if possible */
> - if (vma && !vma_expand(&vmg)) {
> - khugepaged_enter_vma(vma, vm_flags);
> + vma = vma_merge_new_vma(&vmg);
> + if (vma)
> goto expanded;
> - }
> -
> - if (vma == prev)
> - vma_iter_set(&vmi, addr);
> -cannot_expand:
>
> /*
> * Determine the object being mapped and call the appropriate
> @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * If vm_flags changed after call_mmap(), we should try merge
> * vma again as we may succeed this time.
> */
> - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> - vma->vm_start, vma->vm_end,
> - vma->vm_pgoff);
> + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> + merge = vma_merge_new_vma(&vmg);
> +
> if (merge) {
> /*
> * ->mmap() can change vma->vm_file and fput
> @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> vma_iter_set(&vmi, vma->vm_end);
> /* Undo any partial mapping done by a device driver. */
> - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> vma->vm_end, vma->vm_end, true);
> }
> if (writable_file_mapping)
> @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> unsigned long addr, unsigned long len, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> - struct vma_prepare vp;
>
> /*
> * Check against address space limits by the changed size
> @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> */
> if (vma && vma->vm_end == addr) {
> struct vma_merge_struct vmg = {
> + .vmi = vmi,
> .prev = vma,
> + .next = NULL,
> + .start = addr,
> + .end = addr + len,
> .flags = flags,
> .pgoff = addr >> PAGE_SHIFT,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> };
>
> - if (can_vma_merge_after(&vmg)) {
> - vma_iter_config(vmi, vma->vm_start, addr + len);
> - if (vma_iter_prealloc(vmi, vma))
> - goto unacct_fail;
> -
> - vma_start_write(vma);
> -
> - init_vma_prep(&vp, vma);
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> - vma->vm_end = addr + len;
> - vm_flags_set(vma, VM_SOFTDIRTY);
> - vma_iter_store(vmi, vma);
> -
> - vma_complete(&vp, vmi, mm);
> - khugepaged_enter_vma(vma, flags);
> + if (vma_merge_new_vma(&vmg))
> goto out;
This is very convoluted to follow. It seems vma_merge_new_vma() will do
what is necessary by finding out that it can merge after, then call
vma_expand() which calls commit merge(), which sets the iterator to
vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
here.. it's actually set to prev->vm_start in vma_merge_new_vma().
This is getting really hard to trace what happens. I'm also concerned
that the overhead of following all these checks will cost performance on
the brk system call?
Maybe we can have a way to set up the vmg and call the right function to
just make the above happen? We know with the can_vma_merge_after() that
it is going to work, so could we just call vma_start_write() and
commit_merge()?
Also, vma_merge_new_vma() could fail because it's out of memory so it
should goto unacct_fail.. but we now don't know if it's because the
merge wasn't allowed or if we are out of memory..
> - }
> }
>
> if (vma)
> diff --git a/mm/vma.c b/mm/vma.c
> index 55615392e8d2..a404cf718f9e 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> *
> * We assume the vma may be removed as part of the merge.
> */
> -bool
> -can_vma_merge_before(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> {
> pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
>
> @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> *
> * We assume that vma is not removed as part of the merge.
> */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> {
> if (is_mergeable_vma(vmg, false) &&
> is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> return false;
> }
>
> +static void __vma_link_file(struct vm_area_struct *vma,
> + struct address_space *mapping)
> +{
> + if (vma_is_shared_maywrite(vma))
> + mapping_allow_writable(mapping);
> +
> + flush_dcache_mmap_lock(mapping);
> + vma_interval_tree_insert(vma, &mapping->i_mmap);
> + flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * Requires inode->i_mapping->i_mmap_rwsem
> + */
> +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> + struct address_space *mapping)
> +{
> + if (vma_is_shared_maywrite(vma))
> + mapping_unmap_writable(mapping);
> +
> + flush_dcache_mmap_lock(mapping);
> + vma_interval_tree_remove(vma, &mapping->i_mmap);
> + flush_dcache_mmap_unlock(mapping);
> +}
> +
> +/*
> + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> + * @vp: The initialized vma_prepare struct
> + */
> +static void vma_prepare(struct vma_prepare *vp)
> +{
> + if (vp->file) {
> + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> +
> + if (vp->adj_next)
> + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> + vp->adj_next->vm_end);
> +
> + i_mmap_lock_write(vp->mapping);
> + if (vp->insert && vp->insert->vm_file) {
> + /*
> + * Put into interval tree now, so instantiated pages
> + * are visible to arm/parisc __flush_dcache_page
> + * throughout; but we cannot insert into address
> + * space until vma start or end is updated.
> + */
> + __vma_link_file(vp->insert,
> + vp->insert->vm_file->f_mapping);
> + }
> + }
> +
> + if (vp->anon_vma) {
> + anon_vma_lock_write(vp->anon_vma);
> + anon_vma_interval_tree_pre_update_vma(vp->vma);
> + if (vp->adj_next)
> + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> + }
> +
> + if (vp->file) {
> + flush_dcache_mmap_lock(vp->mapping);
> + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> + if (vp->adj_next)
> + vma_interval_tree_remove(vp->adj_next,
> + &vp->mapping->i_mmap);
> + }
> +
> +}
> +
> +/*
> + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> + * or for inserting a VMA.
> + *
> + * @vp: The vma_prepare struct
> + * @vmi: The vma iterator
> + * @mm: The mm_struct
> + */
> +static void vma_complete(struct vma_prepare *vp,
> + struct vma_iterator *vmi, struct mm_struct *mm)
> +{
> + if (vp->file) {
> + if (vp->adj_next)
> + vma_interval_tree_insert(vp->adj_next,
> + &vp->mapping->i_mmap);
> + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> + flush_dcache_mmap_unlock(vp->mapping);
> + }
> +
> + if (vp->remove && vp->file) {
> + __remove_shared_vm_struct(vp->remove, vp->mapping);
> + if (vp->remove2)
> + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> + } else if (vp->insert) {
> + /*
> + * split_vma has split insert from vma, and needs
> + * us to insert it before dropping the locks
> + * (it may either follow vma or precede it).
> + */
> + vma_iter_store(vmi, vp->insert);
> + mm->map_count++;
> + }
> +
> + if (vp->anon_vma) {
> + anon_vma_interval_tree_post_update_vma(vp->vma);
> + if (vp->adj_next)
> + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> + anon_vma_unlock_write(vp->anon_vma);
> + }
> +
> + if (vp->file) {
> + i_mmap_unlock_write(vp->mapping);
> + uprobe_mmap(vp->vma);
> +
> + if (vp->adj_next)
> + uprobe_mmap(vp->adj_next);
> + }
> +
> + if (vp->remove) {
> +again:
> + vma_mark_detached(vp->remove, true);
> + if (vp->file) {
> + uprobe_munmap(vp->remove, vp->remove->vm_start,
> + vp->remove->vm_end);
> + fput(vp->file);
> + }
> + if (vp->remove->anon_vma)
> + anon_vma_merge(vp->vma, vp->remove);
> + mm->map_count--;
> + mpol_put(vma_policy(vp->remove));
> + if (!vp->remove2)
> + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> + vm_area_free(vp->remove);
> +
> + /*
> + * In mprotect's case 6 (see comments on vma_merge),
> + * we are removing both mid and next vmas
> + */
> + if (vp->remove2) {
> + vp->remove = vp->remove2;
> + vp->remove2 = NULL;
> + goto again;
> + }
> + }
> + if (vp->insert && vp->file)
> + uprobe_mmap(vp->insert);
> + validate_mm(mm);
> +}
> +
> +/*
> + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> + * @vp: The vma_prepare struct
> + * @vma: The vma that will be altered once locked
> + */
> +static void init_vma_prep(struct vma_prepare *vp,
> + struct vm_area_struct *vma)
> +{
> + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> +}
> +
> /*
> * Close a vm structure and free it.
> */
> @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> vm_unacct_memory(nr_accounted);
> }
>
> -/*
> - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> - * @vp: The vma_prepare struct
> - * @vma: The vma that will be altered once locked
> - */
> -void init_vma_prep(struct vma_prepare *vp,
> - struct vm_area_struct *vma)
> -{
> - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> -}
> -
> -/*
> - * Requires inode->i_mapping->i_mmap_rwsem
> - */
> -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> - struct address_space *mapping)
> -{
> - if (vma_is_shared_maywrite(vma))
> - mapping_unmap_writable(mapping);
> -
> - flush_dcache_mmap_lock(mapping);
> - vma_interval_tree_remove(vma, &mapping->i_mmap);
> - flush_dcache_mmap_unlock(mapping);
> -}
> -
> /*
> * vma has some anon_vma assigned, and is already inserted on that
> * anon_vma's interval trees.
> @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> }
>
> -static void __vma_link_file(struct vm_area_struct *vma,
> - struct address_space *mapping)
> -{
> - if (vma_is_shared_maywrite(vma))
> - mapping_allow_writable(mapping);
> -
> - flush_dcache_mmap_lock(mapping);
> - vma_interval_tree_insert(vma, &mapping->i_mmap);
> - flush_dcache_mmap_unlock(mapping);
> -}
> -
> -/*
> - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> - * @vp: The initialized vma_prepare struct
> - */
> -void vma_prepare(struct vma_prepare *vp)
> -{
> - if (vp->file) {
> - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> -
> - if (vp->adj_next)
> - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> - vp->adj_next->vm_end);
> -
> - i_mmap_lock_write(vp->mapping);
> - if (vp->insert && vp->insert->vm_file) {
> - /*
> - * Put into interval tree now, so instantiated pages
> - * are visible to arm/parisc __flush_dcache_page
> - * throughout; but we cannot insert into address
> - * space until vma start or end is updated.
> - */
> - __vma_link_file(vp->insert,
> - vp->insert->vm_file->f_mapping);
> - }
> - }
> -
> - if (vp->anon_vma) {
> - anon_vma_lock_write(vp->anon_vma);
> - anon_vma_interval_tree_pre_update_vma(vp->vma);
> - if (vp->adj_next)
> - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> - }
> -
> - if (vp->file) {
> - flush_dcache_mmap_lock(vp->mapping);
> - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> - if (vp->adj_next)
> - vma_interval_tree_remove(vp->adj_next,
> - &vp->mapping->i_mmap);
> - }
> -
> -}
> -
> /*
> * dup_anon_vma() - Helper function to duplicate anon_vma
> * @dst: The destination VMA
> @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> }
> #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>
> +/*
> + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> + *
> + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> + *
> + * We are about to add a VMA to the address space starting at @vmg->start and
> + * ending at @vmg->end. There are three different possible scenarios:
> + *
> + * 1. There is a VMA with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> + * EXPAND that VMA:
> + *
> + * Proposed: |-----| or |-----|
> + * Existing: |----| |----|
> + *
> + * 2. There are VMAs with identical properties immediately adjacent to the
> + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> + * EXPAND the former and REMOVE the latter:
> + *
> + * Proposed: |-----|
> + * Existing: |----| |----|
> + *
> + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
We still have diagrams, that's too bad.
> + *
> + * In instances where we can merge, this function returns the expanded VMA which
> + * will have its range adjusted accordingly and the underlying maple tree also
> + * adjusted.
> + *
> + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> + * to the VMA we expanded.
> + *
> + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> + * expanded range.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> + */
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> +{
> + bool is_special = vmg->flags & VM_SPECIAL;
> + struct vm_area_struct *prev = vmg->prev;
> + struct vm_area_struct *next = vmg->next;
> + unsigned long start = vmg->start;
> + unsigned long end = vmg->end;
> + pgoff_t pgoff = vmg->pgoff;
> + pgoff_t pglen = PHYS_PFN(end - start);
> +
> + VM_WARN_ON(vmg->vma);
> +
> + if (!prev && !next) {
> + /*
> + * Since the caller must have determined that the requested
> + * range is empty, vmg->vmi will be left pointing at the VMA
> + * immediately prior.
> + */
> + next = vmg->next = vma_next(vmg->vmi);
> + prev = vmg->prev = vma_prev(vmg->vmi);
> +
> + /* Avoid maple tree re-walk. */
> + if (is_special && prev)
> + vma_iter_next_range(vmg->vmi);
> + }
> +
> + /* If special mapping or no adjacent VMAs, nothing to merge. */
> + if (is_special || (!prev && !next))
> + return NULL;
> +
> + /* If we can merge with the following VMA, adjust vmg accordingly. */
> + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> + /*
> + * We can adjust this here as can_vma_merge_after() doesn't
> + * touch vmg->end.
> + */
> + vmg->end = next->vm_end;
> + vmg->vma = next;
> + vmg->pgoff = next->vm_pgoff - pglen;
> +
> + vmg->anon_vma = next->anon_vma;
> + }
> +
> + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> + vmg->start = prev->vm_start;
> + vmg->vma = prev;
> + vmg->pgoff = prev->vm_pgoff;
> + } else if (prev) {
> + vma_iter_next_range(vmg->vmi);
> + }
> +
> + /*
> + * Now try to expand adjacent VMA(s). This takes care of removing the
> + * following VMA if we have VMAs on both sides.
> + */
> + if (vmg->vma && !vma_expand(vmg)) {
> + khugepaged_enter_vma(vmg->vma, vmg->flags);
> + return vmg->vma;
> + }
> +
> + /* If expansion failed, reset state. Allows us to retry merge later. */
> + vmg->vma = NULL;
> + vmg->anon_vma = NULL;
> + vmg->start = start;
> + vmg->end = end;
> + vmg->pgoff = pgoff;
> + if (vmg->vma == prev)
> + vma_iter_set(vmg->vmi, start);
> +
> + return NULL;
> +}
Can we split this up a bit? I was thinking that, for the brk() case, we
need to know if we can merge prev and if that merge fails. I was
thinking something that you create a vmg with whatever, then call
can_merge_prev, and that'd do the block above and change the vmg as
required. We could have a can_merge_next that does the same, then we
need to prepare the change (dup anon vma, preallocate for maple tree,
locking, whatever), then commit.
There could still be the function above, but with smaller widgets to do
what we need so we gain flexibility in what we decide to check - prev
only in brk().
I'm not sure if we'd need one for expanding vs existing or if we could
check !vmg->vma to figure that out..
This would also have the effect of self-documenting what is going on.
For brk, it would look like this:
if (vmg_expand_prev()) {
if (vmg_prepare())
goto no_mem;
vmg_commit();
}
I think this would change your exposed interface, at least for brk() -
or a wrapper for this, but small widgets may gain us some
self-documented code?
If you really don't like the exposure of the interface, the vmg could
have a return so we can see if we ran out of memory?
> +
> /*
> * vma_expand - Expand an existing VMA
> *
> @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> * vmg->next needs to be handled by the caller.
> *
> - * Returns: 0 on success
> + * Returns: 0 on success.
> + *
> + * ASSUMPTIONS:
> + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> + * - The caller must have set @vmg->prev and @vmg->next.
> */
> int vma_expand(struct vma_merge_struct *vmg)
> {
> @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> return 0;
> }
>
> -/*
> - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> - * or for inserting a VMA.
> - *
> - * @vp: The vma_prepare struct
> - * @vmi: The vma iterator
> - * @mm: The mm_struct
> - */
> -void vma_complete(struct vma_prepare *vp,
> - struct vma_iterator *vmi, struct mm_struct *mm)
> -{
> - if (vp->file) {
> - if (vp->adj_next)
> - vma_interval_tree_insert(vp->adj_next,
> - &vp->mapping->i_mmap);
> - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> - flush_dcache_mmap_unlock(vp->mapping);
> - }
> -
> - if (vp->remove && vp->file) {
> - __remove_shared_vm_struct(vp->remove, vp->mapping);
> - if (vp->remove2)
> - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> - } else if (vp->insert) {
> - /*
> - * split_vma has split insert from vma, and needs
> - * us to insert it before dropping the locks
> - * (it may either follow vma or precede it).
> - */
> - vma_iter_store(vmi, vp->insert);
> - mm->map_count++;
> - }
> -
> - if (vp->anon_vma) {
> - anon_vma_interval_tree_post_update_vma(vp->vma);
> - if (vp->adj_next)
> - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> - anon_vma_unlock_write(vp->anon_vma);
> - }
> -
> - if (vp->file) {
> - i_mmap_unlock_write(vp->mapping);
> - uprobe_mmap(vp->vma);
> -
> - if (vp->adj_next)
> - uprobe_mmap(vp->adj_next);
> - }
> -
> - if (vp->remove) {
> -again:
> - vma_mark_detached(vp->remove, true);
> - if (vp->file) {
> - uprobe_munmap(vp->remove, vp->remove->vm_start,
> - vp->remove->vm_end);
> - fput(vp->file);
> - }
> - if (vp->remove->anon_vma)
> - anon_vma_merge(vp->vma, vp->remove);
> - mm->map_count--;
> - mpol_put(vma_policy(vp->remove));
> - if (!vp->remove2)
> - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> - vm_area_free(vp->remove);
> -
> - /*
> - * In mprotect's case 6 (see comments on vma_merge),
> - * we are removing both mid and next vmas
> - */
> - if (vp->remove2) {
> - vp->remove = vp->remove2;
> - vp->remove2 = NULL;
> - goto again;
> - }
> - }
> - if (vp->insert && vp->file)
> - uprobe_mmap(vp->insert);
> - validate_mm(mm);
> -}
> -
> /*
> * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> * @vmi: The vma iterator
> @@ -1261,20 +1378,6 @@ struct vm_area_struct
> return vma_modify(&vmg);
> }
>
> -/*
> - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> - * must ensure that [start, end) does not overlap any existing VMA.
> - */
> -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> -{
> - if (!vmg->prev) {
> - vmg->prev = vma_prev(vmg->vmi);
> - vma_iter_set(vmg->vmi, vmg->start);
> - }
> -
> - return vma_merge(vmg);
> -}
> -
> /*
> * Expand vma by delta bytes, potentially merging with an immediately adjacent
> * VMA with identical properties.
> @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> .anon_name = anon_vma_name(vma),
> };
>
> - /* vma is specified as prev, so case 1 or 2 will apply. */
> - return vma_merge(&vmg);
> + return vma_merge_new_vma(&vmg);
> }
>
> void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> struct vm_area_struct *vma = *vmap;
> unsigned long vma_start = vma->vm_start;
> struct mm_struct *mm = vma->vm_mm;
> - struct vm_area_struct *new_vma, *prev;
> + struct vm_area_struct *new_vma;
> bool faulted_in_anon_vma = true;
> VMA_ITERATOR(vmi, mm, addr);
> + struct vma_merge_struct vmg = {
> + .vmi = &vmi,
> + .start = addr,
> + .end = addr + len,
> + .flags = vma->vm_flags,
> + .pgoff = pgoff,
> + .file = vma->vm_file,
> + .anon_vma = vma->anon_vma,
> + .policy = vma_policy(vma),
> + .uffd_ctx = vma->vm_userfaultfd_ctx,
> + .anon_name = anon_vma_name(vma),
> + };
>
> /*
> * If anonymous vma has not yet been faulted, update new pgoff
> * to match new location, to increase its chance of merging.
> */
> if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> - pgoff = addr >> PAGE_SHIFT;
> + pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> faulted_in_anon_vma = false;
> }
>
> - new_vma = find_vma_prev(mm, addr, &prev);
> + new_vma = find_vma_prev(mm, addr, &vmg.prev);
> if (new_vma && new_vma->vm_start < addr + len)
> return NULL; /* should never get here */
>
> - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> + vmg.next = vma_next(&vmi);
> + vma_prev(&vmi);
> +
> + new_vma = vma_merge_new_vma(&vmg);
> +
> if (new_vma) {
> /*
> * Source vma may have been merged into new_vma
> diff --git a/mm/vma.h b/mm/vma.h
> index 50459f9e4c7f..bbb173053f34 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> /* Required for expand_downwards(). */
> void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
>
> -/* Required for do_brk_flags(). */
> -void vma_prepare(struct vma_prepare *vp);
> -
> -/* Required for do_brk_flags(). */
> -void init_vma_prep(struct vma_prepare *vp,
> - struct vm_area_struct *vma);
> -
> -/* Required for do_brk_flags(). */
> -void vma_complete(struct vma_prepare *vp,
> - struct vma_iterator *vmi, struct mm_struct *mm);
> -
> int vma_expand(struct vma_merge_struct *vmg);
> int vma_shrink(struct vma_merge_struct *vmg);
>
> @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> struct vm_area_struct *next, unsigned long start,
> unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>
> -/*
> - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> - *
> - * Required by mmap_region().
> - */
> -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> -
> -/*
> - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> - *
> - * Required by mmap_region() and do_brk_flags().
> - */
> -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> -
> /* We are about to modify the VMA's flags. */
> struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> struct vm_area_struct *prev,
> @@ -133,31 +108,7 @@ struct vm_area_struct
> unsigned long new_flags,
> struct vm_userfaultfd_ctx new_ctx);
>
> -struct vm_area_struct
> -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> -
> -/* Temporary convenience wrapper. */
> -static inline struct vm_area_struct
> -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> - struct vm_area_struct *vma, unsigned long start,
> - unsigned long end, pgoff_t pgoff)
> -{
> - struct vma_merge_struct vmg = {
> - .vmi = vmi,
> - .prev = prev,
> - .start = start,
> - .end = end,
> - .flags = vma->vm_flags,
> - .file = vma->vm_file,
> - .anon_vma = vma->anon_vma,
> - .pgoff = pgoff,
> - .policy = vma_policy(vma),
> - .uffd_ctx = vma->vm_userfaultfd_ctx,
> - .anon_name = anon_vma_name(vma),
> - };
> -
> - return vma_merge_new_vma(&vmg);
> -}
> +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
>
> /*
> * Temporary wrapper around vma_merge() so we can have a common interface for
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 40797a819d3d..a39a734282d0 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> mas_destroy(&vmi->mas);
> }
>
> +static inline
> +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> +{
> + return mas_next_range(&vmi->mas, ULONG_MAX);
> +}
> +
> static inline void vm_acct_memory(long pages)
> {
> }
> --
> 2.45.2
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
2024-08-09 15:23 ` Liam R. Howlett
@ 2024-08-09 17:20 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 17:20 UTC (permalink / raw)
To: Liam R. Howlett, linux-mm, linux-kernel, Andrew Morton, Vlastimil Babka
On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240805 08:14]:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/mmap.c | 79 ++---
> > mm/vma.c | 482 +++++++++++++++++++------------
> > mm/vma.h | 51 +---
> > tools/testing/vma/vma_internal.h | 6 +
> > 4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > {
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma = NULL;
> > - struct vm_area_struct *next, *prev, *merge;
> > - pgoff_t pglen = len >> PAGE_SHIFT;
> > + struct vm_area_struct *merge;
> > unsigned long charged = 0;
> > unsigned long end = addr + len;
> > bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vmg.next = vma_next(&vmi);
> > - prev = vmg.prev = vma_prev(&vmi);
> > - if (vm_flags & VM_SPECIAL) {
> > - if (prev)
> > - vma_iter_next_range(&vmi);
> > - goto cannot_expand;
> > - }
> > -
> > - /* Attempt to expand an old mapping */
> > - /* Check next */
> > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > - /* We can adjust this as can_vma_merge_after() doesn't touch */
> > - vmg.end = next->vm_end;
> > - vma = vmg.vma = next;
> > - vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > - /* We may merge our NULL anon_vma with non-NULL in next. */
> > - vmg.anon_vma = vma->anon_vma;
> > - }
> > -
> > - /* Check prev */
> > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > - vmg.start = prev->vm_start;
> > - vma = vmg.vma = prev;
> > - vmg.pgoff = prev->vm_pgoff;
> > - } else if (prev) {
> > - vma_iter_next_range(&vmi);
> > - }
> > -
> > - /* Actually expand, if possible */
> > - if (vma && !vma_expand(&vmg)) {
> > - khugepaged_enter_vma(vma, vm_flags);
> > + vma = vma_merge_new_vma(&vmg);
> > + if (vma)
> > goto expanded;
> > - }
> > -
> > - if (vma == prev)
> > - vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> > /*
> > * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > * If vm_flags changed after call_mmap(), we should try merge
> > * vma again as we may succeed this time.
> > */
> > - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > - vma->vm_start, vma->vm_end,
> > - vma->vm_pgoff);
> > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > + merge = vma_merge_new_vma(&vmg);
> > +
> > if (merge) {
> > /*
> > * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> > vma_iter_set(&vmi, vma->vm_end);
> > /* Undo any partial mapping done by a device driver. */
> > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> > vma->vm_end, vma->vm_end, true);
> > }
> > if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > unsigned long addr, unsigned long len, unsigned long flags)
> > {
> > struct mm_struct *mm = current->mm;
> > - struct vma_prepare vp;
> >
> > /*
> > * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > */
> > if (vma && vma->vm_end == addr) {
> > struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > .prev = vma,
> > + .next = NULL,
> > + .start = addr,
> > + .end = addr + len,
> > .flags = flags,
> > .pgoff = addr >> PAGE_SHIFT,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > };
> >
> > - if (can_vma_merge_after(&vmg)) {
> > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto unacct_fail;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > - vma->vm_end = addr + len;
> > - vm_flags_set(vma, VM_SOFTDIRTY);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(vma, flags);
> > + if (vma_merge_new_vma(&vmg))
> > goto out;
>
> This is very convoluted to follow. It seems vma_merge_new_vma() will do
> what is necessary by finding out that it can merge after, then call
> vma_expand() which calls commit merge(), which sets the iterator to
> vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
> here.. it's actually set to prev->vm_start in vma_merge_new_vma().
Sorry, it's kind of hard to make a change like this all that lovely to
follow.
The only extra checks before it checks mergeability are prev - which we set
to vma, so is not NULL (except in the case of first vma, which is wasteful,
but a one-off) and an is_special and next check.
So this isn't _hugely_ terrible.
As to the vmi positioning... I thought there might be some things that we
could improve on this :)
However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end =
addr + len which is the same as before right?
I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY)
though... will add that back in. Ugh.
Again, so frustrating to not have these functions testable. I'd like to
find a way to move things around if possible at some point. But if we're
worried about call stack maybe not feasible.
>
> This is getting really hard to trace what happens. I'm also concerned
> that the overhead of following all these checks will cost performance on
> the brk system call?
I'll take a look at mm-tests results.
>
> Maybe we can have a way to set up the vmg and call the right function to
> just make the above happen? We know with the can_vma_merge_after() that
> it is going to work, so could we just call vma_start_write() and
> commit_merge()?
I'm happy to add an enum or something to set a specific mode if we want,
but maybe worth looking at scalability results first to see if there's
really a regression?
I mean from our discussions on irc, it sounds like this is very possible so
we could figure something out.
>
> Also, vma_merge_new_vma() could fail because it's out of memory so it
> should goto unacct_fail.. but we now don't know if it's because the
> merge wasn't allowed or if we are out of memory..
Yes this is a mistake, damn it. Will fix. Grumble about untestability of
these functions x2.
As per your comment below I think simplest way may be to have an error or
outcome field or some such that we can check to see _why_ things failed.
>
> > - }
> > }
> >
> > if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > *
> > * We assume the vma may be removed as part of the merge.
> > */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> > {
> > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> > *
> > * We assume that vma is not removed as part of the merge.
> > */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > {
> > if (is_mergeable_vma(vmg, false) &&
> > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > return false;
> > }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_allow_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_insert(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_unmap_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_remove(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > + if (vp->file) {
> > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > + if (vp->adj_next)
> > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > + vp->adj_next->vm_end);
> > +
> > + i_mmap_lock_write(vp->mapping);
> > + if (vp->insert && vp->insert->vm_file) {
> > + /*
> > + * Put into interval tree now, so instantiated pages
> > + * are visible to arm/parisc __flush_dcache_page
> > + * throughout; but we cannot insert into address
> > + * space until vma start or end is updated.
> > + */
> > + __vma_link_file(vp->insert,
> > + vp->insert->vm_file->f_mapping);
> > + }
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_lock_write(vp->anon_vma);
> > + anon_vma_interval_tree_pre_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > + }
> > +
> > + if (vp->file) {
> > + flush_dcache_mmap_lock(vp->mapping);
> > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > + if (vp->adj_next)
> > + vma_interval_tree_remove(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + }
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > + struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > + if (vp->file) {
> > + if (vp->adj_next)
> > + vma_interval_tree_insert(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > + flush_dcache_mmap_unlock(vp->mapping);
> > + }
> > +
> > + if (vp->remove && vp->file) {
> > + __remove_shared_vm_struct(vp->remove, vp->mapping);
> > + if (vp->remove2)
> > + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > + } else if (vp->insert) {
> > + /*
> > + * split_vma has split insert from vma, and needs
> > + * us to insert it before dropping the locks
> > + * (it may either follow vma or precede it).
> > + */
> > + vma_iter_store(vmi, vp->insert);
> > + mm->map_count++;
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_interval_tree_post_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > + anon_vma_unlock_write(vp->anon_vma);
> > + }
> > +
> > + if (vp->file) {
> > + i_mmap_unlock_write(vp->mapping);
> > + uprobe_mmap(vp->vma);
> > +
> > + if (vp->adj_next)
> > + uprobe_mmap(vp->adj_next);
> > + }
> > +
> > + if (vp->remove) {
> > +again:
> > + vma_mark_detached(vp->remove, true);
> > + if (vp->file) {
> > + uprobe_munmap(vp->remove, vp->remove->vm_start,
> > + vp->remove->vm_end);
> > + fput(vp->file);
> > + }
> > + if (vp->remove->anon_vma)
> > + anon_vma_merge(vp->vma, vp->remove);
> > + mm->map_count--;
> > + mpol_put(vma_policy(vp->remove));
> > + if (!vp->remove2)
> > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > + vm_area_free(vp->remove);
> > +
> > + /*
> > + * In mprotect's case 6 (see comments on vma_merge),
> > + * we are removing both mid and next vmas
> > + */
> > + if (vp->remove2) {
> > + vp->remove = vp->remove2;
> > + vp->remove2 = NULL;
> > + goto again;
> > + }
> > + }
> > + if (vp->insert && vp->file)
> > + uprobe_mmap(vp->insert);
> > + validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma)
> > +{
> > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> > /*
> > * Close a vm structure and free it.
> > */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> > vm_unacct_memory(nr_accounted);
> > }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma)
> > -{
> > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_unmap_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_remove(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > /*
> > * vma has some anon_vma assigned, and is already inserted on that
> > * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_allow_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_insert(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > - if (vp->file) {
> > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > - if (vp->adj_next)
> > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > - vp->adj_next->vm_end);
> > -
> > - i_mmap_lock_write(vp->mapping);
> > - if (vp->insert && vp->insert->vm_file) {
> > - /*
> > - * Put into interval tree now, so instantiated pages
> > - * are visible to arm/parisc __flush_dcache_page
> > - * throughout; but we cannot insert into address
> > - * space until vma start or end is updated.
> > - */
> > - __vma_link_file(vp->insert,
> > - vp->insert->vm_file->f_mapping);
> > - }
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_lock_write(vp->anon_vma);
> > - anon_vma_interval_tree_pre_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > - }
> > -
> > - if (vp->file) {
> > - flush_dcache_mmap_lock(vp->mapping);
> > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > - if (vp->adj_next)
> > - vma_interval_tree_remove(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - }
> > -
> > -}
> > -
> > /*
> > * dup_anon_vma() - Helper function to duplicate anon_vma
> > * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> > }
> > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + * EXPAND that VMA:
> > + *
> > + * Proposed: |-----| or |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + * EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed: |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
>
> We still have diagrams, that's too bad.
But they're cute ones! Upgrade right?
>
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + * to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > + bool is_special = vmg->flags & VM_SPECIAL;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next = vmg->next;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + pgoff_t pgoff = vmg->pgoff;
> > + pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > + VM_WARN_ON(vmg->vma);
> > +
> > + if (!prev && !next) {
> > + /*
> > + * Since the caller must have determined that the requested
> > + * range is empty, vmg->vmi will be left pointing at the VMA
> > + * immediately prior.
> > + */
> > + next = vmg->next = vma_next(vmg->vmi);
> > + prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > + /* Avoid maple tree re-walk. */
> > + if (is_special && prev)
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /* If special mapping or no adjacent VMAs, nothing to merge. */
> > + if (is_special || (!prev && !next))
> > + return NULL;
> > +
> > + /* If we can merge with the following VMA, adjust vmg accordingly. */
> > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > + /*
> > + * We can adjust this here as can_vma_merge_after() doesn't
> > + * touch vmg->end.
> > + */
> > + vmg->end = next->vm_end;
> > + vmg->vma = next;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > + vmg->anon_vma = next->anon_vma;
> > + }
> > +
> > + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + vmg->start = prev->vm_start;
> > + vmg->vma = prev;
> > + vmg->pgoff = prev->vm_pgoff;
> > + } else if (prev) {
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /*
> > + * Now try to expand adjacent VMA(s). This takes care of removing the
> > + * following VMA if we have VMAs on both sides.
> > + */
> > + if (vmg->vma && !vma_expand(vmg)) {
> > + khugepaged_enter_vma(vmg->vma, vmg->flags);
> > + return vmg->vma;
> > + }
> > +
> > + /* If expansion failed, reset state. Allows us to retry merge later. */
> > + vmg->vma = NULL;
> > + vmg->anon_vma = NULL;
> > + vmg->start = start;
> > + vmg->end = end;
> > + vmg->pgoff = pgoff;
> > + if (vmg->vma == prev)
> > + vma_iter_set(vmg->vmi, start);
> > +
> > + return NULL;
> > +}
>
> Can we split this up a bit? I was thinking that, for the brk() case, we
> need to know if we can merge prev and if that merge fails. I was
> thinking something that you create a vmg with whatever, then call
> can_merge_prev, and that'd do the block above and change the vmg as
> required. We could have a can_merge_next that does the same, then we
> need to prepare the change (dup anon vma, preallocate for maple tree,
> locking, whatever), then commit.
Yeah that's not a bad idea, that could actually help really help clarify
what's going on.
Then could have a sort of state machine that indicates that we've already
adjusted vmg parameters for the merge.
I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma()
that invokes different code to figure out how to expand.
I will have a fiddle around and see what I can figure out that makes sense.
>
> There could still be the function above, but with smaller widgets to do
> what we need so we gain flexibility in what we decide to check - prev
> only in brk().
>
> I'm not sure if we'd need one for expanding vs existing or if we could
> check !vmg->vma to figure that out..
>
> This would also have the effect of self-documenting what is going on.
> For brk, it would look like this:
>
> if (vmg_expand_prev()) {
> if (vmg_prepare())
> goto no_mem;
> vmg_commit();
> }
>
> I think this would change your exposed interface, at least for brk() -
> or a wrapper for this, but small widgets may gain us some
> self-documented code?
>
> If you really don't like the exposure of the interface, the vmg could
> have a return so we can see if we ran out of memory?
I really don't like can_vma_merge_xxx() being exposed, it's very clearly an
internal interface.
As mentioned above we can have some kind of way of passing back an error
code.
Obviously if testing indicates stack size/perf is a problem we can
begrudgingly accept the interface leak :'(. Will check that.
>
> > +
> > /*
> > * vma_expand - Expand an existing VMA
> > *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> > * vmg->next needs to be handled by the caller.
> > *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> > */
> > int vma_expand(struct vma_merge_struct *vmg)
> > {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> > return 0;
> > }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > - if (vp->file) {
> > - if (vp->adj_next)
> > - vma_interval_tree_insert(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > - flush_dcache_mmap_unlock(vp->mapping);
> > - }
> > -
> > - if (vp->remove && vp->file) {
> > - __remove_shared_vm_struct(vp->remove, vp->mapping);
> > - if (vp->remove2)
> > - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > - } else if (vp->insert) {
> > - /*
> > - * split_vma has split insert from vma, and needs
> > - * us to insert it before dropping the locks
> > - * (it may either follow vma or precede it).
> > - */
> > - vma_iter_store(vmi, vp->insert);
> > - mm->map_count++;
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_interval_tree_post_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > - anon_vma_unlock_write(vp->anon_vma);
> > - }
> > -
> > - if (vp->file) {
> > - i_mmap_unlock_write(vp->mapping);
> > - uprobe_mmap(vp->vma);
> > -
> > - if (vp->adj_next)
> > - uprobe_mmap(vp->adj_next);
> > - }
> > -
> > - if (vp->remove) {
> > -again:
> > - vma_mark_detached(vp->remove, true);
> > - if (vp->file) {
> > - uprobe_munmap(vp->remove, vp->remove->vm_start,
> > - vp->remove->vm_end);
> > - fput(vp->file);
> > - }
> > - if (vp->remove->anon_vma)
> > - anon_vma_merge(vp->vma, vp->remove);
> > - mm->map_count--;
> > - mpol_put(vma_policy(vp->remove));
> > - if (!vp->remove2)
> > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > - vm_area_free(vp->remove);
> > -
> > - /*
> > - * In mprotect's case 6 (see comments on vma_merge),
> > - * we are removing both mid and next vmas
> > - */
> > - if (vp->remove2) {
> > - vp->remove = vp->remove2;
> > - vp->remove2 = NULL;
> > - goto again;
> > - }
> > - }
> > - if (vp->insert && vp->file)
> > - uprobe_mmap(vp->insert);
> > - validate_mm(mm);
> > -}
> > -
> > /*
> > * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> > * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> > return vma_modify(&vmg);
> > }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > - if (!vmg->prev) {
> > - vmg->prev = vma_prev(vmg->vmi);
> > - vma_iter_set(vmg->vmi, vmg->start);
> > - }
> > -
> > - return vma_merge(vmg);
> > -}
> > -
> > /*
> > * Expand vma by delta bytes, potentially merging with an immediately adjacent
> > * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > .anon_name = anon_vma_name(vma),
> > };
> >
> > - /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(&vmg);
> > + return vma_merge_new_vma(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > struct vm_area_struct *vma = *vmap;
> > unsigned long vma_start = vma->vm_start;
> > struct mm_struct *mm = vma->vm_mm;
> > - struct vm_area_struct *new_vma, *prev;
> > + struct vm_area_struct *new_vma;
> > bool faulted_in_anon_vma = true;
> > VMA_ITERATOR(vmi, mm, addr);
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .start = addr,
> > + .end = addr + len,
> > + .flags = vma->vm_flags,
> > + .pgoff = pgoff,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> >
> > /*
> > * If anonymous vma has not yet been faulted, update new pgoff
> > * to match new location, to increase its chance of merging.
> > */
> > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > - pgoff = addr >> PAGE_SHIFT;
> > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> > faulted_in_anon_vma = false;
> > }
> >
> > - new_vma = find_vma_prev(mm, addr, &prev);
> > + new_vma = find_vma_prev(mm, addr, &vmg.prev);
> > if (new_vma && new_vma->vm_start < addr + len)
> > return NULL; /* should never get here */
> >
> > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > + vmg.next = vma_next(&vmi);
> > + vma_prev(&vmi);
> > +
> > + new_vma = vma_merge_new_vma(&vmg);
> > +
> > if (new_vma) {
> > /*
> > * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> > /* Required for expand_downwards(). */
> > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> > int vma_expand(struct vma_merge_struct *vmg);
> > int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > struct vm_area_struct *next, unsigned long start,
> > unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> > /* We are about to modify the VMA's flags. */
> > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> > unsigned long new_flags,
> > struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *vma, unsigned long start,
> > - unsigned long end, pgoff_t pgoff)
> > -{
> > - struct vma_merge_struct vmg = {
> > - .vmi = vmi,
> > - .prev = prev,
> > - .start = start,
> > - .end = end,
> > - .flags = vma->vm_flags,
> > - .file = vma->vm_file,
> > - .anon_vma = vma->anon_vma,
> > - .pgoff = pgoff,
> > - .policy = vma_policy(vma),
> > - .uffd_ctx = vma->vm_userfaultfd_ctx,
> > - .anon_name = anon_vma_name(vma),
> > - };
> > -
> > - return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> > /*
> > * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> > mas_destroy(&vmi->mas);
> > }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > + return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> > static inline void vm_acct_memory(long pages)
> > {
> > }
> > --
> > 2.45.2
> >
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (6 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 07/10] mm: avoid using vma_merge() for new VMAs Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 13:41 ` Petr Tesařík
2024-08-09 10:15 ` Vlastimil Babka
2024-08-05 12:13 ` [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified() Lorenzo Stoakes
2024-08-05 12:13 ` [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge Lorenzo Stoakes
9 siblings, 2 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
Pull this operation into its own function and have vma_expand() call
commit_merge() instead.
This lays the groundwork for a subsequent patch which replaces vma_merge()
with a simpler function which can share the same code.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 45 insertions(+), 12 deletions(-)
diff --git a/mm/vma.c b/mm/vma.c
index a404cf718f9e..b7e3c64d5d68 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+/* Actually perform the VMA merge operation. */
+static int commit_merge(struct vma_merge_struct *vmg,
+ struct vm_area_struct *adjust,
+ struct vm_area_struct *remove,
+ struct vm_area_struct *remove2,
+ long adj_start,
+ bool expanded)
+{
+ struct vma_prepare vp;
+
+ init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
+
+ if (expanded) {
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ } else {
+ vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
+ adjust->vm_end);
+ }
+
+ if (vma_iter_prealloc(vmg->vmi, vmg->vma))
+ return -ENOMEM;
+
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
+ vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
+
+ if (expanded)
+ vma_iter_store(vmg->vmi, vmg->vma);
+
+ if (adj_start) {
+ adjust->vm_start += adj_start;
+ adjust->vm_pgoff += PHYS_PFN(adj_start);
+ if (adj_start < 0) {
+ WARN_ON(expanded);
+ vma_iter_store(vmg->vmi, adjust);
+ }
+ }
+
+ vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
+
+ return 0;
+}
+
/*
* vma_merge_new_vma - Attempt to merge a new VMA into address space
*
@@ -700,7 +743,6 @@ int vma_expand(struct vma_merge_struct *vmg)
bool remove_next = false;
struct vm_area_struct *vma = vmg->vma;
struct vm_area_struct *next = vmg->next;
- struct vma_prepare vp;
vma_start_write(vma);
if (next && (vma != next) && (vmg->end == next->vm_end)) {
@@ -713,24 +755,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return ret;
}
- init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
/* Not merging but overwriting any part of next is not handled. */
- VM_WARN_ON(next && !vp.remove &&
+ VM_WARN_ON(next && !remove_next &&
next != vma && vmg->end > next->vm_start);
/* Only handles expanding */
VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
- /* Note: vma iterator must be pointing to 'start' */
- vma_iter_config(vmg->vmi, vmg->start, vmg->end);
- if (vma_iter_prealloc(vmg->vmi, vma))
+ if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
goto nomem;
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
- vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
- vma_iter_store(vmg->vmi, vma);
-
- vma_complete(&vp, vmg->vmi, vma->vm_mm);
return 0;
nomem:
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-05 12:13 ` [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation Lorenzo Stoakes
@ 2024-08-06 13:41 ` Petr Tesařík
2024-08-06 13:48 ` Lorenzo Stoakes
2024-08-09 10:15 ` Vlastimil Babka
1 sibling, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 13:41 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:55 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> Pull this operation into its own function and have vma_expand() call
> commit_merge() instead.
>
> This lays the groundwork for a subsequent patch which replaces vma_merge()
> with a simpler function which can share the same code.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index a404cf718f9e..b7e3c64d5d68 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> }
> #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>
> +/* Actually perform the VMA merge operation. */
> +static int commit_merge(struct vma_merge_struct *vmg,
> + struct vm_area_struct *adjust,
> + struct vm_area_struct *remove,
> + struct vm_area_struct *remove2,
> + long adj_start,
> + bool expanded)
> +{
> + struct vma_prepare vp;
> +
> + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> +
> + if (expanded) {
> + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> + } else {
> + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> + adjust->vm_end);
> + }
It's hard to follow the logic if you the "expanded" parameter is always
true. I have to look at PATCH 09/10 first to see how it is expected to
be used. Is there no other way?
Note that this is not needed for adjust and adj_start, because they are
merely moved here from vma_expand() and passed down as parameters to
other functions.
Petr T
> +
> + if (vma_iter_prealloc(vmg->vmi, vmg->vma))
> + return -ENOMEM;
> +
> + vma_prepare(&vp);
> + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
> + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
> +
> + if (expanded)
> + vma_iter_store(vmg->vmi, vmg->vma);
> +
> + if (adj_start) {
> + adjust->vm_start += adj_start;
> + adjust->vm_pgoff += PHYS_PFN(adj_start);
> + if (adj_start < 0) {
> + WARN_ON(expanded);
> + vma_iter_store(vmg->vmi, adjust);
> + }
> + }
> +
> + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
> +
> + return 0;
> +}
> +
> /*
> * vma_merge_new_vma - Attempt to merge a new VMA into address space
> *
> @@ -700,7 +743,6 @@ int vma_expand(struct vma_merge_struct *vmg)
> bool remove_next = false;
> struct vm_area_struct *vma = vmg->vma;
> struct vm_area_struct *next = vmg->next;
> - struct vma_prepare vp;
>
> vma_start_write(vma);
> if (next && (vma != next) && (vmg->end == next->vm_end)) {
> @@ -713,24 +755,15 @@ int vma_expand(struct vma_merge_struct *vmg)
> return ret;
> }
>
> - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> /* Not merging but overwriting any part of next is not handled. */
> - VM_WARN_ON(next && !vp.remove &&
> + VM_WARN_ON(next && !remove_next &&
> next != vma && vmg->end > next->vm_start);
> /* Only handles expanding */
> VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
>
> - /* Note: vma iterator must be pointing to 'start' */
> - vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> - if (vma_iter_prealloc(vmg->vmi, vma))
> + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
> goto nomem;
>
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
> - vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> - vma_iter_store(vmg->vmi, vma);
> -
> - vma_complete(&vp, vmg->vmi, vma->vm_mm);
> return 0;
>
> nomem:
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-06 13:41 ` Petr Tesařík
@ 2024-08-06 13:48 ` Lorenzo Stoakes
2024-08-06 14:13 ` Petr Tesařík
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 13:48 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 03:41:16PM GMT, Petr Tesařík wrote:
> On Mon, 5 Aug 2024 13:13:55 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > Pull this operation into its own function and have vma_expand() call
> > commit_merge() instead.
> >
> > This lays the groundwork for a subsequent patch which replaces vma_merge()
> > with a simpler function which can share the same code.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> > 1 file changed, 45 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index a404cf718f9e..b7e3c64d5d68 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> > }
> > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/* Actually perform the VMA merge operation. */
> > +static int commit_merge(struct vma_merge_struct *vmg,
> > + struct vm_area_struct *adjust,
> > + struct vm_area_struct *remove,
> > + struct vm_area_struct *remove2,
> > + long adj_start,
> > + bool expanded)
> > +{
> > + struct vma_prepare vp;
> > +
> > + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> > +
> > + if (expanded) {
> > + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > + } else {
> > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > + adjust->vm_end);
> > + }
>
> It's hard to follow the logic if you the "expanded" parameter is always
> true. I have to look at PATCH 09/10 first to see how it is expected to
> be used. Is there no other way?
>
> Note that this is not needed for adjust and adj_start, because they are
> merely moved here from vma_expand() and passed down as parameters to
> other functions.
See the next patch to understand how these are used, as the commit message
says, this lays the groundwork for the next patch which actually uses both
of these.
I have tried hard to clarify how these are used, however there is some
unavoidable and inherent complexity in this logic. If you don't believe me,
I suggest trying to follow the logic of the existing code :)
And if you want to _really_ have fun, I suggest you try to understand the
logic around v6.0 prior to Liam's interventions.
We might be able to try to improve the logic flow further, but it's one
step at a time with this.
>
> Petr T
>
> > +
> > + if (vma_iter_prealloc(vmg->vmi, vmg->vma))
> > + return -ENOMEM;
> > +
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
> > + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
> > +
> > + if (expanded)
> > + vma_iter_store(vmg->vmi, vmg->vma);
> > +
> > + if (adj_start) {
> > + adjust->vm_start += adj_start;
> > + adjust->vm_pgoff += PHYS_PFN(adj_start);
> > + if (adj_start < 0) {
> > + WARN_ON(expanded);
> > + vma_iter_store(vmg->vmi, adjust);
> > + }
> > + }
> > +
> > + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
> > +
> > + return 0;
> > +}
> > +
> > /*
> > * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > *
> > @@ -700,7 +743,6 @@ int vma_expand(struct vma_merge_struct *vmg)
> > bool remove_next = false;
> > struct vm_area_struct *vma = vmg->vma;
> > struct vm_area_struct *next = vmg->next;
> > - struct vma_prepare vp;
> >
> > vma_start_write(vma);
> > if (next && (vma != next) && (vmg->end == next->vm_end)) {
> > @@ -713,24 +755,15 @@ int vma_expand(struct vma_merge_struct *vmg)
> > return ret;
> > }
> >
> > - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> > /* Not merging but overwriting any part of next is not handled. */
> > - VM_WARN_ON(next && !vp.remove &&
> > + VM_WARN_ON(next && !remove_next &&
> > next != vma && vmg->end > next->vm_start);
> > /* Only handles expanding */
> > VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
> >
> > - /* Note: vma iterator must be pointing to 'start' */
> > - vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > - if (vma_iter_prealloc(vmg->vmi, vma))
> > + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
> > goto nomem;
> >
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
> > - vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> > - vma_iter_store(vmg->vmi, vma);
> > -
> > - vma_complete(&vp, vmg->vmi, vma->vm_mm);
> > return 0;
> >
> > nomem:
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-06 13:48 ` Lorenzo Stoakes
@ 2024-08-06 14:13 ` Petr Tesařík
2024-08-06 14:30 ` Lorenzo Stoakes
0 siblings, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 14:13 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, 6 Aug 2024 14:48:33 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> On Tue, Aug 06, 2024 at 03:41:16PM GMT, Petr Tesařík wrote:
> > On Mon, 5 Aug 2024 13:13:55 +0100
> > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >
> > > Pull this operation into its own function and have vma_expand() call
> > > commit_merge() instead.
> > >
> > > This lays the groundwork for a subsequent patch which replaces vma_merge()
> > > with a simpler function which can share the same code.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > > mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> > > 1 file changed, 45 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/mm/vma.c b/mm/vma.c
> > > index a404cf718f9e..b7e3c64d5d68 100644
> > > --- a/mm/vma.c
> > > +++ b/mm/vma.c
> > > @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> > > }
> > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> > >
> > > +/* Actually perform the VMA merge operation. */
> > > +static int commit_merge(struct vma_merge_struct *vmg,
> > > + struct vm_area_struct *adjust,
> > > + struct vm_area_struct *remove,
> > > + struct vm_area_struct *remove2,
> > > + long adj_start,
> > > + bool expanded)
> > > +{
> > > + struct vma_prepare vp;
> > > +
> > > + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> > > +
> > > + if (expanded) {
> > > + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > > + } else {
> > > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > > + adjust->vm_end);
> > > + }
> >
> > It's hard to follow the logic if you the "expanded" parameter is always
> > true. I have to look at PATCH 09/10 first to see how it is expected to
> > be used. Is there no other way?
> >
> > Note that this is not needed for adjust and adj_start, because they are
> > merely moved here from vma_expand() and passed down as parameters to
> > other functions.
>
> See the next patch to understand how these are used, as the commit message
> says, this lays the groundwork for the next patch which actually uses both
> of these.
>
> I have tried hard to clarify how these are used, however there is some
> unavoidable and inherent complexity in this logic. If you don't believe me,
> I suggest trying to follow the logic of the existing code :)
>
> And if you want to _really_ have fun, I suggest you try to understand the
> logic around v6.0 prior to Liam's interventions.
>
> We might be able to try to improve the logic flow further, but it's one
> step at a time with this.
What I mean is: Is there no way to arrange the patch series so that I
don't have to look at PATH 09/10 before I can understand code in patch
08/10?
This PATCH 08/10 adds only one call to commit_merge() and that one
always sets expanded to true. Maybe you could introduce commit_merge()
here without the parameter and add it in PATCH 09/10?
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-06 14:13 ` Petr Tesařík
@ 2024-08-06 14:30 ` Lorenzo Stoakes
2024-08-06 14:39 ` Petr Tesařík
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 14:30 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 04:13:21PM GMT, Petr Tesařík wrote:
> On Tue, 6 Aug 2024 14:48:33 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > On Tue, Aug 06, 2024 at 03:41:16PM GMT, Petr Tesařík wrote:
> > > On Mon, 5 Aug 2024 13:13:55 +0100
> > > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > > Pull this operation into its own function and have vma_expand() call
> > > > commit_merge() instead.
> > > >
> > > > This lays the groundwork for a subsequent patch which replaces vma_merge()
> > > > with a simpler function which can share the same code.
> > > >
> > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > ---
> > > > mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> > > > 1 file changed, 45 insertions(+), 12 deletions(-)
> > > >
> > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > index a404cf718f9e..b7e3c64d5d68 100644
> > > > --- a/mm/vma.c
> > > > +++ b/mm/vma.c
> > > > @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> > > > }
> > > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> > > >
> > > > +/* Actually perform the VMA merge operation. */
> > > > +static int commit_merge(struct vma_merge_struct *vmg,
> > > > + struct vm_area_struct *adjust,
> > > > + struct vm_area_struct *remove,
> > > > + struct vm_area_struct *remove2,
> > > > + long adj_start,
> > > > + bool expanded)
> > > > +{
> > > > + struct vma_prepare vp;
> > > > +
> > > > + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> > > > +
> > > > + if (expanded) {
> > > > + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > > > + } else {
> > > > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > > > + adjust->vm_end);
> > > > + }
> > >
> > > It's hard to follow the logic if you the "expanded" parameter is always
> > > true. I have to look at PATCH 09/10 first to see how it is expected to
> > > be used. Is there no other way?
> > >
> > > Note that this is not needed for adjust and adj_start, because they are
> > > merely moved here from vma_expand() and passed down as parameters to
> > > other functions.
> >
> > See the next patch to understand how these are used, as the commit message
> > says, this lays the groundwork for the next patch which actually uses both
> > of these.
> >
> > I have tried hard to clarify how these are used, however there is some
> > unavoidable and inherent complexity in this logic. If you don't believe me,
> > I suggest trying to follow the logic of the existing code :)
> >
> > And if you want to _really_ have fun, I suggest you try to understand the
> > logic around v6.0 prior to Liam's interventions.
> >
> > We might be able to try to improve the logic flow further, but it's one
> > step at a time with this.
>
> What I mean is: Is there no way to arrange the patch series so that I
> don't have to look at PATH 09/10 before I can understand code in patch
> 08/10?
No.
>
> This PATCH 08/10 adds only one call to commit_merge() and that one
> always sets expanded to true. Maybe you could introduce commit_merge()
> here without the parameter and add it in PATCH 09/10?
No, I won't do that, you haven't made a case for it.
>
> Petr T
I appreciate you are doing a drive-by review on code you aren't familiar
with, but it's worth appreciating that there is some context here - this is
intentionally isolating _existing_ logic from vma_expand() and vma_merge()
in such a way that we have a _generic_ function we can use for this
operation.
I think it'd be _more_ confusing and (surprising given your rather pedantic
interpretation of churn elsewhere) churny to rewrite this again with a
bunch of added logic in the next commit.
I think this is highly subjective, and I'm not sure it's a great use of
either of our time to get too stuck in the weeds on this kind of thing.
Of course if you or others can present a more compelling argument for
reworking this I'm happy to hear.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-06 14:30 ` Lorenzo Stoakes
@ 2024-08-06 14:39 ` Petr Tesařík
0 siblings, 0 replies; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 14:39 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, 6 Aug 2024 15:30:49 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> On Tue, Aug 06, 2024 at 04:13:21PM GMT, Petr Tesařík wrote:
> > On Tue, 6 Aug 2024 14:48:33 +0100
> > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >
> > > On Tue, Aug 06, 2024 at 03:41:16PM GMT, Petr Tesařík wrote:
> > > > On Mon, 5 Aug 2024 13:13:55 +0100
> > > > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > >
> > > > > Pull this operation into its own function and have vma_expand() call
> > > > > commit_merge() instead.
> > > > >
> > > > > This lays the groundwork for a subsequent patch which replaces vma_merge()
> > > > > with a simpler function which can share the same code.
> > > > >
> > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > > > ---
> > > > > mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> > > > > 1 file changed, 45 insertions(+), 12 deletions(-)
> > > > >
> > > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > > index a404cf718f9e..b7e3c64d5d68 100644
> > > > > --- a/mm/vma.c
> > > > > +++ b/mm/vma.c
> > > > > @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> > > > > }
> > > > > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> > > > >
> > > > > +/* Actually perform the VMA merge operation. */
> > > > > +static int commit_merge(struct vma_merge_struct *vmg,
> > > > > + struct vm_area_struct *adjust,
> > > > > + struct vm_area_struct *remove,
> > > > > + struct vm_area_struct *remove2,
> > > > > + long adj_start,
> > > > > + bool expanded)
> > > > > +{
> > > > > + struct vma_prepare vp;
> > > > > +
> > > > > + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> > > > > +
> > > > > + if (expanded) {
> > > > > + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > > > > + } else {
> > > > > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > > > > + adjust->vm_end);
> > > > > + }
> > > >
> > > > It's hard to follow the logic if you the "expanded" parameter is always
> > > > true. I have to look at PATCH 09/10 first to see how it is expected to
> > > > be used. Is there no other way?
> > > >
> > > > Note that this is not needed for adjust and adj_start, because they are
> > > > merely moved here from vma_expand() and passed down as parameters to
> > > > other functions.
> > >
> > > See the next patch to understand how these are used, as the commit message
> > > says, this lays the groundwork for the next patch which actually uses both
> > > of these.
> > >
> > > I have tried hard to clarify how these are used, however there is some
> > > unavoidable and inherent complexity in this logic. If you don't believe me,
> > > I suggest trying to follow the logic of the existing code :)
> > >
> > > And if you want to _really_ have fun, I suggest you try to understand the
> > > logic around v6.0 prior to Liam's interventions.
> > >
> > > We might be able to try to improve the logic flow further, but it's one
> > > step at a time with this.
> >
> > What I mean is: Is there no way to arrange the patch series so that I
> > don't have to look at PATH 09/10 before I can understand code in patch
> > 08/10?
>
> No.
>
> >
> > This PATCH 08/10 adds only one call to commit_merge() and that one
> > always sets expanded to true. Maybe you could introduce commit_merge()
> > here without the parameter and add it in PATCH 09/10?
>
> No, I won't do that, you haven't made a case for it.
>
> >
> > Petr T
>
> I appreciate you are doing a drive-by review on code you aren't familiar
> with, but it's worth appreciating that there is some context here - this is
> intentionally isolating _existing_ logic from vma_expand() and vma_merge()
> in such a way that we have a _generic_ function we can use for this
> operation.
The history you make today becomes the learning material for the next
generation of kernel hackers (who will also lack a lot of context).
> I think it'd be _more_ confusing and (surprising given your rather pedantic
> interpretation of churn elsewhere) churny to rewrite this again with a
> bunch of added logic in the next commit.
>
> I think this is highly subjective, and I'm not sure it's a great use of
> either of our time to get too stuck in the weeds on this kind of thing.
Yep. We can all agre this is the best way to convey the idea behind the
changes. Don't get me wrong; this whole series does a lot of good in
terms of code readability, even for a bystander like myself.
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-05 12:13 ` [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation Lorenzo Stoakes
2024-08-06 13:41 ` Petr Tesařík
@ 2024-08-09 10:15 ` Vlastimil Babka
2024-08-09 10:53 ` Lorenzo Stoakes
1 sibling, 1 reply; 53+ messages in thread
From: Vlastimil Babka @ 2024-08-09 10:15 UTC (permalink / raw)
To: Lorenzo Stoakes, linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> Pull this operation into its own function and have vma_expand() call
> commit_merge() instead.
>
> This lays the groundwork for a subsequent patch which replaces vma_merge()
> with a simpler function which can share the same code.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
In general,
Acked-by: Vlastimil Babka <vbabka@suse.cz>
If you consider the following suggestions, great:
> ---
> mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 45 insertions(+), 12 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index a404cf718f9e..b7e3c64d5d68 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> }
> #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
>
> +/* Actually perform the VMA merge operation. */
> +static int commit_merge(struct vma_merge_struct *vmg,
> + struct vm_area_struct *adjust,
> + struct vm_area_struct *remove,
> + struct vm_area_struct *remove2,
> + long adj_start,
> + bool expanded)
I've read the subthread with Petr. I understand it's hard to organize such
big changes in self-contained units. But maybe it would still be possible to
introduce this function now without the parameters, and as part of the the
next patch add the two parameters and the code using them. Maybe it would
even make git detect the added code as code move from where it's now, so it
would be more obviousl.
> +{
> + struct vma_prepare vp;
> +
> + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> +
> + if (expanded) {
> + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
This originally had a comment
/* Note: vma iterator must be pointing to 'start' */
and now it's gone.
> + } else {
> + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> + adjust->vm_end);
And this less obvious one has none either :(
> + }
> +
> + if (vma_iter_prealloc(vmg->vmi, vmg->vma))
> + return -ENOMEM;
> +
> + vma_prepare(&vp);
> + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
> + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
> +
> + if (expanded)
> + vma_iter_store(vmg->vmi, vmg->vma);
> +
> + if (adj_start) {
> + adjust->vm_start += adj_start;
> + adjust->vm_pgoff += PHYS_PFN(adj_start);
> + if (adj_start < 0) {
> + WARN_ON(expanded);
> + vma_iter_store(vmg->vmi, adjust);
> + }
> + }
> +
> + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
> +
> + return 0;
> +}
> +
> /*
> * vma_merge_new_vma - Attempt to merge a new VMA into address space
> *
> @@ -700,7 +743,6 @@ int vma_expand(struct vma_merge_struct *vmg)
> bool remove_next = false;
> struct vm_area_struct *vma = vmg->vma;
> struct vm_area_struct *next = vmg->next;
> - struct vma_prepare vp;
>
> vma_start_write(vma);
> if (next && (vma != next) && (vmg->end == next->vm_end)) {
> @@ -713,24 +755,15 @@ int vma_expand(struct vma_merge_struct *vmg)
> return ret;
> }
>
> - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> /* Not merging but overwriting any part of next is not handled. */
> - VM_WARN_ON(next && !vp.remove &&
> + VM_WARN_ON(next && !remove_next &&
> next != vma && vmg->end > next->vm_start);
> /* Only handles expanding */
> VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
>
> - /* Note: vma iterator must be pointing to 'start' */
> - vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> - if (vma_iter_prealloc(vmg->vmi, vma))
> + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
> goto nomem;
>
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
> - vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> - vma_iter_store(vmg->vmi, vma);
> -
> - vma_complete(&vp, vmg->vmi, vma->vm_mm);
> return 0;
>
> nomem:
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation
2024-08-09 10:15 ` Vlastimil Babka
@ 2024-08-09 10:53 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 10:53 UTC (permalink / raw)
To: Vlastimil Babka
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett,
Petr Tesařík
On Fri, Aug 09, 2024 at 12:15:24PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > Pull this operation into its own function and have vma_expand() call
> > commit_merge() instead.
> >
> > This lays the groundwork for a subsequent patch which replaces vma_merge()
> > with a simpler function which can share the same code.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> In general,
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>
> If you consider the following suggestions, great:
>
> > ---
> > mm/vma.c | 57 ++++++++++++++++++++++++++++++++++++++++++++------------
> > 1 file changed, 45 insertions(+), 12 deletions(-)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index a404cf718f9e..b7e3c64d5d68 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -564,6 +564,49 @@ void validate_mm(struct mm_struct *mm)
> > }
> > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/* Actually perform the VMA merge operation. */
> > +static int commit_merge(struct vma_merge_struct *vmg,
> > + struct vm_area_struct *adjust,
> > + struct vm_area_struct *remove,
> > + struct vm_area_struct *remove2,
> > + long adj_start,
> > + bool expanded)
>
> I've read the subthread with Petr. I understand it's hard to organize such
> big changes in self-contained units. But maybe it would still be possible to
> introduce this function now without the parameters, and as part of the the
> next patch add the two parameters and the code using them. Maybe it would
> even make git detect the added code as code move from where it's now, so it
> would be more obviousl.
Since both you and Petr make the same point (sorry Petr, I should have
perhaps been a little less resistant to this), I will do this.
As discussed on IRC my position on this is that we're introducing a _really
fundamental_ and important bit of the logic here, intentionally broken out
as a separate commit, and this is why I preferred to introduce it 'fully
formed'.
This function is absolutely fundamental to eliminating the duplication in
do_brk_flags() + mmap_region() and to maintain two separate new/modified
vma versions of vma_merge().
HOWEVER, I totally accept that this makes review much more of a pain in the
arse, and in practice almost certainly the only thing that matters is
reviewability here as to how I structure this.
So TL;DR: I'll do what you both ask and introduce new params only when we
use them.
>
> > +{
> > + struct vma_prepare vp;
> > +
> > + init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
> > +
> > + if (expanded) {
> > + vma_iter_config(vmg->vmi, vmg->start, vmg->end);
>
> This originally had a comment
>
> /* Note: vma iterator must be pointing to 'start' */
>
> and now it's gone.
Will check and re-add if it makes sense. I mean we're now setting the iterator
to start anyway so I don't see that this has value? Maybe I'm missing something
and Liam has thoughts...
>
> > + } else {
> > + vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > + adjust->vm_end);
>
> And this less obvious one has none either :(
I will add a comment.
>
> > + }
> > +
> > + if (vma_iter_prealloc(vmg->vmi, vmg->vma))
> > + return -ENOMEM;
> > +
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
> > + vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
> > +
> > + if (expanded)
> > + vma_iter_store(vmg->vmi, vmg->vma);
> > +
> > + if (adj_start) {
> > + adjust->vm_start += adj_start;
> > + adjust->vm_pgoff += PHYS_PFN(adj_start);
> > + if (adj_start < 0) {
> > + WARN_ON(expanded);
> > + vma_iter_store(vmg->vmi, adjust);
> > + }
> > + }
> > +
> > + vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
> > +
> > + return 0;
> > +}
> > +
> > /*
> > * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > *
> > @@ -700,7 +743,6 @@ int vma_expand(struct vma_merge_struct *vmg)
> > bool remove_next = false;
> > struct vm_area_struct *vma = vmg->vma;
> > struct vm_area_struct *next = vmg->next;
> > - struct vma_prepare vp;
> >
> > vma_start_write(vma);
> > if (next && (vma != next) && (vmg->end == next->vm_end)) {
> > @@ -713,24 +755,15 @@ int vma_expand(struct vma_merge_struct *vmg)
> > return ret;
> > }
> >
> > - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> > /* Not merging but overwriting any part of next is not handled. */
> > - VM_WARN_ON(next && !vp.remove &&
> > + VM_WARN_ON(next && !remove_next &&
> > next != vma && vmg->end > next->vm_start);
> > /* Only handles expanding */
> > VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
> >
> > - /* Note: vma iterator must be pointing to 'start' */
> > - vma_iter_config(vmg->vmi, vmg->start, vmg->end);
> > - if (vma_iter_prealloc(vmg->vmi, vma))
> > + if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
> > goto nomem;
> >
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vmg->start, vmg->end, 0);
> > - vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
> > - vma_iter_store(vmg->vmi, vma);
> > -
> > - vma_complete(&vp, vmg->vmi, vma->vm_mm);
> > return 0;
> >
> > nomem:
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified()
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (7 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 08/10] mm: introduce commit_merge(), abstracting merge operation Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 13:42 ` Petr Tesařík
2024-08-09 13:44 ` Vlastimil Babka
2024-08-05 12:13 ` [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge Lorenzo Stoakes
9 siblings, 2 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
The existing vma_merge() function is no longer required to handle what were
previously referred to as cases 1-3 (i.e. the merging of a new VMA), as
this is now handled by vma_merge_new_vma().
Additionally, we simplify the convoluted control flow of the original,
maintaining identical logic only expressed more clearly and doing away with
a complicated set of cases, rather logically examining each possible
outcome - merging of both the previous and subsequent VMA, merging of the
previous VMA and merging of the subsequent VMA alone.
We now utilise the previously implemented commit_merge() function to share
logic with vma_expand() deduplicating code and providing less surface area
for bugs and confusion.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/vma.c | 474 +++++++++++++++++++++++++++----------------------------
mm/vma.h | 6 -
2 files changed, 232 insertions(+), 248 deletions(-)
diff --git a/mm/vma.c b/mm/vma.c
index b7e3c64d5d68..c55ae035f5d6 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -569,8 +569,7 @@ static int commit_merge(struct vma_merge_struct *vmg,
struct vm_area_struct *adjust,
struct vm_area_struct *remove,
struct vm_area_struct *remove2,
- long adj_start,
- bool expanded)
+ long adj_start, bool expanded)
{
struct vma_prepare vp;
@@ -607,6 +606,236 @@ static int commit_merge(struct vma_merge_struct *vmg,
return 0;
}
+/*
+ * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
+ * attributes modified.
+ *
+ * @vmg: Describes the modifications being made to a VMA and associated
+ * metadata.
+ *
+ * When the attributes of a range within a VMA change, then it might be possible
+ * for immediately adjacent VMAs to be merged into that VMA due to having
+ * identical properties.
+ *
+ * This function checks for the existence of any such mergeable VMAs and updates
+ * the maple tree describing the @vmg->vma->vm_mm address space to account for
+ * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ *
+ * As part of this operation, if a merge occurs, the @vmg object will have its
+ * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
+ * calls to this function should reset these fields.
+ *
+ * Returns: The merged VMA if merge succeeds, or NULL otherwise.
+ *
+ * ASSUMPTIONS:
+ * - The caller must assign the VMA to be modifed to vmg->vma.
+ * - The caller must have set vmg->prev to the previous VMA, if there is one.
+ * - The caller does not need to set vmg->next, as we determine this.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ */
+static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *prev = vmg->prev;
+ struct vm_area_struct *next, *res;
+ struct vm_area_struct *anon_dup = NULL;
+ struct vm_area_struct *adjust = NULL;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
+ bool left_side = vma && start == vma->vm_start;
+ bool right_side = vma && end == vma->vm_end;
+ bool merge_will_delete_vma, merge_will_delete_next;
+ bool merge_left, merge_right;
+ bool merge_both = false;
+ int err = 0;
+ long adj_start = 0;
+
+ VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON(vmg->next); /* We set this. */
+ VM_WARN_ON(prev && start <= prev->vm_start);
+ VM_WARN_ON(start >= end);
+ /*
+ * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * not, we must span a portion of the VMA.
+ */
+ VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
+ vmg->end > vma->vm_end));
+
+ /*
+ * If a special mapping or neither at the furthermost left or right side
+ * of the VMA, then we have no chance of merging and should abort.
+ *
+ * We later require that vma->vm_flags == vm_flags, so this tests
+ * vma->vm_flags & VM_SPECIAL, too.
+ */
+ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ return NULL;
+
+ if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
+ merge_left = true;
+ vma_prev(vmg->vmi);
+ } else {
+ merge_left = false;
+ }
+
+ if (right_side) {
+ next = vmg->next = vma_lookup(vma->vm_mm, end);
+
+ /*
+ * We can merge right if there is a subsequent VMA, if it is
+ * immediately adjacent, and if it is compatible with vma.
+ */
+ merge_right = next && end == next->vm_start &&
+ can_vma_merge_before(vmg);
+
+ /*
+ * We can only merge both if the anonymous VMA of the previous
+ * VMA is compatible with the anonymous VMA of the subsequent
+ * VMA.
+ *
+ * Otherwise, we default to merging only the left.
+ */
+ if (merge_left && merge_right)
+ merge_right = merge_both =
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL);
+ } else {
+ merge_right = false;
+ next = NULL;
+ }
+
+ /* If we have nothing to merge, abort. */
+ if (!merge_left && !merge_right)
+ return NULL;
+
+ /* If we span the entire VMA, a merge implies it will be deleted. */
+ merge_will_delete_vma = left_side && right_side;
+ /* If we merge both VMAs, then next is also deleted. */
+ merge_will_delete_next = merge_both;
+
+ /* No matter what happens, we will be adjusting vma. */
+ vma_start_write(vma);
+
+ if (merge_left)
+ vma_start_write(prev);
+
+ if (merge_right)
+ vma_start_write(next);
+
+ if (merge_both) {
+ /*
+ * |<----->|
+ * |-------*********-------|
+ * prev vma next
+ * extend delete delete
+ */
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->end = next->vm_end;
+ vmg->pgoff = prev->vm_pgoff;
+
+ /*
+ * We already ensured anon_vma compatibility above, so now it's
+ * simply a case of, if prev has no anon_vma object, which of
+ * next or vma contains the anon_vma we must duplicate.
+ */
+ err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ } else if (merge_left) {
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * |-------*************
+ * prev vma
+ * extend shrink/delete
+ */
+
+ unsigned long end = vmg->end;
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->pgoff = prev->vm_pgoff;
+
+ if (merge_will_delete_vma) {
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing vma, so it skipped the check for
+ * vm_ops->close, but we are removing vma.
+ */
+ if (vma->vm_ops && vma->vm_ops->close)
+ err = -EINVAL;
+ } else {
+ adjust = vma;
+ adj_start = end - vma->vm_start;
+ }
+
+ if (!err)
+ err = dup_anon_vma(prev, vma, &anon_dup);
+ } else { /* merge_right */
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * *************-------|
+ * vma next
+ * shrink/delete extend
+ */
+
+ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+ VM_WARN_ON(!merge_right);
+ /* If we are offset into a VMA, then prev must be vma. */
+ VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+
+ if (merge_will_delete_vma) {
+ vmg->vma = next;
+ vmg->end = next->vm_end;
+ vmg->pgoff = next->vm_pgoff - pglen;
+ } else {
+ /*
+ * We shrink vma and expand next.
+ *
+ * IMPORTANT: This is the ONLY case where the final
+ * merged VMA is NOT vmg->vma, but rather vmg->next.
+ */
+
+ vmg->start = vma->vm_start;
+ vmg->end = start;
+ vmg->pgoff = vma->vm_pgoff;
+
+ adjust = next;
+ adj_start = -(vma->vm_end - start);
+ }
+
+ err = dup_anon_vma(next, vma, &anon_dup);
+ }
+
+ if (err)
+ goto abort;
+
+ if (commit_merge(vmg, adjust,
+ merge_will_delete_vma ? vma : NULL,
+ merge_will_delete_next ? next : NULL,
+ adj_start,
+ /*
+ * In nearly all cases, we expand vmg->vma. There is
+ * one exception - merge_right where we partially span
+ * the VMA. In this case we shrink the end of vmg->vma
+ * and adjust the start of vmg->next accordingly.
+ */
+ !merge_right || merge_will_delete_vma))
+ return NULL;
+
+ res = merge_left ? prev : next;
+ khugepaged_enter_vma(res, vmg->flags);
+
+ return res;
+
+abort:
+ vma_iter_set(vmg->vmi, start);
+ vma_iter_load(vmg->vmi);
+ return NULL;
+}
+
/*
* vma_merge_new_vma - Attempt to merge a new VMA into address space
*
@@ -1022,245 +1251,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}
-/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
- * figure out whether that can be merged with its predecessor or its
- * successor. Or both (it neatly fills a hole).
- *
- * In most cases - when called for mmap, brk or mremap - [addr,end) is
- * certain not to be mapped by the time vma_merge is called; but when
- * called for mprotect, it is certain to be already mapped (either at
- * an offset within prev, or at the start of next), and the flags of
- * this area are about to be changed to vm_flags - and the no-change
- * case has already been eliminated.
- *
- * The following mprotect cases have to be considered, where **** is
- * the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
- * at the same address as **** and is of the same or larger span, and
- * NNNN the next vma after ****:
- *
- * **** **** ****
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
- * cannot merge might become might become
- * PPNNNNNNNNNN PPPPPPPPPPCC
- * mmap, brk or case 4 below case 5 below
- * mremap move:
- * **** ****
- * PPPP NNNN PPPPCCCCNNNN
- * might become might become
- * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
- * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
- * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
- *
- * It is important for case 8 that the vma CCCC overlapping the
- * region **** is never going to extended over NNNN. Instead NNNN must
- * be extended in region **** and CCCC must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_merge drops the
- * rmap_locks, the properties of the merged vma will be already
- * correct for the whole merged range. Some of those properties like
- * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
- * be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if NNNN would be removed and
- * CCCC would be extended over the NNNN range, remove_migration_ptes
- * or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of CCCC
- * instead of the right permissions of NNNN.
- *
- * In the code below:
- * PPPP is represented by *prev
- * CCCC is represented by *curr or not represented at all (NULL)
- * NNNN is represented by *next or not represented at all (NULL)
- * **** is not represented - it will be merged and the vma containing the
- * area is returned, or the function will return NULL
- */
-static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
-{
- struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
- struct vm_area_struct *prev = vmg->prev;
- struct vm_area_struct *curr, *next, *res;
- struct vm_area_struct *vma, *adjust, *remove, *remove2;
- struct vm_area_struct *anon_dup = NULL;
- struct vma_prepare vp;
- pgoff_t vma_pgoff;
- int err = 0;
- bool merge_prev = false;
- bool merge_next = false;
- bool vma_expanded = false;
- unsigned long addr = vmg->start;
- unsigned long end = vmg->end;
- unsigned long vma_start = addr;
- unsigned long vma_end = end;
- pgoff_t pglen = PHYS_PFN(end - addr);
- long adj_start = 0;
-
- /*
- * We later require that vma->vm_flags == vm_flags,
- * so this tests vma->vm_flags & VM_SPECIAL, too.
- */
- if (vmg->flags & VM_SPECIAL)
- return NULL;
-
- /* Does the input range span an existing VMA? (cases 5 - 8) */
- curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
-
- if (!curr || /* cases 1 - 4 */
- end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
- next = vmg->next = vma_lookup(mm, end);
- else
- next = vmg->next = NULL; /* case 5 */
-
- if (prev) {
- vma_start = prev->vm_start;
- vma_pgoff = prev->vm_pgoff;
-
- /* Can we merge the predecessor? */
- if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
- merge_prev = true;
- vma_prev(vmg->vmi);
- }
- }
-
- /* Can we merge the successor? */
- if (next && can_vma_merge_before(vmg)) {
- merge_next = true;
- }
-
- /* Verify some invariant that must be enforced by the caller. */
- VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
- VM_WARN_ON(addr >= end);
-
- if (!merge_prev && !merge_next)
- return NULL; /* Not mergeable. */
-
- if (merge_prev)
- vma_start_write(prev);
-
- res = vma = prev;
- remove = remove2 = adjust = NULL;
-
- /* Can we merge both the predecessor and the successor? */
- if (merge_prev && merge_next &&
- is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
- vma_start_write(next);
- remove = next; /* case 1 */
- vma_end = next->vm_end;
- err = dup_anon_vma(prev, next, &anon_dup);
- if (curr) { /* case 6 */
- vma_start_write(curr);
- remove = curr;
- remove2 = next;
- /*
- * Note that the dup_anon_vma below cannot overwrite err
- * since the first caller would do nothing unless next
- * has an anon_vma.
- */
- if (!next->anon_vma)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else if (merge_prev) { /* case 2 */
- if (curr) {
- vma_start_write(curr);
- if (end == curr->vm_end) { /* case 7 */
- /*
- * can_vma_merge_after() assumed we would not be
- * removing prev vma, so it skipped the check
- * for vm_ops->close, but we are removing curr
- */
- if (curr->vm_ops && curr->vm_ops->close)
- err = -EINVAL;
- remove = curr;
- } else { /* case 5 */
- adjust = curr;
- adj_start = end - curr->vm_start;
- }
- if (!err)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else { /* merge_next */
- vma_start_write(next);
- res = next;
- if (prev && addr < prev->vm_end) { /* case 4 */
- vma_start_write(prev);
- vma_end = addr;
- adjust = next;
- adj_start = -(prev->vm_end - addr);
- err = dup_anon_vma(next, prev, &anon_dup);
- } else {
- /*
- * Note that cases 3 and 8 are the ONLY ones where prev
- * is permitted to be (but is not necessarily) NULL.
- */
- vma = next; /* case 3 */
- vma_start = addr;
- vma_end = next->vm_end;
- vma_pgoff = next->vm_pgoff - pglen;
- if (curr) { /* case 8 */
- vma_pgoff = curr->vm_pgoff;
- vma_start_write(curr);
- remove = curr;
- err = dup_anon_vma(next, curr, &anon_dup);
- }
- }
- }
-
- /* Error in anon_vma clone. */
- if (err)
- goto anon_vma_fail;
-
- if (vma_start < vma->vm_start || vma_end > vma->vm_end)
- vma_expanded = true;
-
- if (vma_expanded) {
- vma_iter_config(vmg->vmi, vma_start, vma_end);
- } else {
- vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
- }
-
- if (vma_iter_prealloc(vmg->vmi, vma))
- goto prealloc_fail;
-
- init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
-
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
- vma_set_range(vma, vma_start, vma_end, vma_pgoff);
-
- if (vma_expanded)
- vma_iter_store(vmg->vmi, vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
- if (adj_start < 0) {
- WARN_ON(vma_expanded);
- vma_iter_store(vmg->vmi, next);
- }
- }
-
- vma_complete(&vp, vmg->vmi, mm);
- khugepaged_enter_vma(res, vmg->flags);
- return res;
-
-prealloc_fail:
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
-anon_vma_fail:
- vma_iter_set(vmg->vmi, addr);
- vma_iter_load(vmg->vmi);
- return NULL;
-}
-
-struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
-{
- return vma_merge(vmg);
-}
-
/*
* We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
* context and anonymous VMA name within the range [start, end).
@@ -1280,7 +1270,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *merged;
/* First, try to merge. */
- merged = vma_merge(vmg);
+ merged = vma_merge_modified(vmg);
if (merged)
return merged;
diff --git a/mm/vma.h b/mm/vma.h
index bbb173053f34..bf29ff569a3d 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -110,12 +110,6 @@ struct vm_area_struct
struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
-/*
- * Temporary wrapper around vma_merge() so we can have a common interface for
- * tests.
- */
-struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg);
-
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
struct vm_area_struct *vma,
unsigned long delta);
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified()
2024-08-05 12:13 ` [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified() Lorenzo Stoakes
@ 2024-08-06 13:42 ` Petr Tesařík
2024-08-06 13:52 ` Lorenzo Stoakes
2024-08-09 13:44 ` Vlastimil Babka
1 sibling, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 13:42 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:56 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> The existing vma_merge() function is no longer required to handle what were
> previously referred to as cases 1-3 (i.e. the merging of a new VMA), as
> this is now handled by vma_merge_new_vma().
>
> Additionally, we simplify the convoluted control flow of the original,
> maintaining identical logic only expressed more clearly and doing away with
> a complicated set of cases, rather logically examining each possible
> outcome - merging of both the previous and subsequent VMA, merging of the
> previous VMA and merging of the subsequent VMA alone.
>
> We now utilise the previously implemented commit_merge() function to share
> logic with vma_expand() deduplicating code and providing less surface area
> for bugs and confusion.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/vma.c | 474 +++++++++++++++++++++++++++----------------------------
> mm/vma.h | 6 -
> 2 files changed, 232 insertions(+), 248 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index b7e3c64d5d68..c55ae035f5d6 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -569,8 +569,7 @@ static int commit_merge(struct vma_merge_struct *vmg,
> struct vm_area_struct *adjust,
> struct vm_area_struct *remove,
> struct vm_area_struct *remove2,
> - long adj_start,
> - bool expanded)
> + long adj_start, bool expanded)
Um. Oops? ;-)
Otherwise LGTM.
Petr T
> {
> struct vma_prepare vp;
>
> @@ -607,6 +606,236 @@ static int commit_merge(struct vma_merge_struct *vmg,
> return 0;
> }
>
> +/*
> + * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> + * attributes modified.
> + *
> + * @vmg: Describes the modifications being made to a VMA and associated
> + * metadata.
> + *
> + * When the attributes of a range within a VMA change, then it might be possible
> + * for immediately adjacent VMAs to be merged into that VMA due to having
> + * identical properties.
> + *
> + * This function checks for the existence of any such mergeable VMAs and updates
> + * the maple tree describing the @vmg->vma->vm_mm address space to account for
> + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
> + *
> + * As part of this operation, if a merge occurs, the @vmg object will have its
> + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
> + * calls to this function should reset these fields.
> + *
> + * Returns: The merged VMA if merge succeeds, or NULL otherwise.
> + *
> + * ASSUMPTIONS:
> + * - The caller must assign the VMA to be modifed to vmg->vma.
> + * - The caller must have set vmg->prev to the previous VMA, if there is one.
> + * - The caller does not need to set vmg->next, as we determine this.
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> + */
> +static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> +{
> + struct vm_area_struct *vma = vmg->vma;
> + struct vm_area_struct *prev = vmg->prev;
> + struct vm_area_struct *next, *res;
> + struct vm_area_struct *anon_dup = NULL;
> + struct vm_area_struct *adjust = NULL;
> + unsigned long start = vmg->start;
> + unsigned long end = vmg->end;
> + bool left_side = vma && start == vma->vm_start;
> + bool right_side = vma && end == vma->vm_end;
> + bool merge_will_delete_vma, merge_will_delete_next;
> + bool merge_left, merge_right;
> + bool merge_both = false;
> + int err = 0;
> + long adj_start = 0;
> +
> + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
> + VM_WARN_ON(vmg->next); /* We set this. */
> + VM_WARN_ON(prev && start <= prev->vm_start);
> + VM_WARN_ON(start >= end);
> + /*
> + * If vma == prev, then we are offset into a VMA. Otherwise, if we are
> + * not, we must span a portion of the VMA.
> + */
> + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
> + vmg->end > vma->vm_end));
> +
> + /*
> + * If a special mapping or neither at the furthermost left or right side
> + * of the VMA, then we have no chance of merging and should abort.
> + *
> + * We later require that vma->vm_flags == vm_flags, so this tests
> + * vma->vm_flags & VM_SPECIAL, too.
> + */
> + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
> + return NULL;
> +
> + if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> + merge_left = true;
> + vma_prev(vmg->vmi);
> + } else {
> + merge_left = false;
> + }
> +
> + if (right_side) {
> + next = vmg->next = vma_lookup(vma->vm_mm, end);
> +
> + /*
> + * We can merge right if there is a subsequent VMA, if it is
> + * immediately adjacent, and if it is compatible with vma.
> + */
> + merge_right = next && end == next->vm_start &&
> + can_vma_merge_before(vmg);
> +
> + /*
> + * We can only merge both if the anonymous VMA of the previous
> + * VMA is compatible with the anonymous VMA of the subsequent
> + * VMA.
> + *
> + * Otherwise, we default to merging only the left.
> + */
> + if (merge_left && merge_right)
> + merge_right = merge_both =
> + is_mergeable_anon_vma(prev->anon_vma,
> + next->anon_vma, NULL);
> + } else {
> + merge_right = false;
> + next = NULL;
> + }
> +
> + /* If we have nothing to merge, abort. */
> + if (!merge_left && !merge_right)
> + return NULL;
> +
> + /* If we span the entire VMA, a merge implies it will be deleted. */
> + merge_will_delete_vma = left_side && right_side;
> + /* If we merge both VMAs, then next is also deleted. */
> + merge_will_delete_next = merge_both;
> +
> + /* No matter what happens, we will be adjusting vma. */
> + vma_start_write(vma);
> +
> + if (merge_left)
> + vma_start_write(prev);
> +
> + if (merge_right)
> + vma_start_write(next);
> +
> + if (merge_both) {
> + /*
> + * |<----->|
> + * |-------*********-------|
> + * prev vma next
> + * extend delete delete
> + */
> +
> + vmg->vma = prev;
> + vmg->start = prev->vm_start;
> + vmg->end = next->vm_end;
> + vmg->pgoff = prev->vm_pgoff;
> +
> + /*
> + * We already ensured anon_vma compatibility above, so now it's
> + * simply a case of, if prev has no anon_vma object, which of
> + * next or vma contains the anon_vma we must duplicate.
> + */
> + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
> + } else if (merge_left) {
> + /*
> + * |<----->| OR
> + * |<--------->|
> + * |-------*************
> + * prev vma
> + * extend shrink/delete
> + */
> +
> + unsigned long end = vmg->end;
> +
> + vmg->vma = prev;
> + vmg->start = prev->vm_start;
> + vmg->pgoff = prev->vm_pgoff;
> +
> + if (merge_will_delete_vma) {
> + /*
> + * can_vma_merge_after() assumed we would not be
> + * removing vma, so it skipped the check for
> + * vm_ops->close, but we are removing vma.
> + */
> + if (vma->vm_ops && vma->vm_ops->close)
> + err = -EINVAL;
> + } else {
> + adjust = vma;
> + adj_start = end - vma->vm_start;
> + }
> +
> + if (!err)
> + err = dup_anon_vma(prev, vma, &anon_dup);
> + } else { /* merge_right */
> + /*
> + * |<----->| OR
> + * |<--------->|
> + * *************-------|
> + * vma next
> + * shrink/delete extend
> + */
> +
> + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> +
> + VM_WARN_ON(!merge_right);
> + /* If we are offset into a VMA, then prev must be vma. */
> + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
> +
> + if (merge_will_delete_vma) {
> + vmg->vma = next;
> + vmg->end = next->vm_end;
> + vmg->pgoff = next->vm_pgoff - pglen;
> + } else {
> + /*
> + * We shrink vma and expand next.
> + *
> + * IMPORTANT: This is the ONLY case where the final
> + * merged VMA is NOT vmg->vma, but rather vmg->next.
> + */
> +
> + vmg->start = vma->vm_start;
> + vmg->end = start;
> + vmg->pgoff = vma->vm_pgoff;
> +
> + adjust = next;
> + adj_start = -(vma->vm_end - start);
> + }
> +
> + err = dup_anon_vma(next, vma, &anon_dup);
> + }
> +
> + if (err)
> + goto abort;
> +
> + if (commit_merge(vmg, adjust,
> + merge_will_delete_vma ? vma : NULL,
> + merge_will_delete_next ? next : NULL,
> + adj_start,
> + /*
> + * In nearly all cases, we expand vmg->vma. There is
> + * one exception - merge_right where we partially span
> + * the VMA. In this case we shrink the end of vmg->vma
> + * and adjust the start of vmg->next accordingly.
> + */
> + !merge_right || merge_will_delete_vma))
> + return NULL;
> +
> + res = merge_left ? prev : next;
> + khugepaged_enter_vma(res, vmg->flags);
> +
> + return res;
> +
> +abort:
> + vma_iter_set(vmg->vmi, start);
> + vma_iter_load(vmg->vmi);
> + return NULL;
> +}
> +
> /*
> * vma_merge_new_vma - Attempt to merge a new VMA into address space
> *
> @@ -1022,245 +1251,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> }
>
> -/*
> - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> - * figure out whether that can be merged with its predecessor or its
> - * successor. Or both (it neatly fills a hole).
> - *
> - * In most cases - when called for mmap, brk or mremap - [addr,end) is
> - * certain not to be mapped by the time vma_merge is called; but when
> - * called for mprotect, it is certain to be already mapped (either at
> - * an offset within prev, or at the start of next), and the flags of
> - * this area are about to be changed to vm_flags - and the no-change
> - * case has already been eliminated.
> - *
> - * The following mprotect cases have to be considered, where **** is
> - * the area passed down from mprotect_fixup, never extending beyond one
> - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> - * at the same address as **** and is of the same or larger span, and
> - * NNNN the next vma after ****:
> - *
> - * **** **** ****
> - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> - * cannot merge might become might become
> - * PPNNNNNNNNNN PPPPPPPPPPCC
> - * mmap, brk or case 4 below case 5 below
> - * mremap move:
> - * **** ****
> - * PPPP NNNN PPPPCCCCNNNN
> - * might become might become
> - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> - *
> - * It is important for case 8 that the vma CCCC overlapping the
> - * region **** is never going to extended over NNNN. Instead NNNN must
> - * be extended in region **** and CCCC must be removed. This way in
> - * all cases where vma_merge succeeds, the moment vma_merge drops the
> - * rmap_locks, the properties of the merged vma will be already
> - * correct for the whole merged range. Some of those properties like
> - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> - * be correct for the whole merged range immediately after the
> - * rmap_locks are released. Otherwise if NNNN would be removed and
> - * CCCC would be extended over the NNNN range, remove_migration_ptes
> - * or other rmap walkers (if working on addresses beyond the "end"
> - * parameter) may establish ptes with the wrong permissions of CCCC
> - * instead of the right permissions of NNNN.
> - *
> - * In the code below:
> - * PPPP is represented by *prev
> - * CCCC is represented by *curr or not represented at all (NULL)
> - * NNNN is represented by *next or not represented at all (NULL)
> - * **** is not represented - it will be merged and the vma containing the
> - * area is returned, or the function will return NULL
> - */
> -static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> -{
> - struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> - struct vm_area_struct *prev = vmg->prev;
> - struct vm_area_struct *curr, *next, *res;
> - struct vm_area_struct *vma, *adjust, *remove, *remove2;
> - struct vm_area_struct *anon_dup = NULL;
> - struct vma_prepare vp;
> - pgoff_t vma_pgoff;
> - int err = 0;
> - bool merge_prev = false;
> - bool merge_next = false;
> - bool vma_expanded = false;
> - unsigned long addr = vmg->start;
> - unsigned long end = vmg->end;
> - unsigned long vma_start = addr;
> - unsigned long vma_end = end;
> - pgoff_t pglen = PHYS_PFN(end - addr);
> - long adj_start = 0;
> -
> - /*
> - * We later require that vma->vm_flags == vm_flags,
> - * so this tests vma->vm_flags & VM_SPECIAL, too.
> - */
> - if (vmg->flags & VM_SPECIAL)
> - return NULL;
> -
> - /* Does the input range span an existing VMA? (cases 5 - 8) */
> - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
> -
> - if (!curr || /* cases 1 - 4 */
> - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> - next = vmg->next = vma_lookup(mm, end);
> - else
> - next = vmg->next = NULL; /* case 5 */
> -
> - if (prev) {
> - vma_start = prev->vm_start;
> - vma_pgoff = prev->vm_pgoff;
> -
> - /* Can we merge the predecessor? */
> - if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
> - merge_prev = true;
> - vma_prev(vmg->vmi);
> - }
> - }
> -
> - /* Can we merge the successor? */
> - if (next && can_vma_merge_before(vmg)) {
> - merge_next = true;
> - }
> -
> - /* Verify some invariant that must be enforced by the caller. */
> - VM_WARN_ON(prev && addr <= prev->vm_start);
> - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
> - VM_WARN_ON(addr >= end);
> -
> - if (!merge_prev && !merge_next)
> - return NULL; /* Not mergeable. */
> -
> - if (merge_prev)
> - vma_start_write(prev);
> -
> - res = vma = prev;
> - remove = remove2 = adjust = NULL;
> -
> - /* Can we merge both the predecessor and the successor? */
> - if (merge_prev && merge_next &&
> - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
> - vma_start_write(next);
> - remove = next; /* case 1 */
> - vma_end = next->vm_end;
> - err = dup_anon_vma(prev, next, &anon_dup);
> - if (curr) { /* case 6 */
> - vma_start_write(curr);
> - remove = curr;
> - remove2 = next;
> - /*
> - * Note that the dup_anon_vma below cannot overwrite err
> - * since the first caller would do nothing unless next
> - * has an anon_vma.
> - */
> - if (!next->anon_vma)
> - err = dup_anon_vma(prev, curr, &anon_dup);
> - }
> - } else if (merge_prev) { /* case 2 */
> - if (curr) {
> - vma_start_write(curr);
> - if (end == curr->vm_end) { /* case 7 */
> - /*
> - * can_vma_merge_after() assumed we would not be
> - * removing prev vma, so it skipped the check
> - * for vm_ops->close, but we are removing curr
> - */
> - if (curr->vm_ops && curr->vm_ops->close)
> - err = -EINVAL;
> - remove = curr;
> - } else { /* case 5 */
> - adjust = curr;
> - adj_start = end - curr->vm_start;
> - }
> - if (!err)
> - err = dup_anon_vma(prev, curr, &anon_dup);
> - }
> - } else { /* merge_next */
> - vma_start_write(next);
> - res = next;
> - if (prev && addr < prev->vm_end) { /* case 4 */
> - vma_start_write(prev);
> - vma_end = addr;
> - adjust = next;
> - adj_start = -(prev->vm_end - addr);
> - err = dup_anon_vma(next, prev, &anon_dup);
> - } else {
> - /*
> - * Note that cases 3 and 8 are the ONLY ones where prev
> - * is permitted to be (but is not necessarily) NULL.
> - */
> - vma = next; /* case 3 */
> - vma_start = addr;
> - vma_end = next->vm_end;
> - vma_pgoff = next->vm_pgoff - pglen;
> - if (curr) { /* case 8 */
> - vma_pgoff = curr->vm_pgoff;
> - vma_start_write(curr);
> - remove = curr;
> - err = dup_anon_vma(next, curr, &anon_dup);
> - }
> - }
> - }
> -
> - /* Error in anon_vma clone. */
> - if (err)
> - goto anon_vma_fail;
> -
> - if (vma_start < vma->vm_start || vma_end > vma->vm_end)
> - vma_expanded = true;
> -
> - if (vma_expanded) {
> - vma_iter_config(vmg->vmi, vma_start, vma_end);
> - } else {
> - vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> - adjust->vm_end);
> - }
> -
> - if (vma_iter_prealloc(vmg->vmi, vma))
> - goto prealloc_fail;
> -
> - init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
> - vp.anon_vma != adjust->anon_vma);
> -
> - vma_prepare(&vp);
> - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
> - vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> -
> - if (vma_expanded)
> - vma_iter_store(vmg->vmi, vma);
> -
> - if (adj_start) {
> - adjust->vm_start += adj_start;
> - adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> - if (adj_start < 0) {
> - WARN_ON(vma_expanded);
> - vma_iter_store(vmg->vmi, next);
> - }
> - }
> -
> - vma_complete(&vp, vmg->vmi, mm);
> - khugepaged_enter_vma(res, vmg->flags);
> - return res;
> -
> -prealloc_fail:
> - if (anon_dup)
> - unlink_anon_vmas(anon_dup);
> -
> -anon_vma_fail:
> - vma_iter_set(vmg->vmi, addr);
> - vma_iter_load(vmg->vmi);
> - return NULL;
> -}
> -
> -struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> -{
> - return vma_merge(vmg);
> -}
> -
> /*
> * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
> * context and anonymous VMA name within the range [start, end).
> @@ -1280,7 +1270,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> struct vm_area_struct *merged;
>
> /* First, try to merge. */
> - merged = vma_merge(vmg);
> + merged = vma_merge_modified(vmg);
> if (merged)
> return merged;
>
> diff --git a/mm/vma.h b/mm/vma.h
> index bbb173053f34..bf29ff569a3d 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -110,12 +110,6 @@ struct vm_area_struct
>
> struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
>
> -/*
> - * Temporary wrapper around vma_merge() so we can have a common interface for
> - * tests.
> - */
> -struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg);
> -
> struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> struct vm_area_struct *vma,
> unsigned long delta);
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified()
2024-08-06 13:42 ` Petr Tesařík
@ 2024-08-06 13:52 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 13:52 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 03:42:44PM GMT, Petr Tesařík wrote:
> On Mon, 5 Aug 2024 13:13:56 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > The existing vma_merge() function is no longer required to handle what were
> > previously referred to as cases 1-3 (i.e. the merging of a new VMA), as
> > this is now handled by vma_merge_new_vma().
> >
> > Additionally, we simplify the convoluted control flow of the original,
> > maintaining identical logic only expressed more clearly and doing away with
> > a complicated set of cases, rather logically examining each possible
> > outcome - merging of both the previous and subsequent VMA, merging of the
> > previous VMA and merging of the subsequent VMA alone.
> >
> > We now utilise the previously implemented commit_merge() function to share
> > logic with vma_expand() deduplicating code and providing less surface area
> > for bugs and confusion.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/vma.c | 474 +++++++++++++++++++++++++++----------------------------
> > mm/vma.h | 6 -
> > 2 files changed, 232 insertions(+), 248 deletions(-)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index b7e3c64d5d68..c55ae035f5d6 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -569,8 +569,7 @@ static int commit_merge(struct vma_merge_struct *vmg,
> > struct vm_area_struct *adjust,
> > struct vm_area_struct *remove,
> > struct vm_area_struct *remove2,
> > - long adj_start,
> > - bool expanded)
> > + long adj_start, bool expanded)
>
> Um. Oops? ;-)
Yup minor oops there :) will fix up and put in patch 8 if/when respun!
>
> Otherwise LGTM.
Thanks!
It's worth reviewing the use of commit_merge() here which answers your questions
on patch 8 as to the use of the adj_start / expanded params.
Keep in mind this is trying to retain existing logic as much as possible to
(somewhat!) minimise delta.
>
> Petr T
>
> > {
> > struct vma_prepare vp;
> >
> > @@ -607,6 +606,236 @@ static int commit_merge(struct vma_merge_struct *vmg,
> > return 0;
> > }
> >
> > +/*
> > + * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> > + * attributes modified.
> > + *
> > + * @vmg: Describes the modifications being made to a VMA and associated
> > + * metadata.
> > + *
> > + * When the attributes of a range within a VMA change, then it might be possible
> > + * for immediately adjacent VMAs to be merged into that VMA due to having
> > + * identical properties.
> > + *
> > + * This function checks for the existence of any such mergeable VMAs and updates
> > + * the maple tree describing the @vmg->vma->vm_mm address space to account for
> > + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
> > + *
> > + * As part of this operation, if a merge occurs, the @vmg object will have its
> > + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
> > + * calls to this function should reset these fields.
> > + *
> > + * Returns: The merged VMA if merge succeeds, or NULL otherwise.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must assign the VMA to be modifed to vmg->vma.
> > + * - The caller must have set vmg->prev to the previous VMA, if there is one.
> > + * - The caller does not need to set vmg->next, as we determine this.
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + */
> > +static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> > +{
> > + struct vm_area_struct *vma = vmg->vma;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next, *res;
> > + struct vm_area_struct *anon_dup = NULL;
> > + struct vm_area_struct *adjust = NULL;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + bool left_side = vma && start == vma->vm_start;
> > + bool right_side = vma && end == vma->vm_end;
> > + bool merge_will_delete_vma, merge_will_delete_next;
> > + bool merge_left, merge_right;
> > + bool merge_both = false;
> > + int err = 0;
> > + long adj_start = 0;
> > +
> > + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
> > + VM_WARN_ON(vmg->next); /* We set this. */
> > + VM_WARN_ON(prev && start <= prev->vm_start);
> > + VM_WARN_ON(start >= end);
> > + /*
> > + * If vma == prev, then we are offset into a VMA. Otherwise, if we are
> > + * not, we must span a portion of the VMA.
> > + */
> > + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
> > + vmg->end > vma->vm_end));
> > +
> > + /*
> > + * If a special mapping or neither at the furthermost left or right side
> > + * of the VMA, then we have no chance of merging and should abort.
> > + *
> > + * We later require that vma->vm_flags == vm_flags, so this tests
> > + * vma->vm_flags & VM_SPECIAL, too.
> > + */
> > + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
> > + return NULL;
> > +
> > + if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + merge_left = true;
> > + vma_prev(vmg->vmi);
> > + } else {
> > + merge_left = false;
> > + }
> > +
> > + if (right_side) {
> > + next = vmg->next = vma_lookup(vma->vm_mm, end);
> > +
> > + /*
> > + * We can merge right if there is a subsequent VMA, if it is
> > + * immediately adjacent, and if it is compatible with vma.
> > + */
> > + merge_right = next && end == next->vm_start &&
> > + can_vma_merge_before(vmg);
> > +
> > + /*
> > + * We can only merge both if the anonymous VMA of the previous
> > + * VMA is compatible with the anonymous VMA of the subsequent
> > + * VMA.
> > + *
> > + * Otherwise, we default to merging only the left.
> > + */
> > + if (merge_left && merge_right)
> > + merge_right = merge_both =
> > + is_mergeable_anon_vma(prev->anon_vma,
> > + next->anon_vma, NULL);
> > + } else {
> > + merge_right = false;
> > + next = NULL;
> > + }
> > +
> > + /* If we have nothing to merge, abort. */
> > + if (!merge_left && !merge_right)
> > + return NULL;
> > +
> > + /* If we span the entire VMA, a merge implies it will be deleted. */
> > + merge_will_delete_vma = left_side && right_side;
> > + /* If we merge both VMAs, then next is also deleted. */
> > + merge_will_delete_next = merge_both;
> > +
> > + /* No matter what happens, we will be adjusting vma. */
> > + vma_start_write(vma);
> > +
> > + if (merge_left)
> > + vma_start_write(prev);
> > +
> > + if (merge_right)
> > + vma_start_write(next);
> > +
> > + if (merge_both) {
> > + /*
> > + * |<----->|
> > + * |-------*********-------|
> > + * prev vma next
> > + * extend delete delete
> > + */
> > +
> > + vmg->vma = prev;
> > + vmg->start = prev->vm_start;
> > + vmg->end = next->vm_end;
> > + vmg->pgoff = prev->vm_pgoff;
> > +
> > + /*
> > + * We already ensured anon_vma compatibility above, so now it's
> > + * simply a case of, if prev has no anon_vma object, which of
> > + * next or vma contains the anon_vma we must duplicate.
> > + */
> > + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
> > + } else if (merge_left) {
> > + /*
> > + * |<----->| OR
> > + * |<--------->|
> > + * |-------*************
> > + * prev vma
> > + * extend shrink/delete
> > + */
> > +
> > + unsigned long end = vmg->end;
> > +
> > + vmg->vma = prev;
> > + vmg->start = prev->vm_start;
> > + vmg->pgoff = prev->vm_pgoff;
> > +
> > + if (merge_will_delete_vma) {
> > + /*
> > + * can_vma_merge_after() assumed we would not be
> > + * removing vma, so it skipped the check for
> > + * vm_ops->close, but we are removing vma.
> > + */
> > + if (vma->vm_ops && vma->vm_ops->close)
> > + err = -EINVAL;
> > + } else {
> > + adjust = vma;
> > + adj_start = end - vma->vm_start;
> > + }
> > +
> > + if (!err)
> > + err = dup_anon_vma(prev, vma, &anon_dup);
> > + } else { /* merge_right */
> > + /*
> > + * |<----->| OR
> > + * |<--------->|
> > + * *************-------|
> > + * vma next
> > + * shrink/delete extend
> > + */
> > +
> > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > +
> > + VM_WARN_ON(!merge_right);
> > + /* If we are offset into a VMA, then prev must be vma. */
> > + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
> > +
> > + if (merge_will_delete_vma) {
> > + vmg->vma = next;
> > + vmg->end = next->vm_end;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > + } else {
> > + /*
> > + * We shrink vma and expand next.
> > + *
> > + * IMPORTANT: This is the ONLY case where the final
> > + * merged VMA is NOT vmg->vma, but rather vmg->next.
> > + */
> > +
> > + vmg->start = vma->vm_start;
> > + vmg->end = start;
> > + vmg->pgoff = vma->vm_pgoff;
> > +
> > + adjust = next;
> > + adj_start = -(vma->vm_end - start);
> > + }
> > +
> > + err = dup_anon_vma(next, vma, &anon_dup);
> > + }
> > +
> > + if (err)
> > + goto abort;
> > +
> > + if (commit_merge(vmg, adjust,
> > + merge_will_delete_vma ? vma : NULL,
> > + merge_will_delete_next ? next : NULL,
> > + adj_start,
> > + /*
> > + * In nearly all cases, we expand vmg->vma. There is
> > + * one exception - merge_right where we partially span
> > + * the VMA. In this case we shrink the end of vmg->vma
> > + * and adjust the start of vmg->next accordingly.
> > + */
> > + !merge_right || merge_will_delete_vma))
> > + return NULL;
> > +
> > + res = merge_left ? prev : next;
> > + khugepaged_enter_vma(res, vmg->flags);
> > +
> > + return res;
> > +
> > +abort:
> > + vma_iter_set(vmg->vmi, start);
> > + vma_iter_load(vmg->vmi);
> > + return NULL;
> > +}
> > +
> > /*
> > * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > *
> > @@ -1022,245 +1251,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> > }
> >
> > -/*
> > - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> > - * figure out whether that can be merged with its predecessor or its
> > - * successor. Or both (it neatly fills a hole).
> > - *
> > - * In most cases - when called for mmap, brk or mremap - [addr,end) is
> > - * certain not to be mapped by the time vma_merge is called; but when
> > - * called for mprotect, it is certain to be already mapped (either at
> > - * an offset within prev, or at the start of next), and the flags of
> > - * this area are about to be changed to vm_flags - and the no-change
> > - * case has already been eliminated.
> > - *
> > - * The following mprotect cases have to be considered, where **** is
> > - * the area passed down from mprotect_fixup, never extending beyond one
> > - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> > - * at the same address as **** and is of the same or larger span, and
> > - * NNNN the next vma after ****:
> > - *
> > - * **** **** ****
> > - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> > - * cannot merge might become might become
> > - * PPNNNNNNNNNN PPPPPPPPPPCC
> > - * mmap, brk or case 4 below case 5 below
> > - * mremap move:
> > - * **** ****
> > - * PPPP NNNN PPPPCCCCNNNN
> > - * might become might become
> > - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> > - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> > - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> > - *
> > - * It is important for case 8 that the vma CCCC overlapping the
> > - * region **** is never going to extended over NNNN. Instead NNNN must
> > - * be extended in region **** and CCCC must be removed. This way in
> > - * all cases where vma_merge succeeds, the moment vma_merge drops the
> > - * rmap_locks, the properties of the merged vma will be already
> > - * correct for the whole merged range. Some of those properties like
> > - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> > - * be correct for the whole merged range immediately after the
> > - * rmap_locks are released. Otherwise if NNNN would be removed and
> > - * CCCC would be extended over the NNNN range, remove_migration_ptes
> > - * or other rmap walkers (if working on addresses beyond the "end"
> > - * parameter) may establish ptes with the wrong permissions of CCCC
> > - * instead of the right permissions of NNNN.
> > - *
> > - * In the code below:
> > - * PPPP is represented by *prev
> > - * CCCC is represented by *curr or not represented at all (NULL)
> > - * NNNN is represented by *next or not represented at all (NULL)
> > - * **** is not represented - it will be merged and the vma containing the
> > - * area is returned, or the function will return NULL
> > - */
> > -static struct vm_area_struct *vma_merge(struct vma_merge_struct *vmg)
> > -{
> > - struct mm_struct *mm = container_of(vmg->vmi->mas.tree, struct mm_struct, mm_mt);
> > - struct vm_area_struct *prev = vmg->prev;
> > - struct vm_area_struct *curr, *next, *res;
> > - struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > - struct vm_area_struct *anon_dup = NULL;
> > - struct vma_prepare vp;
> > - pgoff_t vma_pgoff;
> > - int err = 0;
> > - bool merge_prev = false;
> > - bool merge_next = false;
> > - bool vma_expanded = false;
> > - unsigned long addr = vmg->start;
> > - unsigned long end = vmg->end;
> > - unsigned long vma_start = addr;
> > - unsigned long vma_end = end;
> > - pgoff_t pglen = PHYS_PFN(end - addr);
> > - long adj_start = 0;
> > -
> > - /*
> > - * We later require that vma->vm_flags == vm_flags,
> > - * so this tests vma->vm_flags & VM_SPECIAL, too.
> > - */
> > - if (vmg->flags & VM_SPECIAL)
> > - return NULL;
> > -
> > - /* Does the input range span an existing VMA? (cases 5 - 8) */
> > - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
> > -
> > - if (!curr || /* cases 1 - 4 */
> > - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > - next = vmg->next = vma_lookup(mm, end);
> > - else
> > - next = vmg->next = NULL; /* case 5 */
> > -
> > - if (prev) {
> > - vma_start = prev->vm_start;
> > - vma_pgoff = prev->vm_pgoff;
> > -
> > - /* Can we merge the predecessor? */
> > - if (addr == prev->vm_end && can_vma_merge_after(vmg)) {
> > - merge_prev = true;
> > - vma_prev(vmg->vmi);
> > - }
> > - }
> > -
> > - /* Can we merge the successor? */
> > - if (next && can_vma_merge_before(vmg)) {
> > - merge_next = true;
> > - }
> > -
> > - /* Verify some invariant that must be enforced by the caller. */
> > - VM_WARN_ON(prev && addr <= prev->vm_start);
> > - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
> > - VM_WARN_ON(addr >= end);
> > -
> > - if (!merge_prev && !merge_next)
> > - return NULL; /* Not mergeable. */
> > -
> > - if (merge_prev)
> > - vma_start_write(prev);
> > -
> > - res = vma = prev;
> > - remove = remove2 = adjust = NULL;
> > -
> > - /* Can we merge both the predecessor and the successor? */
> > - if (merge_prev && merge_next &&
> > - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
> > - vma_start_write(next);
> > - remove = next; /* case 1 */
> > - vma_end = next->vm_end;
> > - err = dup_anon_vma(prev, next, &anon_dup);
> > - if (curr) { /* case 6 */
> > - vma_start_write(curr);
> > - remove = curr;
> > - remove2 = next;
> > - /*
> > - * Note that the dup_anon_vma below cannot overwrite err
> > - * since the first caller would do nothing unless next
> > - * has an anon_vma.
> > - */
> > - if (!next->anon_vma)
> > - err = dup_anon_vma(prev, curr, &anon_dup);
> > - }
> > - } else if (merge_prev) { /* case 2 */
> > - if (curr) {
> > - vma_start_write(curr);
> > - if (end == curr->vm_end) { /* case 7 */
> > - /*
> > - * can_vma_merge_after() assumed we would not be
> > - * removing prev vma, so it skipped the check
> > - * for vm_ops->close, but we are removing curr
> > - */
> > - if (curr->vm_ops && curr->vm_ops->close)
> > - err = -EINVAL;
> > - remove = curr;
> > - } else { /* case 5 */
> > - adjust = curr;
> > - adj_start = end - curr->vm_start;
> > - }
> > - if (!err)
> > - err = dup_anon_vma(prev, curr, &anon_dup);
> > - }
> > - } else { /* merge_next */
> > - vma_start_write(next);
> > - res = next;
> > - if (prev && addr < prev->vm_end) { /* case 4 */
> > - vma_start_write(prev);
> > - vma_end = addr;
> > - adjust = next;
> > - adj_start = -(prev->vm_end - addr);
> > - err = dup_anon_vma(next, prev, &anon_dup);
> > - } else {
> > - /*
> > - * Note that cases 3 and 8 are the ONLY ones where prev
> > - * is permitted to be (but is not necessarily) NULL.
> > - */
> > - vma = next; /* case 3 */
> > - vma_start = addr;
> > - vma_end = next->vm_end;
> > - vma_pgoff = next->vm_pgoff - pglen;
> > - if (curr) { /* case 8 */
> > - vma_pgoff = curr->vm_pgoff;
> > - vma_start_write(curr);
> > - remove = curr;
> > - err = dup_anon_vma(next, curr, &anon_dup);
> > - }
> > - }
> > - }
> > -
> > - /* Error in anon_vma clone. */
> > - if (err)
> > - goto anon_vma_fail;
> > -
> > - if (vma_start < vma->vm_start || vma_end > vma->vm_end)
> > - vma_expanded = true;
> > -
> > - if (vma_expanded) {
> > - vma_iter_config(vmg->vmi, vma_start, vma_end);
> > - } else {
> > - vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
> > - adjust->vm_end);
> > - }
> > -
> > - if (vma_iter_prealloc(vmg->vmi, vma))
> > - goto prealloc_fail;
> > -
> > - init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> > - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
> > - vp.anon_vma != adjust->anon_vma);
> > -
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
> > - vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> > -
> > - if (vma_expanded)
> > - vma_iter_store(vmg->vmi, vma);
> > -
> > - if (adj_start) {
> > - adjust->vm_start += adj_start;
> > - adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> > - if (adj_start < 0) {
> > - WARN_ON(vma_expanded);
> > - vma_iter_store(vmg->vmi, next);
> > - }
> > - }
> > -
> > - vma_complete(&vp, vmg->vmi, mm);
> > - khugepaged_enter_vma(res, vmg->flags);
> > - return res;
> > -
> > -prealloc_fail:
> > - if (anon_dup)
> > - unlink_anon_vmas(anon_dup);
> > -
> > -anon_vma_fail:
> > - vma_iter_set(vmg->vmi, addr);
> > - vma_iter_load(vmg->vmi);
> > - return NULL;
> > -}
> > -
> > -struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> > -{
> > - return vma_merge(vmg);
> > -}
> > -
> > /*
> > * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
> > * context and anonymous VMA name within the range [start, end).
> > @@ -1280,7 +1270,7 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
> > struct vm_area_struct *merged;
> >
> > /* First, try to merge. */
> > - merged = vma_merge(vmg);
> > + merged = vma_merge_modified(vmg);
> > if (merged)
> > return merged;
> >
> > diff --git a/mm/vma.h b/mm/vma.h
> > index bbb173053f34..bf29ff569a3d 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -110,12 +110,6 @@ struct vm_area_struct
> >
> > struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> > -/*
> > - * Temporary wrapper around vma_merge() so we can have a common interface for
> > - * tests.
> > - */
> > -struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg);
> > -
> > struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > struct vm_area_struct *vma,
> > unsigned long delta);
>
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified()
2024-08-05 12:13 ` [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified() Lorenzo Stoakes
2024-08-06 13:42 ` Petr Tesařík
@ 2024-08-09 13:44 ` Vlastimil Babka
2024-08-09 13:57 ` Lorenzo Stoakes
1 sibling, 1 reply; 53+ messages in thread
From: Vlastimil Babka @ 2024-08-09 13:44 UTC (permalink / raw)
To: Lorenzo Stoakes, linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> The existing vma_merge() function is no longer required to handle what were
> previously referred to as cases 1-3 (i.e. the merging of a new VMA), as
> this is now handled by vma_merge_new_vma().
>
> Additionally, we simplify the convoluted control flow of the original,
> maintaining identical logic only expressed more clearly and doing away with
> a complicated set of cases, rather logically examining each possible
> outcome - merging of both the previous and subsequent VMA, merging of the
> previous VMA and merging of the subsequent VMA alone.
>
> We now utilise the previously implemented commit_merge() function to share
> logic with vma_expand() deduplicating code and providing less surface area
> for bugs and confusion.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/vma.c | 474 +++++++++++++++++++++++++++----------------------------
> mm/vma.h | 6 -
> 2 files changed, 232 insertions(+), 248 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index b7e3c64d5d68..c55ae035f5d6 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -569,8 +569,7 @@ static int commit_merge(struct vma_merge_struct *vmg,
> struct vm_area_struct *adjust,
> struct vm_area_struct *remove,
> struct vm_area_struct *remove2,
> - long adj_start,
> - bool expanded)
> + long adj_start, bool expanded)
> {
> struct vma_prepare vp;
>
> @@ -607,6 +606,236 @@ static int commit_merge(struct vma_merge_struct *vmg,
> return 0;
> }
>
> +/*
> + * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> + * attributes modified.
> + *
> + * @vmg: Describes the modifications being made to a VMA and associated
> + * metadata.
> + *
> + * When the attributes of a range within a VMA change, then it might be possible
> + * for immediately adjacent VMAs to be merged into that VMA due to having
> + * identical properties.
> + *
> + * This function checks for the existence of any such mergeable VMAs and updates
> + * the maple tree describing the @vmg->vma->vm_mm address space to account for
> + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
> + *
> + * As part of this operation, if a merge occurs, the @vmg object will have its
> + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
> + * calls to this function should reset these fields.
> + *
> + * Returns: The merged VMA if merge succeeds, or NULL otherwise.
> + *
> + * ASSUMPTIONS:
> + * - The caller must assign the VMA to be modifed to vmg->vma.
> + * - The caller must have set vmg->prev to the previous VMA, if there is one.
> + * - The caller does not need to set vmg->next, as we determine this.
> + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
Also there's again some assumption about vmi? :)
> + */
> +static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> +{
> + struct vm_area_struct *vma = vmg->vma;
> + struct vm_area_struct *prev = vmg->prev;
> + struct vm_area_struct *next, *res;
> + struct vm_area_struct *anon_dup = NULL;
> + struct vm_area_struct *adjust = NULL;
> + unsigned long start = vmg->start;
> + unsigned long end = vmg->end;
> + bool left_side = vma && start == vma->vm_start;
> + bool right_side = vma && end == vma->vm_end;
> + bool merge_will_delete_vma, merge_will_delete_next;
> + bool merge_left, merge_right;
> + bool merge_both = false;
> + int err = 0;
> + long adj_start = 0;
> +
> + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
> + VM_WARN_ON(vmg->next); /* We set this. */
> + VM_WARN_ON(prev && start <= prev->vm_start);
> + VM_WARN_ON(start >= end);
> + /*
> + * If vma == prev, then we are offset into a VMA. Otherwise, if we are
> + * not, we must span a portion of the VMA.
> + */
> + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
> + vmg->end > vma->vm_end));
> +
> + /*
> + * If a special mapping or neither at the furthermost left or right side
> + * of the VMA, then we have no chance of merging and should abort.
> + *
> + * We later require that vma->vm_flags == vm_flags, so this tests
> + * vma->vm_flags & VM_SPECIAL, too.
> + */
> + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
> + return NULL;
> +
> + if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> + merge_left = true;
> + vma_prev(vmg->vmi);
> + } else {
> + merge_left = false;
> + }
> +
> + if (right_side) {
> + next = vmg->next = vma_lookup(vma->vm_mm, end);
> +
> + /*
> + * We can merge right if there is a subsequent VMA, if it is
> + * immediately adjacent, and if it is compatible with vma.
> + */
> + merge_right = next && end == next->vm_start &&
> + can_vma_merge_before(vmg);
> +
> + /*
> + * We can only merge both if the anonymous VMA of the previous
> + * VMA is compatible with the anonymous VMA of the subsequent
> + * VMA.
> + *
> + * Otherwise, we default to merging only the left.
> + */
> + if (merge_left && merge_right)
> + merge_right = merge_both =
> + is_mergeable_anon_vma(prev->anon_vma,
> + next->anon_vma, NULL);
> + } else {
> + merge_right = false;
> + next = NULL;
> + }
> +
> + /* If we have nothing to merge, abort. */
> + if (!merge_left && !merge_right)
> + return NULL;
> +
> + /* If we span the entire VMA, a merge implies it will be deleted. */
> + merge_will_delete_vma = left_side && right_side;
> + /* If we merge both VMAs, then next is also deleted. */
> + merge_will_delete_next = merge_both;
> +
> + /* No matter what happens, we will be adjusting vma. */
> + vma_start_write(vma);
> +
> + if (merge_left)
> + vma_start_write(prev);
> +
> + if (merge_right)
> + vma_start_write(next);
> +
> + if (merge_both) {
> + /*
> + * |<----->|
> + * |-------*********-------|
> + * prev vma next
> + * extend delete delete
> + */
> +
> + vmg->vma = prev;
> + vmg->start = prev->vm_start;
> + vmg->end = next->vm_end;
> + vmg->pgoff = prev->vm_pgoff;
> +
> + /*
> + * We already ensured anon_vma compatibility above, so now it's
> + * simply a case of, if prev has no anon_vma object, which of
> + * next or vma contains the anon_vma we must duplicate.
> + */
> + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
> + } else if (merge_left) {
> + /*
> + * |<----->| OR
> + * |<--------->|
> + * |-------*************
> + * prev vma
> + * extend shrink/delete
> + */
> +
> + unsigned long end = vmg->end;
Nit: This is only used once below, thus could be used directly?
> +
> + vmg->vma = prev;
> + vmg->start = prev->vm_start;
> + vmg->pgoff = prev->vm_pgoff;
> +
> + if (merge_will_delete_vma) {
> + /*
> + * can_vma_merge_after() assumed we would not be
> + * removing vma, so it skipped the check for
> + * vm_ops->close, but we are removing vma.
> + */
> + if (vma->vm_ops && vma->vm_ops->close)
> + err = -EINVAL;
> + } else {
> + adjust = vma;
> + adj_start = end - vma->vm_start;
> + }
> +
> + if (!err)
> + err = dup_anon_vma(prev, vma, &anon_dup);
> + } else { /* merge_right */
> + /*
> + * |<----->| OR
> + * |<--------->|
> + * *************-------|
> + * vma next
> + * shrink/delete extend
> + */
> +
> + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> +
> + VM_WARN_ON(!merge_right);
> + /* If we are offset into a VMA, then prev must be vma. */
> + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
> +
> + if (merge_will_delete_vma) {
> + vmg->vma = next;
> + vmg->end = next->vm_end;
> + vmg->pgoff = next->vm_pgoff - pglen;
> + } else {
> + /*
> + * We shrink vma and expand next.
> + *
> + * IMPORTANT: This is the ONLY case where the final
> + * merged VMA is NOT vmg->vma, but rather vmg->next.
> + */
> +
> + vmg->start = vma->vm_start;
> + vmg->end = start;
> + vmg->pgoff = vma->vm_pgoff;
> +
> + adjust = next;
> + adj_start = -(vma->vm_end - start);
> + }
> +
> + err = dup_anon_vma(next, vma, &anon_dup);
> + }
> +
> + if (err)
> + goto abort;
> +
> + if (commit_merge(vmg, adjust,
> + merge_will_delete_vma ? vma : NULL,
> + merge_will_delete_next ? next : NULL,
> + adj_start,
> + /*
> + * In nearly all cases, we expand vmg->vma. There is
> + * one exception - merge_right where we partially span
> + * the VMA. In this case we shrink the end of vmg->vma
> + * and adjust the start of vmg->next accordingly.
> + */
> + !merge_right || merge_will_delete_vma))
> + return NULL;
If this fails, you need to unlink_anon_vma() ? The old code did.
> + res = merge_left ? prev : next;
> + khugepaged_enter_vma(res, vmg->flags);
> +
> + return res;
> +
> +abort:
> + vma_iter_set(vmg->vmi, start);
> + vma_iter_load(vmg->vmi);
> + return NULL;
> +}
> +
> /*
> * vma_merge_new_vma - Attempt to merge a new VMA into address space
> *
> @@ -1022,245 +1251,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> }
>
> -/*
> - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> - * figure out whether that can be merged with its predecessor or its
> - * successor. Or both (it neatly fills a hole).
> - *
> - * In most cases - when called for mmap, brk or mremap - [addr,end) is
> - * certain not to be mapped by the time vma_merge is called; but when
> - * called for mprotect, it is certain to be already mapped (either at
> - * an offset within prev, or at the start of next), and the flags of
> - * this area are about to be changed to vm_flags - and the no-change
> - * case has already been eliminated.
> - *
> - * The following mprotect cases have to be considered, where **** is
> - * the area passed down from mprotect_fixup, never extending beyond one
> - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> - * at the same address as **** and is of the same or larger span, and
> - * NNNN the next vma after ****:
> - *
> - * **** **** ****
> - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> - * cannot merge might become might become
> - * PPNNNNNNNNNN PPPPPPPPPPCC
> - * mmap, brk or case 4 below case 5 below
> - * mremap move:
> - * **** ****
> - * PPPP NNNN PPPPCCCCNNNN
> - * might become might become
> - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> - *
> - * It is important for case 8 that the vma CCCC overlapping the
> - * region **** is never going to extended over NNNN. Instead NNNN must
> - * be extended in region **** and CCCC must be removed. This way in
> - * all cases where vma_merge succeeds, the moment vma_merge drops the
> - * rmap_locks, the properties of the merged vma will be already
> - * correct for the whole merged range. Some of those properties like
> - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> - * be correct for the whole merged range immediately after the
> - * rmap_locks are released. Otherwise if NNNN would be removed and
> - * CCCC would be extended over the NNNN range, remove_migration_ptes
> - * or other rmap walkers (if working on addresses beyond the "end"
> - * parameter) may establish ptes with the wrong permissions of CCCC
> - * instead of the right permissions of NNNN.
> - *
> - * In the code below:
> - * PPPP is represented by *prev
> - * CCCC is represented by *curr or not represented at all (NULL)
> - * NNNN is represented by *next or not represented at all (NULL)
> - * **** is not represented - it will be merged and the vma containing the
> - * area is returned, or the function will return NULL
> - */
RIP our precious diagrams.
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified()
2024-08-09 13:44 ` Vlastimil Babka
@ 2024-08-09 13:57 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 13:57 UTC (permalink / raw)
To: Vlastimil Babka; +Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Fri, Aug 09, 2024 at 03:44:00PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > The existing vma_merge() function is no longer required to handle what were
> > previously referred to as cases 1-3 (i.e. the merging of a new VMA), as
> > this is now handled by vma_merge_new_vma().
> >
> > Additionally, we simplify the convoluted control flow of the original,
> > maintaining identical logic only expressed more clearly and doing away with
> > a complicated set of cases, rather logically examining each possible
> > outcome - merging of both the previous and subsequent VMA, merging of the
> > previous VMA and merging of the subsequent VMA alone.
> >
> > We now utilise the previously implemented commit_merge() function to share
> > logic with vma_expand() deduplicating code and providing less surface area
> > for bugs and confusion.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/vma.c | 474 +++++++++++++++++++++++++++----------------------------
> > mm/vma.h | 6 -
> > 2 files changed, 232 insertions(+), 248 deletions(-)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index b7e3c64d5d68..c55ae035f5d6 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -569,8 +569,7 @@ static int commit_merge(struct vma_merge_struct *vmg,
> > struct vm_area_struct *adjust,
> > struct vm_area_struct *remove,
> > struct vm_area_struct *remove2,
> > - long adj_start,
> > - bool expanded)
> > + long adj_start, bool expanded)
> > {
> > struct vma_prepare vp;
> >
> > @@ -607,6 +606,236 @@ static int commit_merge(struct vma_merge_struct *vmg,
> > return 0;
> > }
> >
> > +/*
> > + * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> > + * attributes modified.
> > + *
> > + * @vmg: Describes the modifications being made to a VMA and associated
> > + * metadata.
> > + *
> > + * When the attributes of a range within a VMA change, then it might be possible
> > + * for immediately adjacent VMAs to be merged into that VMA due to having
> > + * identical properties.
> > + *
> > + * This function checks for the existence of any such mergeable VMAs and updates
> > + * the maple tree describing the @vmg->vma->vm_mm address space to account for
> > + * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
> > + *
> > + * As part of this operation, if a merge occurs, the @vmg object will have its
> > + * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
> > + * calls to this function should reset these fields.
> > + *
> > + * Returns: The merged VMA if merge succeeds, or NULL otherwise.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must assign the VMA to be modifed to vmg->vma.
> > + * - The caller must have set vmg->prev to the previous VMA, if there is one.
> > + * - The caller does not need to set vmg->next, as we determine this.
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
>
> Also there's again some assumption about vmi? :)
Yeah I will add.
>
> > + */
> > +static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> > +{
> > + struct vm_area_struct *vma = vmg->vma;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next, *res;
> > + struct vm_area_struct *anon_dup = NULL;
> > + struct vm_area_struct *adjust = NULL;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + bool left_side = vma && start == vma->vm_start;
> > + bool right_side = vma && end == vma->vm_end;
> > + bool merge_will_delete_vma, merge_will_delete_next;
> > + bool merge_left, merge_right;
> > + bool merge_both = false;
> > + int err = 0;
> > + long adj_start = 0;
> > +
> > + VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
> > + VM_WARN_ON(vmg->next); /* We set this. */
> > + VM_WARN_ON(prev && start <= prev->vm_start);
> > + VM_WARN_ON(start >= end);
> > + /*
> > + * If vma == prev, then we are offset into a VMA. Otherwise, if we are
> > + * not, we must span a portion of the VMA.
> > + */
> > + VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
> > + vmg->end > vma->vm_end));
> > +
> > + /*
> > + * If a special mapping or neither at the furthermost left or right side
> > + * of the VMA, then we have no chance of merging and should abort.
> > + *
> > + * We later require that vma->vm_flags == vm_flags, so this tests
> > + * vma->vm_flags & VM_SPECIAL, too.
> > + */
> > + if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
> > + return NULL;
> > +
> > + if (left_side && prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + merge_left = true;
> > + vma_prev(vmg->vmi);
> > + } else {
> > + merge_left = false;
> > + }
> > +
> > + if (right_side) {
> > + next = vmg->next = vma_lookup(vma->vm_mm, end);
> > +
> > + /*
> > + * We can merge right if there is a subsequent VMA, if it is
> > + * immediately adjacent, and if it is compatible with vma.
> > + */
> > + merge_right = next && end == next->vm_start &&
> > + can_vma_merge_before(vmg);
> > +
> > + /*
> > + * We can only merge both if the anonymous VMA of the previous
> > + * VMA is compatible with the anonymous VMA of the subsequent
> > + * VMA.
> > + *
> > + * Otherwise, we default to merging only the left.
> > + */
> > + if (merge_left && merge_right)
> > + merge_right = merge_both =
> > + is_mergeable_anon_vma(prev->anon_vma,
> > + next->anon_vma, NULL);
> > + } else {
> > + merge_right = false;
> > + next = NULL;
> > + }
> > +
> > + /* If we have nothing to merge, abort. */
> > + if (!merge_left && !merge_right)
> > + return NULL;
> > +
> > + /* If we span the entire VMA, a merge implies it will be deleted. */
> > + merge_will_delete_vma = left_side && right_side;
> > + /* If we merge both VMAs, then next is also deleted. */
> > + merge_will_delete_next = merge_both;
> > +
> > + /* No matter what happens, we will be adjusting vma. */
> > + vma_start_write(vma);
> > +
> > + if (merge_left)
> > + vma_start_write(prev);
> > +
> > + if (merge_right)
> > + vma_start_write(next);
> > +
> > + if (merge_both) {
> > + /*
> > + * |<----->|
> > + * |-------*********-------|
> > + * prev vma next
> > + * extend delete delete
> > + */
> > +
> > + vmg->vma = prev;
> > + vmg->start = prev->vm_start;
> > + vmg->end = next->vm_end;
> > + vmg->pgoff = prev->vm_pgoff;
> > +
> > + /*
> > + * We already ensured anon_vma compatibility above, so now it's
> > + * simply a case of, if prev has no anon_vma object, which of
> > + * next or vma contains the anon_vma we must duplicate.
> > + */
> > + err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
> > + } else if (merge_left) {
> > + /*
> > + * |<----->| OR
> > + * |<--------->|
> > + * |-------*************
> > + * prev vma
> > + * extend shrink/delete
> > + */
> > +
> > + unsigned long end = vmg->end;
>
> Nit: This is only used once below, thus could be used directly?
Yeah this is probably a holdover from a previous (maybe buggy before I
fixed it) version of this code which used it more than once.
Will fix.
>
> > +
> > + vmg->vma = prev;
> > + vmg->start = prev->vm_start;
> > + vmg->pgoff = prev->vm_pgoff;
> > +
> > + if (merge_will_delete_vma) {
> > + /*
> > + * can_vma_merge_after() assumed we would not be
> > + * removing vma, so it skipped the check for
> > + * vm_ops->close, but we are removing vma.
> > + */
> > + if (vma->vm_ops && vma->vm_ops->close)
> > + err = -EINVAL;
> > + } else {
> > + adjust = vma;
> > + adj_start = end - vma->vm_start;
> > + }
> > +
> > + if (!err)
> > + err = dup_anon_vma(prev, vma, &anon_dup);
> > + } else { /* merge_right */
> > + /*
> > + * |<----->| OR
> > + * |<--------->|
> > + * *************-------|
> > + * vma next
> > + * shrink/delete extend
> > + */
> > +
> > + pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> > +
> > + VM_WARN_ON(!merge_right);
> > + /* If we are offset into a VMA, then prev must be vma. */
> > + VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
> > +
> > + if (merge_will_delete_vma) {
> > + vmg->vma = next;
> > + vmg->end = next->vm_end;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > + } else {
> > + /*
> > + * We shrink vma and expand next.
> > + *
> > + * IMPORTANT: This is the ONLY case where the final
> > + * merged VMA is NOT vmg->vma, but rather vmg->next.
> > + */
> > +
> > + vmg->start = vma->vm_start;
> > + vmg->end = start;
> > + vmg->pgoff = vma->vm_pgoff;
> > +
> > + adjust = next;
> > + adj_start = -(vma->vm_end - start);
> > + }
> > +
> > + err = dup_anon_vma(next, vma, &anon_dup);
> > + }
> > +
> > + if (err)
> > + goto abort;
> > +
> > + if (commit_merge(vmg, adjust,
> > + merge_will_delete_vma ? vma : NULL,
> > + merge_will_delete_next ? next : NULL,
> > + adj_start,
> > + /*
> > + * In nearly all cases, we expand vmg->vma. There is
> > + * one exception - merge_right where we partially span
> > + * the VMA. In this case we shrink the end of vmg->vma
> > + * and adjust the start of vmg->next accordingly.
> > + */
> > + !merge_right || merge_will_delete_vma))
> > + return NULL;
>
> If this fails, you need to unlink_anon_vma() ? The old code did.
You're right, good spot this is a subtle one...
Will fix and I'll add a test for this too. The preallocate would have to
fail, but we can simulate that now...
>
>
> > + res = merge_left ? prev : next;
> > + khugepaged_enter_vma(res, vmg->flags);
> > +
> > + return res;
> > +
> > +abort:
> > + vma_iter_set(vmg->vmi, start);
> > + vma_iter_load(vmg->vmi);
> > + return NULL;
> > +}
> > +
> > /*
> > * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > *
> > @@ -1022,245 +1251,6 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> > }
> >
> > -/*
> > - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> > - * figure out whether that can be merged with its predecessor or its
> > - * successor. Or both (it neatly fills a hole).
> > - *
> > - * In most cases - when called for mmap, brk or mremap - [addr,end) is
> > - * certain not to be mapped by the time vma_merge is called; but when
> > - * called for mprotect, it is certain to be already mapped (either at
> > - * an offset within prev, or at the start of next), and the flags of
> > - * this area are about to be changed to vm_flags - and the no-change
> > - * case has already been eliminated.
> > - *
> > - * The following mprotect cases have to be considered, where **** is
> > - * the area passed down from mprotect_fixup, never extending beyond one
> > - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> > - * at the same address as **** and is of the same or larger span, and
> > - * NNNN the next vma after ****:
> > - *
> > - * **** **** ****
> > - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> > - * cannot merge might become might become
> > - * PPNNNNNNNNNN PPPPPPPPPPCC
> > - * mmap, brk or case 4 below case 5 below
> > - * mremap move:
> > - * **** ****
> > - * PPPP NNNN PPPPCCCCNNNN
> > - * might become might become
> > - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> > - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> > - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> > - *
> > - * It is important for case 8 that the vma CCCC overlapping the
> > - * region **** is never going to extended over NNNN. Instead NNNN must
> > - * be extended in region **** and CCCC must be removed. This way in
> > - * all cases where vma_merge succeeds, the moment vma_merge drops the
> > - * rmap_locks, the properties of the merged vma will be already
> > - * correct for the whole merged range. Some of those properties like
> > - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> > - * be correct for the whole merged range immediately after the
> > - * rmap_locks are released. Otherwise if NNNN would be removed and
> > - * CCCC would be extended over the NNNN range, remove_migration_ptes
> > - * or other rmap walkers (if working on addresses beyond the "end"
> > - * parameter) may establish ptes with the wrong permissions of CCCC
> > - * instead of the right permissions of NNNN.
> > - *
> > - * In the code below:
> > - * PPPP is represented by *prev
> > - * CCCC is represented by *curr or not represented at all (NULL)
> > - * NNNN is represented by *next or not represented at all (NULL)
> > - * **** is not represented - it will be merged and the vma containing the
> > - * area is returned, or the function will return NULL
> > - */
>
> RIP our precious diagrams.
>
I always disliked these... and I can say so because I was involved in
changing them so it's self-criticism too :)
Very much representing the overwrought complexity of trying to do
everything in one function.
^ permalink raw reply [flat|nested] 53+ messages in thread
* [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-05 12:13 [PATCH 00/10] mm: remove vma_merge() Lorenzo Stoakes
` (8 preceding siblings ...)
2024-08-05 12:13 ` [PATCH 09/10] mm: refactor vma_merge() into modify-only vma_merge_modified() Lorenzo Stoakes
@ 2024-08-05 12:13 ` Lorenzo Stoakes
2024-08-06 13:55 ` Petr Tesařík
2024-08-09 14:25 ` Vlastimil Babka
9 siblings, 2 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-05 12:13 UTC (permalink / raw)
To: linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett, Vlastimil Babka
In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
removed in mergeability test") we relaxed the VMA merge rules for VMAs
possessing a vm_ops->close() hook, permitting this operation in instances
where we wouldn't delete the VMA as part of the merge operation.
This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
case 7 with vma_ops->close") to account for a subtle case that the previous
commit had not taken into account.
In both instances, we first rely on is_mergeable_vma() to determine whether
we might be dealing with a VMA that might be removed, taking advantage of
the fact that a 'previous' VMA will never be deleted, only VMAs that follow
it.
The second patch corrects the instance where a merge of the previous VMA
into a subsequent one did not correctly check whether the subsequent VMA
had a vm_ops->close() handler.
Both changes prevent merge cases that are actually permissible (for
instance a merge of a VMA into a following VMA with a vm_ops->close(), but
with no previous VMA, which would result in the next VMA being extended,
not deleted).
In addition, both changes fail to consider the case where a VMA that would
otherwise be merged with the previous and next VMA might have
vm_ops->close(), on the assumption that for this to be the case, all three
would have to have the same vma->vm_file to be mergeable and thus the same
vm_ops.
And in addition both changes operate at 50,000 feet, trying to guess
whether a VMA will be deleted.
As we have majorly refactored the VMA merge operation and de-duplicated
code to the point where we know precisely where deletions will occur, this
patch removes the aforementioned checks altogether and instead explicitly
checks whether a VMA will be deleted.
In cases where a reduced merge is still possible (where we merge both
previous and next VMA but the next VMA has a vm_ops->close hook, meaning we
could just merge the previous and current VMA), we do so, otherwise the
merge is not permitted.
We take advantage of our userland testing to assert that this functions
correctly - replacing the previous limited vm_ops->close() tests with tests
for every single case where we delete a VMA.
We also update all testing for both new and modified VMAs to set
vma->vm_ops->close() in every single instance where this would not prevent
the merge, to assert that we never do so.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/vma.c | 69 ++++++++-----
tools/testing/vma/vma.c | 213 ++++++++++++++++++++++++----------------
2 files changed, 173 insertions(+), 109 deletions(-)
diff --git a/mm/vma.c b/mm/vma.c
index c55ae035f5d6..9c779fc65ba8 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -10,14 +10,6 @@
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
- /*
- * If the vma has a ->close operation then the driver probably needs to
- * release per-vma resources, so we don't attempt to merge those if the
- * caller indicates the current vma may be removed as part of the merge,
- * which is the case if we are attempting to merge the next VMA into
- * this one.
- */
- bool may_remove_vma = merge_next;
if (!mpol_equal(vmg->policy, vma_policy(vma)))
return false;
@@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return false;
if (vma->vm_file != vmg->file)
return false;
- if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
- return false;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
return false;
if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
@@ -606,6 +596,12 @@ static int commit_merge(struct vma_merge_struct *vmg,
return 0;
}
+/* We can only remove VMAs when merging if they do not have a close hook. */
+static bool can_merge_remove_vma(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops || !vma->vm_ops->close;
+}
+
/*
* vma_merge_modified - Attempt to merge VMAs based on a VMA having its
* attributes modified.
@@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
/* If we span the entire VMA, a merge implies it will be deleted. */
merge_will_delete_vma = left_side && right_side;
- /* If we merge both VMAs, then next is also deleted. */
+
+ /*
+ * If we need to remove vma in its entirety but are unable to do so,
+ * we have no sensible recourse but to abort the merge.
+ */
+ if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+ return NULL;
+
+ /*
+ * If we merge both VMAs, then next is also deleted. This implies
+ * merge_will_delete_vma also.
+ */
merge_will_delete_next = merge_both;
+ /*
+ * If we cannot delete next, then we can reduce the operation to merging
+ * prev and vma (thereby deleting vma).
+ */
+ if (merge_will_delete_next && !can_merge_remove_vma(next)) {
+ merge_will_delete_next = false;
+ merge_right = false;
+ merge_both = false;
+ }
+
/* No matter what happens, we will be adjusting vma. */
vma_start_write(vma);
@@ -756,21 +773,12 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
vmg->start = prev->vm_start;
vmg->pgoff = prev->vm_pgoff;
- if (merge_will_delete_vma) {
- /*
- * can_vma_merge_after() assumed we would not be
- * removing vma, so it skipped the check for
- * vm_ops->close, but we are removing vma.
- */
- if (vma->vm_ops && vma->vm_ops->close)
- err = -EINVAL;
- } else {
+ if (!merge_will_delete_vma) {
adjust = vma;
adj_start = end - vma->vm_start;
}
- if (!err)
- err = dup_anon_vma(prev, vma, &anon_dup);
+ err = dup_anon_vma(prev, vma, &anon_dup);
} else { /* merge_right */
/*
* |<----->| OR
@@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
unsigned long end = vmg->end;
pgoff_t pgoff = vmg->pgoff;
pgoff_t pglen = PHYS_PFN(end - start);
+ bool merge_next = false;
+ struct anon_vma *anon_vma = vmg->anon_vma;
VM_WARN_ON(vmg->vma);
@@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
vmg->end = next->vm_end;
vmg->vma = next;
vmg->pgoff = next->vm_pgoff - pglen;
-
vmg->anon_vma = next->anon_vma;
+
+ merge_next = true;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
@@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
vmg->start = prev->vm_start;
vmg->vma = prev;
vmg->pgoff = prev->vm_pgoff;
+
+ /*
+ * If this merge would result in removal of the next VMA but we
+ * are not permitted to do so, reduce the operation to merging
+ * prev and vma.
+ */
+ if (merge_next && !can_merge_remove_vma(next)) {
+ vmg->end = end;
+ vmg->anon_vma = anon_vma;
+ }
} else if (prev) {
vma_iter_next_range(vmg->vmi);
}
@@ -978,6 +999,8 @@ int vma_expand(struct vma_merge_struct *vmg)
int ret;
remove_next = true;
+ /* This should already have been checked by this point. */
+ VM_WARN_ON(!can_merge_remove_vma(next));
vma_start_write(next);
ret = dup_anon_vma(vma, next, &anon_dup);
if (ret)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index e465dc22e2d0..0c0a6ffcfc98 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -327,6 +327,9 @@ static bool test_vma_merge_new_vma(void)
struct anon_vma_chain dummy_anon_vma_chain_d = {
.anon_vma = &dummy_anon_vma,
};
+ const struct vm_operations_struct vm_ops = {
+ .close = dummy_close,
+ };
int count;
struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
bool merged;
@@ -370,6 +373,7 @@ static bool test_vma_merge_new_vma(void)
* 0123456789abc
* AA*B DD CC
*/
+ vma_a->vm_ops = &vm_ops; /* This should have no impact. */
vma_b->anon_vma = &dummy_anon_vma;
vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
ASSERT_EQ(vma, vma_a);
@@ -406,6 +410,7 @@ static bool test_vma_merge_new_vma(void)
* AAAAA *DD CC
*/
vma_d->anon_vma = &dummy_anon_vma;
+ vma_d->vm_ops = &vm_ops; /* This should have no impact. */
vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
ASSERT_EQ(vma, vma_d);
/* Prepend. */
@@ -423,6 +428,7 @@ static bool test_vma_merge_new_vma(void)
* 0123456789abc
* AAAAA*DDD CC
*/
+ vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
ASSERT_EQ(vma, vma_a);
/* Merge with A, delete D. */
@@ -573,120 +579,145 @@ static bool test_vma_merge_with_close(void)
struct vma_merge_struct vmg = {
.vmi = &vmi,
};
- struct vm_operations_struct vm_ops = {};
- struct vm_area_struct *vma_next =
- alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
- struct vm_area_struct *vma;
+ const struct vm_operations_struct vm_ops = {
+ .close = dummy_close,
+ };
+ struct vm_area_struct *vma_prev, *vma_next, *vma;
/*
- * When we merge VMAs we sometimes have to delete others as part of the
- * operation.
- *
- * Considering the two possible adjacent VMAs to which a VMA can be
- * merged:
- *
- * [ prev ][ vma ][ next ]
- *
- * In no case will we need to delete prev. If the operation is
- * mergeable, then prev will be extended with one or both of vma and
- * next deleted.
- *
- * As a result, during initial mergeability checks, only
- * can_vma_merge_before() (which implies the VMA being merged with is
- * 'next' as shown above) bothers to check to see whether the next VMA
- * has a vm_ops->close() callback that will need to be called when
- * removed.
- *
- * If it does, then we cannot merge as the resources that the close()
- * operation potentially clears down are tied only to the existing VMA
- * range and we have no way of extending those to the nearly merged one.
- *
- * We must consider two scenarios:
- *
- * A.
+ * When merging VMAs we are not permitted to remove any VMA that has a
+ * vm_ops->close() hook.
*
- * vm_ops->close: - - !NULL
- * [ prev ][ vma ][ next ]
- *
- * Where prev may or may not be present/mergeable.
- *
- * This is picked up by a specific check in can_vma_merge_before().
- *
- * B.
- *
- * vm_ops->close: - !NULL
- * [ prev ][ vma ]
- *
- * Where prev and vma are present and mergeable.
- *
- * This is picked up by a specific check in vma_merge_modified().
- *
- * IMPORTANT NOTE: We make the assumption that the following case:
+ * This is because executing this hook may clear state that is pertinent
+ * to the VMA range as a whole.
+ */
+
+ /*
+ * The only case of a new VMA merge that results in a VMA being deleted
+ * is one where both the previous and next VMAs are merged - in this
+ * instance the next VMA is deleted, and the previous VMA is extended.
*
- * - !NULL NULL
- * [ prev ][ vma ][ next ]
+ * If we are unable to do so, we reduce the operation to simply
+ * extending the prev VMA and not merging next.
*
- * Cannot occur, because vma->vm_ops being the same implies the same
- * vma->vm_file, and therefore this would mean that next->vm_ops->close
- * would be set too, and thus scenario A would pick this up.
+ * 0123456789
+ * PPP**NNNN
+ * ->
+ * 0123456789
+ * PPPPPPNNN
*/
- ASSERT_NE(vma_next, NULL);
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+ vma_next->vm_ops = &vm_ops;
+
+ vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+ ASSERT_EQ(vma_merge_new_vma(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x5000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
/*
- * SCENARIO A
+ * When modifying an existing VMA there are further cases where we
+ * delete VMAs.
+ *
+ * <>
+ * 0123456789
+ * PPPVV
*
- * 0123
- * *N
+ * In this instance, if vma has a close hook, the merge simply cannot
+ * proceed.
*/
- /* Make the next VMA have a close() callback. */
- vm_ops.close = dummy_close;
- vma_next->vm_ops = (const struct vm_operations_struct *)&vm_ops;
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+ vma->vm_ops = &vm_ops;
- /* Our proposed VMA has characteristics that would otherwise be merged. */
- vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+ vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
- /* The next VMA having a close() operator should cause the merge to fail.*/
- ASSERT_EQ(vma_merge_new_vma(&vmg), NULL);
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
- /* Now create the VMA so we can merge via modified flags */
- vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
- vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
- vmg.vma = vma;
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
/*
- * The VMA being modified in a way that would otherwise merge should
- * also fail.
+ * This case is mirrored if merging with next.
+ *
+ * <>
+ * 0123456789
+ * VVNNNN
+ *
+ * In this instance, if vma has a close hook, the merge simply cannot
+ * proceed.
*/
+
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+ vma->vm_ops = &vm_ops;
+
+ vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+ vmg.vma = vma;
+
ASSERT_EQ(vma_merge_modified(&vmg), NULL);
- /* SCENARIO B
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
+ /*
+ * Finally, we consider two variants of the case where we modify a VMA
+ * to merge with both the previous and next VMAs.
*
- * 0123
- * P*
+ * The first variant is where vma has a close hook. In this instance, no
+ * merge can proceed.
*
- * In order for this scenario to trigger, the VMA currently being
- * modified must also have a .close().
+ * <>
+ * 0123456789
+ * PPPVVNNNN
*/
- /* Reset VMG state. */
- vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+ vma->vm_ops = &vm_ops;
+
+ vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+ vmg.prev = vma_prev;
+ vmg.vma = vma;
+
+ ASSERT_EQ(vma_merge_modified(&vmg), NULL);
+
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
+
/*
- * Make next unmergeable, and don't let the scenario A check pick this
- * up, we want to reproduce scenario B only.
+ * The second variant is where next has a close hook. In this instance,
+ * we reduce the operation to a merge between prev and vma.
+ *
+ * <>
+ * 0123456789
+ * PPPVVNNNN
+ * ->
+ * 0123456789
+ * PPPPPNNNN
*/
- vma_next->vm_ops = NULL;
- vma_next->__vm_flags &= ~VM_MAYWRITE;
- /* Allocate prev. */
- vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
- /* Assign a vm_ops->close() function to VMA explicitly. */
- vma->vm_ops = (const struct vm_operations_struct *)&vm_ops;
+
+ vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+ vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+ vma_next->vm_ops = &vm_ops;
+
+ vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+ vmg.prev = vma_prev;
vmg.vma = vma;
- /* Make sure merge does not occur. */
- ASSERT_EQ(vma_merge_modified(&vmg), NULL);
- cleanup_mm(&mm, &vmi);
+ ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
+ ASSERT_EQ(vma_prev->vm_start, 0);
+ ASSERT_EQ(vma_prev->vm_end, 0x5000);
+ ASSERT_EQ(vma_prev->vm_pgoff, 0);
+
+ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
+
return true;
}
@@ -699,6 +730,9 @@ static bool test_vma_merge_modified(void)
struct vma_merge_struct vmg = {
.vmi = &vmi,
};
+ const struct vm_operations_struct vm_ops = {
+ .close = dummy_close,
+ };
/*
* Merge right case - partial span.
@@ -711,7 +745,9 @@ static bool test_vma_merge_modified(void)
* VNNNNNN
*/
vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+ vma->vm_ops = &vm_ops; /* This should have no impact. */
vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+ vma_next->vm_ops = &vm_ops; /* This should have no impact. */
vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
vmg.vma = vma;
vmg.prev = vma;
@@ -743,6 +779,7 @@ static bool test_vma_merge_modified(void)
*/
vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+ vma_next->vm_ops = &vm_ops; /* This should have no impact. */
vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
vmg.vma = vma;
vma->anon_vma = &dummy_anon_vma;
@@ -768,7 +805,9 @@ static bool test_vma_merge_modified(void)
* PPPPPPV
*/
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+ vma->vm_ops = &vm_ops; /* This should have no impact. */
vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
vmg.prev = vma_prev;
vmg.vma = vma;
@@ -800,6 +839,7 @@ static bool test_vma_merge_modified(void)
* PPPPPPP
*/
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
vmg.prev = vma_prev;
@@ -827,6 +867,7 @@ static bool test_vma_merge_modified(void)
* PPPPPPPPPP
*/
vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+ vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
--
2.45.2
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-05 12:13 ` [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge Lorenzo Stoakes
@ 2024-08-06 13:55 ` Petr Tesařík
2024-08-06 14:08 ` Lorenzo Stoakes
2024-08-09 14:25 ` Vlastimil Babka
1 sibling, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 13:55 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Mon, 5 Aug 2024 13:13:57 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
> removed in mergeability test") we relaxed the VMA merge rules for VMAs
> possessing a vm_ops->close() hook, permitting this operation in instances
> where we wouldn't delete the VMA as part of the merge operation.
>
> This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
> case 7 with vma_ops->close") to account for a subtle case that the previous
> commit had not taken into account.
>
> In both instances, we first rely on is_mergeable_vma() to determine whether
> we might be dealing with a VMA that might be removed, taking advantage of
> the fact that a 'previous' VMA will never be deleted, only VMAs that follow
> it.
>
> The second patch corrects the instance where a merge of the previous VMA
> into a subsequent one did not correctly check whether the subsequent VMA
> had a vm_ops->close() handler.
>
> Both changes prevent merge cases that are actually permissible (for
> instance a merge of a VMA into a following VMA with a vm_ops->close(), but
> with no previous VMA, which would result in the next VMA being extended,
> not deleted).
>
> In addition, both changes fail to consider the case where a VMA that would
> otherwise be merged with the previous and next VMA might have
> vm_ops->close(), on the assumption that for this to be the case, all three
> would have to have the same vma->vm_file to be mergeable and thus the same
> vm_ops.
>
> And in addition both changes operate at 50,000 feet, trying to guess
> whether a VMA will be deleted.
>
> As we have majorly refactored the VMA merge operation and de-duplicated
> code to the point where we know precisely where deletions will occur, this
> patch removes the aforementioned checks altogether and instead explicitly
> checks whether a VMA will be deleted.
>
> In cases where a reduced merge is still possible (where we merge both
> previous and next VMA but the next VMA has a vm_ops->close hook, meaning we
> could just merge the previous and current VMA), we do so, otherwise the
> merge is not permitted.
>
> We take advantage of our userland testing to assert that this functions
> correctly - replacing the previous limited vm_ops->close() tests with tests
> for every single case where we delete a VMA.
>
> We also update all testing for both new and modified VMAs to set
> vma->vm_ops->close() in every single instance where this would not prevent
> the merge, to assert that we never do so.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/vma.c | 69 ++++++++-----
> tools/testing/vma/vma.c | 213 ++++++++++++++++++++++++----------------
> 2 files changed, 173 insertions(+), 109 deletions(-)
>
> diff --git a/mm/vma.c b/mm/vma.c
> index c55ae035f5d6..9c779fc65ba8 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -10,14 +10,6 @@
> static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> {
> struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> - /*
> - * If the vma has a ->close operation then the driver probably needs to
> - * release per-vma resources, so we don't attempt to merge those if the
> - * caller indicates the current vma may be removed as part of the merge,
> - * which is the case if we are attempting to merge the next VMA into
> - * this one.
> - */
> - bool may_remove_vma = merge_next;
See my comment on PATCH 02/10. You're removing the local variable here,
so maybe it need not be introduced in the first place?
> if (!mpol_equal(vmg->policy, vma_policy(vma)))
> return false;
> @@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
> return false;
> if (vma->vm_file != vmg->file)
> return false;
> - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> - return false;
> if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> return false;
> if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> @@ -606,6 +596,12 @@ static int commit_merge(struct vma_merge_struct *vmg,
> return 0;
> }
>
> +/* We can only remove VMAs when merging if they do not have a close hook. */
> +static bool can_merge_remove_vma(struct vm_area_struct *vma)
> +{
> + return !vma->vm_ops || !vma->vm_ops->close;
> +}
> +
> /*
> * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> * attributes modified.
> @@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
>
> /* If we span the entire VMA, a merge implies it will be deleted. */
> merge_will_delete_vma = left_side && right_side;
> - /* If we merge both VMAs, then next is also deleted. */
> +
> + /*
> + * If we need to remove vma in its entirety but are unable to do so,
> + * we have no sensible recourse but to abort the merge.
> + */
> + if (merge_will_delete_vma && !can_merge_remove_vma(vma))
> + return NULL;
> +
> + /*
> + * If we merge both VMAs, then next is also deleted. This implies
> + * merge_will_delete_vma also.
> + */
> merge_will_delete_next = merge_both;
>
> + /*
> + * If we cannot delete next, then we can reduce the operation to merging
> + * prev and vma (thereby deleting vma).
> + */
> + if (merge_will_delete_next && !can_merge_remove_vma(next)) {
> + merge_will_delete_next = false;
> + merge_right = false;
> + merge_both = false;
> + }
> +
> /* No matter what happens, we will be adjusting vma. */
> vma_start_write(vma);
>
> @@ -756,21 +773,12 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> vmg->start = prev->vm_start;
> vmg->pgoff = prev->vm_pgoff;
>
> - if (merge_will_delete_vma) {
> - /*
> - * can_vma_merge_after() assumed we would not be
> - * removing vma, so it skipped the check for
> - * vm_ops->close, but we are removing vma.
> - */
> - if (vma->vm_ops && vma->vm_ops->close)
> - err = -EINVAL;
> - } else {
> + if (!merge_will_delete_vma) {
> adjust = vma;
> adj_start = end - vma->vm_start;
> }
>
> - if (!err)
> - err = dup_anon_vma(prev, vma, &anon_dup);
> + err = dup_anon_vma(prev, vma, &anon_dup);
> } else { /* merge_right */
> /*
> * |<----->| OR
> @@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> unsigned long end = vmg->end;
> pgoff_t pgoff = vmg->pgoff;
> pgoff_t pglen = PHYS_PFN(end - start);
> + bool merge_next = false;
> + struct anon_vma *anon_vma = vmg->anon_vma;
Calling this "anon_vma" feels a bit too generic. IIUC you want to save
the original vmg->anon_vma in case the VMA turns out to be ummergeable
with the next VMA after vmg->anon_vma has already been modified.
What about calling it "orig_anon_vma"?
Petr T
>
> VM_WARN_ON(vmg->vma);
>
> @@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> vmg->end = next->vm_end;
> vmg->vma = next;
> vmg->pgoff = next->vm_pgoff - pglen;
> -
> vmg->anon_vma = next->anon_vma;
> +
> + merge_next = true;
> }
>
> /* If we can merge with the previous VMA, adjust vmg accordingly. */
> @@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> vmg->start = prev->vm_start;
> vmg->vma = prev;
> vmg->pgoff = prev->vm_pgoff;
> +
> + /*
> + * If this merge would result in removal of the next VMA but we
> + * are not permitted to do so, reduce the operation to merging
> + * prev and vma.
> + */
> + if (merge_next && !can_merge_remove_vma(next)) {
> + vmg->end = end;
> + vmg->anon_vma = anon_vma;
> + }
> } else if (prev) {
> vma_iter_next_range(vmg->vmi);
> }
> @@ -978,6 +999,8 @@ int vma_expand(struct vma_merge_struct *vmg)
> int ret;
>
> remove_next = true;
> + /* This should already have been checked by this point. */
> + VM_WARN_ON(!can_merge_remove_vma(next));
> vma_start_write(next);
> ret = dup_anon_vma(vma, next, &anon_dup);
> if (ret)
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index e465dc22e2d0..0c0a6ffcfc98 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -327,6 +327,9 @@ static bool test_vma_merge_new_vma(void)
> struct anon_vma_chain dummy_anon_vma_chain_d = {
> .anon_vma = &dummy_anon_vma,
> };
> + const struct vm_operations_struct vm_ops = {
> + .close = dummy_close,
> + };
> int count;
> struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
> bool merged;
> @@ -370,6 +373,7 @@ static bool test_vma_merge_new_vma(void)
> * 0123456789abc
> * AA*B DD CC
> */
> + vma_a->vm_ops = &vm_ops; /* This should have no impact. */
> vma_b->anon_vma = &dummy_anon_vma;
> vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
> ASSERT_EQ(vma, vma_a);
> @@ -406,6 +410,7 @@ static bool test_vma_merge_new_vma(void)
> * AAAAA *DD CC
> */
> vma_d->anon_vma = &dummy_anon_vma;
> + vma_d->vm_ops = &vm_ops; /* This should have no impact. */
> vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
> ASSERT_EQ(vma, vma_d);
> /* Prepend. */
> @@ -423,6 +428,7 @@ static bool test_vma_merge_new_vma(void)
> * 0123456789abc
> * AAAAA*DDD CC
> */
> + vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
> vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
> ASSERT_EQ(vma, vma_a);
> /* Merge with A, delete D. */
> @@ -573,120 +579,145 @@ static bool test_vma_merge_with_close(void)
> struct vma_merge_struct vmg = {
> .vmi = &vmi,
> };
> - struct vm_operations_struct vm_ops = {};
> - struct vm_area_struct *vma_next =
> - alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
> - struct vm_area_struct *vma;
> + const struct vm_operations_struct vm_ops = {
> + .close = dummy_close,
> + };
> + struct vm_area_struct *vma_prev, *vma_next, *vma;
>
> /*
> - * When we merge VMAs we sometimes have to delete others as part of the
> - * operation.
> - *
> - * Considering the two possible adjacent VMAs to which a VMA can be
> - * merged:
> - *
> - * [ prev ][ vma ][ next ]
> - *
> - * In no case will we need to delete prev. If the operation is
> - * mergeable, then prev will be extended with one or both of vma and
> - * next deleted.
> - *
> - * As a result, during initial mergeability checks, only
> - * can_vma_merge_before() (which implies the VMA being merged with is
> - * 'next' as shown above) bothers to check to see whether the next VMA
> - * has a vm_ops->close() callback that will need to be called when
> - * removed.
> - *
> - * If it does, then we cannot merge as the resources that the close()
> - * operation potentially clears down are tied only to the existing VMA
> - * range and we have no way of extending those to the nearly merged one.
> - *
> - * We must consider two scenarios:
> - *
> - * A.
> + * When merging VMAs we are not permitted to remove any VMA that has a
> + * vm_ops->close() hook.
> *
> - * vm_ops->close: - - !NULL
> - * [ prev ][ vma ][ next ]
> - *
> - * Where prev may or may not be present/mergeable.
> - *
> - * This is picked up by a specific check in can_vma_merge_before().
> - *
> - * B.
> - *
> - * vm_ops->close: - !NULL
> - * [ prev ][ vma ]
> - *
> - * Where prev and vma are present and mergeable.
> - *
> - * This is picked up by a specific check in vma_merge_modified().
> - *
> - * IMPORTANT NOTE: We make the assumption that the following case:
> + * This is because executing this hook may clear state that is pertinent
> + * to the VMA range as a whole.
> + */
> +
> + /*
> + * The only case of a new VMA merge that results in a VMA being deleted
> + * is one where both the previous and next VMAs are merged - in this
> + * instance the next VMA is deleted, and the previous VMA is extended.
> *
> - * - !NULL NULL
> - * [ prev ][ vma ][ next ]
> + * If we are unable to do so, we reduce the operation to simply
> + * extending the prev VMA and not merging next.
> *
> - * Cannot occur, because vma->vm_ops being the same implies the same
> - * vma->vm_file, and therefore this would mean that next->vm_ops->close
> - * would be set too, and thus scenario A would pick this up.
> + * 0123456789
> + * PPP**NNNN
> + * ->
> + * 0123456789
> + * PPPPPPNNN
> */
>
> - ASSERT_NE(vma_next, NULL);
> + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> + vma_next->vm_ops = &vm_ops;
> +
> + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> + ASSERT_EQ(vma_merge_new_vma(&vmg), vma_prev);
> + ASSERT_EQ(vma_prev->vm_start, 0);
> + ASSERT_EQ(vma_prev->vm_end, 0x5000);
> + ASSERT_EQ(vma_prev->vm_pgoff, 0);
> +
> + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
>
> /*
> - * SCENARIO A
> + * When modifying an existing VMA there are further cases where we
> + * delete VMAs.
> + *
> + * <>
> + * 0123456789
> + * PPPVV
> *
> - * 0123
> - * *N
> + * In this instance, if vma has a close hook, the merge simply cannot
> + * proceed.
> */
>
> - /* Make the next VMA have a close() callback. */
> - vm_ops.close = dummy_close;
> - vma_next->vm_ops = (const struct vm_operations_struct *)&vm_ops;
> + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> + vma->vm_ops = &vm_ops;
>
> - /* Our proposed VMA has characteristics that would otherwise be merged. */
> - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> + vmg.prev = vma_prev;
> + vmg.vma = vma;
>
> - /* The next VMA having a close() operator should cause the merge to fail.*/
> - ASSERT_EQ(vma_merge_new_vma(&vmg), NULL);
> + ASSERT_EQ(vma_merge_modified(&vmg), NULL);
>
> - /* Now create the VMA so we can merge via modified flags */
> - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> - vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
> - vmg.vma = vma;
> + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
>
> /*
> - * The VMA being modified in a way that would otherwise merge should
> - * also fail.
> + * This case is mirrored if merging with next.
> + *
> + * <>
> + * 0123456789
> + * VVNNNN
> + *
> + * In this instance, if vma has a close hook, the merge simply cannot
> + * proceed.
> */
> +
> + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> + vma->vm_ops = &vm_ops;
> +
> + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> + vmg.vma = vma;
> +
> ASSERT_EQ(vma_merge_modified(&vmg), NULL);
>
> - /* SCENARIO B
> + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> +
> + /*
> + * Finally, we consider two variants of the case where we modify a VMA
> + * to merge with both the previous and next VMAs.
> *
> - * 0123
> - * P*
> + * The first variant is where vma has a close hook. In this instance, no
> + * merge can proceed.
> *
> - * In order for this scenario to trigger, the VMA currently being
> - * modified must also have a .close().
> + * <>
> + * 0123456789
> + * PPPVVNNNN
> */
>
> - /* Reset VMG state. */
> - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> + vma->vm_ops = &vm_ops;
> +
> + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> + vmg.prev = vma_prev;
> + vmg.vma = vma;
> +
> + ASSERT_EQ(vma_merge_modified(&vmg), NULL);
> +
> + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
> +
> /*
> - * Make next unmergeable, and don't let the scenario A check pick this
> - * up, we want to reproduce scenario B only.
> + * The second variant is where next has a close hook. In this instance,
> + * we reduce the operation to a merge between prev and vma.
> + *
> + * <>
> + * 0123456789
> + * PPPVVNNNN
> + * ->
> + * 0123456789
> + * PPPPPNNNN
> */
> - vma_next->vm_ops = NULL;
> - vma_next->__vm_flags &= ~VM_MAYWRITE;
> - /* Allocate prev. */
> - vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
> - /* Assign a vm_ops->close() function to VMA explicitly. */
> - vma->vm_ops = (const struct vm_operations_struct *)&vm_ops;
> +
> + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> + vma_next->vm_ops = &vm_ops;
> +
> + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> + vmg.prev = vma_prev;
> vmg.vma = vma;
> - /* Make sure merge does not occur. */
> - ASSERT_EQ(vma_merge_modified(&vmg), NULL);
>
> - cleanup_mm(&mm, &vmi);
> + ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
> + ASSERT_EQ(vma_prev->vm_start, 0);
> + ASSERT_EQ(vma_prev->vm_end, 0x5000);
> + ASSERT_EQ(vma_prev->vm_pgoff, 0);
> +
> + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> +
> return true;
> }
>
> @@ -699,6 +730,9 @@ static bool test_vma_merge_modified(void)
> struct vma_merge_struct vmg = {
> .vmi = &vmi,
> };
> + const struct vm_operations_struct vm_ops = {
> + .close = dummy_close,
> + };
>
> /*
> * Merge right case - partial span.
> @@ -711,7 +745,9 @@ static bool test_vma_merge_modified(void)
> * VNNNNNN
> */
> vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
> + vma->vm_ops = &vm_ops; /* This should have no impact. */
> vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
> + vma_next->vm_ops = &vm_ops; /* This should have no impact. */
> vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
> vmg.vma = vma;
> vmg.prev = vma;
> @@ -743,6 +779,7 @@ static bool test_vma_merge_modified(void)
> */
> vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
> vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
> + vma_next->vm_ops = &vm_ops; /* This should have no impact. */
> vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
> vmg.vma = vma;
> vma->anon_vma = &dummy_anon_vma;
> @@ -768,7 +805,9 @@ static bool test_vma_merge_modified(void)
> * PPPPPPV
> */
> vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> + vma->vm_ops = &vm_ops; /* This should have no impact. */
> vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
> vmg.prev = vma_prev;
> vmg.vma = vma;
> @@ -800,6 +839,7 @@ static bool test_vma_merge_modified(void)
> * PPPPPPP
> */
> vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
> vmg.prev = vma_prev;
> @@ -827,6 +867,7 @@ static bool test_vma_merge_modified(void)
> * PPPPPPPPPP
> */
> vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
> vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
> --
> 2.45.2
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-06 13:55 ` Petr Tesařík
@ 2024-08-06 14:08 ` Lorenzo Stoakes
2024-08-06 14:21 ` Petr Tesařík
0 siblings, 1 reply; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 14:08 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 03:55:55PM GMT, Petr Tesařík wrote:
> On Mon, 5 Aug 2024 13:13:57 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
> > removed in mergeability test") we relaxed the VMA merge rules for VMAs
> > possessing a vm_ops->close() hook, permitting this operation in instances
> > where we wouldn't delete the VMA as part of the merge operation.
> >
> > This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
> > case 7 with vma_ops->close") to account for a subtle case that the previous
> > commit had not taken into account.
> >
> > In both instances, we first rely on is_mergeable_vma() to determine whether
> > we might be dealing with a VMA that might be removed, taking advantage of
> > the fact that a 'previous' VMA will never be deleted, only VMAs that follow
> > it.
> >
> > The second patch corrects the instance where a merge of the previous VMA
> > into a subsequent one did not correctly check whether the subsequent VMA
> > had a vm_ops->close() handler.
> >
> > Both changes prevent merge cases that are actually permissible (for
> > instance a merge of a VMA into a following VMA with a vm_ops->close(), but
> > with no previous VMA, which would result in the next VMA being extended,
> > not deleted).
> >
> > In addition, both changes fail to consider the case where a VMA that would
> > otherwise be merged with the previous and next VMA might have
> > vm_ops->close(), on the assumption that for this to be the case, all three
> > would have to have the same vma->vm_file to be mergeable and thus the same
> > vm_ops.
> >
> > And in addition both changes operate at 50,000 feet, trying to guess
> > whether a VMA will be deleted.
> >
> > As we have majorly refactored the VMA merge operation and de-duplicated
> > code to the point where we know precisely where deletions will occur, this
> > patch removes the aforementioned checks altogether and instead explicitly
> > checks whether a VMA will be deleted.
> >
> > In cases where a reduced merge is still possible (where we merge both
> > previous and next VMA but the next VMA has a vm_ops->close hook, meaning we
> > could just merge the previous and current VMA), we do so, otherwise the
> > merge is not permitted.
> >
> > We take advantage of our userland testing to assert that this functions
> > correctly - replacing the previous limited vm_ops->close() tests with tests
> > for every single case where we delete a VMA.
> >
> > We also update all testing for both new and modified VMAs to set
> > vma->vm_ops->close() in every single instance where this would not prevent
> > the merge, to assert that we never do so.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > mm/vma.c | 69 ++++++++-----
> > tools/testing/vma/vma.c | 213 ++++++++++++++++++++++++----------------
> > 2 files changed, 173 insertions(+), 109 deletions(-)
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index c55ae035f5d6..9c779fc65ba8 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -10,14 +10,6 @@
> > static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
> > {
> > struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
> > - /*
> > - * If the vma has a ->close operation then the driver probably needs to
> > - * release per-vma resources, so we don't attempt to merge those if the
> > - * caller indicates the current vma may be removed as part of the merge,
> > - * which is the case if we are attempting to merge the next VMA into
> > - * this one.
> > - */
> > - bool may_remove_vma = merge_next;
>
> See my comment on PATCH 02/10. You're removing the local variable here,
> so maybe it need not be introduced in the first place?
>
> > if (!mpol_equal(vmg->policy, vma_policy(vma)))
> > return false;
> > @@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
> > return false;
> > if (vma->vm_file != vmg->file)
> > return false;
> > - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> > - return false;
> > if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
> > return false;
> > if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
> > @@ -606,6 +596,12 @@ static int commit_merge(struct vma_merge_struct *vmg,
> > return 0;
> > }
> >
> > +/* We can only remove VMAs when merging if they do not have a close hook. */
> > +static bool can_merge_remove_vma(struct vm_area_struct *vma)
> > +{
> > + return !vma->vm_ops || !vma->vm_ops->close;
> > +}
> > +
> > /*
> > * vma_merge_modified - Attempt to merge VMAs based on a VMA having its
> > * attributes modified.
> > @@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> >
> > /* If we span the entire VMA, a merge implies it will be deleted. */
> > merge_will_delete_vma = left_side && right_side;
> > - /* If we merge both VMAs, then next is also deleted. */
> > +
> > + /*
> > + * If we need to remove vma in its entirety but are unable to do so,
> > + * we have no sensible recourse but to abort the merge.
> > + */
> > + if (merge_will_delete_vma && !can_merge_remove_vma(vma))
> > + return NULL;
> > +
> > + /*
> > + * If we merge both VMAs, then next is also deleted. This implies
> > + * merge_will_delete_vma also.
> > + */
> > merge_will_delete_next = merge_both;
> >
> > + /*
> > + * If we cannot delete next, then we can reduce the operation to merging
> > + * prev and vma (thereby deleting vma).
> > + */
> > + if (merge_will_delete_next && !can_merge_remove_vma(next)) {
> > + merge_will_delete_next = false;
> > + merge_right = false;
> > + merge_both = false;
> > + }
> > +
> > /* No matter what happens, we will be adjusting vma. */
> > vma_start_write(vma);
> >
> > @@ -756,21 +773,12 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> > vmg->start = prev->vm_start;
> > vmg->pgoff = prev->vm_pgoff;
> >
> > - if (merge_will_delete_vma) {
> > - /*
> > - * can_vma_merge_after() assumed we would not be
> > - * removing vma, so it skipped the check for
> > - * vm_ops->close, but we are removing vma.
> > - */
> > - if (vma->vm_ops && vma->vm_ops->close)
> > - err = -EINVAL;
> > - } else {
> > + if (!merge_will_delete_vma) {
> > adjust = vma;
> > adj_start = end - vma->vm_start;
> > }
> >
> > - if (!err)
> > - err = dup_anon_vma(prev, vma, &anon_dup);
> > + err = dup_anon_vma(prev, vma, &anon_dup);
> > } else { /* merge_right */
> > /*
> > * |<----->| OR
> > @@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > unsigned long end = vmg->end;
> > pgoff_t pgoff = vmg->pgoff;
> > pgoff_t pglen = PHYS_PFN(end - start);
> > + bool merge_next = false;
> > + struct anon_vma *anon_vma = vmg->anon_vma;
>
> Calling this "anon_vma" feels a bit too generic. IIUC you want to save
> the original vmg->anon_vma in case the VMA turns out to be ummergeable
> with the next VMA after vmg->anon_vma has already been modified.
>
> What about calling it "orig_anon_vma"?
I disagree, that'd be unnecessary noise (and this is applicable to _all_
the fields).
Again we come to some trade-off between readability and inherent
complexity. I am not a fan of making variable names unnecessarily
overwrought.
In this case it's just a short-hand, as the only instance where we'd retry
the operation anon_vma would be NULL (from mmap_region()), so we reset that
to NULL, however strictly we should reset to anon_vma.
I'll change that on the next respin just to be strict.
>
> Petr T
>
> >
> > VM_WARN_ON(vmg->vma);
> >
> > @@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > vmg->end = next->vm_end;
> > vmg->vma = next;
> > vmg->pgoff = next->vm_pgoff - pglen;
> > -
> > vmg->anon_vma = next->anon_vma;
> > +
> > + merge_next = true;
> > }
> >
> > /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > @@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > vmg->start = prev->vm_start;
> > vmg->vma = prev;
> > vmg->pgoff = prev->vm_pgoff;
> > +
> > + /*
> > + * If this merge would result in removal of the next VMA but we
> > + * are not permitted to do so, reduce the operation to merging
> > + * prev and vma.
> > + */
> > + if (merge_next && !can_merge_remove_vma(next)) {
> > + vmg->end = end;
> > + vmg->anon_vma = anon_vma;
> > + }
> > } else if (prev) {
> > vma_iter_next_range(vmg->vmi);
> > }
> > @@ -978,6 +999,8 @@ int vma_expand(struct vma_merge_struct *vmg)
> > int ret;
> >
> > remove_next = true;
> > + /* This should already have been checked by this point. */
> > + VM_WARN_ON(!can_merge_remove_vma(next));
> > vma_start_write(next);
> > ret = dup_anon_vma(vma, next, &anon_dup);
> > if (ret)
> > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> > index e465dc22e2d0..0c0a6ffcfc98 100644
> > --- a/tools/testing/vma/vma.c
> > +++ b/tools/testing/vma/vma.c
> > @@ -327,6 +327,9 @@ static bool test_vma_merge_new_vma(void)
> > struct anon_vma_chain dummy_anon_vma_chain_d = {
> > .anon_vma = &dummy_anon_vma,
> > };
> > + const struct vm_operations_struct vm_ops = {
> > + .close = dummy_close,
> > + };
> > int count;
> > struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d;
> > bool merged;
> > @@ -370,6 +373,7 @@ static bool test_vma_merge_new_vma(void)
> > * 0123456789abc
> > * AA*B DD CC
> > */
> > + vma_a->vm_ops = &vm_ops; /* This should have no impact. */
> > vma_b->anon_vma = &dummy_anon_vma;
> > vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
> > ASSERT_EQ(vma, vma_a);
> > @@ -406,6 +410,7 @@ static bool test_vma_merge_new_vma(void)
> > * AAAAA *DD CC
> > */
> > vma_d->anon_vma = &dummy_anon_vma;
> > + vma_d->vm_ops = &vm_ops; /* This should have no impact. */
> > vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
> > ASSERT_EQ(vma, vma_d);
> > /* Prepend. */
> > @@ -423,6 +428,7 @@ static bool test_vma_merge_new_vma(void)
> > * 0123456789abc
> > * AAAAA*DDD CC
> > */
> > + vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
> > vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
> > ASSERT_EQ(vma, vma_a);
> > /* Merge with A, delete D. */
> > @@ -573,120 +579,145 @@ static bool test_vma_merge_with_close(void)
> > struct vma_merge_struct vmg = {
> > .vmi = &vmi,
> > };
> > - struct vm_operations_struct vm_ops = {};
> > - struct vm_area_struct *vma_next =
> > - alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags);
> > - struct vm_area_struct *vma;
> > + const struct vm_operations_struct vm_ops = {
> > + .close = dummy_close,
> > + };
> > + struct vm_area_struct *vma_prev, *vma_next, *vma;
> >
> > /*
> > - * When we merge VMAs we sometimes have to delete others as part of the
> > - * operation.
> > - *
> > - * Considering the two possible adjacent VMAs to which a VMA can be
> > - * merged:
> > - *
> > - * [ prev ][ vma ][ next ]
> > - *
> > - * In no case will we need to delete prev. If the operation is
> > - * mergeable, then prev will be extended with one or both of vma and
> > - * next deleted.
> > - *
> > - * As a result, during initial mergeability checks, only
> > - * can_vma_merge_before() (which implies the VMA being merged with is
> > - * 'next' as shown above) bothers to check to see whether the next VMA
> > - * has a vm_ops->close() callback that will need to be called when
> > - * removed.
> > - *
> > - * If it does, then we cannot merge as the resources that the close()
> > - * operation potentially clears down are tied only to the existing VMA
> > - * range and we have no way of extending those to the nearly merged one.
> > - *
> > - * We must consider two scenarios:
> > - *
> > - * A.
> > + * When merging VMAs we are not permitted to remove any VMA that has a
> > + * vm_ops->close() hook.
> > *
> > - * vm_ops->close: - - !NULL
> > - * [ prev ][ vma ][ next ]
> > - *
> > - * Where prev may or may not be present/mergeable.
> > - *
> > - * This is picked up by a specific check in can_vma_merge_before().
> > - *
> > - * B.
> > - *
> > - * vm_ops->close: - !NULL
> > - * [ prev ][ vma ]
> > - *
> > - * Where prev and vma are present and mergeable.
> > - *
> > - * This is picked up by a specific check in vma_merge_modified().
> > - *
> > - * IMPORTANT NOTE: We make the assumption that the following case:
> > + * This is because executing this hook may clear state that is pertinent
> > + * to the VMA range as a whole.
> > + */
> > +
> > + /*
> > + * The only case of a new VMA merge that results in a VMA being deleted
> > + * is one where both the previous and next VMAs are merged - in this
> > + * instance the next VMA is deleted, and the previous VMA is extended.
> > *
> > - * - !NULL NULL
> > - * [ prev ][ vma ][ next ]
> > + * If we are unable to do so, we reduce the operation to simply
> > + * extending the prev VMA and not merging next.
> > *
> > - * Cannot occur, because vma->vm_ops being the same implies the same
> > - * vma->vm_file, and therefore this would mean that next->vm_ops->close
> > - * would be set too, and thus scenario A would pick this up.
> > + * 0123456789
> > + * PPP**NNNN
> > + * ->
> > + * 0123456789
> > + * PPPPPPNNN
> > */
> >
> > - ASSERT_NE(vma_next, NULL);
> > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> > + vma_next->vm_ops = &vm_ops;
> > +
> > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> > + ASSERT_EQ(vma_merge_new_vma(&vmg), vma_prev);
> > + ASSERT_EQ(vma_prev->vm_start, 0);
> > + ASSERT_EQ(vma_prev->vm_end, 0x5000);
> > + ASSERT_EQ(vma_prev->vm_pgoff, 0);
> > +
> > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> >
> > /*
> > - * SCENARIO A
> > + * When modifying an existing VMA there are further cases where we
> > + * delete VMAs.
> > + *
> > + * <>
> > + * 0123456789
> > + * PPPVV
> > *
> > - * 0123
> > - * *N
> > + * In this instance, if vma has a close hook, the merge simply cannot
> > + * proceed.
> > */
> >
> > - /* Make the next VMA have a close() callback. */
> > - vm_ops.close = dummy_close;
> > - vma_next->vm_ops = (const struct vm_operations_struct *)&vm_ops;
> > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> > + vma->vm_ops = &vm_ops;
> >
> > - /* Our proposed VMA has characteristics that would otherwise be merged. */
> > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> > + vmg.prev = vma_prev;
> > + vmg.vma = vma;
> >
> > - /* The next VMA having a close() operator should cause the merge to fail.*/
> > - ASSERT_EQ(vma_merge_new_vma(&vmg), NULL);
> > + ASSERT_EQ(vma_merge_modified(&vmg), NULL);
> >
> > - /* Now create the VMA so we can merge via modified flags */
> > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> > - vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags);
> > - vmg.vma = vma;
> > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> >
> > /*
> > - * The VMA being modified in a way that would otherwise merge should
> > - * also fail.
> > + * This case is mirrored if merging with next.
> > + *
> > + * <>
> > + * 0123456789
> > + * VVNNNN
> > + *
> > + * In this instance, if vma has a close hook, the merge simply cannot
> > + * proceed.
> > */
> > +
> > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> > + vma->vm_ops = &vm_ops;
> > +
> > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> > + vmg.vma = vma;
> > +
> > ASSERT_EQ(vma_merge_modified(&vmg), NULL);
> >
> > - /* SCENARIO B
> > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> > +
> > + /*
> > + * Finally, we consider two variants of the case where we modify a VMA
> > + * to merge with both the previous and next VMAs.
> > *
> > - * 0123
> > - * P*
> > + * The first variant is where vma has a close hook. In this instance, no
> > + * merge can proceed.
> > *
> > - * In order for this scenario to trigger, the VMA currently being
> > - * modified must also have a .close().
> > + * <>
> > + * 0123456789
> > + * PPPVVNNNN
> > */
> >
> > - /* Reset VMG state. */
> > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags);
> > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> > + vma->vm_ops = &vm_ops;
> > +
> > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> > + vmg.prev = vma_prev;
> > + vmg.vma = vma;
> > +
> > + ASSERT_EQ(vma_merge_modified(&vmg), NULL);
> > +
> > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3);
> > +
> > /*
> > - * Make next unmergeable, and don't let the scenario A check pick this
> > - * up, we want to reproduce scenario B only.
> > + * The second variant is where next has a close hook. In this instance,
> > + * we reduce the operation to a merge between prev and vma.
> > + *
> > + * <>
> > + * 0123456789
> > + * PPPVVNNNN
> > + * ->
> > + * 0123456789
> > + * PPPPPNNNN
> > */
> > - vma_next->vm_ops = NULL;
> > - vma_next->__vm_flags &= ~VM_MAYWRITE;
> > - /* Allocate prev. */
> > - vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
> > - /* Assign a vm_ops->close() function to VMA explicitly. */
> > - vma->vm_ops = (const struct vm_operations_struct *)&vm_ops;
> > +
> > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
> > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
> > + vma_next->vm_ops = &vm_ops;
> > +
> > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
> > + vmg.prev = vma_prev;
> > vmg.vma = vma;
> > - /* Make sure merge does not occur. */
> > - ASSERT_EQ(vma_merge_modified(&vmg), NULL);
> >
> > - cleanup_mm(&mm, &vmi);
> > + ASSERT_EQ(vma_merge_modified(&vmg), vma_prev);
> > + ASSERT_EQ(vma_prev->vm_start, 0);
> > + ASSERT_EQ(vma_prev->vm_end, 0x5000);
> > + ASSERT_EQ(vma_prev->vm_pgoff, 0);
> > +
> > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2);
> > +
> > return true;
> > }
> >
> > @@ -699,6 +730,9 @@ static bool test_vma_merge_modified(void)
> > struct vma_merge_struct vmg = {
> > .vmi = &vmi,
> > };
> > + const struct vm_operations_struct vm_ops = {
> > + .close = dummy_close,
> > + };
> >
> > /*
> > * Merge right case - partial span.
> > @@ -711,7 +745,9 @@ static bool test_vma_merge_modified(void)
> > * VNNNNNN
> > */
> > vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
> > + vma->vm_ops = &vm_ops; /* This should have no impact. */
> > vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
> > + vma_next->vm_ops = &vm_ops; /* This should have no impact. */
> > vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
> > vmg.vma = vma;
> > vmg.prev = vma;
> > @@ -743,6 +779,7 @@ static bool test_vma_merge_modified(void)
> > */
> > vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
> > vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
> > + vma_next->vm_ops = &vm_ops; /* This should have no impact. */
> > vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags);
> > vmg.vma = vma;
> > vma->anon_vma = &dummy_anon_vma;
> > @@ -768,7 +805,9 @@ static bool test_vma_merge_modified(void)
> > * PPPPPPV
> > */
> > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> > + vma->vm_ops = &vm_ops; /* This should have no impact. */
> > vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags);
> > vmg.prev = vma_prev;
> > vmg.vma = vma;
> > @@ -800,6 +839,7 @@ static bool test_vma_merge_modified(void)
> > * PPPPPPP
> > */
> > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> > vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
> > vmg.prev = vma_prev;
> > @@ -827,6 +867,7 @@ static bool test_vma_merge_modified(void)
> > * PPPPPPPPPP
> > */
> > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
> > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
> > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
> > vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
> > vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags);
> > --
> > 2.45.2
> >
>
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-06 14:08 ` Lorenzo Stoakes
@ 2024-08-06 14:21 ` Petr Tesařík
2024-08-06 14:42 ` Lorenzo Stoakes
0 siblings, 1 reply; 53+ messages in thread
From: Petr Tesařík @ 2024-08-06 14:21 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, 6 Aug 2024 15:08:33 +0100
Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> On Tue, Aug 06, 2024 at 03:55:55PM GMT, Petr Tesařík wrote:
> > On Mon, 5 Aug 2024 13:13:57 +0100
> > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>[...]
> > > @@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > unsigned long end = vmg->end;
> > > pgoff_t pgoff = vmg->pgoff;
> > > pgoff_t pglen = PHYS_PFN(end - start);
> > > + bool merge_next = false;
> > > + struct anon_vma *anon_vma = vmg->anon_vma;
> >
> > Calling this "anon_vma" feels a bit too generic. IIUC you want to save
> > the original vmg->anon_vma in case the VMA turns out to be ummergeable
> > with the next VMA after vmg->anon_vma has already been modified.
> >
> > What about calling it "orig_anon_vma"?
>
> I disagree, that'd be unnecessary noise (and this is applicable to _all_
> the fields).
I'm afraid I don't understand what you mean with _all_ fields. FWIW my
comment concerns a local variable called "anon_vma", not a struct
member called "anon_vma".
>
> Again we come to some trade-off between readability and inherent
> complexity. I am not a fan of making variable names unnecessarily
> overwrought.
Then call it "a". ;-)
See additional comments below:
>
> In this case it's just a short-hand, as the only instance where we'd retry
> the operation anon_vma would be NULL (from mmap_region()), so we reset that
> to NULL, however strictly we should reset to anon_vma.
>
> I'll change that on the next respin just to be strict.
>
> >
> > Petr T
> >
> > >
> > > VM_WARN_ON(vmg->vma);
> > >
> > > @@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > vmg->end = next->vm_end;
> > > vmg->vma = next;
> > > vmg->pgoff = next->vm_pgoff - pglen;
> > > -
> > > vmg->anon_vma = next->anon_vma;
Here, vmg->anon_vma is modified. Original value is lost.
> > > +
> > > + merge_next = true;
> > > }
> > >
> > > /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > > @@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > vmg->start = prev->vm_start;
> > > vmg->vma = prev;
> > > vmg->pgoff = prev->vm_pgoff;
> > > +
> > > + /*
> > > + * If this merge would result in removal of the next VMA but we
> > > + * are not permitted to do so, reduce the operation to merging
> > > + * prev and vma.
> > > + */
> > > + if (merge_next && !can_merge_remove_vma(next)) {
> > > + vmg->end = end;
> > > + vmg->anon_vma = anon_vma;
But here you need to restore the original value of vmg->anon_vma.
Isn't this why you introduced the local variable "anon_vma"? I believe
it would be easier to understand its purpose if it includes the "orig_"
prefix.
Just my two eurocents.
Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-06 14:21 ` Petr Tesařík
@ 2024-08-06 14:42 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-06 14:42 UTC (permalink / raw)
To: Petr Tesařík
Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett, Vlastimil Babka
On Tue, Aug 06, 2024 at 04:21:49PM GMT, Petr Tesařík wrote:
> On Tue, 6 Aug 2024 15:08:33 +0100
> Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > On Tue, Aug 06, 2024 at 03:55:55PM GMT, Petr Tesařík wrote:
> > > On Mon, 5 Aug 2024 13:13:57 +0100
> > > Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >[...]
> > > > @@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > unsigned long end = vmg->end;
> > > > pgoff_t pgoff = vmg->pgoff;
> > > > pgoff_t pglen = PHYS_PFN(end - start);
> > > > + bool merge_next = false;
> > > > + struct anon_vma *anon_vma = vmg->anon_vma;
> > >
> > > Calling this "anon_vma" feels a bit too generic. IIUC you want to save
> > > the original vmg->anon_vma in case the VMA turns out to be ummergeable
> > > with the next VMA after vmg->anon_vma has already been modified.
> > >
> > > What about calling it "orig_anon_vma"?
> >
> > I disagree, that'd be unnecessary noise (and this is applicable to _all_
> > the fields).
>
> I'm afraid I don't understand what you mean with _all_ fields. FWIW my
> comment concerns a local variable called "anon_vma", not a struct
> member called "anon_vma".
At the risk of sounding a little rude, it'd be courteous to take the time
to read through and understand the function before reviewing, especially
when doing a drive-by.
We use other fields like start, end, pgoff in a similar way to reset things
if expansion fails.
>
> >
> > Again we come to some trade-off between readability and inherent
> > complexity. I am not a fan of making variable names unnecessarily
> > overwrought.
>
> Then call it "a". ;-)
I don't find these kind of sarcastic comments hugely helpful.
>
> See additional comments below:
>
> >
> > In this case it's just a short-hand, as the only instance where we'd retry
> > the operation anon_vma would be NULL (from mmap_region()), so we reset that
> > to NULL, however strictly we should reset to anon_vma.
> >
> > I'll change that on the next respin just to be strict.
> >
> > >
> > > Petr T
> > >
> > > >
> > > > VM_WARN_ON(vmg->vma);
> > > >
> > > > @@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > vmg->end = next->vm_end;
> > > > vmg->vma = next;
> > > > vmg->pgoff = next->vm_pgoff - pglen;
> > > > -
> > > > vmg->anon_vma = next->anon_vma;
>
> Here, vmg->anon_vma is modified. Original value is lost.
Yes, that's intentional, and a product of how anon_vma objects
function. This may be worth a comment actually.
By this point, is_mergeable_anon_vma() would be been checked between the
VMAs, so either they'd be identical, or (due to the intricacies of
anon_vma) it'd be permitted to overwrite the new VMA's anon_vma.
This is fiddly so I'll add a comment on respin.
>
> > > > +
> > > > + merge_next = true;
> > > > }
> > > >
> > > > /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > > > @@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > > > vmg->start = prev->vm_start;
> > > > vmg->vma = prev;
> > > > vmg->pgoff = prev->vm_pgoff;
> > > > +
> > > > + /*
> > > > + * If this merge would result in removal of the next VMA but we
> > > > + * are not permitted to do so, reduce the operation to merging
> > > > + * prev and vma.
> > > > + */
> > > > + if (merge_next && !can_merge_remove_vma(next)) {
> > > > + vmg->end = end;
> > > > + vmg->anon_vma = anon_vma;
>
> But here you need to restore the original value of vmg->anon_vma.
>
> Isn't this why you introduced the local variable "anon_vma"? I believe
> it would be easier to understand its purpose if it includes the "orig_"
> prefix.
I think at some point on review when a bikeshed point is simply being
repeated a review-ee just has to say 'sorry no'.
So, sorry, no.
I don't mean to be rude, but I just don't think it's productive to go in a
loop.
>
> Just my two eurocents.
>
> Petr T
^ permalink raw reply [flat|nested] 53+ messages in thread
* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-05 12:13 ` [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge Lorenzo Stoakes
2024-08-06 13:55 ` Petr Tesařík
@ 2024-08-09 14:25 ` Vlastimil Babka
2024-08-09 14:37 ` Lorenzo Stoakes
1 sibling, 1 reply; 53+ messages in thread
From: Vlastimil Babka @ 2024-08-09 14:25 UTC (permalink / raw)
To: Lorenzo Stoakes, linux-mm, linux-kernel, Andrew Morton; +Cc: Liam R . Howlett
On 8/5/24 14:13, Lorenzo Stoakes wrote:
> In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
> removed in mergeability test") we relaxed the VMA merge rules for VMAs
> possessing a vm_ops->close() hook, permitting this operation in instances
> where we wouldn't delete the VMA as part of the merge operation.
>
> This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
> case 7 with vma_ops->close") to account for a subtle case that the previous
> commit had not taken into account.
>
> In both instances, we first rely on is_mergeable_vma() to determine whether
> we might be dealing with a VMA that might be removed, taking advantage of
> the fact that a 'previous' VMA will never be deleted, only VMAs that follow
> it.
>
> The second patch corrects the instance where a merge of the previous VMA
> into a subsequent one did not correctly check whether the subsequent VMA
> had a vm_ops->close() handler.
>
> Both changes prevent merge cases that are actually permissible (for
> instance a merge of a VMA into a following VMA with a vm_ops->close(), but
> with no previous VMA, which would result in the next VMA being extended,
> not deleted).
>
> In addition, both changes fail to consider the case where a VMA that would
> otherwise be merged with the previous and next VMA might have
> vm_ops->close(), on the assumption that for this to be the case, all three
> would have to have the same vma->vm_file to be mergeable and thus the same
> vm_ops.
>
> And in addition both changes operate at 50,000 feet, trying to guess
> whether a VMA will be deleted.
>
> As we have majorly refactored the VMA merge operation and de-duplicated
> code to the point where we know precisely where deletions will occur, this
> patch removes the aforementioned checks altogether and instead explicitly
> checks whether a VMA will be deleted.
>
> In cases where a reduced merge is still possible (where we merge both
> previous and next VMA but the next VMA has a vm_ops->close hook, meaning we
> could just merge the previous and current VMA), we do so, otherwise the
> merge is not permitted.
>
> We take advantage of our userland testing to assert that this functions
> correctly - replacing the previous limited vm_ops->close() tests with tests
> for every single case where we delete a VMA.
>
> We also update all testing for both new and modified VMAs to set
> vma->vm_ops->close() in every single instance where this would not prevent
> the merge, to assert that we never do so.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Amazing!
Acked-by: Vlastimil Babka <vbabka@suse.cz>
> @@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
>
> /* If we span the entire VMA, a merge implies it will be deleted. */
> merge_will_delete_vma = left_side && right_side;
> - /* If we merge both VMAs, then next is also deleted. */
Nit: This comment ...
> +
> + /*
> + * If we need to remove vma in its entirety but are unable to do so,
> + * we have no sensible recourse but to abort the merge.
> + */
> + if (merge_will_delete_vma && !can_merge_remove_vma(vma))
> + return NULL;
> +
> + /*
> + * If we merge both VMAs, then next is also deleted. This implies
> + * merge_will_delete_vma also.
> + */
... changed to this comment. Seems spurious, could have been like that
before already? I don't see how the new "This implies" part became relevant
now? We already tested merge_will_delete_vma above.
> merge_will_delete_next = merge_both;
>
> + /*
> + * If we cannot delete next, then we can reduce the operation to merging
> + * prev and vma (thereby deleting vma).
> + */
> + if (merge_will_delete_next && !can_merge_remove_vma(next)) {
> + merge_will_delete_next = false;
> + merge_right = false;
> + merge_both = false;
> + }
> +
> /* No matter what happens, we will be adjusting vma. */
> vma_start_write(vma);
>
^ permalink raw reply [flat|nested] 53+ messages in thread* Re: [PATCH 10/10] mm: rework vm_ops->close() handling on VMA merge
2024-08-09 14:25 ` Vlastimil Babka
@ 2024-08-09 14:37 ` Lorenzo Stoakes
0 siblings, 0 replies; 53+ messages in thread
From: Lorenzo Stoakes @ 2024-08-09 14:37 UTC (permalink / raw)
To: Vlastimil Babka; +Cc: linux-mm, linux-kernel, Andrew Morton, Liam R . Howlett
On Fri, Aug 09, 2024 at 04:25:53PM GMT, Vlastimil Babka wrote:
> On 8/5/24 14:13, Lorenzo Stoakes wrote:
> > In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be
> > removed in mergeability test") we relaxed the VMA merge rules for VMAs
> > possessing a vm_ops->close() hook, permitting this operation in instances
> > where we wouldn't delete the VMA as part of the merge operation.
> >
> > This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge()
> > case 7 with vma_ops->close") to account for a subtle case that the previous
> > commit had not taken into account.
> >
> > In both instances, we first rely on is_mergeable_vma() to determine whether
> > we might be dealing with a VMA that might be removed, taking advantage of
> > the fact that a 'previous' VMA will never be deleted, only VMAs that follow
> > it.
> >
> > The second patch corrects the instance where a merge of the previous VMA
> > into a subsequent one did not correctly check whether the subsequent VMA
> > had a vm_ops->close() handler.
> >
> > Both changes prevent merge cases that are actually permissible (for
> > instance a merge of a VMA into a following VMA with a vm_ops->close(), but
> > with no previous VMA, which would result in the next VMA being extended,
> > not deleted).
> >
> > In addition, both changes fail to consider the case where a VMA that would
> > otherwise be merged with the previous and next VMA might have
> > vm_ops->close(), on the assumption that for this to be the case, all three
> > would have to have the same vma->vm_file to be mergeable and thus the same
> > vm_ops.
> >
> > And in addition both changes operate at 50,000 feet, trying to guess
> > whether a VMA will be deleted.
> >
> > As we have majorly refactored the VMA merge operation and de-duplicated
> > code to the point where we know precisely where deletions will occur, this
> > patch removes the aforementioned checks altogether and instead explicitly
> > checks whether a VMA will be deleted.
> >
> > In cases where a reduced merge is still possible (where we merge both
> > previous and next VMA but the next VMA has a vm_ops->close hook, meaning we
> > could just merge the previous and current VMA), we do so, otherwise the
> > merge is not permitted.
> >
> > We take advantage of our userland testing to assert that this functions
> > correctly - replacing the previous limited vm_ops->close() tests with tests
> > for every single case where we delete a VMA.
> >
> > We also update all testing for both new and modified VMAs to set
> > vma->vm_ops->close() in every single instance where this would not prevent
> > the merge, to assert that we never do so.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Amazing!
>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
>
Thanks! :)
> > @@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg)
> >
> > /* If we span the entire VMA, a merge implies it will be deleted. */
> > merge_will_delete_vma = left_side && right_side;
> > - /* If we merge both VMAs, then next is also deleted. */
>
> Nit: This comment ...
>
> > +
> > + /*
> > + * If we need to remove vma in its entirety but are unable to do so,
> > + * we have no sensible recourse but to abort the merge.
> > + */
> > + if (merge_will_delete_vma && !can_merge_remove_vma(vma))
> > + return NULL;
> > +
> > + /*
> > + * If we merge both VMAs, then next is also deleted. This implies
> > + * merge_will_delete_vma also.
> > + */
>
> ... changed to this comment. Seems spurious, could have been like that
> before already? I don't see how the new "This implies" part became relevant
> now? We already tested merge_will_delete_vma above.
Will move to previous commit.
>
> > merge_will_delete_next = merge_both;
> >
> > + /*
> > + * If we cannot delete next, then we can reduce the operation to merging
> > + * prev and vma (thereby deleting vma).
> > + */
> > + if (merge_will_delete_next && !can_merge_remove_vma(next)) {
> > + merge_will_delete_next = false;
> > + merge_right = false;
> > + merge_both = false;
> > + }
> > +
> > /* No matter what happens, we will be adjusting vma. */
> > vma_start_write(vma);
> >
>
^ permalink raw reply [flat|nested] 53+ messages in thread