[RFC] mm: add a new vector based madvise syscall

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC] mm: add a new vector based madvise syscall
@ 2015-10-29 21:55 Shaohua Li
  2015-10-30 12:48 ` Rik van Riel
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Shaohua Li @ 2015-10-29 21:55 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, riel, mgorman, hughd, hannes, aarcange, je, Kernel-team

[-- Attachment #1: Type: text/plain, Size: 7336 bytes --]

In jemalloc, a free(3) doesn't immediately free the memory to OS even
the memory is page aligned/size, and hope the memory can be reused soon.
Later the virtual address becomes fragmented, and more and more free
memory are aggregated. If the free memory size is large, jemalloc uses
madvise(DONT_NEED) to actually free the memory back to OS.

The madvise has significantly overhead paritcularly because of TLB
flush. jemalloc does madvise for several virtual address space ranges
one time. Instead of calling madvise for each of the ranges, we
introduce a new syscall to purge memory for several ranges one time. In
this way, we can merge several TLB flush for the ranges to one big TLB
flush. This also reduce mmap_sem locking.

I'm running a simple memory allocation benchmark. 32 threads do random
malloc/free/realloc. Corresponding jemalloc patch to utilize this API is
attached.
Without patch:
real    0m18.923s
user    1m11.819s
sys     7m44.626s
each cpu gets around 3000K/s TLB flush interrupt. Perf shows TLB flush
is hotest functions. mmap_sem read locking (because of page fault) is
also heavy.

with patch:
real    0m15.026s
user    0m48.548s
sys     6m41.153s
each cpu gets around 140k/s TLB flush interrupt. TLB flush isn't hot at
all. mmap_sem read locking (still because of page fault) becomes the
sole hot spot.

Another test malloc a bunch of memory in 48 threads, then all threads
free the memory. I measure the time of the memory free.
Without patch: 34.332s
With patch:    17.429s

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 mm/madvise.c                           | 144 ++++++++++++++++++++++++++++++---
 3 files changed, 134 insertions(+), 12 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7663c45..4c99ef5 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -382,3 +382,4 @@
 373	i386	shutdown		sys_shutdown
 374	i386	userfaultfd		sys_userfaultfd
 375	i386	membarrier		sys_membarrier
+376	i386	madvisev		sys_madvisev
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 278842f..1025406 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -331,6 +331,7 @@
 322	64	execveat		stub_execveat
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
+325	common	madvisev		sys_madvisev
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcb..6251103 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,9 @@
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/uio.h>
+#include <linux/sort.h>
+#include <asm/tlb.h>
 
 /*
  * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -415,6 +418,29 @@ madvise_behavior_valid(int behavior)
 	}
 }
 
+static bool madvise_range_valid(unsigned long start, size_t len_in, bool *skip)
+{
+	size_t len;
+	unsigned long end;
+
+	if (start & ~PAGE_MASK)
+		return false;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return false;
+
+	end = start + len;
+	if (end < start)
+		return false;
+	if (end == start)
+		*skip = true;
+	else
+		*skip = false;
+	return true;
+}
+
 /*
  * The madvise(2) system call.
  *
@@ -464,8 +490,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	int unmapped_error = 0;
 	int error = -EINVAL;
 	int write;
-	size_t len;
 	struct blk_plug plug;
+	bool skip;
 
 #ifdef CONFIG_MEMORY_FAILURE
 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -474,20 +500,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	if (!madvise_behavior_valid(behavior))
 		return error;
 
-	if (start & ~PAGE_MASK)
-		return error;
-	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
-
-	/* Check to see whether len was rounded up from small -ve to zero */
-	if (len_in && !len)
-		return error;
-
-	end = start + len;
-	if (end < start)
+	if (!madvise_range_valid(start, len_in, &skip))
 		return error;
+	end = start + ((len_in + ~PAGE_MASK) & PAGE_MASK);
 
 	error = 0;
-	if (end == start)
+	if (skip)
 		return error;
 
 	write = madvise_need_mmap_write(behavior);
@@ -549,3 +567,105 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 
 	return error;
 }
+
+static int iov_cmp_func(const void *a, const void *b)
+{
+	const struct iovec *iova = a;
+	const struct iovec *iovb = b;
+	unsigned long addr_a = (unsigned long)iova->iov_base;
+	unsigned long addr_b = (unsigned long)iovb->iov_base;
+
+	if (addr_a > addr_b)
+		return 1;
+	if (addr_a < addr_b)
+		return -1;
+	return 0;
+}
+
+SYSCALL_DEFINE3(madvisev, const struct iovec __user *, uvector, unsigned long, nr_segs,
+	int, behavior)
+{
+	struct iovec iovstack[UIO_FASTIOV];
+	struct iovec *iov = NULL;
+	struct vm_area_struct **vmas = NULL;
+	unsigned long start, last_start = 0;
+	size_t len;
+	struct mmu_gather tlb;
+	int error;
+	int i;
+	bool skip;
+
+	if (behavior != MADV_DONTNEED)
+		return -EINVAL;
+
+	error = rw_copy_check_uvector(CHECK_IOVEC_ONLY, uvector, nr_segs,
+			UIO_FASTIOV, iovstack, &iov);
+	if (error <= 0)
+		return error;
+	/* Make sure address in ascend order */
+	sort(iov, nr_segs, sizeof(struct iovec), iov_cmp_func, NULL);
+
+	vmas = kmalloc(nr_segs * sizeof(struct vm_area_struct *), GFP_KERNEL);
+	if (!vmas) {
+		error = -EFAULT;
+		goto out;
+	}
+	for (i = 0; i < nr_segs; i++) {
+		start = (unsigned long)iov[i].iov_base;
+		len = ((iov[i].iov_len + ~PAGE_MASK) & PAGE_MASK);
+		iov[i].iov_len = len;
+		if (start < last_start) {
+			error = -EINVAL;
+			goto out;
+		}
+		if (!madvise_range_valid(start, len, &skip)) {
+			error = -EINVAL;
+			goto out;
+		}
+		if (skip) {
+			error = 0;
+			goto out;
+		}
+		last_start = start + len;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	for (i = 0; i < nr_segs; i++) {
+		start = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		vmas[i] = find_vma(current->mm, start);
+		/*
+		 * don't allow range cross vma, it doesn't make sense for
+		 * DONTNEED
+		 */
+		if (!vmas[i] || start < vmas[i]->vm_start ||
+		    start + len > vmas[i]->vm_end) {
+			error = -ENOMEM;
+			goto up_out;
+		}
+		if (vmas[i]->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) {
+			error = -EINVAL;
+			goto up_out;
+		}
+	}
+
+	lru_add_drain();
+	tlb_gather_mmu(&tlb, current->mm, (unsigned long)iov[0].iov_base,
+		last_start);
+	update_hiwater_rss(current->mm);
+	for (i = 0; i < nr_segs; i++) {
+		start = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		unmap_vmas(&tlb, vmas[i], start, start + len);
+	}
+	tlb_finish_mmu(&tlb, (unsigned long)iov[0].iov_base, last_start);
+	error = 0;
+
+up_out:
+	up_read(&current->mm->mmap_sem);
+out:
+	kfree(vmas);
+	if (iov != iovstack)
+		kfree(iov);
+	return error;
+}
-- 
2.4.6


[-- Attachment #2: je.patch --]
[-- Type: text/plain, Size: 2295 bytes --]

diff --git a/src/arena.c b/src/arena.c
index 43733cc..ae2de35 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1266,6 +1266,7 @@ arena_dirty_count(arena_t *arena)
 	return (ndirty);
 }
 
+#define PURGE_VEC 1
 static size_t
 arena_compute_npurge(arena_t *arena, bool all)
 {
@@ -1280,6 +1281,10 @@ arena_compute_npurge(arena_t *arena, bool all)
 		threshold = threshold < chunk_npages ? chunk_npages : threshold;
 
 		npurge = arena->ndirty - threshold;
+#if PURGE_VEC
+		if (npurge < arena->ndirty / 2)
+			npurge = arena->ndirty / 2;
+#endif
 	} else
 		npurge = arena->ndirty;
 
@@ -1366,6 +1371,16 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
 	return (nstashed);
 }
 
+#if PURGE_VEC
+#define MAX_IOVEC 32
+bool pages_purge_vec(struct iovec *iov, unsigned long nr_segs)
+{
+	int ret = syscall(325, iov, nr_segs, MADV_DONTNEED);
+
+	return !!ret;
+}
+#endif
+
 static size_t
 arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
     arena_runs_dirty_link_t *purge_runs_sentinel,
@@ -1374,6 +1389,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 	size_t npurged, nmadvise;
 	arena_runs_dirty_link_t *rdelm;
 	extent_node_t *chunkselm;
+#if PURGE_VEC
+	struct iovec iovec[MAX_IOVEC];
+	int vec_index = 0;
+#endif
 
 	if (config_stats)
 		nmadvise = 0;
@@ -1418,9 +1437,21 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 				flag_unzeroed = 0;
 				flags = CHUNK_MAP_DECOMMITTED;
 			} else {
+#if !PURGE_VEC
 				flag_unzeroed = chunk_purge_wrapper(arena,
 				    chunk_hooks, chunk, chunksize, pageind <<
 				    LG_PAGE, run_size) ? CHUNK_MAP_UNZEROED : 0;
+#else
+				flag_unzeroed = 0;
+				iovec[vec_index].iov_base = (void *)((uintptr_t)chunk +
+					(pageind << LG_PAGE));
+				iovec[vec_index].iov_len = run_size;
+				vec_index++;
+				if (vec_index >= MAX_IOVEC) {
+					pages_purge_vec(iovec, vec_index);
+					vec_index = 0;
+				}
+#endif
 				flags = flag_unzeroed;
 			}
 			arena_mapbits_large_set(chunk, pageind+npages-1, 0,
@@ -1449,6 +1480,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
 		if (config_stats)
 			nmadvise++;
 	}
+#if PURGE_VEC
+	if (vec_index > 0)
+		pages_purge_vec(iovec, vec_index);
+#endif
 	malloc_mutex_lock(&arena->lock);
 
 	if (config_stats) {

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] mm: add a new vector based madvise syscall
  2015-10-29 21:55 [RFC] mm: add a new vector based madvise syscall Shaohua Li
@ 2015-10-30 12:48 ` Rik van Riel
  2015-10-30 20:17 ` Andi Kleen
  2015-11-05 13:01 ` Vlastimil Babka
  2 siblings, 0 replies; 5+ messages in thread
From: Rik van Riel @ 2015-10-30 12:48 UTC (permalink / raw)
  To: Shaohua Li, linux-mm
  Cc: akpm, mgorman, hughd, hannes, aarcange, je, Kernel-team

On 10/29/2015 05:55 PM, Shaohua Li wrote:
> In jemalloc, a free(3) doesn't immediately free the memory to OS even
> the memory is page aligned/size, and hope the memory can be reused soon.
> Later the virtual address becomes fragmented, and more and more free
> memory are aggregated. If the free memory size is large, jemalloc uses
> madvise(DONT_NEED) to actually free the memory back to OS.
> 
> The madvise has significantly overhead paritcularly because of TLB
> flush. jemalloc does madvise for several virtual address space ranges
> one time. Instead of calling madvise for each of the ranges, we
> introduce a new syscall to purge memory for several ranges one time. In
> this way, we can merge several TLB flush for the ranges to one big TLB
> flush. This also reduce mmap_sem locking.
> 
> I'm running a simple memory allocation benchmark. 32 threads do random
> malloc/free/realloc. Corresponding jemalloc patch to utilize this API is
> attached.
> Without patch:
> real    0m18.923s
> user    1m11.819s
> sys     7m44.626s
> each cpu gets around 3000K/s TLB flush interrupt. Perf shows TLB flush
> is hotest functions. mmap_sem read locking (because of page fault) is
> also heavy.
> 
> with patch:
> real    0m15.026s
> user    0m48.548s
> sys     6m41.153s
> each cpu gets around 140k/s TLB flush interrupt. TLB flush isn't hot at
> all. mmap_sem read locking (still because of page fault) becomes the
> sole hot spot.
> 
> Another test malloc a bunch of memory in 48 threads, then all threads
> free the memory. I measure the time of the memory free.
> Without patch: 34.332s
> With patch:    17.429s

Nice. This approach makes a lot of sense to me.

Is it too early to ack the patch? :)

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] mm: add a new vector based madvise syscall
  2015-10-29 21:55 [RFC] mm: add a new vector based madvise syscall Shaohua Li
  2015-10-30 12:48 ` Rik van Riel
@ 2015-10-30 20:17 ` Andi Kleen
  2015-11-02 18:16   ` Shaohua Li
  2015-11-05 13:01 ` Vlastimil Babka
  2 siblings, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2015-10-30 20:17 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-mm, akpm, riel, mgorman, hughd, hannes, aarcange, je, Kernel-team

Shaohua Li <shli@fb.com> writes:
> +		vmas[i] = find_vma(current->mm, start);
> +		/*
> +		 * don't allow range cross vma, it doesn't make sense for
> +		 * DONTNEED
> +		 */
> +		if (!vmas[i] || start < vmas[i]->vm_start ||
> +		    start + len > vmas[i]->vm_end) {
> +			error = -ENOMEM;
> +			goto up_out;
> +		}
> +		if (vmas[i]->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) {
> +			error = -EINVAL;
> +			goto up_out;
> +		}
> +	}

Needs a cond_resched() somewhere in case the list is very long?

BTW one trick that may be interesting here is to add a new mode
that skips the TLB flush completely, but instead waits with
the freeing until enough context switches to non kernel tasks occurred
(and flushed the TLB this way). This could be done as part of RCU.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] mm: add a new vector based madvise syscall
  2015-10-30 20:17 ` Andi Kleen
@ 2015-11-02 18:16   ` Shaohua Li
  0 siblings, 0 replies; 5+ messages in thread
From: Shaohua Li @ 2015-11-02 18:16 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-mm, akpm, riel, mgorman, hughd, hannes, aarcange, je, Kernel-team

On Fri, Oct 30, 2015 at 01:17:54PM -0700, Andi Kleen wrote:
> Shaohua Li <shli@fb.com> writes:
> > +		vmas[i] = find_vma(current->mm, start);
> > +		/*
> > +		 * don't allow range cross vma, it doesn't make sense for
> > +		 * DONTNEED
> > +		 */
> > +		if (!vmas[i] || start < vmas[i]->vm_start ||
> > +		    start + len > vmas[i]->vm_end) {
> > +			error = -ENOMEM;
> > +			goto up_out;
> > +		}
> > +		if (vmas[i]->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) {
> > +			error = -EINVAL;
> > +			goto up_out;
> > +		}
> > +	}
> 
> Needs a cond_resched() somewhere in case the list is very long?

Yep, the zap_pmd_range() has cond_resched(). 
> BTW one trick that may be interesting here is to add a new mode
> that skips the TLB flush completely, but instead waits with
> the freeing until enough context switches to non kernel tasks occurred
> (and flushed the TLB this way). This could be done as part of RCU.

that would not work if the app madvise(DONTNEED) first and then access the
virtual address again.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] mm: add a new vector based madvise syscall
  2015-10-29 21:55 [RFC] mm: add a new vector based madvise syscall Shaohua Li
  2015-10-30 12:48 ` Rik van Riel
  2015-10-30 20:17 ` Andi Kleen
@ 2015-11-05 13:01 ` Vlastimil Babka
  2 siblings, 0 replies; 5+ messages in thread
From: Vlastimil Babka @ 2015-11-05 13:01 UTC (permalink / raw)
  To: Shaohua Li, linux-mm
  Cc: akpm, riel, mgorman, hughd, hannes, aarcange, je, Kernel-team,
	Linux API, Minchan Kim

On 10/29/2015 10:55 PM, Shaohua Li wrote:
> In jemalloc, a free(3) doesn't immediately free the memory to OS even
> the memory is page aligned/size, and hope the memory can be reused soon.
> Later the virtual address becomes fragmented, and more and more free
> memory are aggregated. If the free memory size is large, jemalloc uses
> madvise(DONT_NEED) to actually free the memory back to OS.
> 
> The madvise has significantly overhead paritcularly because of TLB
> flush. jemalloc does madvise for several virtual address space ranges
> one time. Instead of calling madvise for each of the ranges, we
> introduce a new syscall to purge memory for several ranges one time. In
> this way, we can merge several TLB flush for the ranges to one big TLB
> flush. This also reduce mmap_sem locking.
> 
> I'm running a simple memory allocation benchmark. 32 threads do random
> malloc/free/realloc. Corresponding jemalloc patch to utilize this API is
> attached.
> Without patch:
> real    0m18.923s
> user    1m11.819s
> sys     7m44.626s
> each cpu gets around 3000K/s TLB flush interrupt. Perf shows TLB flush
> is hotest functions. mmap_sem read locking (because of page fault) is
> also heavy.
> 
> with patch:
> real    0m15.026s
> user    0m48.548s
> sys     6m41.153s
> each cpu gets around 140k/s TLB flush interrupt. TLB flush isn't hot at
> all. mmap_sem read locking (still because of page fault) becomes the
> sole hot spot.
> 
> Another test malloc a bunch of memory in 48 threads, then all threads
> free the memory. I measure the time of the memory free.
> Without patch: 34.332s
> With patch:    17.429s
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Shaohua Li <shli@fb.com>

First the obligatory (please remember on next submissions):

[CC += linux-api@vger.kernel.org]

    Since this is a kernel-user-space API change, please CC linux-api@. The
kernel source file Documentation/SubmitChecklist notes that all Linux kernel
patches that change userspace interfaces should be CCed to
linux-api@vger.kernel.org, so that the various parties who are interested in API
changes are informed. For further information, see
https://www.kernel.org/doc/man-pages/linux-api-ml.html

Also CCing Minchan. What about MADV_FREE support?

> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  mm/madvise.c                           | 144 ++++++++++++++++++++++++++++++---
>  3 files changed, 134 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 7663c45..4c99ef5 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -382,3 +382,4 @@
>  373	i386	shutdown		sys_shutdown
>  374	i386	userfaultfd		sys_userfaultfd
>  375	i386	membarrier		sys_membarrier
> +376	i386	madvisev		sys_madvisev
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 278842f..1025406 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -331,6 +331,7 @@
>  322	64	execveat		stub_execveat
>  323	common	userfaultfd		sys_userfaultfd
>  324	common	membarrier		sys_membarrier
> +325	common	madvisev		sys_madvisev
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/mm/madvise.c b/mm/madvise.c
> index c889fcb..6251103 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -20,6 +20,9 @@
>  #include <linux/backing-dev.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/uio.h>
> +#include <linux/sort.h>
> +#include <asm/tlb.h>
>  
>  /*
>   * Any behaviour which results in changes to the vma->vm_flags needs to
> @@ -415,6 +418,29 @@ madvise_behavior_valid(int behavior)
>  	}
>  }
>  
> +static bool madvise_range_valid(unsigned long start, size_t len_in, bool *skip)
> +{
> +	size_t len;
> +	unsigned long end;
> +
> +	if (start & ~PAGE_MASK)
> +		return false;
> +	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
> +
> +	/* Check to see whether len was rounded up from small -ve to zero */
> +	if (len_in && !len)
> +		return false;
> +
> +	end = start + len;
> +	if (end < start)
> +		return false;
> +	if (end == start)
> +		*skip = true;
> +	else
> +		*skip = false;
> +	return true;
> +}
> +
>  /*
>   * The madvise(2) system call.
>   *
> @@ -464,8 +490,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  	int unmapped_error = 0;
>  	int error = -EINVAL;
>  	int write;
> -	size_t len;
>  	struct blk_plug plug;
> +	bool skip;
>  
>  #ifdef CONFIG_MEMORY_FAILURE
>  	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
> @@ -474,20 +500,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  	if (!madvise_behavior_valid(behavior))
>  		return error;
>  
> -	if (start & ~PAGE_MASK)
> -		return error;
> -	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
> -
> -	/* Check to see whether len was rounded up from small -ve to zero */
> -	if (len_in && !len)
> -		return error;
> -
> -	end = start + len;
> -	if (end < start)
> +	if (!madvise_range_valid(start, len_in, &skip))
>  		return error;
> +	end = start + ((len_in + ~PAGE_MASK) & PAGE_MASK);
>  
>  	error = 0;
> -	if (end == start)
> +	if (skip)
>  		return error;
>  
>  	write = madvise_need_mmap_write(behavior);
> @@ -549,3 +567,105 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  
>  	return error;
>  }
> +
> +static int iov_cmp_func(const void *a, const void *b)
> +{
> +	const struct iovec *iova = a;
> +	const struct iovec *iovb = b;
> +	unsigned long addr_a = (unsigned long)iova->iov_base;
> +	unsigned long addr_b = (unsigned long)iovb->iov_base;
> +
> +	if (addr_a > addr_b)
> +		return 1;
> +	if (addr_a < addr_b)
> +		return -1;
> +	return 0;
> +}
> +
> +SYSCALL_DEFINE3(madvisev, const struct iovec __user *, uvector, unsigned long, nr_segs,
> +	int, behavior)
> +{
> +	struct iovec iovstack[UIO_FASTIOV];
> +	struct iovec *iov = NULL;
> +	struct vm_area_struct **vmas = NULL;
> +	unsigned long start, last_start = 0;
> +	size_t len;
> +	struct mmu_gather tlb;
> +	int error;
> +	int i;
> +	bool skip;
> +
> +	if (behavior != MADV_DONTNEED)
> +		return -EINVAL;
> +
> +	error = rw_copy_check_uvector(CHECK_IOVEC_ONLY, uvector, nr_segs,
> +			UIO_FASTIOV, iovstack, &iov);
> +	if (error <= 0)
> +		return error;
> +	/* Make sure address in ascend order */
> +	sort(iov, nr_segs, sizeof(struct iovec), iov_cmp_func, NULL);
> +
> +	vmas = kmalloc(nr_segs * sizeof(struct vm_area_struct *), GFP_KERNEL);
> +	if (!vmas) {
> +		error = -EFAULT;
> +		goto out;
> +	}
> +	for (i = 0; i < nr_segs; i++) {
> +		start = (unsigned long)iov[i].iov_base;
> +		len = ((iov[i].iov_len + ~PAGE_MASK) & PAGE_MASK);
> +		iov[i].iov_len = len;
> +		if (start < last_start) {
> +			error = -EINVAL;
> +			goto out;
> +		}
> +		if (!madvise_range_valid(start, len, &skip)) {
> +			error = -EINVAL;
> +			goto out;
> +		}
> +		if (skip) {
> +			error = 0;
> +			goto out;
> +		}
> +		last_start = start + len;
> +	}
> +
> +	down_read(&current->mm->mmap_sem);
> +	for (i = 0; i < nr_segs; i++) {
> +		start = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +		vmas[i] = find_vma(current->mm, start);
> +		/*
> +		 * don't allow range cross vma, it doesn't make sense for
> +		 * DONTNEED
> +		 */
> +		if (!vmas[i] || start < vmas[i]->vm_start ||
> +		    start + len > vmas[i]->vm_end) {
> +			error = -ENOMEM;
> +			goto up_out;
> +		}
> +		if (vmas[i]->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) {
> +			error = -EINVAL;
> +			goto up_out;
> +		}
> +	}
> +
> +	lru_add_drain();
> +	tlb_gather_mmu(&tlb, current->mm, (unsigned long)iov[0].iov_base,
> +		last_start);
> +	update_hiwater_rss(current->mm);
> +	for (i = 0; i < nr_segs; i++) {
> +		start = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +		unmap_vmas(&tlb, vmas[i], start, start + len);
> +	}
> +	tlb_finish_mmu(&tlb, (unsigned long)iov[0].iov_base, last_start);
> +	error = 0;
> +
> +up_out:
> +	up_read(&current->mm->mmap_sem);
> +out:
> +	kfree(vmas);
> +	if (iov != iovstack)
> +		kfree(iov);
> +	return error;
> +}
> 
> 
> je.patch
> 
> 
> diff --git a/src/arena.c b/src/arena.c
> index 43733cc..ae2de35 100644
> --- a/src/arena.c
> +++ b/src/arena.c
> @@ -1266,6 +1266,7 @@ arena_dirty_count(arena_t *arena)
>  	return (ndirty);
>  }
>  
> +#define PURGE_VEC 1
>  static size_t
>  arena_compute_npurge(arena_t *arena, bool all)
>  {
> @@ -1280,6 +1281,10 @@ arena_compute_npurge(arena_t *arena, bool all)
>  		threshold = threshold < chunk_npages ? chunk_npages : threshold;
>  
>  		npurge = arena->ndirty - threshold;
> +#if PURGE_VEC
> +		if (npurge < arena->ndirty / 2)
> +			npurge = arena->ndirty / 2;
> +#endif
>  	} else
>  		npurge = arena->ndirty;
>  
> @@ -1366,6 +1371,16 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all,
>  	return (nstashed);
>  }
>  
> +#if PURGE_VEC
> +#define MAX_IOVEC 32
> +bool pages_purge_vec(struct iovec *iov, unsigned long nr_segs)
> +{
> +	int ret = syscall(325, iov, nr_segs, MADV_DONTNEED);
> +
> +	return !!ret;
> +}
> +#endif
> +
>  static size_t
>  arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
>      arena_runs_dirty_link_t *purge_runs_sentinel,
> @@ -1374,6 +1389,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
>  	size_t npurged, nmadvise;
>  	arena_runs_dirty_link_t *rdelm;
>  	extent_node_t *chunkselm;
> +#if PURGE_VEC
> +	struct iovec iovec[MAX_IOVEC];
> +	int vec_index = 0;
> +#endif
>  
>  	if (config_stats)
>  		nmadvise = 0;
> @@ -1418,9 +1437,21 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
>  				flag_unzeroed = 0;
>  				flags = CHUNK_MAP_DECOMMITTED;
>  			} else {
> +#if !PURGE_VEC
>  				flag_unzeroed = chunk_purge_wrapper(arena,
>  				    chunk_hooks, chunk, chunksize, pageind <<
>  				    LG_PAGE, run_size) ? CHUNK_MAP_UNZEROED : 0;
> +#else
> +				flag_unzeroed = 0;
> +				iovec[vec_index].iov_base = (void *)((uintptr_t)chunk +
> +					(pageind << LG_PAGE));
> +				iovec[vec_index].iov_len = run_size;
> +				vec_index++;
> +				if (vec_index >= MAX_IOVEC) {
> +					pages_purge_vec(iovec, vec_index);
> +					vec_index = 0;
> +				}
> +#endif
>  				flags = flag_unzeroed;
>  			}
>  			arena_mapbits_large_set(chunk, pageind+npages-1, 0,
> @@ -1449,6 +1480,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks,
>  		if (config_stats)
>  			nmadvise++;
>  	}
> +#if PURGE_VEC
> +	if (vec_index > 0)
> +		pages_purge_vec(iovec, vec_index);
> +#endif
>  	malloc_mutex_lock(&arena->lock);
>  
>  	if (config_stats) {
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-11-05 13:01 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-29 21:55 [RFC] mm: add a new vector based madvise syscall Shaohua Li
2015-10-30 12:48 ` Rik van Riel
2015-10-30 20:17 ` Andi Kleen
2015-11-02 18:16   ` Shaohua Li
2015-11-05 13:01 ` Vlastimil Babka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox