linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nadav Amit <nadav.amit@gmail.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Peter Xu <peterx@redhat.com>, Nadav Amit <namit@vmware.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Minchan Kim <minchan@kernel.org>, Colin Cross <ccross@google.com>,
	Suren Baghdasarya <surenb@google.com>,
	Mike Rapoport <rppt@linux.vnet.ibm.com>
Subject: [RFC PATCH 5/8] mm/madvise: perform certain operations once on process_madvise()
Date: Sun, 26 Sep 2021 09:12:56 -0700	[thread overview]
Message-ID: <20210926161259.238054-6-namit@vmware.com> (raw)
In-Reply-To: <20210926161259.238054-1-namit@vmware.com>

From: Nadav Amit <namit@vmware.com>

There are certain operations that can be performed only once on
process_madvise() instead of performing them for each IO vector.
Acquiring the mmap-lock, and initializing blk_plug are specifically such
operations.

Collect the aforementioned operations into madvise_start() and
madvise_finish(). The next patches will add additional operations into
these functions.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Colin Cross <ccross@google.com>
Cc: Suren Baghdasarya <surenb@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
 mm/madvise.c | 139 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 53 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 127507c71ba9..84b86ae85671 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -43,6 +43,13 @@ struct madvise_walk_private {
 struct madvise_info {
 	u8 behavior_valid: 1;
 	u8 process_behavior_valid: 1;
+	u8 no_mmap_lock: 1;
+
+	/*
+	 * Any behaviour which results in changes to the vma->vm_flags needs to
+	 * take mmap_lock for writing. Others, which simply traverse vmas, need
+	 * to only take it for reading.
+	 */
 	u8 need_mmap_read_only: 1;
 };
 
@@ -120,9 +127,11 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
 #ifdef CONFIG_MEMORY_FAILURE
 	[MADV_HWPOISON] = {
 		.behavior_valid = 1,
+		.no_mmap_lock = 1,
 	},
 	[MADV_SOFT_OFFLINE] = {
 		.behavior_valid = 1,
+		.no_mmap_lock = 1,
 	},
 #endif
 	[MADV_POPULATE_READ] = {
@@ -135,16 +144,6 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
 	},
 };
 
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
-	return !madvise_info[behavior].need_mmap_read_only;
-}
-
 /*
  * We can potentially split a vm area into separate
  * areas, each area with its own behavior.
@@ -1081,26 +1080,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	}
 }
 
-static bool
-madvise_behavior_valid(int *behavior)
-{
-	if (*behavior >= ARRAY_SIZE(madvise_info))
-		return false;
-
-	*behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
-	return madvise_info[*behavior].behavior_valid;
-}
-
-static bool
-process_madvise_behavior_valid(int *behavior)
-{
-	if (*behavior >= ARRAY_SIZE(madvise_info))
-		return false;
-
-	*behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
-	return madvise_info[*behavior].process_behavior_valid;
-}
-
 /*
  * The madvise(2) system call.
  *
@@ -1171,21 +1150,17 @@ process_madvise_behavior_valid(int *behavior)
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in,
+		      int behavior)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
 	int unmapped_error = 0;
 	int error = -EINVAL;
-	int write;
 	size_t len;
-	struct blk_plug plug;
 
 	start = untagged_addr(start);
 
-	if (!madvise_behavior_valid(&behavior))
-		return error;
-
 	if (!PAGE_ALIGNED(start))
 		return error;
 	len = PAGE_ALIGN(len_in);
@@ -1207,14 +1182,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		return madvise_inject_error(behavior, start, start + len_in);
 #endif
 
-	write = madvise_need_mmap_write(behavior);
-	if (write) {
-		if (mmap_write_lock_killable(mm))
-			return -EINTR;
-	} else {
-		mmap_read_lock(mm);
-	}
-
 	/*
 	 * If the interval [start,end) covers some unmapped address
 	 * ranges, just ignore them, but return -ENOMEM at the end.
@@ -1224,7 +1191,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	if (vma && start > vma->vm_start)
 		prev = vma;
 
-	blk_start_plug(&plug);
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
@@ -1260,15 +1226,72 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 			vma = find_vma(mm, start);
 	}
 out:
-	blk_finish_plug(&plug);
-	if (write)
-		mmap_write_unlock(mm);
-	else
-		mmap_read_unlock(mm);
 
 	return error;
 }
 
+static int
+madvise_start(struct mm_struct *mm, struct madvise_info behavior_info,
+	      struct blk_plug *plug)
+{
+	if (!behavior_info.no_mmap_lock) {
+		if (behavior_info.need_mmap_read_only)
+			mmap_read_lock(mm);
+		else if (mmap_write_lock_killable(mm))
+			return -EINTR;
+	}
+
+	blk_start_plug(plug);
+	return 0;
+}
+
+static void
+madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info,
+	       struct blk_plug *plug)
+{
+	blk_finish_plug(plug);
+
+	if (!behavior_info.no_mmap_lock) {
+		if (behavior_info.need_mmap_read_only)
+			mmap_read_unlock(mm);
+		else
+			mmap_write_unlock(mm);
+	}
+}
+
+static struct madvise_info madvise_behavior_info(int behavior)
+{
+	if (behavior >= ARRAY_SIZE(madvise_info) || behavior < 0) {
+		const struct madvise_info invalid = {0};
+		return invalid;
+	}
+
+	behavior = array_index_nospec(behavior, ARRAY_SIZE(madvise_info));
+	return madvise_info[behavior];
+}
+
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+	       int behavior)
+{
+	struct madvise_info behavior_info;
+	struct blk_plug plug;
+	int ret = -EINVAL;
+
+	behavior_info = madvise_behavior_info(behavior);
+
+	if (!behavior_info.behavior_valid)
+		return ret;
+
+	ret = madvise_start(mm, behavior_info, &plug);
+	if (ret != 0)
+		return ret;
+
+	ret = madvise_one_range(mm, start, len_in, behavior);
+
+	madvise_finish(mm, behavior_info, &plug);
+	return ret;
+}
+
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
 	return do_madvise(current->mm, start, len_in, behavior);
@@ -1286,6 +1309,8 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	struct mm_struct *mm;
 	size_t total_len;
 	unsigned int f_flags;
+	struct madvise_info behavior_info;
+	struct blk_plug plug;
 
 	if (flags != 0) {
 		ret = -EINVAL;
@@ -1308,7 +1333,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		goto put_pid;
 	}
 
-	if (!process_madvise_behavior_valid(&behavior)) {
+	behavior_info = madvise_behavior_info(behavior);
+
+	if (!behavior_info.process_behavior_valid) {
 		ret = -EINVAL;
 		goto release_task;
 	}
@@ -1331,15 +1358,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 
 	total_len = iov_iter_count(&iter);
 
+	ret = madvise_start(mm, behavior_info, &plug);
+	if (ret != 0)
+		goto release_mm;
+
 	while (iov_iter_count(&iter)) {
 		iovec = iov_iter_iovec(&iter);
-		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-					iovec.iov_len, behavior);
+		ret = madvise_one_range(mm, (unsigned long)iovec.iov_base,
+				iovec.iov_len, behavior);
 		if (ret < 0)
 			break;
 		iov_iter_advance(&iter, iovec.iov_len);
 	}
 
+	madvise_finish(mm, behavior_info, &plug);
+
 	if (ret == 0)
 		ret = total_len - iov_iter_count(&iter);
 
-- 
2.25.1



  parent reply	other threads:[~2021-09-26 23:44 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-26 16:12 [RFC PATCH 0/8] mm/madvise: support process_madvise(MADV_DONTNEED) Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 1/8] mm/madvise: propagate vma->vm_end changes Nadav Amit
2021-09-27  9:08   ` Kirill A. Shutemov
2021-09-27 10:11     ` Nadav Amit
2021-09-27 11:55       ` Kirill A. Shutemov
2021-09-27 12:33         ` Nadav Amit
2021-09-27 12:45           ` Kirill A. Shutemov
2021-09-27 12:59             ` Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 2/8] mm/madvise: remove unnecessary check on madvise_dontneed_free() Nadav Amit
2021-09-27  9:11   ` Kirill A. Shutemov
2021-09-27 11:05     ` Nadav Amit
2021-09-27 12:19       ` Kirill A. Shutemov
2021-09-27 12:52         ` Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 3/8] mm/madvise: remove unnecessary checks on madvise_free_single_vma() Nadav Amit
2021-09-27  9:17   ` Kirill A. Shutemov
2021-09-27  9:24     ` Kirill A. Shutemov
2021-09-26 16:12 ` [RFC PATCH 4/8] mm/madvise: define madvise behavior in a struct Nadav Amit
2021-09-27  9:31   ` Kirill A. Shutemov
2021-09-27 10:31     ` Nadav Amit
2021-09-27 12:14       ` Kirill A. Shutemov
2021-09-27 20:36         ` Nadav Amit
2021-09-26 16:12 ` Nadav Amit [this message]
2021-09-26 16:12 ` [RFC PATCH 6/8] mm/madvise: more aggressive TLB batching Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 7/8] mm/madvise: deduplicate code in madvise_dontneed_free() Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 8/8] mm/madvise: process_madvise(MADV_DONTNEED) Nadav Amit
2021-09-27  9:24 ` [RFC PATCH 0/8] mm/madvise: support process_madvise(MADV_DONTNEED) David Hildenbrand
2021-09-27 10:41   ` Nadav Amit
2021-09-27 10:58     ` David Hildenbrand
2021-09-27 12:00       ` Nadav Amit
2021-09-27 12:16         ` Michal Hocko
2021-09-27 19:12           ` Nadav Amit
2021-09-29  7:52             ` Michal Hocko
2021-09-29 18:31               ` Nadav Amit
2021-10-12 23:14                 ` Peter Xu
2021-10-13 15:47                   ` Nadav Amit
2021-10-13 23:09                     ` Peter Xu
2021-09-27 17:05         ` David Hildenbrand
2021-09-27 19:59           ` Nadav Amit
2021-09-28  8:53             ` David Hildenbrand
2021-09-28 22:56               ` Nadav Amit
2021-10-04 17:58                 ` David Hildenbrand
2021-10-07 16:19                   ` Nadav Amit
2021-10-07 16:46                     ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210926161259.238054-6-namit@vmware.com \
    --to=nadav.amit@gmail.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=ccross@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan@kernel.org \
    --cc=namit@vmware.com \
    --cc=peterx@redhat.com \
    --cc=rppt@linux.vnet.ibm.com \
    --cc=surenb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox