From: Nadav Amit <nadav.amit@gmail.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Peter Xu <peterx@redhat.com>, Nadav Amit <namit@vmware.com>,
Andrea Arcangeli <aarcange@redhat.com>,
Minchan Kim <minchan@kernel.org>, Colin Cross <ccross@google.com>,
Suren Baghdasarya <surenb@google.com>,
Mike Rapoport <rppt@linux.vnet.ibm.com>
Subject: [RFC PATCH 5/8] mm/madvise: perform certain operations once on process_madvise()
Date: Sun, 26 Sep 2021 09:12:56 -0700 [thread overview]
Message-ID: <20210926161259.238054-6-namit@vmware.com> (raw)
In-Reply-To: <20210926161259.238054-1-namit@vmware.com>
From: Nadav Amit <namit@vmware.com>
There are certain operations that can be performed only once on
process_madvise() instead of performing them for each IO vector.
Acquiring the mmap-lock, and initializing blk_plug are specifically such
operations.
Collect the aforementioned operations into madvise_start() and
madvise_finish(). The next patches will add additional operations into
these functions.
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Colin Cross <ccross@google.com>
Cc: Suren Baghdasarya <surenb@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
---
mm/madvise.c | 139 +++++++++++++++++++++++++++++++--------------------
1 file changed, 86 insertions(+), 53 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 127507c71ba9..84b86ae85671 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -43,6 +43,13 @@ struct madvise_walk_private {
struct madvise_info {
u8 behavior_valid: 1;
u8 process_behavior_valid: 1;
+ u8 no_mmap_lock: 1;
+
+ /*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
u8 need_mmap_read_only: 1;
};
@@ -120,9 +127,11 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
#ifdef CONFIG_MEMORY_FAILURE
[MADV_HWPOISON] = {
.behavior_valid = 1,
+ .no_mmap_lock = 1,
},
[MADV_SOFT_OFFLINE] = {
.behavior_valid = 1,
+ .no_mmap_lock = 1,
},
#endif
[MADV_POPULATE_READ] = {
@@ -135,16 +144,6 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = {
},
};
-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
- return !madvise_info[behavior].need_mmap_read_only;
-}
-
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
@@ -1081,26 +1080,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
}
}
-static bool
-madvise_behavior_valid(int *behavior)
-{
- if (*behavior >= ARRAY_SIZE(madvise_info))
- return false;
-
- *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
- return madvise_info[*behavior].behavior_valid;
-}
-
-static bool
-process_madvise_behavior_valid(int *behavior)
-{
- if (*behavior >= ARRAY_SIZE(madvise_info))
- return false;
-
- *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info));
- return madvise_info[*behavior].process_behavior_valid;
-}
-
/*
* The madvise(2) system call.
*
@@ -1171,21 +1150,17 @@ process_madvise_behavior_valid(int *behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
+int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in,
+ int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
- int write;
size_t len;
- struct blk_plug plug;
start = untagged_addr(start);
- if (!madvise_behavior_valid(&behavior))
- return error;
-
if (!PAGE_ALIGNED(start))
return error;
len = PAGE_ALIGN(len_in);
@@ -1207,14 +1182,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
return madvise_inject_error(behavior, start, start + len_in);
#endif
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
@@ -1224,7 +1191,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
if (vma && start > vma->vm_start)
prev = vma;
- blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
@@ -1260,15 +1226,72 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
vma = find_vma(mm, start);
}
out:
- blk_finish_plug(&plug);
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
return error;
}
+static int
+madvise_start(struct mm_struct *mm, struct madvise_info behavior_info,
+ struct blk_plug *plug)
+{
+ if (!behavior_info.no_mmap_lock) {
+ if (behavior_info.need_mmap_read_only)
+ mmap_read_lock(mm);
+ else if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ }
+
+ blk_start_plug(plug);
+ return 0;
+}
+
+static void
+madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info,
+ struct blk_plug *plug)
+{
+ blk_finish_plug(plug);
+
+ if (!behavior_info.no_mmap_lock) {
+ if (behavior_info.need_mmap_read_only)
+ mmap_read_unlock(mm);
+ else
+ mmap_write_unlock(mm);
+ }
+}
+
+static struct madvise_info madvise_behavior_info(int behavior)
+{
+ if (behavior >= ARRAY_SIZE(madvise_info) || behavior < 0) {
+ const struct madvise_info invalid = {0};
+ return invalid;
+ }
+
+ behavior = array_index_nospec(behavior, ARRAY_SIZE(madvise_info));
+ return madvise_info[behavior];
+}
+
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in,
+ int behavior)
+{
+ struct madvise_info behavior_info;
+ struct blk_plug plug;
+ int ret = -EINVAL;
+
+ behavior_info = madvise_behavior_info(behavior);
+
+ if (!behavior_info.behavior_valid)
+ return ret;
+
+ ret = madvise_start(mm, behavior_info, &plug);
+ if (ret != 0)
+ return ret;
+
+ ret = madvise_one_range(mm, start, len_in, behavior);
+
+ madvise_finish(mm, behavior_info, &plug);
+ return ret;
+}
+
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(current->mm, start, len_in, behavior);
@@ -1286,6 +1309,8 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct mm_struct *mm;
size_t total_len;
unsigned int f_flags;
+ struct madvise_info behavior_info;
+ struct blk_plug plug;
if (flags != 0) {
ret = -EINVAL;
@@ -1308,7 +1333,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto put_pid;
}
- if (!process_madvise_behavior_valid(&behavior)) {
+ behavior_info = madvise_behavior_info(behavior);
+
+ if (!behavior_info.process_behavior_valid) {
ret = -EINVAL;
goto release_task;
}
@@ -1331,15 +1358,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
total_len = iov_iter_count(&iter);
+ ret = madvise_start(mm, behavior_info, &plug);
+ if (ret != 0)
+ goto release_mm;
+
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
- ret = do_madvise(mm, (unsigned long)iovec.iov_base,
- iovec.iov_len, behavior);
+ ret = madvise_one_range(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
+ madvise_finish(mm, behavior_info, &plug);
+
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
--
2.25.1
next prev parent reply other threads:[~2021-09-26 23:44 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-26 16:12 [RFC PATCH 0/8] mm/madvise: support process_madvise(MADV_DONTNEED) Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 1/8] mm/madvise: propagate vma->vm_end changes Nadav Amit
2021-09-27 9:08 ` Kirill A. Shutemov
2021-09-27 10:11 ` Nadav Amit
2021-09-27 11:55 ` Kirill A. Shutemov
2021-09-27 12:33 ` Nadav Amit
2021-09-27 12:45 ` Kirill A. Shutemov
2021-09-27 12:59 ` Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 2/8] mm/madvise: remove unnecessary check on madvise_dontneed_free() Nadav Amit
2021-09-27 9:11 ` Kirill A. Shutemov
2021-09-27 11:05 ` Nadav Amit
2021-09-27 12:19 ` Kirill A. Shutemov
2021-09-27 12:52 ` Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 3/8] mm/madvise: remove unnecessary checks on madvise_free_single_vma() Nadav Amit
2021-09-27 9:17 ` Kirill A. Shutemov
2021-09-27 9:24 ` Kirill A. Shutemov
2021-09-26 16:12 ` [RFC PATCH 4/8] mm/madvise: define madvise behavior in a struct Nadav Amit
2021-09-27 9:31 ` Kirill A. Shutemov
2021-09-27 10:31 ` Nadav Amit
2021-09-27 12:14 ` Kirill A. Shutemov
2021-09-27 20:36 ` Nadav Amit
2021-09-26 16:12 ` Nadav Amit [this message]
2021-09-26 16:12 ` [RFC PATCH 6/8] mm/madvise: more aggressive TLB batching Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 7/8] mm/madvise: deduplicate code in madvise_dontneed_free() Nadav Amit
2021-09-26 16:12 ` [RFC PATCH 8/8] mm/madvise: process_madvise(MADV_DONTNEED) Nadav Amit
2021-09-27 9:24 ` [RFC PATCH 0/8] mm/madvise: support process_madvise(MADV_DONTNEED) David Hildenbrand
2021-09-27 10:41 ` Nadav Amit
2021-09-27 10:58 ` David Hildenbrand
2021-09-27 12:00 ` Nadav Amit
2021-09-27 12:16 ` Michal Hocko
2021-09-27 19:12 ` Nadav Amit
2021-09-29 7:52 ` Michal Hocko
2021-09-29 18:31 ` Nadav Amit
2021-10-12 23:14 ` Peter Xu
2021-10-13 15:47 ` Nadav Amit
2021-10-13 23:09 ` Peter Xu
2021-09-27 17:05 ` David Hildenbrand
2021-09-27 19:59 ` Nadav Amit
2021-09-28 8:53 ` David Hildenbrand
2021-09-28 22:56 ` Nadav Amit
2021-10-04 17:58 ` David Hildenbrand
2021-10-07 16:19 ` Nadav Amit
2021-10-07 16:46 ` David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210926161259.238054-6-namit@vmware.com \
--to=nadav.amit@gmail.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=ccross@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan@kernel.org \
--cc=namit@vmware.com \
--cc=peterx@redhat.com \
--cc=rppt@linux.vnet.ibm.com \
--cc=surenb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox