linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Linus Torvalds <torvalds@linux-foundation.org>
To: Kiryl Shutsemau <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>,
	Luis Chamberlain <mcgrof@kernel.org>,
	 Linux-MM <linux-mm@kvack.org>,
	linux-fsdevel@vger.kernel.org
Subject: Re: Optimizing small reads
Date: Thu, 9 Oct 2025 10:29:12 -0700	[thread overview]
Message-ID: <CAHk-=wi1rrcijcD0i7V7JD6bLL-yKHUX-hcxtLx=BUd34phdug@mail.gmail.com> (raw)
In-Reply-To: <nhrb37zzltn5hi3h5phwprtmkj2z2wb4gchvp725bwcnsgvjyf@eohezc2gouwr>

[-- Attachment #1: Type: text/plain, Size: 1429 bytes --]

On Thu, 9 Oct 2025 at 09:22, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> Objtool is not happy about calling random stuff within UACCESS. I
> ignored it for now.

Yeah, that needs to be done inside the other stuff - including, very
much, the folio lookup.

> I am not sure if I use user_access_begin()/_end() correctly. Let me know
> if I misunderstood or misimplemented your idea.

Close. Except I'd have gotten rid of the iov stuff by making the inner
helper just get a 'void __user *' pointer and a length, and then
updating the iov state outside that helper.

> This patch brings 4k reads from 512k files to ~60GiB/s. Making the
> buffer 4k, brings it ~95GiB/s (baseline is 100GiB/s).

Note that right now, 'unsafe_copy_to_user()' is a horrible thing. It's
almost entirely unoptimized, see the hacky unsafe_copy_loop
implementation in <asm/uaccess.h>.

Because before this code, it was only used for readdir() to copy
individual filenames, I think.

Anyway, I'd have organized things a bit differently. Incremental
UNTESTED patch attached.

objtool still complains about SMAP issues, because
memcpy_from_file_folio() ends up resulting in a external call to
memcpy. Not great.

I don't love how complicated this all got, and even with your bigger
buffer it's slower than the baseline/

So honestly I'd be inclined to go back to "just deal with the
trivially small reads", and scratch this extra complexity.

       Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 3464 bytes --]

 mm/filemap.c | 81 +++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 28 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 13c5de94c884..64def0dd3b97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2697,7 +2697,41 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+static size_t inner_read_loop(struct kiocb *iocb, struct folio *folio,
+				void __user *dst, size_t dst_size,
+				char *buffer, size_t buffer_size,
+				struct address_space *mapping, unsigned int seq)
+{
+	size_t read = 0;
+
+	if (can_do_masked_user_access())
+		dst = masked_user_access_begin(dst);
+	else if (!user_access_begin(dst, dst_size))
+		return 0;
+
+	do {
+		size_t to_read = min(dst_size, buffer_size);
+
+		to_read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
+
+		/* Give up and go to slow path if raced with page_cache_delete() */
+		if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+			break;
+
+		unsafe_copy_to_user(dst, buffer, to_read, Efault);
+
+		dst += read;
+		dst_size -= read;
+
+		iocb->ki_pos += read;
+	} while (dst_size && iocb->ki_pos % folio_size(folio));
+
+Efault:
+	user_access_end();
+	return read;
+}
+
+static bool noinline filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 			      char *buffer, size_t buffer_size,
 			      ssize_t *already_read)
 {
@@ -2719,14 +2753,12 @@ static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
 		return false;
 
-	if (!user_access_begin(iter->ubuf + iter->iov_offset, iter->count))
-		return false;
-
 	rcu_read_lock();
 	pagefault_disable();
 
 	do {
 		size_t to_read, read;
+		void __user *dst;
 		XA_STATE(xas, &mapping->i_pages, iocb->ki_pos >> PAGE_SHIFT);
 
 		xas_reset(&xas);
@@ -2750,34 +2782,27 @@ static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 		/* i_size check must be after folio_test_uptodate() */
 		file_size = i_size_read(mapping->host);
 
-		do {
-			if (unlikely(iocb->ki_pos >= file_size))
-				goto out;
+		if (unlikely(iocb->ki_pos >= file_size))
+			break;
+		file_size -= iocb->ki_pos;
+		to_read = iov_iter_count(iter);
+		if (to_read > file_size)
+			to_read = file_size;
 
-			to_read = min(iov_iter_count(iter), buffer_size);
-			if (to_read > file_size - iocb->ki_pos)
-				to_read = file_size - iocb->ki_pos;
-
-			read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
-
-			/* Give up and go to slow path if raced with page_cache_delete() */
-			if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
-				goto out;
-
-			unsafe_copy_to_user(iter->ubuf + iter->iov_offset, buffer,
-					    read, out);
-
-			iter->iov_offset += read;
-			iter->count -= read;
-			*already_read += read;
-			iocb->ki_pos += read;
-			last_pos = iocb->ki_pos;
-		} while (iov_iter_count(iter) && iocb->ki_pos % folio_size(folio));
+		dst = iter->ubuf + iter->iov_offset;
+		read = inner_read_loop(iocb, folio,
+			dst, to_read, buffer, buffer_size,
+			mapping, seq);
+		if (!read)
+			break;
+		iter->iov_offset += read;
+		iter->count -= read;
+		*already_read += read;
+		last_pos = iocb->ki_pos;
 	} while (iov_iter_count(iter));
-out:
+
 	pagefault_enable();
 	rcu_read_unlock();
-	user_access_end();
 
 	file_accessed(iocb->ki_filp);
 	ra->prev_pos = last_pos;

  reply	other threads:[~2025-10-09 17:29 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-03  2:18 Linus Torvalds
2025-10-03  3:32 ` Luis Chamberlain
2025-10-15 21:31   ` Swarna Prabhu
2025-10-03  9:55 ` Kiryl Shutsemau
2025-10-03 16:18   ` Linus Torvalds
2025-10-03 16:40     ` Linus Torvalds
2025-10-03 17:23       ` Kiryl Shutsemau
2025-10-03 17:49         ` Linus Torvalds
2025-10-06 11:44           ` Kiryl Shutsemau
2025-10-06 15:50             ` Linus Torvalds
2025-10-06 18:04               ` Kiryl Shutsemau
2025-10-06 18:14                 ` Linus Torvalds
2025-10-07 21:47                 ` Linus Torvalds
2025-10-07 22:35                   ` Linus Torvalds
2025-10-07 22:54                     ` Linus Torvalds
2025-10-07 23:30                       ` Linus Torvalds
2025-10-08 14:54                         ` Kiryl Shutsemau
2025-10-08 16:27                           ` Linus Torvalds
2025-10-08 17:03                             ` Linus Torvalds
2025-10-09 16:22                               ` Kiryl Shutsemau
2025-10-09 17:29                                 ` Linus Torvalds [this message]
2025-10-10 10:10                                   ` Kiryl Shutsemau
2025-10-10 17:51                                     ` Linus Torvalds
2025-10-13 15:35                                       ` Kiryl Shutsemau
2025-10-13 15:39                                         ` Kiryl Shutsemau
2025-10-13 16:19                                           ` Linus Torvalds
2025-10-14 12:58                                             ` Kiryl Shutsemau
2025-10-14 16:41                                               ` Linus Torvalds
2025-10-13 16:06                                         ` Linus Torvalds
2025-10-13 17:26                                         ` Theodore Ts'o
2025-10-14  3:20                                           ` Theodore Ts'o
2025-10-08 10:28                       ` Kiryl Shutsemau
2025-10-08 16:24                         ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAHk-=wi1rrcijcD0i7V7JD6bLL-yKHUX-hcxtLx=BUd34phdug@mail.gmail.com' \
    --to=torvalds@linux-foundation.org \
    --cc=kirill@shutemov.name \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mcgrof@kernel.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox