linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Kiryl Shutsemau <kirill@shutemov.name>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>,
	 Luis Chamberlain <mcgrof@kernel.org>,
	Linux-MM <linux-mm@kvack.org>,
	linux-fsdevel@vger.kernel.org
Subject: Re: Optimizing small reads
Date: Mon, 13 Oct 2025 16:39:51 +0100	[thread overview]
Message-ID: <dz7pcqi5ytmb35r6kojuetdipjp7xdjlnyzcu5qb6d4cdo6vq5@3b62gfzcxszo> (raw)
In-Reply-To: <jzpbwmoygmjsltnqfdgnq4p75tg74bdamq3hne7t32mof4m5xo@lcw3afbr4daf>

On Mon, Oct 13, 2025 at 04:35:20PM +0100, Kiryl Shutsemau wrote:
> On Fri, Oct 10, 2025 at 10:51:40AM -0700, Linus Torvalds wrote:
> > Sounds like a plan?
> 
> The patch is below. Can I use your Signed-off-by for it?

And, for archiving purposes, here is the last version of the patch that
supports large blocks.

Do you think it makes sense to submit unsafe_copy_to_user() optimization
as a standalone thingy?

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3a7755c1a441..48bd31bac20e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -607,15 +607,24 @@ _label:									\
 		len -= sizeof(type);						\
 	}
 
-#define unsafe_copy_to_user(_dst,_src,_len,label)			\
-do {									\
-	char __user *__ucu_dst = (_dst);				\
-	const char *__ucu_src = (_src);					\
-	size_t __ucu_len = (_len);					\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);	\
+#define unsafe_copy_to_user(_dst,_src,_len,label)				\
+do {										\
+	char __user *__ucu_dst = (_dst);					\
+	const char *__ucu_src = (_src);						\
+	size_t __ucu_len = (_len);						\
+	if (cpu_feature_enabled(X86_FEATURE_FSRM)) {				\
+		asm goto(							\
+			     "1:	rep movsb\n"				\
+			     _ASM_EXTABLE_UA(1b, %l[label])			\
+			     : "+D" (__ucu_dst), "+S" (__ucu_src),		\
+			       "+c" (__ucu_len)					\
+			     : : "memory" : label);				\
+	} else {								\
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);   \
+	}									\
 } while (0)
 
 #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..08ace2cca696 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,41 +2661,132 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache.  If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller.  If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
-		ssize_t already_read)
+static size_t inner_read_loop(struct kiocb *iocb, struct folio *folio,
+				void __user *dst, size_t dst_size,
+				char *buffer, size_t buffer_size,
+				struct address_space *mapping, unsigned int seq)
+{
+	size_t read = 0;
+
+	if (can_do_masked_user_access())
+		dst = masked_user_access_begin(dst);
+	else if (!user_access_begin(dst, dst_size))
+		return 0;
+
+	do {
+		size_t to_read = min(dst_size, buffer_size);
+
+		to_read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
+
+		/* Give up and go to slow path if raced with page_cache_delete() */
+		if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+			break;
+
+		unsafe_copy_to_user(dst, buffer, to_read, Efault);
+
+		dst += to_read;
+		dst_size -= to_read;
+
+		iocb->ki_pos += to_read;
+		read += to_read;
+	} while (dst_size && iocb->ki_pos % folio_size(folio));
+
+Efault:
+	user_access_end();
+	return read;
+}
+
+static bool noinline filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+			      char *buffer, size_t buffer_size,
+			      ssize_t *already_read)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	loff_t last_pos = ra->prev_pos;
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	/* Don't bother with flush_dcache_folio() */
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (!iter_is_ubuf(iter))
+		return false;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+		return false;
+
+	rcu_read_lock();
+	pagefault_disable();
+
+	do {
+		size_t to_read, read;
+		void __user *dst;
+		XA_STATE(xas, &mapping->i_pages, iocb->ki_pos >> PAGE_SHIFT);
+
+		xas_reset(&xas);
+		folio = xas_load(&xas);
+		if (xas_retry(&xas, folio))
+			break;
+
+		if (!folio || xa_is_value(folio))
+			break;
+
+		if (!folio_test_uptodate(folio))
+			break;
+
+		/* No fast-case if readahead is supposed to started */
+		if (folio_test_readahead(folio))
+			break;
+		/* .. or mark it accessed */
+		if (!folio_test_referenced(folio))
+			break;
+
+		/* i_size check must be after folio_test_uptodate() */
+		file_size = i_size_read(mapping->host);
+
+		if (unlikely(iocb->ki_pos >= file_size))
+			break;
+		file_size -= iocb->ki_pos;
+		to_read = iov_iter_count(iter);
+		if (to_read > file_size)
+			to_read = file_size;
+
+		dst = iter->ubuf + iter->iov_offset;
+		read = inner_read_loop(iocb, folio,
+			dst, to_read, buffer, buffer_size,
+			mapping, seq);
+		if (!read)
+			break;
+		iter->iov_offset += read;
+		iter->count -= read;
+		*already_read += read;
+		last_pos = iocb->ki_pos;
+	} while (iov_iter_count(iter));
+
+	pagefault_enable();
+	rcu_read_unlock();
+
+	file_accessed(iocb->ki_filp);
+	ra->prev_pos = last_pos;
+	return !iov_iter_count(iter);
+}
+
+static ssize_t filemap_read_slow(struct kiocb *iocb, struct iov_iter *iter,
+			      struct folio_batch *fbatch, ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
 	loff_t last_pos = ra->prev_pos;
 
-	if (unlikely(iocb->ki_pos < 0))
-		return -EINVAL;
-	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
-		return 0;
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
-
-	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+	folio_batch_init(fbatch);
 
 	do {
 		cond_resched();
@@ -2709,7 +2802,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2737,11 +2830,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    fbatch->folios[0]))
+			folio_mark_accessed(fbatch->folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,19 +2865,57 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 
 			filemap_end_dropbehind_read(folio);
 			folio_put(folio);
 		}
-		folio_batch_init(&fbatch);
+		folio_batch_init(fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
 }
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+		//char __buffer[4096];
+	} area __uninitialized;
+
+	if (unlikely(iocb->ki_pos < 0))
+		return -EINVAL;
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+		return 0;
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+	if (filemap_read_fast(iocb, iter, area.buffer, sizeof(area), &already_read))
+		return already_read;
+
+	return filemap_read_slow(iocb, iter, &area.fbatch, already_read);
+}
 EXPORT_SYMBOL_GPL(filemap_read);
 
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
-- 
  Kiryl Shutsemau / Kirill A. Shutemov


  reply	other threads:[~2025-10-13 15:39 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-03  2:18 Linus Torvalds
2025-10-03  3:32 ` Luis Chamberlain
2025-10-15 21:31   ` Swarna Prabhu
2025-10-03  9:55 ` Kiryl Shutsemau
2025-10-03 16:18   ` Linus Torvalds
2025-10-03 16:40     ` Linus Torvalds
2025-10-03 17:23       ` Kiryl Shutsemau
2025-10-03 17:49         ` Linus Torvalds
2025-10-06 11:44           ` Kiryl Shutsemau
2025-10-06 15:50             ` Linus Torvalds
2025-10-06 18:04               ` Kiryl Shutsemau
2025-10-06 18:14                 ` Linus Torvalds
2025-10-07 21:47                 ` Linus Torvalds
2025-10-07 22:35                   ` Linus Torvalds
2025-10-07 22:54                     ` Linus Torvalds
2025-10-07 23:30                       ` Linus Torvalds
2025-10-08 14:54                         ` Kiryl Shutsemau
2025-10-08 16:27                           ` Linus Torvalds
2025-10-08 17:03                             ` Linus Torvalds
2025-10-09 16:22                               ` Kiryl Shutsemau
2025-10-09 17:29                                 ` Linus Torvalds
2025-10-10 10:10                                   ` Kiryl Shutsemau
2025-10-10 17:51                                     ` Linus Torvalds
2025-10-13 15:35                                       ` Kiryl Shutsemau
2025-10-13 15:39                                         ` Kiryl Shutsemau [this message]
2025-10-13 16:19                                           ` Linus Torvalds
2025-10-14 12:58                                             ` Kiryl Shutsemau
2025-10-14 16:41                                               ` Linus Torvalds
2025-10-13 16:06                                         ` Linus Torvalds
2025-10-13 17:26                                         ` Theodore Ts'o
2025-10-14  3:20                                           ` Theodore Ts'o
2025-10-08 10:28                       ` Kiryl Shutsemau
2025-10-08 16:24                         ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=dz7pcqi5ytmb35r6kojuetdipjp7xdjlnyzcu5qb6d4cdo6vq5@3b62gfzcxszo \
    --to=kirill@shutemov.name \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mcgrof@kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox