linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Benjamin LaHaise <bcrl@kvack.org>
To: linux-aio@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
	linux-mm@kvack.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>
Subject: [PATCH 12/13] aio: add support for aio readahead
Date: Mon, 11 Jan 2016 17:07:58 -0500	[thread overview]
Message-ID: <130a393a298209223b5ed3c3d3fe9023e56eddcb.1452549431.git.bcrl@kvack.org> (raw)
In-Reply-To: <cover.1452549431.git.bcrl@kvack.org>

Introduce an asynchronous operation to populate the page cache with
pages at a given offset and length.  This operation is conceptually
similar to performing an asynchronous read except that it does not
actually copy the data from the page cache into userspace, rather it
performs readahead and notifies userspace when all pages have been read.

The motivation for this came about as a result of investigation into a
performace degradation when reading from disk.  In the case of a heavily
loaded system, the copy_to_user() performed for an asynchronous read was
temporally quite distant from when the data was actually used.  By only
reading the data into the kernel's page cache, the cache pollution
caused by copying the data into userspace is avoided, and overall system
performance is improved.

Signed-off-by: Benjamin LaHaise <ben.lahaise@solacesystems.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c                     | 141 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/aio_abi.h |   1 +
 2 files changed, 142 insertions(+)

diff --git a/fs/aio.c b/fs/aio.c
index 3a70492..5cb3d74 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/fdtable.h>
 #include <linux/fs_struct.h>
+#include <../mm/internal.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -238,6 +239,8 @@ long aio_do_openat(int fd, const char *filename, int flags, int mode);
 long aio_do_unlinkat(int fd, const char *filename, int flags, int mode);
 long aio_foo_at(struct aio_kiocb *req, do_foo_at_t do_foo_at);
 
+long aio_readahead(struct aio_kiocb *iocb, unsigned long len);
+
 static __always_inline bool aio_may_use_threads(void)
 {
 #if IS_ENABLED(CONFIG_AIO_THREAD)
@@ -1812,6 +1815,137 @@ long aio_foo_at(struct aio_kiocb *req, do_foo_at_t do_foo_at)
 				     AIO_THREAD_NEED_FILES |
 				     AIO_THREAD_NEED_CRED);
 }
+
+static int aio_ra_filler(void *data, struct page *page)
+{
+	struct file *file = data;
+
+	return file->f_mapping->a_ops->readpage(file, page);
+}
+
+static long aio_ra_wait_on_pages(struct file *file, pgoff_t start,
+				 unsigned long nr)
+{
+	struct address_space *mapping = file->f_mapping;
+	unsigned long i;
+
+	/* Wait on pages starting at the end to holdfully avoid too many
+	 * wakeups.
+	 */
+	for (i = nr; i-- > 0; ) {
+		pgoff_t index = start + i;
+		struct page *page;
+
+		/* First do the quick check to see if the page is present and
+		 * uptodate.
+		 */
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, index);
+		rcu_read_unlock();
+
+		if (page && !radix_tree_exceptional_entry(page) &&
+		    PageUptodate(page)) {
+			continue;
+		}
+
+		page = read_cache_page(mapping, index, aio_ra_filler, file);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+static long aio_thread_op_readahead(struct aio_kiocb *iocb)
+{
+	pgoff_t start, end, nr, offset;
+	long ret = 0;
+
+	start = iocb->common.ki_pos >> PAGE_CACHE_SHIFT;
+	end = (iocb->common.ki_pos + iocb->ki_data - 1) >> PAGE_CACHE_SHIFT;
+	nr = end - start + 1;
+
+	for (offset = 0; offset < nr; ) {
+		pgoff_t chunk = nr - offset;
+		unsigned long max_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+
+		if (chunk > max_chunk)
+			chunk = max_chunk;
+
+		ret = __do_page_cache_readahead(iocb->common.ki_filp->f_mapping,
+						iocb->common.ki_filp,
+						start + offset, chunk, 0, 1);
+		if (ret <= 0)
+			break;
+		offset += ret;
+	}
+
+	if (!offset && ret < 0)
+		return ret;
+
+	if (offset > 0) {
+		ret = aio_ra_wait_on_pages(iocb->common.ki_filp, start, offset);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (offset == nr)
+		return iocb->ki_data;
+	if (offset > 0)
+		return ((start + offset) << PAGE_CACHE_SHIFT) -
+			iocb->common.ki_pos;
+	return 0;
+}
+
+long aio_readahead(struct aio_kiocb *iocb, unsigned long len)
+{
+	struct address_space *mapping = iocb->common.ki_filp->f_mapping;
+	pgoff_t index, end;
+	loff_t epos, isize;
+	int do_io = 0;
+
+	if (!mapping || !mapping->a_ops)
+		return -EBADF;
+	if (!mapping->a_ops->readpage && !mapping->a_ops->readpages)
+		return -EBADF;
+	if (!len)
+		return 0;
+
+	epos = iocb->common.ki_pos + len;
+	if (epos < 0)
+		return -EINVAL;
+	isize = i_size_read(mapping->host);
+	if (isize < epos) {
+		epos = isize - iocb->common.ki_pos;
+		if (epos <= 0)
+			return 0;
+		if ((unsigned long)epos != epos)
+			return -EINVAL;
+		len = epos;
+	}
+
+	index = iocb->common.ki_pos >> PAGE_CACHE_SHIFT;
+	end = (iocb->common.ki_pos + len - 1) >> PAGE_CACHE_SHIFT;
+	iocb->ki_data = len;
+	if (end < index)
+		return -EINVAL;
+
+	do {
+		struct page *page;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, index);
+		rcu_read_unlock();
+
+		if (!page || radix_tree_exceptional_entry(page) ||
+		    !PageUptodate(page))
+			do_io = 1;
+	} while (!do_io && (index++ < end));
+
+	if (do_io)
+		return aio_thread_queue_iocb(iocb, aio_thread_op_readahead, 0);
+	return len;
+}
 #endif /* IS_ENABLED(CONFIG_AIO_THREAD) */
 
 /*
@@ -1922,6 +2056,13 @@ rw_common:
 			ret = aio_foo_at(req, aio_do_unlinkat);
 		break;
 
+	case IOCB_CMD_READAHEAD:
+		if (user_iocb->aio_buf)
+			return -EINVAL;
+		if (aio_may_use_threads())
+			ret = aio_readahead(req, user_iocb->aio_nbytes);
+		break;
+
 	default:
 		pr_debug("EINVAL: no operation provided\n");
 		return -EINVAL;
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index 63a0d41..4def682 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -47,6 +47,7 @@ enum {
 
 	IOCB_CMD_OPENAT = 9,
 	IOCB_CMD_UNLINKAT = 10,
+	IOCB_CMD_READAHEAD = 12,
 };
 
 /*
-- 
2.5.0


-- 
"Thought is the essence of where you are now."

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-01-11 22:07 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-11 22:06 [PATCH 00/13] aio: thread (work queue) based aio and new aio functionality Benjamin LaHaise
2016-01-11 22:06 ` [PATCH 01/13] signals: distinguish signals sent due to i/o via io_send_sig() Benjamin LaHaise
2016-01-11 22:06 ` [PATCH 02/13] aio: add aio_get_mm() helper Benjamin LaHaise
2016-01-11 22:06 ` [PATCH 03/13] aio: for async operations, make the iter argument persistent Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 04/13] signals: add and use aio_get_task() to direct signals sent via io_send_sig() Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 05/13] fs: make do_loop_readv_writev() non-static Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 06/13] aio: add queue_work() based threaded aio support Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 07/13] aio: enabled thread based async fsync Benjamin LaHaise
2016-01-12  1:11   ` Dave Chinner
2016-01-12  1:20     ` Linus Torvalds
2016-01-12  2:25       ` Dave Chinner
2016-01-12  2:38         ` Linus Torvalds
2016-01-12  3:37           ` Dave Chinner
2016-01-12  4:03             ` Linus Torvalds
2016-01-12  4:48               ` Linus Torvalds
2016-01-12 22:50                 ` Benjamin LaHaise
2016-01-15 20:21                 ` Benjamin LaHaise
2016-01-20  3:59                   ` Linus Torvalds
2016-01-20  5:02                     ` Theodore Ts'o
2016-01-20 19:59                     ` Dave Chinner
2016-01-20 20:29                       ` Linus Torvalds
2016-01-20 20:44                         ` Benjamin LaHaise
2016-01-20 21:45                           ` Dave Chinner
2016-01-20 21:56                             ` Benjamin LaHaise
2016-01-23  4:24                               ` Dave Chinner
2016-01-23  4:50                                 ` Benjamin LaHaise
2016-01-23 22:22                                   ` Dave Chinner
2016-01-20 23:07                             ` Linus Torvalds
2016-01-23  4:39                               ` Dave Chinner
2016-03-14 17:17                                 ` aio openat " Benjamin LaHaise
2016-03-20  1:20                                   ` Linus Torvalds
2016-03-20  1:26                                     ` Al Viro
2016-03-20  1:45                                       ` Linus Torvalds
2016-03-20  1:55                                         ` Al Viro
2016-03-20  2:03                                           ` Linus Torvalds
2016-01-20 21:57                         ` Dave Chinner
2016-01-22 15:41                     ` Andres Freund
2016-01-12 22:59               ` Andy Lutomirski
2016-01-14  9:19       ` Paolo Bonzini
2016-01-12  1:30     ` Benjamin LaHaise
2016-01-22 15:31     ` Andres Freund
2016-01-11 22:07 ` [PATCH 08/13] aio: add support for aio poll via aio thread helper Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 09/13] aio: add support for async openat() Benjamin LaHaise
2016-01-12  0:22   ` Linus Torvalds
2016-01-12  1:17     ` Benjamin LaHaise
2016-01-12  1:45     ` Chris Mason
2016-01-12  9:53     ` Ingo Molnar
2016-01-11 22:07 ` [PATCH 10/13] aio: add async unlinkat functionality Benjamin LaHaise
2016-01-11 22:07 ` [PATCH 11/13] mm: enable __do_page_cache_readahead() to include present pages Benjamin LaHaise
2016-01-11 22:07 ` Benjamin LaHaise [this message]
2016-01-11 22:08 ` [PATCH 13/13] aio: add support for aio renameat operation Benjamin LaHaise

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=130a393a298209223b5ed3c3d3fe9023e56eddcb.1452549431.git.bcrl@kvack.org \
    --to=bcrl@kvack.org \
    --cc=akpm@linux-foundation.org \
    --cc=linux-aio@kvack.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox