From: NeilBrown <neilb@suse.de>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
Anna Schumaker <anna.schumaker@netapp.com>,
Chuck Lever <chuck.lever@oracle.com>,
Andrew Morton <akpm@linux-foundation.org>,
Mel Gorman <mgorman@suse.de>
Cc: linux-nfs@vger.kernel.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org
Subject: [PATCH 13/13] MM: use AIO for DIO writes to swap
Date: Tue, 16 Nov 2021 13:44:04 +1100 [thread overview]
Message-ID: <163703064458.25805.5272714590032323298.stgit@noble.brown> (raw)
In-Reply-To: <163702956672.25805.16457749992977493579.stgit@noble.brown>
When swap-out goes through the filesystem (as with NFS), we currently
perform synchronous writes with ->direct_IO. This serializes swap
writes and causes kswapd to block waiting for a writes to complete. This
is quite different to swap-out to a block device (always async), and
possibly hurts liveness.
So switch to AIO writes. If the necessary kiocb structure cannot be
allocated, fall back to sync writes using a kiocb on the stack.
Signed-off-by: NeilBrown <neilb@suse.de>
---
mm/page_io.c | 136 ++++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 103 insertions(+), 33 deletions(-)
diff --git a/mm/page_io.c b/mm/page_io.c
index 30d613881995..59a2d49e53c3 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -25,6 +25,7 @@
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
+#include "internal.h"
void end_swap_bio_write(struct bio *bio)
{
@@ -288,8 +289,70 @@ struct swap_iocb {
struct bio_vec bvec[SWAP_CLUSTER_MAX];
struct work_struct work;
int pages;
+ bool on_stack;
};
+static void sio_aio_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ int p;
+
+ if (ret != PAGE_SIZE * sio->pages) {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * rotate_reclaimable_page but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ pr_err_ratelimited("Write error on dio swapfile (%llu - %d pages)\n",
+ page_file_offset(sio->bvec[0].bv_page),
+ sio->pages);
+ for (p = 0; p < sio->pages; p++) {
+ set_page_dirty(sio->bvec[p].bv_page);
+ ClearPageReclaim(sio->bvec[p].bv_page);
+ }
+ }
+ for (p = 0; p < sio->pages; p++)
+ end_page_writeback(sio->bvec[p].bv_page);
+ if (!sio->on_stack)
+ kfree(sio);
+}
+
+static void sio_aio_unplug(struct blk_plug_cb *cb, bool from_schedule);
+
+static void sio_write_unplug_worker(struct work_struct *work)
+{
+ struct swap_iocb *sio = container_of(work, struct swap_iocb, work);
+ sio_aio_unplug(&sio->cb, 0);
+}
+
+static void sio_aio_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct swap_iocb *sio = container_of(cb, struct swap_iocb, cb);
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ struct iov_iter from;
+ int ret;
+ unsigned int noreclaim_flag;
+
+ if (from_schedule) {
+ INIT_WORK(&sio->work, sio_write_unplug_worker);
+ queue_work(mm_percpu_wq, &sio->work);
+ return;
+ }
+
+ noreclaim_flag = memalloc_noreclaim_save();
+ iov_iter_bvec(&from, WRITE, sio->bvec,
+ sio->pages, PAGE_SIZE * sio->pages);
+ ret = mapping->a_ops->direct_IO(&sio->iocb, &from);
+ memalloc_noreclaim_restore(noreclaim_flag);
+ if (ret != -EIOCBQUEUED)
+ sio_aio_complete(&sio->iocb, ret);
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -299,44 +362,51 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
if (data_race(sis->flags & SWP_FS_OPS)) {
- struct kiocb kiocb;
+ struct swap_iocb *sio, sio_on_stack;
+ struct blk_plug_cb *cb;
struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from;
-
- iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
- init_sync_kiocb(&kiocb, swap_file);
- kiocb.ki_pos = page_file_offset(page);
+ loff_t pos = page_file_offset(page);
+ int p;
set_page_writeback(page);
unlock_page(page);
- ret = mapping->a_ops->direct_IO(&kiocb, &from);
- if (ret == PAGE_SIZE) {
- count_vm_event(PSWPOUT);
- ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * folio_rotate_reclaimable but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
- set_page_dirty(page);
- ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
- page_file_offset(page));
+ cb = blk_check_plugged(sio_aio_unplug, swap_file, sizeof(*sio));
+ sio = container_of(cb, struct swap_iocb, cb);
+ if (cb && sio->pages &&
+ sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+ /* Not contiguous - hide this sio from lookup */
+ cb->data = NULL;
+ cb = blk_check_plugged(sio_aio_unplug, swap_file,
+ sizeof(*sio));
+ sio = container_of(cb, struct swap_iocb, cb);
}
- end_page_writeback(page);
- return ret;
+ if (!cb) {
+ sio = &sio_on_stack;
+ sio->pages = 0;
+ sio->on_stack = true;
+ }
+
+ if (sio->pages == 0) {
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_pos = pos;
+ if (sio != &sio_on_stack)
+ sio->iocb.ki_complete = sio_aio_complete;
+ }
+ p = sio->pages;
+ sio->bvec[p].bv_page = page;
+ sio->bvec[p].bv_len = PAGE_SIZE;
+ sio->bvec[p].bv_offset = 0;
+ p += 1;
+ sio->pages = p;
+ if (!cb)
+ sio_aio_unplug(&sio->cb, 0);
+ else if (p >= ARRAY_SIZE(sio->bvec))
+ /* Don't try to add to this */
+ cb->data = NULL;
+
+ count_vm_event(PSWPOUT);
+
+ return 0;
}
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
next prev parent reply other threads:[~2021-11-16 2:47 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-16 2:44 [PATCH 00/13] Repair SWAP-over-NFS NeilBrown
2021-11-16 2:44 ` [PATCH 10/13] NFSv4: keep state manager thread active if swap is enabled NeilBrown
2021-11-16 2:44 ` [PATCH 12/13] MM: use AIO/DIO for reads from SWP_FS_OPS swap-space NeilBrown
2021-11-16 8:31 ` Christoph Hellwig
2021-11-16 21:46 ` NeilBrown
2021-11-16 2:44 ` [PATCH 09/13] SUNRPC: improve 'swap' handling: scheduling and PF_MEMALLOC NeilBrown
2021-11-16 2:44 ` [PATCH 04/13] SUNRPC/call_alloc: async tasks mustn't block waiting for memory NeilBrown
2021-11-16 2:44 ` NeilBrown [this message]
2021-11-16 2:44 ` [PATCH 05/13] SUNRPC/auth: " NeilBrown
2021-11-16 2:44 ` [PATCH 01/13] NFS: move generic_write_checks() call from nfs_file_direct_write() to nfs_file_write() NeilBrown
2021-11-16 2:44 ` [PATCH 08/13] NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS NeilBrown
2021-11-16 2:44 ` [PATCH 03/13] MM: reclaim mustn't enter FS for swap-over-NFS NeilBrown
2021-11-16 8:32 ` Christoph Hellwig
2021-11-16 21:35 ` NeilBrown
2021-11-17 5:50 ` Christoph Hellwig
2021-11-18 1:43 ` kernel test robot
2021-11-16 2:44 ` [PATCH 06/13] SUNRPC/xprt: async tasks mustn't block waiting for memory NeilBrown
2021-11-16 2:44 ` [PATCH 07/13] SUNRPC: remove scheduling boost for "SWAPPER" tasks NeilBrown
2021-11-16 2:44 ` [PATCH 11/13] NFS: swap-out must always use STABLE writes NeilBrown
2021-11-16 2:44 ` [PATCH 02/13] NFS: do not take i_rwsem for swap IO NeilBrown
2021-11-16 7:52 ` Christoph Hellwig
2021-11-16 21:50 ` NeilBrown
2021-11-17 5:49 ` Christoph Hellwig
2021-11-16 3:29 ` [PATCH 00/13] Repair SWAP-over-NFS Matthew Wilcox
2021-11-16 3:55 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=163703064458.25805.5272714590032323298.stgit@noble.brown \
--to=neilb@suse.de \
--cc=akpm@linux-foundation.org \
--cc=anna.schumaker@netapp.com \
--cc=chuck.lever@oracle.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nfs@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=trond.myklebust@hammerspace.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox