* [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Christoph Hellwig
` (6 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Some bio completion handlers need to run in task context but bio_endio()
can be called from IRQ context (e.g. buffer_head writeback). Add a
BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
task-context completion of their bi_end_io callback.
When bio_endio() sees this flag and is running in non-task context, it
queues the bio to a per-cpu lockless list and schedules a delayed work
item to call bi_end_io() from task context. The delayed work uses a
1-jiffie delay to allow batches of completions to accumulate before
processing. A CPU hotplug dead callback drains any remaining bios from
the departing CPU's batch.
This will be used to enable RWF_DONTCACHE for block devices, and could
be used for other subsystems like fscrypt that need task-context bio
completion.
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
block/bio.c | 83 ++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 7 +++-
2 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 641ef0928d73..550eb770bfa6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,6 +19,7 @@
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
+#include <linux/llist.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -1716,6 +1717,51 @@ void bio_check_pages_dirty(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
+struct bio_complete_batch {
+ struct llist_head list;
+ struct delayed_work work;
+ int cpu;
+};
+
+static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
+static struct workqueue_struct *bio_complete_wq;
+
+static void bio_complete_work_fn(struct work_struct *w)
+{
+ struct delayed_work *dw = to_delayed_work(w);
+ struct bio_complete_batch *batch =
+ container_of(dw, struct bio_complete_batch, work);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ do {
+ node = llist_del_all(&batch->list);
+ if (!node)
+ break;
+
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ if (need_resched()) {
+ if (!llist_empty(&batch->list))
+ mod_delayed_work_on(batch->cpu,
+ bio_complete_wq,
+ &batch->work, 0);
+ break;
+ }
+ } while (1);
+}
+
+static void bio_queue_completion(struct bio *bio)
+{
+ struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
+
+ if (llist_add(&bio->bi_llist, &batch->list))
+ mod_delayed_work_on(batch->cpu, bio_complete_wq,
+ &batch->work, 1);
+}
+
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1790,7 +1836,9 @@ void bio_endio(struct bio *bio)
}
#endif
- if (bio->bi_end_io)
+ if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
+ bio_queue_completion(bio);
+ else if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
@@ -1976,6 +2024,24 @@ int bioset_init(struct bio_set *bs,
}
EXPORT_SYMBOL(bioset_init);
+/*
+ * Drain a dead CPU's deferred bio completions.
+ */
+static int bio_complete_batch_cpu_dead(unsigned int cpu)
+{
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, cpu);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ node = llist_del_all(&batch->list);
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ return 0;
+}
+
static int __init init_bio(void)
{
int i;
@@ -1990,6 +2056,21 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
+ for_each_possible_cpu(i) {
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, i);
+
+ init_llist_head(&batch->list);
+ INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
+ batch->cpu = i;
+ }
+
+ bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
+ if (!bio_complete_wq)
+ panic("bio: can't allocate bio_complete workqueue\n");
+
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
+ NULL, bio_complete_batch_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..0b55159d110d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -11,6 +11,7 @@
#include <linux/device.h>
#include <linux/ktime.h>
#include <linux/rw_hint.h>
+#include <linux/llist.h>
struct bio_set;
struct bio;
@@ -208,7 +209,10 @@ typedef unsigned int blk_qc_t;
* stacking drivers)
*/
struct bio {
- struct bio *bi_next; /* request queue link */
+ union {
+ struct bio *bi_next; /* request queue link */
+ struct llist_node bi_llist; /* deferred completion */
+ };
struct block_device *bi_bdev;
blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits
* req_flags.
@@ -322,6 +326,7 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
+ BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 3/8] block: enable RWF_DONTCACHE for block devices Christoph Hellwig
` (5 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Set BIO_COMPLETE_IN_TASK on iomap writeback bios when a dropbehind folio
is added. This ensures that bi_end_io runs in task context, where
folio_end_dropbehind() can safely invalidate folios.
With the bio layer now handling task-context deferral generically,
IOMAP_IOEND_DONTCACHE is no longer needed, as XFS no longer needs to
route DONTCACHE ioends through its completion workqueue. Remove the flag
and its NOMERGE entry.
Without the NOMERGE, regular I/Os that get merged with a dropbehind
folio will also have their completion deferred to task context.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
fs/iomap/ioend.c | 5 +++--
fs/xfs/xfs_aops.c | 4 ----
include/linux/iomap.h | 6 +-----
3 files changed, 4 insertions(+), 11 deletions(-)
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index acf3cf98b23a..892dbfc77ae9 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -237,8 +237,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
- if (folio_test_dropbehind(folio))
- ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
@@ -255,6 +253,9 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
+ if (folio_test_dropbehind(folio))
+ bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK);
+
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f279055fcea0..0dcf78beae8a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -511,10 +511,6 @@ xfs_ioend_needs_wq_completion(
if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
return true;
- /* Page cache invalidation cannot be done in irq context. */
- if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
- return true;
-
return false;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2c5685adf3a9..bf49ba71dd42 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -399,16 +399,12 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
-/* is DONTCACHE I/O */
-#define IOMAP_IOEND_DONTCACHE (1U << 4)
-
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
- (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \
- IOMAP_IOEND_DONTCACHE)
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
/*
* Structure for writeback I/O completions.
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 3/8] block: enable RWF_DONTCACHE for block devices
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
2026-04-09 16:02 ` [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
` (4 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Block device buffered reads and writes already pass through
filemap_read() and iomap_file_buffered_write() respectively, both of
which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files
by setting FOP_DONTCACHE in def_blk_fops.
For CONFIG_BUFFER_HEAD=y paths, add block_write_begin_iocb() which
threads the kiocb through so that buffer_head-based I/O can use
DONTCACHE behavior. The existing block_write_begin() is preserved as a
wrapper that passes a NULL iocb. Set BIO_COMPLETE_IN_TASK in
submit_bh_wbc() when the folio has dropbehind so that buffer_head
writeback completions get deferred to task context.
CONFIG_BUFFER_HEAD=n paths are handled by the previously added iomap
BIO_COMPLETE_IN_TASK support.
This support is useful for databases that operate on raw block devices,
among other userspace applications.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
block/fops.c | 5 +++--
fs/buffer.c | 22 +++++++++++++++++++---
include/linux/buffer_head.h | 3 +++
3 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..31b073181d87 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -504,7 +504,8 @@ static int blkdev_write_begin(const struct kiocb *iocb,
unsigned len, struct folio **foliop,
void **fsdata)
{
- return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ blkdev_get_block);
}
static int blkdev_write_end(const struct kiocb *iocb,
@@ -966,7 +967,7 @@ const struct file_operations def_blk_fops = {
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
- .fop_flags = FOP_BUFFER_RASYNC,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE,
};
static __init int blkdev_init(void)
diff --git a/fs/buffer.c b/fs/buffer.c
index d6e062c42a8d..289ab33fe3fd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2131,14 +2131,19 @@ EXPORT_SYMBOL(block_commit_write);
*
* The filesystem needs to handle block truncation upon failure.
*/
-int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
struct folio *folio;
int status;
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
+ fgp_flags |= FGP_DONTCACHE;
+
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -2153,6 +2158,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
*foliop = folio;
return status;
}
+
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block)
+{
+ return block_write_begin_iocb(NULL, mapping, pos, len, foliop,
+ get_block);
+}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(loff_t pos, unsigned len, unsigned copied,
@@ -2481,7 +2493,8 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, foliop, get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ get_block);
}
EXPORT_SYMBOL(cont_write_begin);
@@ -2711,6 +2724,9 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
+ if (folio_test_dropbehind(bh->b_folio))
+ bio_set_flag(bio, BIO_COMPLETE_IN_TASK);
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e4939e33b4b5..4ce50882d621 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block);
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block);
int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block);
int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (2 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 3/8] block: enable RWF_DONTCACHE for block devices Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 20:18 ` Matthew Wilcox
2026-04-09 16:02 ` [PATCH 5/8] FOLD: don't use in_task() to decide for offloading Christoph Hellwig
` (3 subsequent siblings)
7 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace the bio-flag based interface with an explicit
bio_complete_in_task() API. The advantage is that this can also be
called from inside the ->bi_end_io callback and thus dynamically.
This will be important to use it for fserror reporting.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/bio.c | 7 +++----
fs/buffer.c | 5 ++++-
fs/iomap/ioend.c | 11 ++++++++---
include/linux/bio.h | 17 +++++++++++++++++
include/linux/blk_types.h | 1 -
include/linux/buffer_head.h | 2 ++
include/linux/iomap.h | 6 +++++-
7 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 550eb770bfa6..88d191455762 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1753,7 +1753,7 @@ static void bio_complete_work_fn(struct work_struct *w)
} while (1);
}
-static void bio_queue_completion(struct bio *bio)
+void __bio_complete_in_task(struct bio *bio)
{
struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
@@ -1761,6 +1761,7 @@ static void bio_queue_completion(struct bio *bio)
mod_delayed_work_on(batch->cpu, bio_complete_wq,
&batch->work, 1);
}
+EXPORT_SYMBOL_GPL(__bio_complete_in_task);
static inline bool bio_remaining_done(struct bio *bio)
{
@@ -1836,9 +1837,7 @@ void bio_endio(struct bio *bio)
}
#endif
- if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
- bio_queue_completion(bio);
- else if (bio->bi_end_io)
+ if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 289ab33fe3fd..b5de776c8491 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2673,6 +2673,9 @@ static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;
+ if (buffer_dropbehind(bh) && bio_complete_in_task(bio))
+ return;
+
if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
@@ -2725,7 +2728,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
if (folio_test_dropbehind(bh->b_folio))
- bio_set_flag(bio, BIO_COMPLETE_IN_TASK);
+ set_buffer_dropbehind(bh);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index 892dbfc77ae9..a32ece8a3ee3 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -117,6 +117,12 @@ static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) {
+ if (bio_complete_in_task(bio))
+ return;
+ }
+
ioend->io_error = blk_status_to_errno(bio->bi_status);
if (ioend->io_error) {
iomap_fail_ioend_buffered(ioend);
@@ -237,6 +243,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
+ if (folio_test_dropbehind(folio))
+ ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
@@ -253,9 +261,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
- if (folio_test_dropbehind(folio))
- bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK);
-
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0b6744557b42..45c311e5ff71 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -375,6 +375,23 @@ static inline struct bio *bio_alloc(struct block_device *bdev,
void submit_bio(struct bio *bio);
+void __bio_complete_in_task(struct bio *bio);
+
+/**
+ * bio_complete_in_task - ensure a bio is complete in preemptible task context
+ * @bio: bio to complete
+ *
+ * If called from non-task context, offload the bio completion to worker thread
+ * and return %true. Else return %false and do nothing.
+ */
+static inline bool bio_complete_in_task(struct bio *bio)
+{
+ if (in_task())
+ return false;
+ __bio_complete_in_task(bio);
+ return true;
+}
+
extern void bio_endio(struct bio *);
static inline void bio_io_error(struct bio *bio)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0b55159d110d..8419f42de14f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -326,7 +326,6 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
- BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 4ce50882d621..bd7df5883cc8 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_Migrate, /* Buffer is being migrated (norefs) */
+ BH_Dropbehind, /* drop pages on IO completion */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -136,6 +137,7 @@ BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)
+BUFFER_FNS(Dropbehind, dropbehind)
static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index bf49ba71dd42..2c5685adf3a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -399,12 +399,16 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
+/* is DONTCACHE I/O */
+#define IOMAP_IOEND_DONTCACHE (1U << 4)
+
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
- (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \
+ IOMAP_IOEND_DONTCACHE)
/*
* Structure for writeback I/O completions.
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
@ 2026-04-09 20:18 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2026-04-09 20:18 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Tal Zussman, Jens Axboe, Christian Brauner, Darrick J. Wong,
Carlos Maiolino, Al Viro, Jan Kara, Dave Chinner,
Bart Van Assche, Gao Xiang, linux-block, linux-kernel, linux-xfs,
linux-fsdevel, linux-mm
On Thu, Apr 09, 2026 at 06:02:17PM +0200, Christoph Hellwig wrote:
> @@ -1836,9 +1837,7 @@ void bio_endio(struct bio *bio)
> }
> #endif
>
> - if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
> - bio_queue_completion(bio);
> - else if (bio->bi_end_io)
> + if (bio->bi_end_io)
> bio->bi_end_io(bio);
What I liked about this before is that we had one central place that
needed to be changed. This change means that every bi_end_io now needs
to check whether the BIO can be completed in its context.
> +++ b/fs/buffer.c
> @@ -2673,6 +2673,9 @@ static void end_bio_bh_io_sync(struct bio *bio)
> {
> struct buffer_head *bh = bio->bi_private;
>
> + if (buffer_dropbehind(bh) && bio_complete_in_task(bio))
> + return;
I really don't like this. It assumes there's only one reason to
complete in task context -- whether the buffer belongs to a dropbehind
folio. I want there to be other reasons. Why would you introduce the
new BH_dropbehind flag instead of checking BIO_COMPLETE_IN_TASK?
> struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
>
> + /* Page cache invalidation cannot be done in irq context. */
> + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) {
> + if (bio_complete_in_task(bio))
> + return;
> + }
I thought we agreed to kill off IOMAP_IOEND_DONTCACHE?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 5/8] FOLD: don't use in_task() to decide for offloading
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (3 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors Christoph Hellwig
` (2 subsequent siblings)
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
As described in commit c99fab6e80b76, some block drivers might call
into ->bi_end_io from non-preemptible context. Copy and past the
logic from that commit, although having a core helper for it would
be nicer.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/bio.h | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 45c311e5ff71..72664807c757 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -375,6 +375,16 @@ static inline struct bio *bio_alloc(struct block_device *bdev,
void submit_bio(struct bio *bio);
+/* Offload from atomic contexts to minimize scheduling overhead */
+static inline bool bio_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
void __bio_complete_in_task(struct bio *bio);
/**
@@ -386,10 +396,11 @@ void __bio_complete_in_task(struct bio *bio);
*/
static inline bool bio_complete_in_task(struct bio *bio)
{
- if (in_task())
- return false;
- __bio_complete_in_task(bio);
- return true;
+ if (bio_in_atomic()) {
+ __bio_complete_in_task(bio);
+ return true;
+ }
+ return false;
}
extern void bio_endio(struct bio *);
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (4 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 5/8] FOLD: don't use in_task() to decide for offloading Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions Christoph Hellwig
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace out own hand-crafted complete in task context scheme with the
generic block code.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/bio.c | 44 +-------------------------------------------
1 file changed, 1 insertion(+), 43 deletions(-)
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
index 4504f4633f17..5b9b91198ec8 100644
--- a/fs/iomap/bio.c
+++ b/fs/iomap/bio.c
@@ -9,9 +9,6 @@
#include "internal.h"
#include "trace.h"
-static DEFINE_SPINLOCK(failed_read_lock);
-static struct bio_list failed_read_list = BIO_EMPTY_LIST;
-
static u32 __iomap_read_end_io(struct bio *bio, int error)
{
struct folio_iter fi;
@@ -27,49 +24,10 @@ static u32 __iomap_read_end_io(struct bio *bio, int error)
return folio_count;
}
-static void
-iomap_fail_reads(
- struct work_struct *work)
-{
- struct bio *bio;
- struct bio_list tmp = BIO_EMPTY_LIST;
- unsigned long flags;
-
- spin_lock_irqsave(&failed_read_lock, flags);
- bio_list_merge_init(&tmp, &failed_read_list);
- spin_unlock_irqrestore(&failed_read_lock, flags);
-
- while ((bio = bio_list_pop(&tmp)) != NULL) {
- __iomap_read_end_io(bio, blk_status_to_errno(bio->bi_status));
- cond_resched();
- }
-}
-
-static DECLARE_WORK(failed_read_work, iomap_fail_reads);
-
-static void iomap_fail_buffered_read(struct bio *bio)
-{
- unsigned long flags;
-
- /*
- * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
- * in the fserror code. The caller no longer owns the bio reference
- * after the spinlock drops.
- */
- spin_lock_irqsave(&failed_read_lock, flags);
- if (bio_list_empty(&failed_read_list))
- WARN_ON_ONCE(!schedule_work(&failed_read_work));
- bio_list_add(&failed_read_list, bio);
- spin_unlock_irqrestore(&failed_read_lock, flags);
-}
-
static void iomap_read_end_io(struct bio *bio)
{
- if (bio->bi_status) {
- iomap_fail_buffered_read(bio);
+ if (bio->bi_status && bio_complete_in_task(bio))
return;
- }
-
__iomap_read_end_io(bio, 0);
}
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (5 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
7 siblings, 0 replies; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace out own hand-crafted complete in task context scheme with the
generic block code.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/ioend.c | 53 +++++-------------------------------------------
1 file changed, 5 insertions(+), 48 deletions(-)
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index a32ece8a3ee3..160224007486 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -72,63 +72,20 @@ static u32 iomap_finish_ioend_buffered_write(struct iomap_ioend *ioend)
return folio_count;
}
-static DEFINE_SPINLOCK(failed_ioend_lock);
-static LIST_HEAD(failed_ioend_list);
-
-static void
-iomap_fail_ioends(
- struct work_struct *work)
-{
- struct iomap_ioend *ioend;
- struct list_head tmp;
- unsigned long flags;
-
- spin_lock_irqsave(&failed_ioend_lock, flags);
- list_replace_init(&failed_ioend_list, &tmp);
- spin_unlock_irqrestore(&failed_ioend_lock, flags);
-
- while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
- io_list))) {
- list_del_init(&ioend->io_list);
- iomap_finish_ioend_buffered_write(ioend);
- cond_resched();
- }
-}
-
-static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
-
-static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
-{
- unsigned long flags;
-
- /*
- * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
- * in the fserror code. The caller no longer owns the ioend reference
- * after the spinlock drops.
- */
- spin_lock_irqsave(&failed_ioend_lock, flags);
- if (list_empty(&failed_ioend_list))
- WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
- list_add_tail(&ioend->io_list, &failed_ioend_list);
- spin_unlock_irqrestore(&failed_ioend_lock, flags);
-}
-
static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
- /* Page cache invalidation cannot be done in irq context. */
- if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) {
+ /*
+ * Page cache invalidation and error reporting cannot be done in irq
+ * context.
+ */
+ if ((ioend->io_flags & IOMAP_IOEND_DONTCACHE) || bio->bi_status) {
if (bio_complete_in_task(bio))
return;
}
ioend->io_error = blk_status_to_errno(bio->bi_status);
- if (ioend->io_error) {
- iomap_fail_ioend_buffered(ioend);
- return;
- }
-
iomap_finish_ioend_buffered_write(ioend);
}
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (6 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 19:06 ` Tal Zussman
7 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Commit 3fffb589b9a6 ("erofs: add per-cpu threads for decompression as an
option") explains why workqueue aren't great for low-latency completion
handling. Switch to a per-cpu kthread to handle it instead. This code
is based on the erofs code in the above commit, but further simplified
by directly using a kthread instead of a kthread_work.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/bio.c | 117 +++++++++++++++++++++++++++++-----------------------
1 file changed, 65 insertions(+), 52 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 88d191455762..6a993fb129a0 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,7 +19,7 @@
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
-#include <linux/llist.h>
+#include <linux/freezer.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -1718,51 +1718,83 @@ void bio_check_pages_dirty(struct bio *bio)
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
struct bio_complete_batch {
- struct llist_head list;
- struct delayed_work work;
- int cpu;
+ spinlock_t lock;
+ struct bio_list bios;
+ struct task_struct *worker;
};
static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
-static struct workqueue_struct *bio_complete_wq;
-static void bio_complete_work_fn(struct work_struct *w)
+static bool bio_try_complete_batch(struct bio_complete_batch *batch)
{
- struct delayed_work *dw = to_delayed_work(w);
- struct bio_complete_batch *batch =
- container_of(dw, struct bio_complete_batch, work);
- struct llist_node *node;
- struct bio *bio, *next;
+ struct bio_list bios;
+ unsigned long flags;
+ struct bio *bio;
- do {
- node = llist_del_all(&batch->list);
- if (!node)
- break;
+ spin_lock_irqsave(&batch->lock, flags);
+ bios = batch->bios;
+ bio_list_init(&batch->bios);
+ spin_unlock_irqrestore(&batch->lock, flags);
- node = llist_reverse_order(node);
- llist_for_each_entry_safe(bio, next, node, bi_llist)
- bio->bi_end_io(bio);
+ if (bio_list_empty(&bios))
+ return false;
- if (need_resched()) {
- if (!llist_empty(&batch->list))
- mod_delayed_work_on(batch->cpu,
- bio_complete_wq,
- &batch->work, 0);
- break;
- }
- } while (1);
+ __set_current_state(TASK_RUNNING);
+ while ((bio = bio_list_pop(&bios)))
+ bio->bi_end_io(bio);
+ return true;
+}
+
+static int bio_complete_thread(void *private)
+{
+ struct bio_complete_batch *batch = private;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!bio_try_complete_batch(batch))
+ schedule();
+ }
+
+ return 0;
}
void __bio_complete_in_task(struct bio *bio)
{
- struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
+ struct bio_complete_batch *batch;
+ unsigned long flags;
+ bool wake;
+
+ get_cpu();
+ batch = this_cpu_ptr(&bio_complete_batch);
+ spin_lock_irqsave(&batch->lock, flags);
+ wake = bio_list_empty(&batch->bios);
+ bio_list_add(&batch->bios, bio);
+ spin_unlock_irqrestore(&batch->lock, flags);
+ put_cpu();
- if (llist_add(&bio->bi_llist, &batch->list))
- mod_delayed_work_on(batch->cpu, bio_complete_wq,
- &batch->work, 1);
+ if (wake)
+ wake_up_process(batch->worker);
}
EXPORT_SYMBOL_GPL(__bio_complete_in_task);
+static void __init bio_complete_batch_init(int cpu)
+{
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, cpu);
+ struct task_struct *worker;
+
+ worker = kthread_create_on_cpu(bio_complete_thread,
+ per_cpu_ptr(&bio_complete_batch, cpu),
+ cpu, "bio_worker/%u");
+ if (IS_ERR(worker))
+ panic("bio: can't create kthread_work");
+ sched_set_fifo_low(worker);
+
+ spin_lock_init(&batch->lock);
+ bio_list_init(&batch->bios);
+ batch->worker = worker;
+}
+
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -2028,16 +2060,7 @@ EXPORT_SYMBOL(bioset_init);
*/
static int bio_complete_batch_cpu_dead(unsigned int cpu)
{
- struct bio_complete_batch *batch =
- per_cpu_ptr(&bio_complete_batch, cpu);
- struct llist_node *node;
- struct bio *bio, *next;
-
- node = llist_del_all(&batch->list);
- node = llist_reverse_order(node);
- llist_for_each_entry_safe(bio, next, node, bi_llist)
- bio->bi_end_io(bio);
-
+ bio_try_complete_batch(per_cpu_ptr(&bio_complete_batch, cpu));
return 0;
}
@@ -2055,18 +2078,8 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
- for_each_possible_cpu(i) {
- struct bio_complete_batch *batch =
- per_cpu_ptr(&bio_complete_batch, i);
-
- init_llist_head(&batch->list);
- INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
- batch->cpu = i;
- }
-
- bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
- if (!bio_complete_wq)
- panic("bio: can't allocate bio_complete workqueue\n");
+ for_each_possible_cpu(i)
+ bio_complete_batch_init(i);
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
NULL, bio_complete_batch_cpu_dead);
--
2.47.3
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
@ 2026-04-09 19:06 ` Tal Zussman
0 siblings, 0 replies; 11+ messages in thread
From: Tal Zussman @ 2026-04-09 19:06 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
On 4/9/26 12:02 PM, Christoph Hellwig wrote:
> Commit 3fffb589b9a6 ("erofs: add per-cpu threads for decompression as an
> option") explains why workqueue aren't great for low-latency completion
> handling. Switch to a per-cpu kthread to handle it instead. This code
> is based on the erofs code in the above commit, but further simplified
> by directly using a kthread instead of a kthread_work.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> block/bio.c | 117 +++++++++++++++++++++++++++++-----------------------
> 1 file changed, 65 insertions(+), 52 deletions(-)
>
> diff --git a/block/bio.c b/block/bio.c
> index 88d191455762..6a993fb129a0 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -19,7 +19,7 @@
> #include <linux/blk-crypto.h>
> #include <linux/xarray.h>
> #include <linux/kmemleak.h>
> -#include <linux/llist.h>
> +#include <linux/freezer.h>
Why freezer.h and not kthread.h?
> #include <trace/events/block.h>
> #include "blk.h"
> @@ -1718,51 +1718,83 @@ void bio_check_pages_dirty(struct bio *bio)
> EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
>
> struct bio_complete_batch {
> - struct llist_head list;
If we go with this approach, we should remove the newly-added bi_llist from
struct bio too.
> - struct delayed_work work;
> - int cpu;
> + spinlock_t lock;
> + struct bio_list bios;
> + struct task_struct *worker;
> };
>
> static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
> -static struct workqueue_struct *bio_complete_wq;
>
> -static void bio_complete_work_fn(struct work_struct *w)
> +static bool bio_try_complete_batch(struct bio_complete_batch *batch)
> {
> - struct delayed_work *dw = to_delayed_work(w);
> - struct bio_complete_batch *batch =
> - container_of(dw, struct bio_complete_batch, work);
> - struct llist_node *node;
> - struct bio *bio, *next;
> + struct bio_list bios;
> + unsigned long flags;
> + struct bio *bio;
>
> - do {
> - node = llist_del_all(&batch->list);
> - if (!node)
> - break;
> + spin_lock_irqsave(&batch->lock, flags);
> + bios = batch->bios;
> + bio_list_init(&batch->bios);
> + spin_unlock_irqrestore(&batch->lock, flags);
>
> - node = llist_reverse_order(node);
> - llist_for_each_entry_safe(bio, next, node, bi_llist)
> - bio->bi_end_io(bio);
> + if (bio_list_empty(&bios))
> + return false;
>
> - if (need_resched()) {
> - if (!llist_empty(&batch->list))
> - mod_delayed_work_on(batch->cpu,
> - bio_complete_wq,
> - &batch->work, 0);
> - break;
> - }
> - } while (1);
> + __set_current_state(TASK_RUNNING);
> + while ((bio = bio_list_pop(&bios)))
> + bio->bi_end_io(bio);
> + return true;
> +}
> +
> +static int bio_complete_thread(void *private)
> +{
> + struct bio_complete_batch *batch = private;
> +
> + for (;;) {
> + set_current_state(TASK_INTERRUPTIBLE);
> + if (!bio_try_complete_batch(batch))
> + schedule();
> + }
> +
> + return 0;
> }
>
> void __bio_complete_in_task(struct bio *bio)
> {
> - struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
> + struct bio_complete_batch *batch;
> + unsigned long flags;
> + bool wake;
> +
> + get_cpu();
> + batch = this_cpu_ptr(&bio_complete_batch);
> + spin_lock_irqsave(&batch->lock, flags);
> + wake = bio_list_empty(&batch->bios);
> + bio_list_add(&batch->bios, bio);
> + spin_unlock_irqrestore(&batch->lock, flags);
> + put_cpu();
>
> - if (llist_add(&bio->bi_llist, &batch->list))
> - mod_delayed_work_on(batch->cpu, bio_complete_wq,
> - &batch->work, 1);
> + if (wake)
> + wake_up_process(batch->worker);
> }
> EXPORT_SYMBOL_GPL(__bio_complete_in_task);
>
> +static void __init bio_complete_batch_init(int cpu)
> +{
> + struct bio_complete_batch *batch =
> + per_cpu_ptr(&bio_complete_batch, cpu);
> + struct task_struct *worker;
> +
> + worker = kthread_create_on_cpu(bio_complete_thread,
> + per_cpu_ptr(&bio_complete_batch, cpu),
> + cpu, "bio_worker/%u");
> + if (IS_ERR(worker))
> + panic("bio: can't create kthread_work");
> + sched_set_fifo_low(worker);
> +
> + spin_lock_init(&batch->lock);
> + bio_list_init(&batch->bios);
> + batch->worker = worker;
> +}
> +
> static inline bool bio_remaining_done(struct bio *bio)
> {
> /*
> @@ -2028,16 +2060,7 @@ EXPORT_SYMBOL(bioset_init);
> */
> static int bio_complete_batch_cpu_dead(unsigned int cpu)
> {
> - struct bio_complete_batch *batch =
> - per_cpu_ptr(&bio_complete_batch, cpu);
> - struct llist_node *node;
> - struct bio *bio, *next;
> -
> - node = llist_del_all(&batch->list);
> - node = llist_reverse_order(node);
> - llist_for_each_entry_safe(bio, next, node, bi_llist)
> - bio->bi_end_io(bio);
> -
> + bio_try_complete_batch(per_cpu_ptr(&bio_complete_batch, cpu));
> return 0;
> }
>
> @@ -2055,18 +2078,8 @@ static int __init init_bio(void)
> SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
> }
>
> - for_each_possible_cpu(i) {
> - struct bio_complete_batch *batch =
> - per_cpu_ptr(&bio_complete_batch, i);
> -
> - init_llist_head(&batch->list);
> - INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
> - batch->cpu = i;
> - }
> -
> - bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
> - if (!bio_complete_wq)
> - panic("bio: can't allocate bio_complete workqueue\n");
> + for_each_possible_cpu(i)
> + bio_complete_batch_init(i);
>
> cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
> NULL, bio_complete_batch_cpu_dead);
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 11+ messages in thread