* [PATCH RFC v3 1/2] filemap: defer dropbehind invalidation from IRQ context
2026-02-27 16:41 [PATCH RFC v3 0/2] block: enable RWF_DONTCACHE for block devices Tal Zussman
@ 2026-02-27 16:41 ` Tal Zussman
2026-02-27 16:41 ` [PATCH RFC v3 2/2] block: enable RWF_DONTCACHE for block devices Tal Zussman
1 sibling, 0 replies; 3+ messages in thread
From: Tal Zussman @ 2026-02-27 16:41 UTC (permalink / raw)
To: Matthew Wilcox (Oracle),
Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
Johannes Weiner, Zi Yan, Jens Axboe, Alexander Viro,
Christian Brauner, Jan Kara
Cc: Christoph Hellwig, linux-fsdevel, linux-mm, linux-kernel,
linux-block, Tal Zussman
folio_end_dropbehind() is called from folio_end_writeback(), which can
run in IRQ context through buffer_head completion.
Previously, when folio_end_dropbehind() detected !in_task(), it skipped
the invalidation entirely. This meant that folios marked for dropbehind
via RWF_DONTCACHE would remain in the page cache after writeback when
completed from IRQ context, defeating the purpose of using it.
Fix this by adding folio_end_dropbehind_irq() which defers the
invalidation to a workqueue. The folio is added to a per-cpu folio_batch
protected by a local_lock, and a work item pinned to that CPU drains the
batch. folio_end_writeback() dispatches between the task and IRQ paths
based on in_task().
A CPU hotplug dead callback drains any remaining folios from the
departing CPU's batch to avoid leaking folio references.
This unblocks enabling RWF_DONTCACHE for block devices and other
buffer_head-based I/O.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
include/linux/pagemap.h | 1 +
mm/filemap.c | 130 ++++++++++++++++++++++++++++++++++++++++++++----
mm/page_alloc.c | 1 +
3 files changed, 123 insertions(+), 9 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ec442af3f886..ae0632cfdedd 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1260,6 +1260,7 @@ void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void folio_end_writeback_no_dropbehind(struct folio *folio);
void folio_end_dropbehind(struct folio *folio);
+void dropbehind_drain_cpu(int cpu);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
diff --git a/mm/filemap.c b/mm/filemap.c
index ebd75684cb0a..b223dca708df 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -49,6 +49,7 @@
#include <linux/sched/mm.h>
#include <linux/sysctl.h>
#include <linux/pgalloc.h>
+#include <linux/local_lock.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1085,6 +1086,8 @@ static const struct ctl_table filemap_sysctl_table[] = {
}
};
+static void __init dropbehind_init(void);
+
void __init pagecache_init(void)
{
int i;
@@ -1092,6 +1095,7 @@ void __init pagecache_init(void)
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(&folio_wait_table[i]);
+ dropbehind_init();
page_writeback_init();
register_sysctl_init("vm", filemap_sysctl_table);
}
@@ -1613,26 +1617,131 @@ static void filemap_end_dropbehind(struct folio *folio)
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
+ *
+ * When called from IRQ context (e.g. buffer_head completion), we cannot lock
+ * the folio and invalidate. Defer to a workqueue so that callers like
+ * end_buffer_async_write() that complete in IRQ context still get their folios
+ * pruned.
+ */
+struct dropbehind_batch {
+ local_lock_t lock_irq;
+ struct folio_batch fbatch;
+ struct work_struct work;
+};
+
+static DEFINE_PER_CPU(struct dropbehind_batch, dropbehind_batch) = {
+ .lock_irq = INIT_LOCAL_LOCK(lock_irq),
+};
+
+static void dropbehind_work_fn(struct work_struct *w)
+{
+ struct dropbehind_batch *db_batch;
+ struct folio_batch fbatch;
+
+again:
+ local_lock_irq(&dropbehind_batch.lock_irq);
+ db_batch = this_cpu_ptr(&dropbehind_batch);
+ fbatch = db_batch->fbatch;
+ folio_batch_reinit(&db_batch->fbatch);
+ local_unlock_irq(&dropbehind_batch.lock_irq);
+
+ for (int i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ }
+
+ /* Drain folios that were added while we were processing. */
+ local_lock_irq(&dropbehind_batch.lock_irq);
+ if (folio_batch_count(&db_batch->fbatch)) {
+ local_unlock_irq(&dropbehind_batch.lock_irq);
+ goto again;
+ }
+ local_unlock_irq(&dropbehind_batch.lock_irq);
+}
+
+/*
+ * Drain a dead CPU's dropbehind batch. The CPU is already dead so no
+ * locking is needed.
+ */
+void dropbehind_drain_cpu(int cpu)
+{
+ struct dropbehind_batch *db_batch = per_cpu_ptr(&dropbehind_batch, cpu);
+ struct folio_batch *fbatch = &db_batch->fbatch;
+
+ for (int i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
+
+ if (folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ }
+ folio_batch_reinit(fbatch);
+}
+
+static void __init dropbehind_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct dropbehind_batch *db_batch = per_cpu_ptr(&dropbehind_batch, cpu);
+
+ folio_batch_init(&db_batch->fbatch);
+ INIT_WORK(&db_batch->work, dropbehind_work_fn);
+ }
+}
+
+/*
+ * Must be called from task context. Use folio_end_dropbehind_irq() for
+ * IRQ context (e.g. buffer_head completion).
*/
void folio_end_dropbehind(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
- /*
- * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
- * but can happen if normal writeback just happens to find dirty folios
- * that were created as part of uncached writeback, and that writeback
- * would otherwise not need non-IRQ handling. Just skip the
- * invalidation in that case.
- */
- if (in_task() && folio_trylock(folio)) {
+ if (folio_trylock(folio)) {
filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
EXPORT_SYMBOL_GPL(folio_end_dropbehind);
+/*
+ * In IRQ context we cannot lock the folio or call into the invalidation
+ * path. Defer to a workqueue. This happens for buffer_head-based writeback
+ * which runs from bio IRQ context.
+ */
+static void folio_end_dropbehind_irq(struct folio *folio)
+{
+ struct dropbehind_batch *db_batch;
+ unsigned long flags;
+
+ if (!folio_test_dropbehind(folio))
+ return;
+
+ local_lock_irqsave(&dropbehind_batch.lock_irq, flags);
+ db_batch = this_cpu_ptr(&dropbehind_batch);
+
+ /* If there is no space in the folio_batch, skip the invalidation. */
+ if (!folio_batch_space(&db_batch->fbatch)) {
+ local_unlock_irqrestore(&dropbehind_batch.lock_irq, flags);
+ return;
+ }
+
+ folio_get(folio);
+ folio_batch_add(&db_batch->fbatch, folio);
+ local_unlock_irqrestore(&dropbehind_batch.lock_irq, flags);
+
+ schedule_work_on(smp_processor_id(), &db_batch->work);
+}
+
/**
* folio_end_writeback_no_dropbehind - End writeback against a folio.
* @folio: The folio.
@@ -1685,7 +1794,10 @@ void folio_end_writeback(struct folio *folio)
*/
folio_get(folio);
folio_end_writeback_no_dropbehind(folio);
- folio_end_dropbehind(folio);
+ if (in_task())
+ folio_end_dropbehind(folio);
+ else
+ folio_end_dropbehind_irq(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbf758e27aa2..8208223fd764 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6277,6 +6277,7 @@ static int page_alloc_cpu_dead(unsigned int cpu)
struct zone *zone;
lru_add_drain_cpu(cpu);
+ dropbehind_drain_cpu(cpu);
mlock_drain_remote(cpu);
drain_pages(cpu);
--
2.39.5
^ permalink raw reply [flat|nested] 3+ messages in thread* [PATCH RFC v3 2/2] block: enable RWF_DONTCACHE for block devices
2026-02-27 16:41 [PATCH RFC v3 0/2] block: enable RWF_DONTCACHE for block devices Tal Zussman
2026-02-27 16:41 ` [PATCH RFC v3 1/2] filemap: defer dropbehind invalidation from IRQ context Tal Zussman
@ 2026-02-27 16:41 ` Tal Zussman
1 sibling, 0 replies; 3+ messages in thread
From: Tal Zussman @ 2026-02-27 16:41 UTC (permalink / raw)
To: Matthew Wilcox (Oracle),
Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
Suren Baghdasaryan, Michal Hocko, Brendan Jackman,
Johannes Weiner, Zi Yan, Jens Axboe, Alexander Viro,
Christian Brauner, Jan Kara
Cc: Christoph Hellwig, linux-fsdevel, linux-mm, linux-kernel,
linux-block, Tal Zussman
Block device buffered reads and writes already pass through
filemap_read() and iomap_file_buffered_write() respectively, both of
which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files
by setting FOP_DONTCACHE in def_blk_fops.
For CONFIG_BUFFER_HEAD paths, add block_write_begin_iocb() which threads
the kiocb through so that buffer_head-based I/O can use DONTCACHE
behavior. The existing block_write_begin() is preserved as a wrapper
that passes a NULL iocb.
This support is useful for databases that operate on raw block devices,
among other userspace applications.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
block/fops.c | 5 +++--
fs/buffer.c | 19 ++++++++++++++++---
include/linux/buffer_head.h | 3 +++
3 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index 4d32785b31d9..d8165f6ba71c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -505,7 +505,8 @@ static int blkdev_write_begin(const struct kiocb *iocb,
unsigned len, struct folio **foliop,
void **fsdata)
{
- return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ blkdev_get_block);
}
static int blkdev_write_end(const struct kiocb *iocb,
@@ -967,7 +968,7 @@ const struct file_operations def_blk_fops = {
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
- .fop_flags = FOP_BUFFER_RASYNC,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE,
};
static __init int blkdev_init(void)
diff --git a/fs/buffer.c b/fs/buffer.c
index 838c0c571022..18f1d128bb19 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2241,14 +2241,19 @@ EXPORT_SYMBOL(block_commit_write);
*
* The filesystem needs to handle block truncation upon failure.
*/
-int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
struct folio *folio;
int status;
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
+ fgp_flags |= FGP_DONTCACHE;
+
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -2263,6 +2268,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
*foliop = folio;
return status;
}
+
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block)
+{
+ return block_write_begin_iocb(NULL, mapping, pos, len, foliop,
+ get_block);
+}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(loff_t pos, unsigned len, unsigned copied,
@@ -2591,7 +2603,8 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, foliop, get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ get_block);
}
EXPORT_SYMBOL(cont_write_begin);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index b16b88bfbc3e..ddf88ce290f2 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block);
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block);
int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block);
int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
--
2.39.5
^ permalink raw reply [flat|nested] 3+ messages in thread