linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Cc: linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	 djwong@kernel.org, john.g.garry@oracle.com, willy@infradead.org,
	hch@lst.de,  ritesh.list@gmail.com, jack@suse.cz,
	Luis Chamberlain <mcgrof@kernel.org>,
	 dgc@kernel.org, tytso@mit.edu, p.raghav@samsung.com,
	andres@anarazel.de,  brauner@kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [RFC PATCH v2 2/5] iomap: Add initial support for buffered RWF_WRITETHROUGH
Date: Thu, 16 Apr 2026 14:34:15 +0200	[thread overview]
Message-ID: <52wsh6owrtmznt5xuks6ljwy4zbpyid45x5dbxo5xgssxm4zxy@iue2on3llpfb> (raw)
In-Reply-To: <d44171d5a1ec783722019ad7d1a4ede0478cc914.1775658795.git.ojaswin@linux.ibm.com>

Some more thoughs... :)

> @@ -1096,6 +1097,276 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
>  	return __iomap_write_end(iter->inode, pos, len, copied, folio);
>  }
>  
> +static ssize_t iomap_writethrough_complete(struct iomap_writethrough_ctx *wt_ctx)
> +{
> +	struct kiocb *iocb	= wt_ctx->iocb;
> +	struct inode *inode	= wt_ctx->inode;
> +	ssize_t ret		= wt_ctx->error;
> +
> +	if (wt_ctx->dops && wt_ctx->dops->end_io) {
> +		int err = wt_ctx->dops->end_io(iocb, wt_ctx->written,
> +						wt_ctx->error,
> +						wt_ctx->flags);

It's a bit odd to use only ->end_io from dops also because we don't really
use direct IO submission path. So perhaps you can have just an end_io
handler pointer in iomap_writethrough_ops similarly as you have a
submission one?

> +		if (err)
> +			ret = err;
> +	}
> +
> +	mapping_clear_stable_writes(inode->i_mapping);
> +
> +	if (!ret) {
> +		ret = wt_ctx->written;
> +		iocb->ki_pos = wt_ctx->pos + ret;
> +	}
> +
> +	kfree(wt_ctx);
> +	return ret;
> +}

...

> +static int iomap_writethrough_iter(struct iomap_writethrough_ctx *wt_ctx,
> +				   struct iomap_iter *iter, struct iov_iter *i,
> +				   const struct iomap_writethrough_ops *wt_ops)
> +
> +{
> +	ssize_t total_written = 0;
> +	int status = 0;
> +	struct address_space *mapping = iter->inode->i_mapping;
> +	size_t chunk = mapping_max_folio_size(mapping);
> +	unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
> +	unsigned int bs = i_blocksize(iter->inode);
> +
> +	/* copied over based on DIO handles these flags */
> +	if (iter->iomap.type == IOMAP_UNWRITTEN)
> +		wt_ctx->flags |= IOMAP_DIO_UNWRITTEN;
> +	if (iter->iomap.flags & IOMAP_F_SHARED)
> +		wt_ctx->flags |= IOMAP_DIO_COW;
> +
> +	if (!(iter->flags & IOMAP_WRITETHROUGH))
> +		return -EINVAL;
> +
> +	do {
> +		struct folio *folio;
> +		size_t offset;		/* Offset into folio */
> +		u64 bytes;		/* Bytes to write to folio */
> +		size_t copied;		/* Bytes copied from user */
> +		u64 written;		/* Bytes have been written */
> +		loff_t pos;
> +		size_t off_aligned, len_aligned;
> +
> +		bytes = iov_iter_count(i);
> +retry:
> +		offset = iter->pos & (chunk - 1);
> +		bytes = min(chunk - offset, bytes);
> +		status = balance_dirty_pages_ratelimited_flags(mapping,
> +							       bdp_flags);
> +		if (unlikely(status))
> +			break;
> +
> +		/*
> +		 * If completions already occurred and reported errors, give up
> +		 * now and don't bother submitting more bios.
> +		 */
> +		if (unlikely(data_race(wt_ctx->error))) {
> +			wt_ctx->nr_bvecs = 0;
> +			break;
> +		}
> +
> +		if (bytes > iomap_length(iter))
> +			bytes = iomap_length(iter);
> +
> +		/*
> +		 * Bring in the user page that we'll copy from _first_.
> +		 * Otherwise there's a nasty deadlock on copying from the
> +		 * same page as we're writing to, without it being marked
> +		 * up-to-date.
> +		 *
> +		 * For async buffered writes the assumption is that the user
> +		 * page has already been faulted in. This can be optimized by
> +		 * faulting the user page.
> +		 */
> +		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
> +			status = -EFAULT;
> +			break;
> +		}
> +
> +		status = iomap_write_begin(iter, wt_ops->write_ops, &folio,
> +					   &offset, &bytes);
> +		if (unlikely(status)) {
> +			iomap_write_failed(iter->inode, iter->pos, bytes);
> +			break;
> +		}
> +		if (iter->iomap.flags & IOMAP_F_STALE)
> +			break;
> +
> +		pos = iter->pos;
> +
> +		if (mapping_writably_mapped(mapping))
> +			flush_dcache_folio(folio);
> +
> +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> +		written = iomap_write_end(iter, bytes, copied, folio) ?
> +			  copied : 0;
> +
> +		if (!written)
> +			goto put_folio;
> +
> +		off_aligned = round_down(offset, bs);
> +		len_aligned = round_up(offset + written, bs) - off_aligned;
> +
> +		iomap_folio_prepare_writethrough(folio, off_aligned,
> +						 len_aligned);
> +
> +		if (!wt_ctx->nr_bvecs)
> +			wt_ctx->bio_pos = round_down(pos, bs);
> +
> +		bvec_set_folio(&wt_ctx->bvec[wt_ctx->nr_bvecs], folio,
> +			       len_aligned, off_aligned);

Shouldn't we zero out the tail of the folio if we are submitting partial
folio for write?

> +		wt_ctx->nr_bvecs++;
> +		wt_ctx->written += written;
> +
> +		if (pos + written > wt_ctx->new_i_size)
> +			wt_ctx->new_i_size = pos + written;

I'm probably missing something here but where is i_size update handled? I
don't see new_i_size used anywhere? Also why is it OK to not call
pagecache_isize_extended() but that goes with the i_size update...

> +
> +		if (wt_ctx->nr_bvecs == wt_ctx->max_bvecs)
> +			iomap_writethrough_submit_bio(wt_ctx, &iter->iomap, wt_ops);
> +
> +put_folio:
> +		__iomap_put_folio(iter, wt_ops->write_ops, written, folio);
> +
> +		cond_resched();
> +		if (unlikely(written == 0)) {
> +			iomap_write_failed(iter->inode, pos, bytes);
> +			iov_iter_revert(i, copied);
> +
> +			if (chunk > PAGE_SIZE)
> +				chunk /= 2;
> +			if (copied) {
> +				bytes = copied;
> +				goto retry;
> +			}
> +		} else {
> +			total_written += written;
> +			iomap_iter_advance(iter, written);
> +		}
> +	} while (iov_iter_count(i) && iomap_length(iter));

Overall the differences of this function from iomap_write_iter() seem
relatively small so maybe it would be possible to just extend
iomap_write_iter() to support writethrough IO as well? Basically once we've
copied data into the folio and called iomap_write_end() we can have "if
writethrough, call function to prepare & submit the folio for IO".

> +
> +	if (wt_ctx->nr_bvecs)
> +		iomap_writethrough_submit_bio(wt_ctx, &iter->iomap, wt_ops);
> +
> +	return total_written ? 0 : status;
> +}
> +

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR


  parent reply	other threads:[~2026-04-16 12:34 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-08 18:45 [RFC PATCH v2 0/5] Add buffered write-through support to iomap & xfs Ojaswin Mujoo
2026-04-08 18:45 ` [RFC PATCH v2 1/5] mm: Refactor folio_clear_dirty_for_io() Ojaswin Mujoo
2026-04-15  6:14   ` Christoph Hellwig
2026-04-08 18:45 ` [RFC PATCH v2 2/5] iomap: Add initial support for buffered RWF_WRITETHROUGH Ojaswin Mujoo
2026-04-16 12:05   ` Jan Kara
2026-04-16 12:34   ` Jan Kara [this message]
2026-04-17  4:13   ` Pankaj Raghav (Samsung)
2026-04-08 18:45 ` [RFC PATCH v2 3/5] xfs: Add RWF_WRITETHROUGH support to xfs Ojaswin Mujoo
2026-04-08 18:45 ` [RFC PATCH v2 4/5] iomap: Add aio support to RWF_WRITETHROUGH Ojaswin Mujoo
2026-04-08 18:45 ` [RFC PATCH v2 5/5] iomap: Add DSYNC support to writethrough Ojaswin Mujoo
2026-04-17  3:54 ` [RFC PATCH v2 0/5] Add buffered write-through support to iomap & xfs Pankaj Raghav (Samsung)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52wsh6owrtmznt5xuks6ljwy4zbpyid45x5dbxo5xgssxm4zxy@iue2on3llpfb \
    --to=jack@suse.cz \
    --cc=andres@anarazel.de \
    --cc=brauner@kernel.org \
    --cc=dgc@kernel.org \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=john.g.garry@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=mcgrof@kernel.org \
    --cc=ojaswin@linux.ibm.com \
    --cc=p.raghav@samsung.com \
    --cc=ritesh.list@gmail.com \
    --cc=tytso@mit.edu \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox