Re: [PATCH v6 5/5] libfs: Use d_children list to iterate simple_offset directories

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: yangerkun <yangerkun@huaweicloud.com>
To: cel@kernel.org, Hugh Dickins <hughd@google.com>,
	Christian Brauner <brauner@kernel.org>,
	Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	yukuai3@huawei.com, Chuck Lever <chuck.lever@oracle.com>
Subject: Re: [PATCH v6 5/5] libfs: Use d_children list to iterate simple_offset directories
Date: Mon, 23 Dec 2024 22:21:42 +0800	[thread overview]
Message-ID: <3ccf8255-dfbb-d019-d156-01edf5242c49@huaweicloud.com> (raw)
In-Reply-To: <20241220153314.5237-6-cel@kernel.org>



在 2024/12/20 23:33, cel@kernel.org 写道:
> From: Chuck Lever <chuck.lever@oracle.com>
> 
> The mtree mechanism has been effective at creating directory offsets
> that are stable over multiple opendir instances. However, it has not
> been able to handle the subtleties of renames that are concurrent
> with readdir.
> 
> Instead of using the mtree to emit entries in the order of their
> offset values, use it only to map incoming ctx->pos to a starting
> entry. Then use the directory's d_children list, which is already
> maintained properly by the dcache, to find the next child to emit.
> 
> One of the sneaky things about this is that when the mtree-allocated
> offset value wraps (which is very rare), looking up ctx->pos++ is
> not going to find the next entry; it will return NULL. Instead, by
> following the d_children list, the offset values can appear in any
> order but all of the entries in the directory will be visited
> eventually.
> 
> Note also that the readdir() is guaranteed to reach the tail of this
> list. Entries are added only at the head of d_children, and readdir
> walks from its current position in that list towards its tail.
> 
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
>   fs/libfs.c | 84 +++++++++++++++++++++++++++++++++++++-----------------
>   1 file changed, 58 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/libfs.c b/fs/libfs.c
> index 5c56783c03a5..f7ead02062ad 100644
> --- a/fs/libfs.c
> +++ b/fs/libfs.c
> @@ -247,12 +247,13 @@ EXPORT_SYMBOL(simple_dir_inode_operations);
>   
>   /* simple_offset_add() allocation range */
>   enum {
> -	DIR_OFFSET_MIN		= 2,
> +	DIR_OFFSET_MIN		= 3,
>   	DIR_OFFSET_MAX		= LONG_MAX - 1,
>   };
>   
>   /* simple_offset_add() never assigns these to a dentry */
>   enum {
> +	DIR_OFFSET_FIRST	= 2,		/* Find first real entry */
>   	DIR_OFFSET_EOD		= LONG_MAX,	/* Marks EOD */
>   
>   };
> @@ -458,51 +459,82 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
>   	return vfs_setpos(file, offset, LONG_MAX);
>   }
>   
> -static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
> +static struct dentry *find_positive_dentry(struct dentry *parent,
> +					   struct dentry *dentry,
> +					   bool next)
>   {
> -	MA_STATE(mas, &octx->mt, offset, offset);
> +	struct dentry *found = NULL;
> +
> +	spin_lock(&parent->d_lock);
> +	if (next)
> +		dentry = d_next_sibling(dentry);
> +	else if (!dentry)
> +		dentry = d_first_child(parent);
> +	hlist_for_each_entry_from(dentry, d_sib) {
> +		if (!simple_positive(dentry))
> +			continue;
> +		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
> +		if (simple_positive(dentry))
> +			found = dget_dlock(dentry);
> +		spin_unlock(&dentry->d_lock);
> +		if (likely(found))
> +			break;
> +	}
> +	spin_unlock(&parent->d_lock);
> +	return found;
> +}
> +
> +static noinline_for_stack struct dentry *
> +offset_dir_lookup(struct dentry *parent, loff_t offset)
> +{
> +	struct inode *inode = d_inode(parent);
> +	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
>   	struct dentry *child, *found = NULL;
>   
> -	rcu_read_lock();
> -	child = mas_find(&mas, DIR_OFFSET_MAX);
> -	if (!child)
> -		goto out;
> -	spin_lock(&child->d_lock);
> -	if (simple_positive(child))
> -		found = dget_dlock(child);
> -	spin_unlock(&child->d_lock);
> -out:
> -	rcu_read_unlock();
> +	MA_STATE(mas, &octx->mt, offset, offset);
> +
> +	if (offset == DIR_OFFSET_FIRST)
> +		found = find_positive_dentry(parent, NULL, false);
> +	else {
> +		rcu_read_lock();
> +		child = mas_find(&mas, DIR_OFFSET_MAX);

Can this child be NULL? Like we delete some file after first readdir, 
maybe we should break here, or we may rescan all dentry and return them 
to userspace again?

> +		found = find_positive_dentry(parent, child, false);
> +		rcu_read_unlock();
> +	}
>   	return found;
>   }
>   
>   static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>   {
>   	struct inode *inode = d_inode(dentry);
> -	long offset = dentry2offset(dentry);
>   
> -	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
> -			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
> +	return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
> +			inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>   }
>   
> -static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
> +static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
>   {
> -	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
> +	struct dentry *dir = file->f_path.dentry;
>   	struct dentry *dentry;
>   
> +	dentry = offset_dir_lookup(dir, ctx->pos);
> +	if (!dentry)
> +		goto out_eod;
>   	while (true) {
> -		dentry = offset_find_next(octx, ctx->pos);
> -		if (!dentry)
> -			goto out_eod;
> +		struct dentry *next;
>   
> -		if (!offset_dir_emit(ctx, dentry)) {
> -			dput(dentry);
> +		ctx->pos = dentry2offset(dentry);
> +		if (!offset_dir_emit(ctx, dentry))
>   			break;
> -		}
>   
> -		ctx->pos = dentry2offset(dentry) + 1;
> +		next = find_positive_dentry(dir, dentry, true);
>   		dput(dentry);
> +
> +		if (!next)
> +			goto out_eod;
> +		dentry = next;
>   	}
> +	dput(dentry);
>   	return;
>   
>   out_eod:
> @@ -541,7 +573,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
>   	if (!dir_emit_dots(file, ctx))
>   		return 0;
>   	if (ctx->pos != DIR_OFFSET_EOD)
> -		offset_iterate_dir(d_inode(dir), ctx);
> +		offset_iterate_dir(file, ctx);
>   	return 0;
>   }
>

next prev parent reply	other threads:[~2024-12-23 14:21 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-20 15:33 [PATCH v6 0/5] Improve simple directory offset wrap behavior cel
2024-12-20 15:33 ` [PATCH v6 1/5] libfs: Return ENOSPC when the directory offset range is exhausted cel
2024-12-23 16:28   ` Liam R. Howlett
2024-12-23 17:54     ` Chuck Lever
2024-12-20 15:33 ` [PATCH v6 2/5] Revert "libfs: Add simple_offset_empty()" cel
2024-12-23 14:17   ` yangerkun
2024-12-20 15:33 ` [PATCH v6 3/5] Revert "libfs: fix infinite directory reads for offset dir" cel
2024-12-23 14:17   ` yangerkun
2024-12-20 15:33 ` [PATCH v6 4/5] libfs: Replace simple_offset end-of-directory detection cel
2024-12-23 14:17   ` yangerkun
2024-12-23 16:30   ` Liam R. Howlett
2024-12-23 17:57     ` Chuck Lever
2025-01-04 11:29     ` Christian Brauner
2024-12-20 15:33 ` [PATCH v6 5/5] libfs: Use d_children list to iterate simple_offset directories cel
2024-12-23 14:21   ` yangerkun [this message]
2024-12-23 14:44     ` Chuck Lever
2024-12-24  4:40       ` yangerkun
2024-12-24 13:52         ` Chuck Lever
2024-12-24 13:57           ` yangerkun
2024-12-24 14:00             ` yangerkun
2024-12-24 16:10               ` Chuck Lever
2024-12-22 10:44 ` [PATCH v6 0/5] Improve simple directory offset wrap behavior Christian Brauner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3ccf8255-dfbb-d019-d156-01edf5242c49@huaweicloud.com \
    --to=yangerkun@huaweicloud.com \
    --cc=brauner@kernel.org \
    --cc=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=hughd@google.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yukuai3@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox