Re: [PATCH v4 5/5] libfs: Use d_children list to iterate simple_offset directories

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Chuck Lever <chuck.lever@oracle.com>
To: Hugh Dickens <hughd@google.com>,
	Christian Brauner <brauner@kernel.org>,
	Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	yukuai3@huawei.com, yangerkun@huaweicloud.com
Subject: Re: [PATCH v4 5/5] libfs: Use d_children list to iterate simple_offset directories
Date: Sat, 14 Dec 2024 12:13:30 -0500	[thread overview]
Message-ID: <8c716ca1-84f9-4644-95cf-9965e8a30284@oracle.com> (raw)
In-Reply-To: <5eb7bbdb-0928-4c80-bf03-9de27d6f3f89@oracle.com>

On 12/8/24 12:11 PM, Chuck Lever wrote:
> On 12/4/24 10:52 AM, cel@kernel.org wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>>
>> The mtree mechanism has been effective at creating directory offsets
>> that are stable over multiple opendir instances. However, it has not
>> been able to handle the subtleties of renames that are concurrent
>> with readdir.
>>
>> Instead of using the mtree to emit entries in the order of their
>> offset values, use it only to map incoming ctx->pos to a starting
>> entry. Then use the directory's d_children list, which is already
>> maintained properly by the dcache, to find the next child to emit.
>>
>> One of the sneaky things about this is that when the mtree-allocated
>> offset value wraps (which is very rare), looking up ctx->pos++ is
>> not going to find the next entry; it will return NULL. Instead, by
>> following the d_children list, the offset values can appear in any
>> order but all of the entries in the directory will be visited
>> eventually.
>>
>> Note also that the readdir() is guaranteed to reach the tail of this
>> list. Entries are added only at the head of d_children, and readdir
>> walks from its current position in that list towards its tail.
>>
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>>   fs/libfs.c | 77 ++++++++++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 57 insertions(+), 20 deletions(-)
>>
>> diff --git a/fs/libfs.c b/fs/libfs.c
>> index fcb2cdf6e3f3..398eac385094 100644
>> --- a/fs/libfs.c
>> +++ b/fs/libfs.c
>> @@ -243,12 +243,13 @@ EXPORT_SYMBOL(simple_dir_inode_operations);
>>   /* simple_offset_add() allocation range */
>>   enum {
>> -    DIR_OFFSET_MIN        = 2,
>> +    DIR_OFFSET_MIN        = 3,
>>       DIR_OFFSET_MAX        = LONG_MAX - 1,
>>   };
>>   /* simple_offset_add() never assigns these to a dentry */
>>   enum {
>> +    DIR_OFFSET_FIRST    = 2,        /* Find first real entry */
>>       DIR_OFFSET_EOD        = LONG_MAX,    /* Marks EOD */
>>   };
>> @@ -456,19 +457,43 @@ static loff_t offset_dir_llseek(struct file 
>> *file, loff_t offset, int whence)
>>       return vfs_setpos(file, offset, LONG_MAX);
>>   }
>> -static struct dentry *offset_find_next(struct offset_ctx *octx, 
>> loff_t offset)
>> +/* Cf. find_next_child() */
>> +static struct dentry *find_next_sibling_locked(struct dentry *parent,
>> +                           struct dentry *dentry)
> 
> There might be a better name for this function.
> 
> It looks a lot like find_next_child(), but it acts more like
> scan_positives(). It starts looking for positive dentries starting
> at @dentry, thus it can return the dentry that was passed in @dentry.
> 
> find_positive_from_locked()  ??
> 
> 
>>   {
>> -    MA_STATE(mas, &octx->mt, offset, offset);
>> +    struct dentry *found = NULL;
>> +
>> +    hlist_for_each_entry_from(dentry, d_sib) {
>> +        if (!simple_positive(dentry))
>> +            continue;
>> +        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
>> +        if (simple_positive(dentry))
>> +            found = dget_dlock(dentry);
>> +        spin_unlock(&dentry->d_lock);
>> +        if (likely(found))
>> +            break;
>> +    }
>> +    return found;
>> +}
>> +
>> +static noinline_for_stack struct dentry *
>> +offset_dir_lookup(struct file *file, loff_t offset)
>> +{
>> +    struct dentry *parent = file->f_path.dentry;
>>       struct dentry *child, *found = NULL;
>> +    struct inode *inode = d_inode(parent);
>> +    struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
>> +
>> +    MA_STATE(mas, &octx->mt, offset, offset);
>>       rcu_read_lock();
>>       child = mas_find(&mas, DIR_OFFSET_MAX);
>>       if (!child)
>>           goto out;
>> -    spin_lock(&child->d_lock);
>> -    if (simple_positive(child))
>> -        found = dget_dlock(child);
>> -    spin_unlock(&child->d_lock);
>> +
>> +    spin_lock(&parent->d_lock);
>> +    found = find_next_sibling_locked(parent, child);
>> +    spin_unlock(&parent->d_lock);
>>   out:
>>       rcu_read_unlock();
>>       return found;
>> @@ -477,30 +502,42 @@ static struct dentry *offset_find_next(struct 
>> offset_ctx *octx, loff_t offset)
>>   static bool offset_dir_emit(struct dir_context *ctx, struct dentry 
>> *dentry)
>>   {
>>       struct inode *inode = d_inode(dentry);
>> -    long offset = dentry2offset(dentry);
>> -    return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, 
>> offset,
>> -              inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>> +    return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
>> +            inode->i_ino, fs_umode_to_dtype(inode->i_mode));
>>   }
>> -static void offset_iterate_dir(struct inode *inode, struct 
>> dir_context *ctx)
>> +static void offset_iterate_dir(struct file *file, struct dir_context 
>> *ctx)
>>   {
>> -    struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
>> +    struct dentry *dir = file->f_path.dentry;
>>       struct dentry *dentry;
>> +    if (ctx->pos == DIR_OFFSET_FIRST) {
>> +        spin_lock(&dir->d_lock);
>> +        dentry = find_next_sibling_locked(dir, d_first_child(dir));
>> +        spin_unlock(&dir->d_lock);
>> +    } else
>> +        dentry = offset_dir_lookup(file, ctx->pos);
>> +    if (!dentry)
>> +        goto out_eod;
>> +
>>       while (true) {
>> -        dentry = offset_find_next(octx, ctx->pos);
>> -        if (!dentry)
>> -            goto out_eod;
>> +        struct dentry *next;
>> -        if (!offset_dir_emit(ctx, dentry)) {
>> -            dput(dentry);
>> +        ctx->pos = dentry2offset(dentry);
>> +        if (!offset_dir_emit(ctx, dentry))
>>               break;
>> -        }
>> -        ctx->pos = dentry2offset(dentry) + 1;
>> +        spin_lock(&dir->d_lock);
>> +        next = find_next_sibling_locked(dir, d_next_sibling(dentry));
>> +        spin_unlock(&dir->d_lock);

Recent coverity report:

*** CID 1602474:  Concurrent data access violations  (ATOMICITY)
/fs/libfs.c: 536 in offset_iterate_dir()
530
531     		ctx->pos = dentry2offset(dentry);
532     		if (!offset_dir_emit(ctx, dentry))
533     			break;
534
535     		spin_lock(&dir->d_lock);
 >>>     CID 1602474:  Concurrent data access violations  (ATOMICITY)
 >>>     Using an unreliable value of "dentry" inside the second locked 
section. If the data that "dentry" depends on was changed by another 
thread, this use might be incorrect.
536     		next = find_next_sibling_locked(dir, d_next_sibling(dentry));
537     		spin_unlock(&dir->d_lock);
538     		dput(dentry);
539
540     		if (!next)
541     			goto out_eod;

As far as I can tell, @dentry's list fields, which are the only fields
accessed in find_next_sibling_locked(), are protected by dir->d_lock. We
don't care about the other fields.

Not sure if this is a false positive. Is there an annotation that will
help clarify this situation?


>>           dput(dentry);
>> +
>> +        if (!next)
>> +            goto out_eod;
>> +        dentry = next;
>>       }
>> +    dput(dentry);
>>       return;
>>   out_eod:
>> @@ -539,7 +576,7 @@ static int offset_readdir(struct file *file, 
>> struct dir_context *ctx)
>>       if (!dir_emit_dots(file, ctx))
>>           return 0;
>>       if (ctx->pos != DIR_OFFSET_EOD)
>> -        offset_iterate_dir(d_inode(dir), ctx);
>> +        offset_iterate_dir(file, ctx);
>>       return 0;
>>   }
> 
> 


-- 
Chuck Lever

next prev parent reply	other threads:[~2024-12-14 17:13 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-12-04 15:52 [PATCH v4 0/5] Improve simple directory offset wrap behavior cel
2024-12-04 15:52 ` [PATCH v4 1/5] libfs: Return ENOSPC when the directory offset range is exhausted cel
2024-12-04 15:52 ` [PATCH v4 2/5] Revert "libfs: Add simple_offset_empty()" cel
2024-12-04 15:52 ` [PATCH v4 3/5] Revert "libfs: fix infinite directory reads for offset dir" cel
2024-12-04 15:52 ` [PATCH v4 4/5] libfs: Replace simple_offset end-of-directory detection cel
2024-12-04 15:52 ` [PATCH v4 5/5] libfs: Use d_children list to iterate simple_offset directories cel
2024-12-08 17:11   ` Chuck Lever
2024-12-14 17:13     ` Chuck Lever [this message]
2024-12-14 17:49       ` Al Viro
2024-12-14 19:22         ` Chuck Lever
2024-12-14 19:59           ` Al Viro
2024-12-05 17:09 ` [PATCH v4 0/5] Improve simple directory offset wrap behavior Christian Brauner
2024-12-05 22:16 ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8c716ca1-84f9-4644-95cf-9965e8a30284@oracle.com \
    --to=chuck.lever@oracle.com \
    --cc=brauner@kernel.org \
    --cc=hughd@google.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yangerkun@huaweicloud.com \
    --cc=yukuai3@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox