linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Bernd Schubert <bernd.schubert@fastmail.fm>
To: Chuck Lever III <chuck.lever@oracle.com>
Cc: Chuck Lever <cel@kernel.org>, Al Viro <viro@zeniv.linux.org.uk>,
	Christian Brauner <brauner@kernel.org>,
	Hugh Dickins <hughd@google.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm <linux-mm@kvack.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>
Subject: Re: [PATCH v3 1/3] libfs: Add directory operations for stable offsets
Date: Mon, 26 Jun 2023 17:01:37 +0200	[thread overview]
Message-ID: <a6b09f29-221d-7b49-edc9-2f4430bbbee2@fastmail.fm> (raw)
In-Reply-To: <65AB1E24-7714-4296-A501-BD824EE0DBC8@oracle.com>



On 6/26/23 16:50, Chuck Lever III wrote:
> 
> 
>> On Jun 26, 2023, at 9:36 AM, Bernd Schubert <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 6/24/23 00:21, Bernd Schubert wrote:
>>> On 6/6/23 15:10, Chuck Lever wrote:
>>>> From: Chuck Lever <chuck.lever@oracle.com>
>>>>
>>>> Create a vector of directory operations in fs/libfs.c that handles
>>>> directory seeks and readdir via stable offsets instead of the
>>>> current cursor-based mechanism.
>>>>
>>>> For the moment these are unused.
>>>>
>>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>>> ---
>>>>    fs/dcache.c            |    1
>>>>    fs/libfs.c             |  185 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    include/linux/dcache.h |    1
>>>>    include/linux/fs.h     |    9 ++
>>>>    4 files changed, 196 insertions(+)
>>>>
>>>> diff --git a/fs/dcache.c b/fs/dcache.c
>>>> index 52e6d5fdab6b..9c9a801f3b33 100644
>>>> --- a/fs/dcache.c
>>>> +++ b/fs/dcache.c
>>>> @@ -1813,6 +1813,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
>>>>        dentry->d_sb = sb;
>>>>        dentry->d_op = NULL;
>>>>        dentry->d_fsdata = NULL;
>>>> +    dentry->d_offset = 0;
>>>>        INIT_HLIST_BL_NODE(&dentry->d_hash);
>>>>        INIT_LIST_HEAD(&dentry->d_lru);
>>>>        INIT_LIST_HEAD(&dentry->d_subdirs);
>>>> diff --git a/fs/libfs.c b/fs/libfs.c
>>>> index 89cf614a3271..07317bbe1668 100644
>>>> --- a/fs/libfs.c
>>>> +++ b/fs/libfs.c
>>>> @@ -239,6 +239,191 @@ const struct inode_operations simple_dir_inode_operations = {
>>>>    };
>>>>    EXPORT_SYMBOL(simple_dir_inode_operations);
>>>> +/**
>>>> + * stable_offset_init - initialize a parent directory
>>>> + * @dir: parent directory to be initialized
>>>> + *
>>>> + */
>>>> +void stable_offset_init(struct inode *dir)
>>>> +{
>>>> +    xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
>>>> +    dir->i_next_offset = 0;
>>>> +}
>>>> +EXPORT_SYMBOL(stable_offset_init);
>>>> +
>>>> +/**
>>>> + * stable_offset_add - Add an entry to a directory's stable offset map
>>>> + * @dir: parent directory being modified
>>>> + * @dentry: new dentry being added
>>>> + *
>>>> + * Returns zero on success. Otherwise, a negative errno value is returned.
>>>> + */
>>>> +int stable_offset_add(struct inode *dir, struct dentry *dentry)
>>>> +{
>>>> +    struct xa_limit limit = XA_LIMIT(2, U32_MAX);
>>>> +    u32 offset = 0;
>>>> +    int ret;
>>>> +
>>>> +    if (dentry->d_offset)
>>>> +        return -EBUSY;
>>>> +
>>>> +    ret = xa_alloc_cyclic(&dir->i_doff_map, &offset, dentry, limit,
>>>> +                  &dir->i_next_offset, GFP_KERNEL);
>>> Please see below at struct inode my question about i_next_offset.
>>>> +    if (ret < 0)
>>>> +        return ret;
>>>> +
>>>> +    dentry->d_offset = offset;
>>>> +    return 0;
>>>> +}
>>>> +EXPORT_SYMBOL(stable_offset_add);
>>>> +
>>>> +/**
>>>> + * stable_offset_remove - Remove an entry to a directory's stable offset map
>>>> + * @dir: parent directory being modified
>>>> + * @dentry: dentry being removed
>>>> + *
>>>> + */
>>>> +void stable_offset_remove(struct inode *dir, struct dentry *dentry)
>>>> +{
>>>> +    if (!dentry->d_offset)
>>>> +        return;
>>>> +
>>>> +    xa_erase(&dir->i_doff_map, dentry->d_offset);
>>>> +    dentry->d_offset = 0;
>>>> +}
>>>> +EXPORT_SYMBOL(stable_offset_remove);
>>>> +
>>>> +/**
>>>> + * stable_offset_destroy - Release offset map
>>>> + * @dir: parent directory that is about to be destroyed
>>>> + *
>>>> + * During fs teardown (eg. umount), a directory's offset map might still
>>>> + * contain entries. xa_destroy() cleans out anything that remains.
>>>> + */
>>>> +void stable_offset_destroy(struct inode *dir)
>>>> +{
>>>> +    xa_destroy(&dir->i_doff_map);
>>>> +}
>>>> +EXPORT_SYMBOL(stable_offset_destroy);
>>>> +
>>>> +/**
>>>> + * stable_dir_llseek - Advance the read position of a directory descriptor
>>>> + * @file: an open directory whose position is to be updated
>>>> + * @offset: a byte offset
>>>> + * @whence: enumerator describing the starting position for this update
>>>> + *
>>>> + * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
>>>> + *
>>>> + * Returns the updated read position if successful; otherwise a
>>>> + * negative errno is returned and the read position remains unchanged.
>>>> + */
>>>> +static loff_t stable_dir_llseek(struct file *file, loff_t offset, int whence)
>>>> +{
>>>> +    switch (whence) {
>>>> +    case SEEK_CUR:
>>>> +        offset += file->f_pos;
>>>> +        fallthrough;
>>>> +    case SEEK_SET:
>>>> +        if (offset >= 0)
>>>> +            break;
>>>> +        fallthrough;
>>>> +    default:
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    return vfs_setpos(file, offset, U32_MAX);
>>>> +}
>>>> +
>>>> +static struct dentry *stable_find_next(struct xa_state *xas)
>>>> +{
>>>> +    struct dentry *child, *found = NULL;
>>>> +
>>>> +    rcu_read_lock();
>>>> +    child = xas_next_entry(xas, U32_MAX);
>>>> +    if (!child)
>>>> +        goto out;
>>>> +    spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
>>>> +    if (simple_positive(child))
>>>> +        found = dget_dlock(child);
>>>> +    spin_unlock(&child->d_lock);
>>>> +out:
>>>> +    rcu_read_unlock();
>>>> +    return found;
>>>> +}
>>>> +
>>>> +static bool stable_dir_emit(struct dir_context *ctx, struct dentry *dentry)
>>>> +{
>>>> +    struct inode *inode = d_inode(dentry);
>>>> +
>>>> +    return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len,
>>>> +              dentry->d_offset, inode->i_ino,
>>>> +              fs_umode_to_dtype(inode->i_mode));
>>>> +}
>>>> +
>>>> +static void stable_iterate_dir(struct dentry *dir, struct dir_context *ctx)
>>>> +{
>>>> +    XA_STATE(xas, &((d_inode(dir))->i_doff_map), ctx->pos);
>>>> +    struct dentry *dentry;
>>>> +
>>>> +    while (true) {
>>>> +        spin_lock(&dir->d_lock);
>>>> +        dentry = stable_find_next(&xas);
>>>> +        spin_unlock(&dir->d_lock);
>>>> +        if (!dentry)
>>>> +            break;
>>>> +
>>>> +        if (!stable_dir_emit(ctx, dentry)) {
>>>> +            dput(dentry);
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        dput(dentry);
>>>> +        ctx->pos = xas.xa_index + 1;
>>>> +    }
>>>> +}
>>>> +
>>>> +/**
>>>> + * stable_readdir - Emit entries starting at offset @ctx->pos
>>>> + * @file: an open directory to iterate over
>>>> + * @ctx: directory iteration context
>>>> + *
>>>> + * Caller must hold @file's i_rwsem to prevent insertion or removal of
>>>> + * entries during this call.
>>>> + *
>>>> + * On entry, @ctx->pos contains an offset that represents the first entry
>>>> + * to be read from the directory.
>>>> + *
>>>> + * The operation continues until there are no more entries to read, or
>>>> + * until the ctx->actor indicates there is no more space in the caller's
>>>> + * output buffer.
>>>> + *
>>>> + * On return, @ctx->pos contains an offset that will read the next entry
>>>> + * in this directory when shmem_readdir() is called again with @ctx.
>>>> + *
>>>> + * Return values:
>>>> + *   %0 - Complete
>>>> + */
>>>> +static int stable_readdir(struct file *file, struct dir_context *ctx)
>>>> +{
>>>> +    struct dentry *dir = file->f_path.dentry;
>>>> +
>>>> +    lockdep_assert_held(&d_inode(dir)->i_rwsem);
>>>> +
>>>> +    if (!dir_emit_dots(file, ctx))
>>>> +        return 0;
>>>> +
>>>> +    stable_iterate_dir(dir, ctx);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +const struct file_operations stable_dir_operations = {
>>>> +    .llseek        = stable_dir_llseek,
>>>> +    .iterate_shared    = stable_readdir,
>>>> +    .read        = generic_read_dir,
>>>> +    .fsync        = noop_fsync,
>>>> +};
>>>> +EXPORT_SYMBOL(stable_dir_operations);
>>>> +
>>>>    static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
>>>>    {
>>>>        struct dentry *child = NULL;
>>>> diff --git a/include/linux/dcache.h b/include/linux/dcache.h
>>>> index 6b351e009f59..579ce1800efe 100644
>>>> --- a/include/linux/dcache.h
>>>> +++ b/include/linux/dcache.h
>>>> @@ -96,6 +96,7 @@ struct dentry {
>>>>        struct super_block *d_sb;    /* The root of the dentry tree */
>>>>        unsigned long d_time;        /* used by d_revalidate */
>>>>        void *d_fsdata;            /* fs-specific data */
>>>> +    u32 d_offset;            /* directory offset in parent */
>>>>        union {
>>>>            struct list_head d_lru;        /* LRU list */
>>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>>> index 133f0640fb24..3fc2c04ed8ff 100644
>>>> --- a/include/linux/fs.h
>>>> +++ b/include/linux/fs.h
>>>> @@ -719,6 +719,10 @@ struct inode {
>>>>    #endif
>>>>        void            *i_private; /* fs or device private pointer */
>>>> +
>>>> +    /* simplefs stable directory offset tracking */
>>>> +    struct xarray        i_doff_map;
>>>> +    u32            i_next_offset;
>>> Hmm, I was grepping through the patches and only find that "i_next_offset" is initialized to 0 and then passed to xa_alloc_cyclic - does this really need to part of struct inode or could it be a local variable in stable_offset_add()?
> 
> This is a per-directory value so that each directory can use
> the full range of offsets (from zero to UINT_MAX). If there were
> only a single next_offset field, then all tmpfs directories
> would share the offset range, which is not scalable (not to
> mention, it would also be unlike the behavior of other
> filesystems).
> 
> Yes, we could move this to the shmem-private part of the inode,
> but then the API gets a little uglier.

Oh right, sorry, I was reading this wrong as per dentry value.

> 
> 
>>> I only managed to look a bit through the patches right now, personally I like v2 better as it doesn't extend struct inode with changes that can be used by in-memory file system only. What do others think? An alternative would be to have these fields in struct shmem_inode_info and pass it as extra argument to the stable_ functions?


  reply	other threads:[~2023-06-26 15:03 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <168605676256.32244.6158641147817585524.stgit@manet.1015granger.net>
2023-06-21 13:12 ` [PATCH v3 0/3] shmemfs stable directory offsets Chuck Lever III
2023-06-21 14:25   ` Christian Brauner
     [not found] ` <168605705924.32244.13384849924097654559.stgit@manet.1015granger.net>
2023-06-23 22:21   ` [PATCH v3 1/3] libfs: Add directory operations for stable offsets Bernd Schubert
2023-06-26 13:36     ` Bernd Schubert
2023-06-26 14:50       ` Chuck Lever III
2023-06-26 15:01         ` Bernd Schubert [this message]
2023-06-26 15:22       ` Bernd Schubert
     [not found] ` <168605707262.32244.4794425063054676856.stgit@manet.1015granger.net>
2023-06-26 13:18   ` [PATCH v3 3/3] shmem: stable directory offsets Bernd Schubert
2023-06-26 15:16     ` Chuck Lever III

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a6b09f29-221d-7b49-edc9-2f4430bbbee2@fastmail.fm \
    --to=bernd.schubert@fastmail.fm \
    --cc=akpm@linux-foundation.org \
    --cc=brauner@kernel.org \
    --cc=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=hughd@google.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox