* Re: [RFC PATCH 1/1] fs,ns: allow copying of shm_mnt mount trees
2026-01-29 17:35 ` [RFC PATCH 1/1] fs,ns: allow copying of shm_mnt mount trees Snaipe
2026-01-30 8:38 ` Johannes Thumshirn
@ 2026-02-04 16:51 ` Christian Brauner
1 sibling, 0 replies; 5+ messages in thread
From: Christian Brauner @ 2026-02-04 16:51 UTC (permalink / raw)
To: Snaipe; +Cc: linux-fsdevel, linux-mm
On Thu, Jan 29, 2026 at 06:35:15PM +0100, Snaipe wrote:
> From: "Franklin \"Snaipe\" Mathieu" <me@snai.pe>
>
> The main motivation for this change is to be able to bind-mount memfd file
> descriptors. Prior to this change, it was not easy for a process to
> create a private in-memory handle that could then be bind-mounted.
>
> A process had to have access to a tmpfs, create a file in it, call
> open_tree on the resulting file descriptor, close the original file
> descriptor, unlink the file, and then check that no other process raced
> the process to open the new file. Doable, but not great for mounting
> sensitive content like secrets.
>
> With this change, it is now possible for a process to prepare a memfd,
> and call open_tree on it:
>
> int tmpfd = memfd_create("secret", 0);
> fchmod(tmpfd, 0600);
> write(tmpfd, "SecretKey", 9);
>
> int treefd = open_tree(tmpfd, "", OPEN_TREE_CLONE|AT_EMPTY_PATH|AT_RECURSIVE);
> move_mount(treefd, "", -1, "/secret.txt", MOVE_MOUNT_F_EMPTY_PATH);
>
> Signed-off-by: Franklin "Snaipe" Mathieu <me@snai.pe>
> ---
> fs/namespace.c | 8 ++++++++
> mm/internal.h | 2 ++
> mm/shmem.c | 2 +-
> 3 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d82910f33dc4..f51ad2013662 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -38,6 +38,9 @@
> #include "pnode.h"
> #include "internal.h"
>
> +/* For checking memfd bind-mounts via shm_mnt */
> +#include "../mm/internal.h"
> +
> /* Maximum number of mounts in a mount namespace */
> static unsigned int sysctl_mount_max __read_mostly = 100000;
>
> @@ -2901,6 +2904,8 @@ static int do_change_type(const struct path *path, int ms_flags)
> * (3) The caller tries to copy a pidfs mount referring to a pidfd.
> * (4) The caller is trying to copy a mount tree that belongs to an
> * anonymous mount namespace.
> + * (5) The caller is trying to copy a mount tree belonging to shm_mnt
> + * (e.g. bind-mounting a file descriptor obtained from memfd_create)
> *
> * For that to be safe, this helper enforces that the origin mount
> * namespace the anonymous mount namespace was created from is the
> @@ -2943,6 +2948,9 @@ static inline bool may_copy_tree(const struct path *path)
> if (d_op == &pidfs_dentry_operations)
> return true;
>
> + if (path->mnt == shm_mnt)
> + return true;
The problem with this approach is that it allows to bind-mount anything
that uses the internal tmpfs mount and that while it allows to
bind-mount tmpfs it exludes memfd_create() calls that are hugetlb
backed. So this would allow:
arch/x86/kernel/cpu/sgx/ioctl.c: backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
drivers/gpu/drm/drm_gem.c: filp = shmem_file_setup("drm mm object", size, VM_NORESERVE);
drivers/gpu/drm/i915/gem/i915_gem_shmem.c: filp = shmem_file_setup("i915", size, flags);
drivers/gpu/drm/i915/gem/i915_gem_ttm.c: filp = shmem_file_setup("i915-shmem-tt", size, VM_NORESERVE);
drivers/gpu/drm/i915/gt/shmem_utils.c: file = shmem_file_setup(name, PAGE_ALIGN(len), VM_NORESERVE);
drivers/gpu/drm/ttm/tests/ttm_tt_test.c: shmem = shmem_file_setup("ttm swap", BO_SIZE, 0);
drivers/gpu/drm/ttm/ttm_backup.c: return shmem_file_setup("ttm shmem backup", size, 0);
drivers/gpu/drm/ttm/ttm_tt.c: swap_storage = shmem_file_setup("ttm swap", size, 0);
include/linux/shmem_fs.h:extern struct file *shmem_file_setup(const char *name,
mm/memfd.c: file = shmem_file_setup(name, 0, VM_NORESERVE);
mm/memfd_luo.c: file = shmem_file_setup("", 0, VM_NORESERVE);
mm/shmem.c:static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
mm/shmem.c: return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
mm/shmem.c:struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
mm/shmem.c: return __shmem_file_setup(shm_mnt, name, size, flags, 0);
mm/shmem.c: return __shmem_file_setup(mnt, name, size, flags, 0);
fs/xfs/scrub/xfile.c: xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
fs/xfs/xfs_buf_mem.c: file = shmem_kernel_file_setup(descr, 0, 0);
include/linux/shmem_fs.h:extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
ipc/shm.c: file = shmem_kernel_file_setup(name, size, acctflag);
mm/shmem.c: * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
mm/shmem.c:struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
mm/shmem.c:EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
mm/shmem.c: * bypass file security, in the same way as shmem_kernel_file_setup().
mm/shmem.c: return shmem_kernel_file_setup("dev/zero", size, vm_flags);
security/keys/big_key.c: file = shmem_kernel_file_setup("", enclen, 0);
which is a no-no. If we want to support that we might need to come up
with something more granular.
One way to work around this is something like the DRAFT, UNTESTED, BREAKS,
DOESN'T COMPILE thing below [1]. It copies the shm_mnt for memfds. If
you have multiple things that want to bind-mount and that rely on the
internal tmpfs mount this code should instead create an internal
shm_mnt_clonable mount that can be reused by the respective subsystems.
The problem is hugetlbfs which creates a couple of mounts but it's max 5
so it's probably ok to do that as well. But ugly as sin.
The other option is to fsck around with the file operations - also ugly
as sin. The third option is [3] via an inode flag. It's also not
completely clean but it's preferable to the other ones. Then you can
check for an inode flag. pidfs and nsfs could also be switched over to
this unless I'm missing something.
So what will it be: Pest or Cholera?
[3]:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 78699d2ec0bb..f806ee130b37 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2122,6 +2122,7 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
+#define S_KERN_MOUNTABLE (1 << 20) /* Inode is kernel internal but mountable. */
diff --git a/mm/shmem.c b/mm/shmem.c
index 88ef1fd5cd38..93e443e580b2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5880,6 +5880,11 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
+struct file *shmem_file_mountable(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(shm_mnt, name, size, flags, S_KERN_MOUNTABLE);
+}
+
/**
* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
* @mnt: the tmpfs mount where the file will be created
diff --git a/mm/memfd.c b/mm/memfd.c
index ab5312aff14b..75fd0f5b7b27 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -464,12 +464,13 @@ static struct file *alloc_file(const char *name, unsigned int flags)
int err = 0;
if (flags & MFD_HUGETLB) {
+ /* Do the same for hugetblfs. */
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else {
- file = shmem_file_setup(name, 0, VM_NORESERVE);
+ file = shmem_file_mountable(name, 0, VM_NORESERVE);
}
if (IS_ERR(file))
return file;
diff --git a/fs/namespace.c b/fs/namespace.c
index 080659ea7e62..1c6be54b2f08 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2399,6 +2399,23 @@ struct vfsmount *clone_private_mount(const struct path *path)
}
EXPORT_SYMBOL_GPL(clone_private_mount);
+struct vfsmount *vfs_clone_kern_mount(const struct vfsmount *mnt)
+{
+ struct mount *new_mnt;
+
+ guard(namespace_shared)();
+
+ if (WARN_ON_ONCE(mnt->mnt_ns != MNT_NS_INTERNAL))
+ return ERR_PTR(-EINVAL);
+
+ new_mnt = clone_mnt(mnt, mnt->mnt_root, CL_PRIVATE);
+ if (IS_ERR(new_mnt))
+ return ERR_PTR(-EINVAL);
+
+ new_mnt->mnt_ns = MNT_NS_INTERNAL;
+ return &new_mnt->mnt;
+}
+
static void lock_mnt_tree(struct mount *mnt)
{
struct mount *p;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index acfe7ef86a1b..8faa864d8f05 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -80,6 +80,7 @@ extern bool __mnt_is_readonly(const struct vfsmount *mnt);
extern bool mnt_may_suid(struct vfsmount *mnt);
extern struct vfsmount *clone_private_mount(const struct path *path);
+struct vfsmount *vfs_clone_kern_mount(const struct vfsmount *mnt);
int mnt_get_write_access(struct vfsmount *mnt);
void mnt_put_write_access(struct vfsmount *mnt);
diff --git a/mm/memfd.c b/mm/memfd.c
index ab5312aff14b..bffb5281e082 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -22,6 +22,9 @@
#include <uapi/linux/memfd.h>
#include "swap.h"
+static struct vfsmount *memfd_shm_mnt __ro_after_init;
+static struct vfsmount *__memfd_internal_mnt __ro_after_init;
+
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
@@ -464,12 +467,13 @@ static struct file *alloc_file(const char *name, unsigned int flags)
int err = 0;
if (flags & MFD_HUGETLB) {
+ /* Do the same for hugetblfs. */
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
} else {
- file = shmem_file_setup(name, 0, VM_NORESERVE);
+ file = shmem_file_setup_with_mnt(__memfd_internal_mnt, name, 0, VM_NORESERVE);
}
if (IS_ERR(file))
return file;
@@ -522,3 +526,12 @@ SYSCALL_DEFINE2(memfd_create,
fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
return FD_ADD(fd_flags, alloc_file(name, flags));
}
+
+void __init memfd_secret_init(const struct vfsmount *mnt)
+{
+ memfd_shm_mnt = vfs_clone_kern_mount(mnt);
+ if (ERR_PTR(memfd_shm_mnt)) /* leave memfd_shm_mnt as an error pointer so comparison against another mount always fails. */
+ __memfd_internal_mnt = mnt;
+ else
+ __memfd_internal_mnt = memfd_shm_mnt;
+}
^ permalink raw reply [flat|nested] 5+ messages in thread