linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Elliot Berman <quic_eberman@quicinc.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Sean Christopherson <seanjc@google.com>,
	Fuad Tabba <tabba@google.com>,
	David Hildenbrand <david@redhat.com>,
	Patrick Roy <roypat@amazon.co.uk>, <qperret@google.com>,
	Ackerley Tng <ackerleytng@google.com>
Cc: <linux-coco@lists.linux.dev>, <linux-arm-msm@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <linux-mm@kvack.org>,
	<kvm@vger.kernel.org>, Elliot Berman <quic_eberman@quicinc.com>
Subject: [PATCH RFC 1/4] mm: Introduce guest_memfd
Date: Mon, 5 Aug 2024 11:34:47 -0700	[thread overview]
Message-ID: <20240805-guest-memfd-lib-v1-1-e5a29a4ff5d7@quicinc.com> (raw)
In-Reply-To: <20240805-guest-memfd-lib-v1-0-e5a29a4ff5d7@quicinc.com>

In preparation for adding more features to KVM's guest_memfd, refactor
and introduce a library which abstracts some of the core-mm decisions
about managing folios associated with the file. The goal of the refactor
serves two purposes:

Provide an easier way to reason about memory in guest_memfd. With KVM
supporting multiple confidentiality models (TDX, SEV-SNP, pKVM, ARM
CCA), and coming support for allowing kernel and userspace to access
this memory, it seems necessary to create a stronger abstraction between
core-mm concerns and hypervisor concerns.

Provide a common implementation for other hypervisors (Gunyah) to use.

Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 include/linux/guest_memfd.h |  44 +++++++
 mm/Kconfig                  |   3 +
 mm/Makefile                 |   1 +
 mm/guest_memfd.c            | 285 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 333 insertions(+)

diff --git a/include/linux/guest_memfd.h b/include/linux/guest_memfd.h
new file mode 100644
index 000000000000..be56d9d53067
--- /dev/null
+++ b/include/linux/guest_memfd.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _LINUX_GUEST_MEMFD_H
+#define _LINUX_GUEST_MEMFD_H
+
+#include <linux/fs.h>
+
+/**
+ * struct guest_memfd_operations - ops provided by owner to manage folios
+ * @invalidate_begin: called when folios should be unmapped from guest.
+ *                    May fail if folios couldn't be unmapped from guest.
+ *                    Required.
+ * @invalidate_end: called after invalidate_begin returns success. Optional.
+ * @prepare: called before a folio is mapped into the guest address space.
+ *           Optional.
+ * @release: Called when releasing the guest_memfd file. Required.
+ */
+struct guest_memfd_operations {
+	int (*invalidate_begin)(struct inode *inode, pgoff_t offset, unsigned long nr);
+	void (*invalidate_end)(struct inode *inode, pgoff_t offset, unsigned long nr);
+	int (*prepare)(struct inode *inode, pgoff_t offset, struct folio *folio);
+	int (*release)(struct inode *inode);
+};
+
+/**
+ * @GUEST_MEMFD_GRAB_UPTODATE: Ensure pages are zeroed/up to date.
+ *                             If trusted hyp will do it, can ommit this flag
+ * @GUEST_MEMFD_PREPARE: Call the ->prepare() op, if present.
+ */
+enum {
+	GUEST_MEMFD_GRAB_UPTODATE	= BIT(0),
+	GUEST_MEMFD_PREPARE		= BIT(1),
+};
+
+struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags);
+struct file *guest_memfd_alloc(const char *name,
+			       const struct guest_memfd_operations *ops,
+			       loff_t size, unsigned long flags);
+bool is_guest_memfd(struct file *file, const struct guest_memfd_operations *ops);
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index b72e7d040f78..333f46525695 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1168,6 +1168,9 @@ config SECRETMEM
 	  memory areas visible only in the context of the owning process and
 	  not mapped to other processes and other kernel page tables.
 
+config GUEST_MEMFD
+	tristate
+
 config ANON_VMA_NAME
 	bool "Anonymous VMA name support"
 	depends on PROC_FS && ADVISE_SYSCALLS && MMU
diff --git a/mm/Makefile b/mm/Makefile
index d2915f8c9dc0..e15a95ebeac5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -122,6 +122,7 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_SECRETMEM) += secretmem.o
+obj-$(CONFIG_GUEST_MEMFD) += guest_memfd.o
 obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/guest_memfd.c b/mm/guest_memfd.c
new file mode 100644
index 000000000000..580138b0f9d4
--- /dev/null
+++ b/mm/guest_memfd.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/falloc.h>
+#include <linux/guest_memfd.h>
+#include <linux/pagemap.h>
+
+struct folio *guest_memfd_grab_folio(struct file *file, pgoff_t index, u32 flags)
+{
+	struct inode *inode = file_inode(file);
+	struct guest_memfd_operations *ops = inode->i_private;
+	struct folio *folio;
+	int r;
+
+	/* TODO: Support huge pages. */
+	folio = filemap_grab_folio(inode->i_mapping, index);
+	if (IS_ERR(folio))
+		return folio;
+
+	/*
+	 * Use the up-to-date flag to track whether or not the memory has been
+	 * zeroed before being handed off to the guest.  There is no backing
+	 * storage for the memory, so the folio will remain up-to-date until
+	 * it's removed.
+	 */
+	if ((flags & GUEST_MEMFD_GRAB_UPTODATE) &&
+	    !folio_test_uptodate(folio)) {
+		unsigned long nr_pages = folio_nr_pages(folio);
+		unsigned long i;
+
+		for (i = 0; i < nr_pages; i++)
+			clear_highpage(folio_page(folio, i));
+
+		folio_mark_uptodate(folio);
+	}
+
+	if (flags & GUEST_MEMFD_PREPARE && ops->prepare) {
+		r = ops->prepare(inode, index, folio);
+		if (r < 0)
+			goto out_err;
+	}
+
+	/*
+	 * Ignore accessed, referenced, and dirty flags.  The memory is
+	 * unevictable and there is no storage to write back to.
+	 */
+	return folio;
+out_err:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ERR_PTR(r);
+}
+EXPORT_SYMBOL_GPL(guest_memfd_grab_folio);
+
+static long gmem_punch_hole(struct file *file, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	const struct guest_memfd_operations *ops = inode->i_private;
+	pgoff_t start = offset >> PAGE_SHIFT;
+	unsigned long nr = len >> PAGE_SHIFT;
+	long ret;
+
+	/*
+	 * Bindings must be stable across invalidation to ensure the start+end
+	 * are balanced.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+
+	ret = ops->invalidate_begin(inode, start, nr);
+	if (ret)
+		goto out;
+
+	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+	if (ops->invalidate_end)
+		ops->invalidate_end(inode, start, nr);
+
+out:
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return 0;
+}
+
+static long gmem_allocate(struct file *file, loff_t offset, loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t start, index, end;
+	int r;
+
+	/* Dedicated guest is immutable by default. */
+	if (offset + len > i_size_read(inode))
+		return -EINVAL;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	r = 0;
+	for (index = start; index < end;) {
+		struct folio *folio;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		folio = guest_memfd_grab_folio(file, index,
+					       GUEST_MEMFD_GRAB_UPTODATE |
+						       GUEST_MEMFD_PREPARE);
+		if (!folio) {
+			r = -ENOMEM;
+			break;
+		}
+
+		index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/* 64-bit only, wrapping the index should be impossible. */
+		if (WARN_ON_ONCE(!index))
+			break;
+
+		cond_resched();
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return r;
+}
+
+static long gmem_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	int ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		ret = gmem_punch_hole(file, offset, len);
+	else
+		ret = gmem_allocate(file, offset, len);
+
+	if (!ret)
+		file_modified(file);
+	return ret;
+}
+
+static int gmem_release(struct inode *inode, struct file *file)
+{
+	struct guest_memfd_operations *ops = inode->i_private;
+
+	return ops->release(inode);
+}
+
+static struct file_operations gmem_fops = {
+	.open = generic_file_open,
+	.llseek = generic_file_llseek,
+	.release = gmem_release,
+	.fallocate = gmem_fallocate,
+	.owner = THIS_MODULE,
+};
+
+static int gmem_migrate_folio(struct address_space *mapping, struct folio *dst,
+			      struct folio *src, enum migrate_mode mode)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+
+static int gmem_error_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct inode *inode = mapping->host;
+	struct guest_memfd_operations *ops = inode->i_private;
+	off_t offset = folio->index;
+	size_t nr = folio_nr_pages(folio);
+	int ret;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	ret = ops->invalidate_begin(inode, offset, nr);
+	if (!ret && ops->invalidate_end)
+		ops->invalidate_end(inode, offset, nr);
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return ret;
+}
+
+static bool gmem_release_folio(struct folio *folio, gfp_t gfp)
+{
+	struct inode *inode = folio_inode(folio);
+	struct guest_memfd_operations *ops = inode->i_private;
+	off_t offset = folio->index;
+	size_t nr = folio_nr_pages(folio);
+	int ret;
+
+	ret = ops->invalidate_begin(inode, offset, nr);
+	if (ret)
+		return false;
+	if (ops->invalidate_end)
+		ops->invalidate_end(inode, offset, nr);
+
+	return true;
+}
+
+static const struct address_space_operations gmem_aops = {
+	.dirty_folio = noop_dirty_folio,
+	.migrate_folio = gmem_migrate_folio,
+	.error_remove_folio = gmem_error_folio,
+	.release_folio = gmem_release_folio,
+};
+
+static inline bool guest_memfd_check_ops(const struct guest_memfd_operations *ops)
+{
+	return ops->invalidate_begin && ops->release;
+}
+
+struct file *guest_memfd_alloc(const char *name,
+			       const struct guest_memfd_operations *ops,
+			       loff_t size, unsigned long flags)
+{
+	struct inode *inode;
+	struct file *file;
+
+	if (size <= 0 || !PAGE_ALIGNED(size))
+		return ERR_PTR(-EINVAL);
+
+	if (!guest_memfd_check_ops(ops))
+		return ERR_PTR(-EINVAL);
+
+	if (flags)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * Use the so called "secure" variant, which creates a unique inode
+	 * instead of reusing a single inode.  Each guest_memfd instance needs
+	 * its own inode to track the size, flags, etc.
+	 */
+	file = anon_inode_create_getfile(name, &gmem_fops, (void *)flags,
+					 O_RDWR, NULL);
+	if (IS_ERR(file))
+		return file;
+
+	file->f_flags |= O_LARGEFILE;
+
+	inode = file_inode(file);
+	WARN_ON(file->f_mapping != inode->i_mapping);
+
+	inode->i_private = (void *)ops; /* discards const qualifier */
+	inode->i_mapping->a_ops = &gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_inaccessible(inode->i_mapping);
+	/* Unmovable mappings are supposed to be marked unevictable as well. */
+	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+	return file;
+}
+EXPORT_SYMBOL_GPL(guest_memfd_alloc);
+
+bool is_guest_memfd(struct file *file, const struct guest_memfd_operations *ops)
+{
+	if (file->f_op != &gmem_fops)
+		return false;
+
+	struct inode *inode = file_inode(file);
+	struct guest_memfd_operations *gops = inode->i_private;
+
+	return ops == gops;
+}
+EXPORT_SYMBOL_GPL(is_guest_memfd);

-- 
2.34.1



  reply	other threads:[~2024-08-05 18:35 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-08-05 18:34 [PATCH RFC 0/4] mm: Introduce guest_memfd library Elliot Berman
2024-08-05 18:34 ` Elliot Berman [this message]
2024-08-06 13:48   ` [PATCH RFC 1/4] mm: Introduce guest_memfd David Hildenbrand
2024-08-08 18:39   ` Ackerley Tng
2024-08-05 18:34 ` [PATCH RFC 2/4] kvm: Convert to use mm/guest_memfd Elliot Berman
2024-08-05 18:34 ` [PATCH RFC 3/4] mm: guest_memfd: Add option to remove guest private memory from direct map Elliot Berman
2024-08-06 14:08   ` David Hildenbrand
     [not found]     ` <396fb134-f43e-4263-99a8-cfcef82bfd99@amazon.com>
2024-08-15 19:08       ` Manwaring, Derek
2024-08-06 15:39   ` Patrick Roy
     [not found]     ` <20240806104702482-0700.eberman@hu-eberman-lv.qualcomm.com>
     [not found]       ` <a43ae745-9907-425f-b09d-a49405d6bc2d@amazon.co.uk>
     [not found]         ` <90886a03-ad62-4e98-bc05-63875faa9ccc@amazon.co.uk>
     [not found]           ` <20240807113514068-0700.eberman@hu-eberman-lv.qualcomm.com>
     [not found]             ` <7166d51c-7757-44f2-a6f8-36da3e86bf90@amazon.co.uk>
2024-08-08 22:16               ` Elliot Berman
2024-08-09 15:02                 ` Patrick Roy
2024-08-19 10:09   ` Mike Rapoport
2024-08-20 16:56     ` Elliot Berman
2024-08-21 14:26       ` Mike Rapoport
2024-08-05 18:34 ` [PATCH RFC 4/4] mm: guest_memfd: Add ability for mmap'ing pages Elliot Berman
2024-08-06 13:51   ` David Hildenbrand
     [not found]     ` <20240806093625007-0700.eberman@hu-eberman-lv.qualcomm.com>
     [not found]       ` <a7c5bfc0-1648-4ae1-ba08-e706596e014b@redhat.com>
2024-08-08 21:41         ` Elliot Berman
2024-08-08 21:55           ` David Hildenbrand
2024-08-08 22:26             ` Elliot Berman
2024-08-09  7:16               ` David Hildenbrand
2024-08-15  7:24     ` Fuad Tabba
2024-08-16  9:48       ` David Hildenbrand
2024-08-16 11:19         ` Fuad Tabba
2024-08-16 17:45         ` Ackerley Tng
2024-08-16 18:08           ` David Hildenbrand
2024-08-16 21:52             ` Ackerley Tng
2024-08-16 22:03               ` David Hildenbrand
2024-08-16 23:52                 ` Elliot Berman
2024-08-06 15:48   ` Patrick Roy
2024-08-08 18:51   ` Ackerley Tng
2024-08-08 21:42     ` Elliot Berman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240805-guest-memfd-lib-v1-1-e5a29a4ff5d7@quicinc.com \
    --to=quic_eberman@quicinc.com \
    --cc=ackerleytng@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-coco@lists.linux.dev \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=pbonzini@redhat.com \
    --cc=qperret@google.com \
    --cc=roypat@amazon.co.uk \
    --cc=seanjc@google.com \
    --cc=tabba@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox