[RFC PATCH 4/8] kvm: x86: support walking guest page tables in gmem

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Patrick Roy <roypat@amazon.co.uk>
To: <seanjc@google.com>, <pbonzini@redhat.com>,
	<akpm@linux-foundation.org>, <dwmw@amazon.co.uk>,
	<rppt@kernel.org>, <david@redhat.com>
Cc: Patrick Roy <roypat@amazon.co.uk>, <tglx@linutronix.de>,
	<mingo@redhat.com>, <bp@alien8.de>, <dave.hansen@linux.intel.com>,
	<x86@kernel.org>, <hpa@zytor.com>, <willy@infradead.org>,
	<graf@amazon.com>, <derekmn@amazon.com>, <kalyazin@amazon.com>,
	<kvm@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<linux-mm@kvack.org>, <dmatlack@google.com>, <tabba@google.com>,
	<chao.p.peng@linux.intel.com>, <xmarcalx@amazon.co.uk>
Subject: [RFC PATCH 4/8] kvm: x86: support walking guest page tables in gmem
Date: Tue, 9 Jul 2024 14:20:32 +0100	[thread overview]
Message-ID: <20240709132041.3625501-5-roypat@amazon.co.uk> (raw)
In-Reply-To: <20240709132041.3625501-1-roypat@amazon.co.uk>

Update the logic in paging_tmpl.h to work with guest_private memory. If
KVM cannot access gmem and the guest's page tables are in gfns marked as
private, then error out.

Let the guest page table walker access gmem by making it use
gfn_to_pfn_caches, which are already gmem aware, and will later also
handle on-demand mapping of gmem once it supports being removed from the
direct map.  We re-use the gfn_to_pfn_cache here to avoid implementing
yet another remapping solution to support the cmpxchg used to set the
"accessed" bit on guest PTEs. The only case that now needs some special
handling is page tables in read-only memslots, as gfn_to_pfn_caches
cannot be used for readonly memory. In this case, use
kvm_vcpu_read_guest (which is also gmem aware), as there is no need to
cache the gfn->pfn translation in this case (there is no need to do a
cmpxchg on the PTE as the walker does not set the accessed bit for
read-only ptes).

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 arch/x86/kvm/mmu/paging_tmpl.h | 94 ++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 69941cebb3a8..ddf3b4bd479e 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -84,7 +84,7 @@ struct guest_walker {
 	pt_element_t ptes[PT_MAX_FULL_LEVELS];
 	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
 	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
-	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+	struct gfn_to_pfn_cache ptep_caches[PT_MAX_FULL_LEVELS];
 	bool pte_writable[PT_MAX_FULL_LEVELS];
 	unsigned int pt_access[PT_MAX_FULL_LEVELS];
 	unsigned int pte_access;
@@ -201,7 +201,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 {
 	unsigned level, index;
 	pt_element_t pte, orig_pte;
-	pt_element_t __user *ptep_user;
+	struct gfn_to_pfn_cache *pte_cache;
 	gfn_t table_gfn;
 	int ret;
 
@@ -210,10 +210,12 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 		return 0;
 
 	for (level = walker->max_level; level >= walker->level; --level) {
+		unsigned long flags;
+
 		pte = orig_pte = walker->ptes[level - 1];
 		table_gfn = walker->table_gfn[level - 1];
-		ptep_user = walker->ptep_user[level - 1];
-		index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+		pte_cache = &walker->ptep_caches[level - 1];
+		index = offset_in_page(pte_cache->khva) / sizeof(pt_element_t);
 		if (!(pte & PT_GUEST_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
 			pte |= PT_GUEST_ACCESSED_MASK;
@@ -246,11 +248,26 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 		if (unlikely(!walker->pte_writable[level - 1]))
 			continue;
 
-		ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
+		read_lock_irqsave(&pte_cache->lock, flags);
+		while (!kvm_gpc_check(pte_cache, sizeof(pte))) {
+			read_unlock_irqrestore(&pte_cache->lock, flags);
+
+			ret = kvm_gpc_refresh(pte_cache, sizeof(pte));
+			if (ret)
+				return ret;
+
+			read_lock_irqsave(&pte_cache->lock, flags);
+		}
+		ret = __try_cmpxchg((pt_element_t *)pte_cache->khva, &orig_pte, pte, sizeof(pte));
+
+		if (!ret)
+			kvm_gpc_mark_dirty_in_slot(pte_cache);
+
+		read_unlock_irqrestore(&pte_cache->lock, flags);
+
 		if (ret)
 			return ret;
 
-		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
 		walker->ptes[level - 1] = pte;
 	}
 	return 0;
@@ -296,6 +313,12 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
 
 	return gpte & PT_PAGE_SIZE_MASK;
 }
+
+static void FNAME(walk_deactivate_gpcs)(struct guest_walker *walker) {
+	for (unsigned int level = 0; level < PT_MAX_FULL_LEVELS; ++level)
+		kvm_gpc_deactivate(&walker->ptep_caches[level]);
+}
+
 /*
  * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
  */
@@ -305,7 +328,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 {
 	int ret;
 	pt_element_t pte;
-	pt_element_t __user *ptep_user;
 	gfn_t table_gfn;
 	u64 pt_access, pte_access;
 	unsigned index, accessed_dirty, pte_pkey;
@@ -320,8 +342,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	u16 errcode = 0;
 	gpa_t real_gpa;
 	gfn_t gfn;
+	struct gfn_to_pfn_cache *pte_cache;
 
 	trace_kvm_mmu_pagetable_walk(addr, access);
+
+	for (unsigned int level = 0; level < PT_MAX_FULL_LEVELS; ++level) {
+		pte_cache = &walker->ptep_caches[level];
+
+		memset(pte_cache, 0, sizeof(*pte_cache));
+		kvm_gpc_init(pte_cache, vcpu->kvm);
+	}
+
 retry_walk:
 	walker->level = mmu->cpu_role.base.level;
 	pte           = kvm_mmu_get_guest_pgd(vcpu, mmu);
@@ -362,11 +393,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 	do {
 		struct kvm_memory_slot *slot;
-		unsigned long host_addr;
+		unsigned long flags;
 
 		pt_access = pte_access;
 		--walker->level;
 
+		pte_cache = &walker->ptep_caches[walker->level - 1];
+
 		index = PT_INDEX(addr, walker->level);
 		table_gfn = gpte_to_gfn(pte);
 		offset    = index * sizeof(pt_element_t);
@@ -396,15 +429,36 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		if (!kvm_is_visible_memslot(slot))
 			goto error;
 
-		host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
-					    &walker->pte_writable[walker->level - 1]);
-		if (unlikely(kvm_is_error_hva(host_addr)))
-			goto error;
+		/*
+		 * gfn_to_pfn_cache expects the memory to be writable. However,
+		 * if the memory is not writable, we do not need caching in the
+		 * first place, as we only need it to later potentially write
+		 * the access bit (which we cannot do anyway if the memory is
+		 * readonly).
+		 */
+		if (slot->flags & KVM_MEM_READONLY) {
+			if (kvm_vcpu_read_guest(vcpu, real_gpa + offset, &pte, sizeof(pte)))
+				goto error;
+		} else {
+			if (kvm_gpc_activate(pte_cache, real_gpa + offset,
+					     sizeof(pte)))
+				goto error;
 
-		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__get_user(pte, ptep_user)))
-			goto error;
-		walker->ptep_user[walker->level - 1] = ptep_user;
+			read_lock_irqsave(&pte_cache->lock, flags);
+			while (!kvm_gpc_check(pte_cache, sizeof(pte))) {
+				read_unlock_irqrestore(&pte_cache->lock, flags);
+
+				if (kvm_gpc_refresh(pte_cache, sizeof(pte)))
+					goto error;
+
+				read_lock_irqsave(&pte_cache->lock, flags);
+			}
+
+			pte = *(pt_element_t *)pte_cache->khva;
+			read_unlock_irqrestore(&pte_cache->lock, flags);
+
+			walker->pte_writable[walker->level - 1] = true;
+		}
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
@@ -467,13 +521,19 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 							addr, write_fault);
 		if (unlikely(ret < 0))
 			goto error;
-		else if (ret)
+		else if (ret) {
+			FNAME(walk_deactivate_gpcs)(walker);
 			goto retry_walk;
+		}
 	}
 
+	FNAME(walk_deactivate_gpcs)(walker);
+
 	return 1;
 
 error:
+	FNAME(walk_deactivate_gpcs)(walker);
+
 	errcode |= write_fault | user_fault;
 	if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
 		errcode |= PFERR_FETCH_MASK;
-- 
2.45.2

next prev parent reply	other threads:[~2024-07-09 13:21 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-09 13:20 [RFC PATCH 0/8] Unmapping guest_memfd from Direct Map Patrick Roy
2024-07-09 13:20 ` [RFC PATCH 1/8] kvm: Allow reading/writing gmem using kvm_{read,write}_guest Patrick Roy
2024-07-09 13:20 ` [RFC PATCH 2/8] kvm: use slowpath in gfn_to_hva_cache if memory is private Patrick Roy
2024-07-09 13:20 ` [RFC PATCH 3/8] kvm: pfncache: enlighten about gmem Patrick Roy
2024-07-09 14:36   ` David Woodhouse
2024-07-10  9:49     ` Patrick Roy
2024-07-10 10:20       ` David Woodhouse
2024-07-10 10:46         ` Patrick Roy
2024-07-10 10:50           ` David Woodhouse
2024-07-09 13:20 ` Patrick Roy [this message]
2024-07-09 13:20 ` [RFC PATCH 5/8] kvm: gmem: add option to remove guest private memory from direct map Patrick Roy
2024-07-10  7:31   ` Mike Rapoport
2024-07-10  9:50     ` Patrick Roy
2024-07-09 13:20 ` [RFC PATCH 6/8] kvm: gmem: Temporarily restore direct map entries when needed Patrick Roy
2024-07-11  6:25   ` Paolo Bonzini
2024-07-09 13:20 ` [RFC PATCH 7/8] mm: secretmem: use AS_INACCESSIBLE to prohibit GUP Patrick Roy
2024-07-09 21:09   ` David Hildenbrand
2024-07-10  7:32     ` Mike Rapoport
2024-07-10  9:50       ` Patrick Roy
2024-07-10 21:14         ` David Hildenbrand
2024-07-09 13:20 ` [RFC PATCH 8/8] kvm: gmem: Allow restricted userspace mappings Patrick Roy
2024-07-09 14:48   ` Fuad Tabba
2024-07-09 21:13     ` David Hildenbrand
2024-07-10  9:51       ` Patrick Roy
2024-07-10 21:12         ` David Hildenbrand
2024-07-10 21:53           ` Sean Christopherson
2024-07-10 21:56             ` David Hildenbrand
2024-07-12 15:59           ` Patrick Roy
2024-07-30 10:15             ` David Hildenbrand
2024-08-01 10:30               ` Patrick Roy
2024-07-22 12:28 ` [RFC PATCH 0/8] Unmapping guest_memfd from Direct Map Vlastimil Babka (SUSE)
2024-07-26  6:55   ` Patrick Roy
2024-07-30 10:17     ` David Hildenbrand
2024-07-26 16:44 ` Yosry Ahmed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240709132041.3625501-5-roypat@amazon.co.uk \
    --to=roypat@amazon.co.uk \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=chao.p.peng@linux.intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=derekmn@amazon.com \
    --cc=dmatlack@google.com \
    --cc=dwmw@amazon.co.uk \
    --cc=graf@amazon.com \
    --cc=hpa@zytor.com \
    --cc=kalyazin@amazon.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=tabba@google.com \
    --cc=tglx@linutronix.de \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=xmarcalx@amazon.co.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox