From: Ryan Roberts <ryan.roberts@arm.com>
To: Catalin Marinas <catalin.marinas@arm.com>,
Will Deacon <will@kernel.org>, Ard Biesheuvel <ardb@kernel.org>,
Marc Zyngier <maz@kernel.org>, James Morse <james.morse@arm.com>,
Andrey Ryabinin <ryabinin.a.a@gmail.com>,
Andrew Morton <akpm@linux-foundation.org>,
Matthew Wilcox <willy@infradead.org>,
Mark Rutland <mark.rutland@arm.com>,
David Hildenbrand <david@redhat.com>,
Kefeng Wang <wangkefeng.wang@huawei.com>,
John Hubbard <jhubbard@nvidia.com>, Zi Yan <ziy@nvidia.com>,
Barry Song <21cnbao@gmail.com>,
Alistair Popple <apopple@nvidia.com>,
Yang Shi <shy828301@gmail.com>,
Nicholas Piggin <npiggin@gmail.com>,
Christophe Leroy <christophe.leroy@csgroup.eu>,
"Aneesh Kumar K.V" <aneesh.kumar@kernel.org>,
"Naveen N. Rao" <naveen.n.rao@linux.ibm.com>,
Thomas Gleixner <tglx@linutronix.de>,
Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>,
"H. Peter Anvin" <hpa@zytor.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
linux-arm-kernel@lists.infradead.org, x86@kernel.org,
linuxppc-dev@lists.ozlabs.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org
Subject: [PATCH v5 20/25] arm64/mm: Implement new wrprotect_ptes() batch API
Date: Fri, 2 Feb 2024 08:07:51 +0000 [thread overview]
Message-ID: <20240202080756.1453939-21-ryan.roberts@arm.com> (raw)
In-Reply-To: <20240202080756.1453939-1-ryan.roberts@arm.com>
Optimize the contpte implementation to fix some of the fork performance
regression introduced by the initial contpte commit. Subsequent patches
will solve it entirely.
During fork(), any private memory in the parent must be write-protected.
Previously this was done 1 PTE at a time. But the core-mm supports
batched wrprotect via the new wrprotect_ptes() API. So let's implement
that API and for fully covered contpte mappings, we no longer need to
unfold the contpte. This has 2 benefits:
- reduced unfolding, reduces the number of tlbis that must be issued.
- The memory remains contpte-mapped ("folded") in the parent, so it
continues to benefit from the more efficient use of the TLB after
the fork.
The optimization to wrprotect a whole contpte block without unfolding is
possible thanks to the tightening of the Arm ARM in respect to the
definition and behaviour when 'Misprogramming the Contiguous bit'. See
section D21194 at https://developer.arm.com/documentation/102105/latest/
Tested-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
arch/arm64/include/asm/pgtable.h | 61 ++++++++++++++++++++++++++------
arch/arm64/mm/contpte.c | 35 ++++++++++++++++++
2 files changed, 86 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 34892a95403d..c07f0d563733 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -978,16 +978,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/*
- * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
- * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
- */
-static inline void __ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ pte_t pte)
{
- pte_t old_pte, pte;
+ pte_t old_pte;
- pte = __ptep_get(ptep);
do {
old_pte = pte;
pte = pte_wrprotect(pte);
@@ -996,6 +992,25 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
} while (pte_val(pte) != pte_val(old_pte));
}
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
+{
+ ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+ pte_t *ptep, unsigned int nr)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+ __ptep_set_wrprotect(mm, address, ptep);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
@@ -1156,6 +1171,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty);
@@ -1269,12 +1286,35 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
return contpte_ptep_clear_flush_young(vma, addr, ptep);
}
+#define wrprotect_ptes wrprotect_ptes
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ if (likely(nr == 1)) {
+ /*
+ * Optimization: wrprotect_ptes() can only be called for present
+ * ptes so we only need to check contig bit as condition for
+ * unfold, and we can remove the contig bit from the pte we read
+ * to avoid re-reading. This speeds up fork() which is sensitive
+ * for order-0 folios. Equivalent to contpte_try_unfold().
+ */
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (unlikely(pte_cont(orig_pte))) {
+ __contpte_try_unfold(mm, addr, ptep, orig_pte);
+ orig_pte = pte_mknoncont(orig_pte);
+ }
+ ___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+ } else {
+ contpte_wrprotect_ptes(mm, addr, ptep, nr);
+ }
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- __ptep_set_wrprotect(mm, addr, ptep);
+ wrprotect_ptes(mm, addr, ptep, 1);
}
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
@@ -1306,6 +1346,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#define ptep_clear_flush_young __ptep_clear_flush_young
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect __ptep_set_wrprotect
+#define wrprotect_ptes __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index bfb50e6b44c7..c85e64baf03b 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -23,6 +23,23 @@ static inline pte_t *contpte_align_down(pte_t *ptep)
return (pte_t *)(ALIGN_DOWN((unsigned long)ptep >> 3, CONT_PTES) << 3);
}
+static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * Unfold any partially covered contpte block at the beginning and end
+ * of the range.
+ */
+
+ if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ if (ptep + nr != contpte_align_down(ptep + nr))
+ contpte_try_unfold(mm, addr + PAGE_SIZE * (nr - 1),
+ ptep + nr - 1,
+ __ptep_get(ptep + nr - 1));
+}
+
static void contpte_convert(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -236,6 +253,24 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * If wrprotecting an entire contig range, we can avoid unfolding. Just
+ * set wrprotect and wait for the later mmu_gather flush to invalidate
+ * the tlb. Until the flush, the page may or may not be wrprotected.
+ * After the flush, it is guarranteed wrprotected. If its a partial
+ * range though, we must unfold, because we can't have a case where
+ * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
+ * would cause it to continue to be unpredictable after the flush.
+ */
+
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ __wrprotect_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(contpte_wrprotect_ptes);
+
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty)
--
2.25.1
next prev parent reply other threads:[~2024-02-02 8:09 UTC|newest]
Thread overview: 80+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-02 8:07 [PATCH v5 00/25] Transparent Contiguous PTEs for User Mappings Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 01/25] mm: Clarify the spec for set_ptes() Ryan Roberts
2024-02-12 12:03 ` David Hildenbrand
2024-02-02 8:07 ` [PATCH v5 02/25] mm: thp: Batch-collapse PMD with set_ptes() Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 03/25] mm: Make pte_next_pfn() a wrapper around pte_advance_pfn() Ryan Roberts
2024-02-12 12:14 ` David Hildenbrand
2024-02-12 14:10 ` Ryan Roberts
2024-02-12 14:29 ` David Hildenbrand
2024-02-12 21:34 ` Ryan Roberts
2024-02-13 9:54 ` David Hildenbrand
2024-02-02 8:07 ` [PATCH v5 04/25] arm/mm: Convert pte_next_pfn() to pte_advance_pfn() Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 05/25] arm64/mm: " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 06/25] powerpc/mm: " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 07/25] x86/mm: " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 08/25] mm: Remove pte_next_pfn() and replace with pte_advance_pfn() Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 09/25] arm64/mm: set_pte(): New layer to manage contig bit Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 10/25] arm64/mm: set_ptes()/set_pte_at(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 11/25] arm64/mm: pte_clear(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 12/25] arm64/mm: ptep_get_and_clear(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 13/25] arm64/mm: ptep_test_and_clear_young(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 14/25] arm64/mm: ptep_clear_flush_young(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 15/25] arm64/mm: ptep_set_wrprotect(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 16/25] arm64/mm: ptep_set_access_flags(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 17/25] arm64/mm: ptep_get(): " Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 18/25] arm64/mm: Split __flush_tlb_range() to elide trailing DSB Ryan Roberts
2024-02-12 12:44 ` David Hildenbrand
2024-02-12 13:05 ` Ryan Roberts
2024-02-12 13:15 ` David Hildenbrand
2024-02-12 13:27 ` Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 19/25] arm64/mm: Wire up PTE_CONT for user mappings Ryan Roberts
2024-02-12 12:00 ` Mark Rutland
2024-02-12 12:59 ` Ryan Roberts
2024-02-12 13:54 ` David Hildenbrand
2024-02-12 14:45 ` Ryan Roberts
2024-02-12 15:26 ` David Hildenbrand
2024-02-12 15:34 ` Ryan Roberts
2024-02-12 16:24 ` David Hildenbrand
2024-02-13 15:29 ` Ryan Roberts
2024-02-12 15:30 ` Ryan Roberts
2024-02-12 20:38 ` Ryan Roberts
2024-02-13 10:01 ` David Hildenbrand
2024-02-13 12:06 ` Ryan Roberts
2024-02-13 12:19 ` David Hildenbrand
2024-02-13 13:06 ` Ryan Roberts
2024-02-13 13:13 ` David Hildenbrand
2024-02-13 13:20 ` Ryan Roberts
2024-02-13 13:22 ` David Hildenbrand
2024-02-13 13:24 ` Ryan Roberts
2024-02-13 13:33 ` Ard Biesheuvel
2024-02-13 13:45 ` David Hildenbrand
2024-02-13 14:02 ` Ryan Roberts
2024-02-13 14:05 ` David Hildenbrand
2024-02-13 14:08 ` Ard Biesheuvel
2024-02-13 14:21 ` Ryan Roberts
2024-02-13 12:02 ` Mark Rutland
2024-02-13 13:03 ` Ryan Roberts
2024-02-02 8:07 ` Ryan Roberts [this message]
2024-02-13 16:31 ` [PATCH v5 20/25] arm64/mm: Implement new wrprotect_ptes() batch API Mark Rutland
2024-02-13 16:36 ` Ryan Roberts
2024-02-02 8:07 ` [PATCH v5 21/25] arm64/mm: Implement new [get_and_]clear_full_ptes() batch APIs Ryan Roberts
2024-02-13 16:43 ` Mark Rutland
2024-02-13 16:48 ` Ryan Roberts
2024-02-13 16:53 ` Mark Rutland
2024-02-02 8:07 ` [PATCH v5 22/25] mm: Add pte_batch_hint() to reduce scanning in folio_pte_batch() Ryan Roberts
2024-02-12 13:43 ` David Hildenbrand
2024-02-12 15:00 ` Ryan Roberts
2024-02-12 15:47 ` Ryan Roberts
2024-02-12 16:27 ` David Hildenbrand
2024-02-02 8:07 ` [PATCH v5 23/25] arm64/mm: Implement pte_batch_hint() Ryan Roberts
2024-02-12 13:46 ` David Hildenbrand
2024-02-13 16:54 ` Mark Rutland
2024-02-02 8:07 ` [PATCH v5 24/25] arm64/mm: __always_inline to improve fork() perf Ryan Roberts
2024-02-13 16:55 ` Mark Rutland
2024-02-02 8:07 ` [PATCH v5 25/25] arm64/mm: Automatically fold contpte mappings Ryan Roberts
2024-02-13 17:44 ` Mark Rutland
2024-02-13 18:05 ` Ryan Roberts
2024-02-08 17:34 ` [PATCH v5 00/25] Transparent Contiguous PTEs for User Mappings Mark Rutland
2024-02-09 8:54 ` Ryan Roberts
2024-02-09 22:16 ` David Hildenbrand
2024-02-09 23:52 ` Ryan Roberts
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240202080756.1453939-21-ryan.roberts@arm.com \
--to=ryan.roberts@arm.com \
--cc=21cnbao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=aneesh.kumar@kernel.org \
--cc=apopple@nvidia.com \
--cc=ardb@kernel.org \
--cc=bp@alien8.de \
--cc=catalin.marinas@arm.com \
--cc=christophe.leroy@csgroup.eu \
--cc=dave.hansen@linux.intel.com \
--cc=david@redhat.com \
--cc=hpa@zytor.com \
--cc=james.morse@arm.com \
--cc=jhubbard@nvidia.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mark.rutland@arm.com \
--cc=maz@kernel.org \
--cc=mingo@redhat.com \
--cc=naveen.n.rao@linux.ibm.com \
--cc=npiggin@gmail.com \
--cc=ryabinin.a.a@gmail.com \
--cc=shy828301@gmail.com \
--cc=tglx@linutronix.de \
--cc=wangkefeng.wang@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=x86@kernel.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox