linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Borislav Petkov <bp@alien8.de>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Hemment <markhemm@googlemail.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	the arch/x86 maintainers <x86@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	patrice.chotard@foss.st.com,
	Mikulas Patocka <mpatocka@redhat.com>,
	Lukas Czerner <lczerner@redhat.com>,
	Christoph Hellwig <hch@lst.de>,
	"Darrick J. Wong" <djwong@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	Hugh Dickins <hughd@google.com>,
	patches@lists.linux.dev, Linux-MM <linux-mm@kvack.org>,
	mm-commits@vger.kernel.org, Mel Gorman <mgorman@suse.de>
Subject: [PATCH] x86/clear_user: Make it faster
Date: Tue, 24 May 2022 14:32:36 +0200	[thread overview]
Message-ID: <YozQZMyQ0NDdD8cH@zn.tnic> (raw)
In-Reply-To: <Ynq1nVpu1xCpjnXm@zn.tnic>

Ok,

finally a somewhat final version, lightly tested.

I still need to run it on production Icelake and that is kinda being
delayed due to server room cooling issues (don't ask ;-\).

---
From: Borislav Petkov <bp@suse.de>

Based on a patch by Mark Hemment <markhemm@googlemail.com> and
incorporating very sane suggestions from Linus.

The point here is to have the default case with FSRM - which is supposed
to be the majority of x86 hw out there - if not now then soon - be
directly inlined into the instruction stream so that no function call
overhead is taking place.

The benchmarks I ran would show very small improvements and a PF
benchmark would even show weird things like slowdowns with higher core
counts.

So for a ~6m running the git test suite, the function gets called under
700K times, all from padzero():

  <...>-2536    [006] .....   261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900
  <...>-2536    [006] .....   261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160
  <...>-2537    [008] .....   261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850
  <...>-2537    [008] .....   261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900
   ...

which is around 1%-ish of the total time and which is consistent with
the benchmark numbers.

So Mel gave me the idea to simply measure how fast the function becomes.
I.e.:

  start = rdtsc_ordered();
  ret = __clear_user(to, n);
  end = rdtsc_ordered();

Computing the mean average of all the samples collected during the test
suite run then shows some improvement:

  clear_user_original:
  Amean: 9219.71 (Sum: 6340154910, samples: 687674)

  fsrm:
  Amean: 8030.63 (Sum: 5522277720, samples: 687652)

That's on Zen3.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@mail.gmail.com
---
 arch/x86/include/asm/uaccess.h    |   5 +-
 arch/x86/include/asm/uaccess_64.h |  45 ++++++++++
 arch/x86/lib/clear_page_64.S      | 142 ++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy_64.c        |  40 ---------
 tools/objtool/check.c             |   3 +
 5 files changed, 192 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index f78e2b3501a1..335d571e8a79 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -405,9 +405,6 @@ strncpy_from_user(char *dst, const char __user *src, long count);
 
 extern __must_check long strnlen_user(const char __user *str, long n);
 
-unsigned long __must_check clear_user(void __user *mem, unsigned long len);
-unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
-
 #ifdef CONFIG_ARCH_HAS_COPY_MC
 unsigned long __must_check
 copy_mc_to_kernel(void *to, const void *from, unsigned len);
@@ -429,6 +426,8 @@ extern struct movsl_mask {
 #define ARCH_HAS_NOCACHE_UACCESS 1
 
 #ifdef CONFIG_X86_32
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
 # include <asm/uaccess_32.h>
 #else
 # include <asm/uaccess_64.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 45697e04d771..4ffefb4bb844 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -79,4 +79,49 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 	kasan_check_write(dst, size);
 	return __copy_user_flushcache(dst, src, size);
 }
+
+/*
+ * Zero Userspace.
+ */
+
+__must_check unsigned long
+clear_user_original(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_rep_good(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_erms(void __user *addr, unsigned long len);
+
+static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+	might_fault();
+	stac();
+
+	/*
+	 * No memory constraint because it doesn't change any memory gcc
+	 * knows about.
+	 */
+	asm volatile(
+		"1:\n\t"
+		ALTERNATIVE_3("rep stosb",
+			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
+			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
+			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+		"2:\n"
+	       _ASM_EXTABLE_UA(1b, 2b)
+	       : "+&c" (size), "+&D" (addr), ASM_CALL_CONSTRAINT
+	       : "a" (0)
+		/* rep_good clobbers %rdx */
+	       : "rdx");
+
+	clac();
+
+	return size;
+}
+
+static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
+{
+	if (access_ok(to, n))
+		return __clear_user(to, n);
+	return n;
+}
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index fe59b8ac4fcc..6dd6c7fd1eb8 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #include <linux/linkage.h>
+#include <asm/asm.h>
 #include <asm/export.h>
 
 /*
@@ -50,3 +51,144 @@ SYM_FUNC_START(clear_page_erms)
 	RET
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Default clear user-space.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_original)
+	/*
+	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
+	 * i.e., no need for a 'q' suffix and thus a REX prefix.
+	 */
+	mov %ecx,%eax
+	shr $3,%rcx
+	jz .Lrest_bytes
+
+	# do the qwords first
+	.p2align 4
+.Lqwords:
+	movq $0,(%rdi)
+	lea 8(%rdi),%rdi
+	dec %rcx
+	jnz .Lqwords
+
+.Lrest_bytes:
+	and $7,  %eax
+	jz .Lexit
+
+	# now do the rest bytes
+.Lbytes:
+	movb $0,(%rdi)
+	inc %rdi
+	dec %eax
+	jnz .Lbytes
+
+.Lexit:
+	/*
+	 * %rax still needs to be cleared in the exception case because this function is called
+	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
+	 * in case it might reuse it somewhere.
+	 */
+        xor %eax,%eax
+        RET
+
+.Lqwords_exception:
+        # convert remaining qwords back into bytes to return to caller
+        shl $3, %rcx
+        and $7, %eax
+        add %rax,%rcx
+        jmp .Lexit
+
+.Lbytes_exception:
+        mov %eax,%ecx
+        jmp .Lexit
+
+        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
+        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
+SYM_FUNC_END(clear_user_original)
+EXPORT_SYMBOL(clear_user_original)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
+ * present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_rep_good)
+	# call the original thing for less than a cacheline
+	cmp $64, %rcx
+	ja  .Lprep
+	call clear_user_original
+	RET
+
+.Lprep:
+	# copy lower 32-bits for rest bytes
+	mov %ecx, %edx
+	shr $3, %rcx
+	jz .Lrep_good_rest_bytes
+
+.Lrep_good_qwords:
+	rep stosq
+
+.Lrep_good_rest_bytes:
+	and $7, %edx
+	jz .Lrep_good_exit
+
+.Lrep_good_bytes:
+	mov %edx, %ecx
+	rep stosb
+
+.Lrep_good_exit:
+	# see .Lexit comment above
+	xor %eax, %eax
+	RET
+
+.Lrep_good_qwords_exception:
+	# convert remaining qwords back into bytes to return to caller
+	shl $3, %rcx
+	and $7, %edx
+	add %rdx, %rcx
+	jmp .Lrep_good_exit
+
+	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
+	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
+SYM_FUNC_END(clear_user_rep_good)
+EXPORT_SYMBOL(clear_user_rep_good)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ *
+ */
+SYM_FUNC_START(clear_user_erms)
+	# call the original thing for less than a cacheline
+	cmp $64, %rcx
+	ja  .Lerms_bytes
+	call clear_user_original
+	RET
+
+.Lerms_bytes:
+	rep stosb
+
+.Lerms_exit:
+	xorl %eax,%eax
+	RET
+
+	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
+SYM_FUNC_END(clear_user_erms)
+EXPORT_SYMBOL(clear_user_erms)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0ae6cf804197..6c1f8ac5e721 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -14,46 +14,6 @@
  * Zero Userspace
  */
 
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-	long __d0;
-	might_fault();
-	/* no memory constraint because it doesn't change any memory gcc knows
-	   about */
-	stac();
-	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
-
-		_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
-		_ASM_EXTABLE_UA(1b, 2b)
-
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
-	clac();
-	return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
-unsigned long clear_user(void __user *to, unsigned long n)
-{
-	if (access_ok(to, n))
-		return __clear_user(to, n);
-	return n;
-}
-EXPORT_SYMBOL(clear_user);
-
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ca5b74603008..e460aa004b88 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1020,6 +1020,9 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
+	"clear_user_erms",
+	"clear_user_rep_good",
+	"clear_user_original",
 	NULL
 };
 
-- 
2.35.1


-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette


  parent reply	other threads:[~2022-05-24 12:40 UTC|newest]

Thread overview: 48+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-15  2:12 incoming Andrew Morton
2022-04-15  2:13 ` [patch 01/14] MAINTAINERS: Broadcom internal lists aren't maintainers Andrew Morton
2022-04-15  2:13 ` [patch 02/14] tmpfs: fix regressions from wider use of ZERO_PAGE Andrew Morton
2022-04-15 22:10   ` Linus Torvalds
2022-04-15 22:21     ` Matthew Wilcox
2022-04-15 22:41     ` Hugh Dickins
     [not found]     ` <Ylpj9of+CP4ipDtm@zn.tnic>
2022-04-16 14:07       ` Mark Hemment
     [not found]         ` <Ylr8rR+LHQ1uGL47@zn.tnic>
2022-04-16 17:42           ` Linus Torvalds
     [not found]             ` <Ylsx5PwyUPOainHa@zn.tnic>
     [not found]               ` <YlxtTNFP58TcUHZQ@zn.tnic>
2022-04-17 20:56                 ` Linus Torvalds
     [not found]                   ` <Yl06PWVgeZplboXV@zn.tnic>
2022-04-18 17:10                     ` Linus Torvalds
     [not found]                       ` <Yl5+DCfQmG5C3BHf@zn.tnic>
2022-04-19 16:41                         ` Linus Torvalds
     [not found]                           ` <Yl7146AZDgfLviVv@zn.tnic>
     [not found]                             ` <YmFy8DEqvX4FlnuB@zn.tnic>
     [not found]                               ` <CAHk-=wgf2C9nFiC+3UFG4k7XVTQq5aV6fasSYuT_nQeo_Yew6A@mail.gmail.com>
2022-04-21 17:22                                 ` Linus Torvalds
     [not found]                                   ` <YmWm5AXdwgwu57KZ@zn.tnic>
2022-04-24 19:54                                     ` Linus Torvalds
2022-04-24 20:24                                       ` Linus Torvalds
     [not found]                                       ` <YmiK7Bos+zLAvL0t@zn.tnic>
2022-04-27  1:29                                         ` Linus Torvalds
     [not found]                                           ` <YmkdxaKdc2w/3I7o@zn.tnic>
2022-04-27 16:00                                             ` Linus Torvalds
     [not found]                                               ` <YnLMSWbz6BNfsBME@zn.tnic>
2022-05-04 19:22                                                 ` Linus Torvalds
     [not found]                                                   ` <YnLfl6lupN2nq7+t@zn.tnic>
2022-05-04 20:40                                                     ` Linus Torvalds
     [not found]                                                       ` <YnLplKy0Y66SsvQw@zn.tnic>
2022-05-04 21:09                                                         ` Linus Torvalds
     [not found]                                                           ` <Ynow8F3G8Kl6V3gu@zn.tnic>
2022-05-10 17:17                                                             ` clear_user (was: [patch 02/14] tmpfs: fix regressions from wider use of ZERO_PAGE) Linus Torvalds
2022-05-10 17:28                                                             ` Linus Torvalds
     [not found]                                                               ` <YnqqhmYv75p+xl73@zn.tnic>
     [not found]                                                                 ` <Ynq1nVpu1xCpjnXm@zn.tnic>
2022-05-24 12:32                                                                   ` Borislav Petkov [this message]
2022-05-24 16:51                                                                     ` [PATCH] x86/clear_user: Make it faster Linus Torvalds
2022-05-24 17:30                                                                       ` Borislav Petkov
2022-05-25 12:11                                                                     ` Mark Hemment
2022-05-27 11:28                                                                       ` Borislav Petkov
2022-05-27 11:10                                                                     ` Ingo Molnar
2022-06-22 14:21                                                                     ` Borislav Petkov
2022-06-22 15:06                                                                       ` Linus Torvalds
2022-06-22 20:14                                                                         ` Borislav Petkov
2022-06-22 21:07                                                                           ` Linus Torvalds
2022-06-23  9:41                                                                             ` Borislav Petkov
2022-07-05 17:01                                                                               ` [PATCH -final] " Borislav Petkov
2022-07-06  9:24                                                                                 ` Alexey Dobriyan
2022-07-11 10:33                                                                                   ` Borislav Petkov
2022-07-12 12:32                                                                                     ` Alexey Dobriyan
2022-08-06 12:49                                                                                       ` Borislav Petkov
2022-04-15  2:13 ` [patch 03/14] mm/secretmem: fix panic when growing a memfd_secret Andrew Morton
2022-04-15  2:13 ` [patch 04/14] irq_work: use kasan_record_aux_stack_noalloc() record callstack Andrew Morton
2022-04-15  2:13 ` [patch 05/14] kasan: fix hw tags enablement when KUNIT tests are disabled Andrew Morton
2022-04-15  2:13 ` [patch 06/14] mm, kfence: support kmem_dump_obj() for KFENCE objects Andrew Morton
2022-04-15  2:13 ` [patch 08/14] mm: fix unexpected zeroed page mapping with zram swap Andrew Morton
2022-04-15  2:13 ` [patch 09/14] mm: compaction: fix compiler warning when CONFIG_COMPACTION=n Andrew Morton
2022-04-15  2:13 ` [patch 10/14] hugetlb: do not demote poisoned hugetlb pages Andrew Morton
2022-04-15  2:13 ` [patch 11/14] revert "fs/binfmt_elf: fix PT_LOAD p_align values for loaders" Andrew Morton
2022-04-15  2:13 ` [patch 12/14] revert "fs/binfmt_elf: use PT_LOAD p_align values for static PIE" Andrew Morton
2022-04-15  2:14 ` [patch 13/14] mm/vmalloc: fix spinning drain_vmap_work after reading from /proc/vmcore Andrew Morton
2022-04-15  2:14 ` [patch 14/14] mm: kmemleak: take a full lowmem check in kmemleak_*_phys() Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YozQZMyQ0NDdD8cH@zn.tnic \
    --to=bp@alien8.de \
    --cc=akpm@linux-foundation.org \
    --cc=chuck.lever@oracle.com \
    --cc=djwong@kernel.org \
    --cc=hch@lst.de \
    --cc=hughd@google.com \
    --cc=lczerner@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=markhemm@googlemail.com \
    --cc=mgorman@suse.de \
    --cc=mm-commits@vger.kernel.org \
    --cc=mpatocka@redhat.com \
    --cc=patches@lists.linux.dev \
    --cc=patrice.chotard@foss.st.com \
    --cc=peterz@infradead.org \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox