* [PATCH 1/3] mm: avoid extra mem_alloc_profiling_enabled() checks
@ 2025-01-26 7:02 Suren Baghdasaryan
2025-01-26 7:02 ` [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator Suren Baghdasaryan
2025-01-26 7:02 ` [PATCH 3/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator Suren Baghdasaryan
0 siblings, 2 replies; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-26 7:02 UTC (permalink / raw)
To: akpm
Cc: kent.overstreet, vbabka, yuzhao, minchan, shakeel.butt,
souravpanda, pasha.tatashin, 00107082, quic_zhenhuah, surenb,
linux-mm, linux-kernel
Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside
pgalloc_tag_get() function which is often called after that check was
already done.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/pgalloc_tag.h | 35 +++++++++++++++++++----------------
lib/alloc_tag.c | 6 +++---
mm/page_alloc.c | 3 +--
3 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 3469c4b20105..4a82b6b4820e 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -205,28 +205,32 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
}
}
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
{
struct alloc_tag *tag = NULL;
-
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_sub_check(&ref);
- if (ref.ct)
- tag = ct_to_alloc_tag(ref.ct);
- put_page_tag_ref(handle);
- }
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub_check(&ref);
+ if (ref.ct)
+ tag = ct_to_alloc_tag(ref.ct);
+ put_page_tag_ref(handle);
}
return tag;
}
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
{
- if (mem_alloc_profiling_enabled() && tag)
+ struct alloc_tag *tag;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = __pgalloc_tag_get(page);
+ if (tag)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
@@ -241,8 +245,7 @@ static inline void clear_page_tag_ref(struct page *page) {}
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
-static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
static inline void alloc_tag_sec_init(void) {}
static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 19b45617bdcf..1d893e313614 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -174,7 +174,7 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order)
if (!mem_alloc_profiling_enabled())
return;
- tag = pgalloc_tag_get(&folio->page);
+ tag = __pgalloc_tag_get(&folio->page);
if (!tag)
return;
@@ -200,10 +200,10 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old)
if (!mem_alloc_profiling_enabled())
return;
- tag_old = pgalloc_tag_get(&old->page);
+ tag_old = __pgalloc_tag_get(&old->page);
if (!tag_old)
return;
- tag_new = pgalloc_tag_get(&new->page);
+ tag_new = __pgalloc_tag_get(&new->page);
if (!tag_new)
return;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e469c7ef9a4..55ed2f245f80 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4832,12 +4832,11 @@ void __free_pages(struct page *page, unsigned int order)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
- struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
free_frozen_pages(page, order);
else if (!head) {
- pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+ pgalloc_tag_sub_pages(page, (1 << order) - 1);
while (order-- > 0)
free_frozen_pages(page + (1 << order), order);
}
base-commit: a227a25fe3fdd08f743a20a6bf6367a47b20c125
--
2.48.1.262.g85cc9f2d1e-goog
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-26 7:02 [PATCH 1/3] mm: avoid extra mem_alloc_profiling_enabled() checks Suren Baghdasaryan
@ 2025-01-26 7:02 ` Suren Baghdasaryan
2025-01-26 16:47 ` Vlastimil Babka
2025-01-26 7:02 ` [PATCH 3/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator Suren Baghdasaryan
1 sibling, 1 reply; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-26 7:02 UTC (permalink / raw)
To: akpm
Cc: kent.overstreet, vbabka, yuzhao, minchan, shakeel.butt,
souravpanda, pasha.tatashin, 00107082, quic_zhenhuah, surenb,
linux-mm, linux-kernel
When a sizable code section is protected by a disabled static key, that
code gets into the instruction cache even though it's not executed and
consumes the cache, increasing cache misses. This can be remedied by
moving such code into a separate uninlined function. The improvement
however comes at the expense of the configuration when this static key
gets enabled since there is now an additional function call.
The default state of the mem_alloc_profiling_key is controlled by
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
performance of the default configuration.
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
are inlined and performance does not change.
On a Pixel6 phone, slab allocation profiling overhead measured with
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
baseline modified
Big 3.31% 0.17%
Medium 3.79% 0.57%
Little 6.68% 1.28%
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
profiling gets enabled, the difference in performance before and after
this change stays within noise levels.
On x86 this patch does not make noticeable difference because the overhead
with mem_alloc_profiling_key disabled is much lower (under 1%) to start
with, so any improvement is less visible and hard to distinguish from the
noise.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/alloc_tag.h | 6 +++++
mm/slub.c | 46 ++++++++++++++++++++++++---------------
2 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index a946e0203e6d..c5de2a0c1780 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -116,6 +116,12 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
mem_alloc_profiling_key);
+#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+#define inline_if_mem_alloc_prof inline
+#else
+#define inline_if_mem_alloc_prof noinline
+#endif
+
static inline bool mem_alloc_profiling_enabled(void)
{
return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
diff --git a/mm/slub.c b/mm/slub.c
index 996691c137eb..3107d43dfddc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2000,7 +2000,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
return 0;
}
-static inline void free_slab_obj_exts(struct slab *slab)
+static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
@@ -2077,33 +2077,35 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
return slab_obj_exts(slab) + obj_to_index(s, slab, p);
}
-static inline void
-alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+static inline_if_mem_alloc_prof void
+__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- if (need_slab_obj_ext()) {
- struct slabobj_ext *obj_exts;
+ struct slabobj_ext *obj_exts;
- obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
- /*
- * Currently obj_exts is used only for allocation profiling.
- * If other users appear then mem_alloc_profiling_enabled()
- * check should be added before alloc_tag_add().
- */
- if (likely(obj_exts))
- alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
- }
+ obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+ /*
+ * Currently obj_exts is used only for allocation profiling.
+ * If other users appear then mem_alloc_profiling_enabled()
+ * check should be added before alloc_tag_add().
+ */
+ if (likely(obj_exts))
+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
}
static inline void
-alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (need_slab_obj_ext())
+ __alloc_tagging_slab_alloc_hook(s, object, flags);
+}
+
+static inline_if_mem_alloc_prof void
+__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
int objects)
{
struct slabobj_ext *obj_exts;
int i;
- if (!mem_alloc_profiling_enabled())
- return;
-
/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
return;
@@ -2119,6 +2121,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
}
}
+static inline void
+alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
+ int objects)
+{
+ if (mem_alloc_profiling_enabled())
+ __alloc_tagging_slab_free_hook(s, slab, p, objects);
+}
+
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void
--
2.48.1.262.g85cc9f2d1e-goog
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH 3/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator
2025-01-26 7:02 [PATCH 1/3] mm: avoid extra mem_alloc_profiling_enabled() checks Suren Baghdasaryan
2025-01-26 7:02 ` [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator Suren Baghdasaryan
@ 2025-01-26 7:02 ` Suren Baghdasaryan
1 sibling, 0 replies; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-26 7:02 UTC (permalink / raw)
To: akpm
Cc: kent.overstreet, vbabka, yuzhao, minchan, shakeel.butt,
souravpanda, pasha.tatashin, 00107082, quic_zhenhuah, surenb,
linux-mm, linux-kernel
When a sizable code section is protected by a disabled static key, that
code gets into the instruction cache even though it's not executed and
consumes the cache, increasing cache misses. This can be remedied by
moving such code into a separate uninlined function. The improvement
however comes at the expense of the configuration when this static key
gets enabled since there is now an additional function call.
The default state of the mem_alloc_profiling_key is controlled by
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
performance of the default configuration.
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
are inlined and performance does not change.
On a Pixel6 phone, page allocation profiling overhead measured with
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
baseline modified
Big 4.93% 1.53%
Medium 4.39% 1.41%
Little 1.02% 0.36%
When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
profiling gets enabled, the difference in performance before and after
this change stays within noise levels.
On x86 this patch does not make noticeable difference because the overhead
with mem_alloc_profiling_key disabled is much lower (under 1%) to start
with, so any improvement is less visible and hard to distinguish from the
noise.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
include/linux/pgalloc_tag.h | 60 +++-------------------------
mm/page_alloc.c | 78 +++++++++++++++++++++++++++++++++++++
2 files changed, 83 insertions(+), 55 deletions(-)
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 4a82b6b4820e..c74077977830 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -162,47 +162,13 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code
}
}
-static inline void clear_page_tag_ref(struct page *page)
-{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- set_codetag_empty(&ref);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
-}
-
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr)
-{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
-}
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page);
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+static inline void clear_page_tag_ref(struct page *page)
{
- if (mem_alloc_profiling_enabled()) {
- union pgtag_ref_handle handle;
- union codetag_ref ref;
-
- if (get_page_tag_ref(page, &ref, &handle)) {
- alloc_tag_sub(&ref, PAGE_SIZE * nr);
- update_page_tag_ref(handle, &ref);
- put_page_tag_ref(handle);
- }
- }
+ if (mem_alloc_profiling_enabled())
+ __clear_page_tag_ref(page);
}
/* Should be called only if mem_alloc_profiling_enabled() */
@@ -222,18 +188,6 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
return tag;
}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
-{
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- tag = __pgalloc_tag_get(page);
- if (tag)
- this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
-}
-
void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
void pgalloc_tag_swap(struct folio *new, struct folio *old);
@@ -242,10 +196,6 @@ void __init alloc_tag_sec_init(void);
#else /* CONFIG_MEM_ALLOC_PROFILING */
static inline void clear_page_tag_ref(struct page *page) {}
-static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
- unsigned int nr) {}
-static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
static inline void alloc_tag_sec_init(void) {}
static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 55ed2f245f80..67e205286dbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1041,6 +1041,84 @@ static void kernel_init_pages(struct page *page, int numpages)
kasan_enable_current();
}
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+void __clear_page_tag_ref(struct page *page)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ set_codetag_empty(&ref);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline_if_mem_alloc_prof
+void __pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_add(page, task, nr);
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline_if_mem_alloc_prof
+void __pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ union pgtag_ref_handle handle;
+ union codetag_ref ref;
+
+ if (get_page_tag_ref(page, &ref, &handle)) {
+ alloc_tag_sub(&ref, PAGE_SIZE * nr);
+ update_page_tag_ref(handle, &ref);
+ put_page_tag_ref(handle);
+ }
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+ if (mem_alloc_profiling_enabled())
+ __pgalloc_tag_sub(page, nr);
+}
+
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+{
+ struct alloc_tag *tag;
+
+ if (!mem_alloc_profiling_enabled())
+ return;
+
+ tag = __pgalloc_tag_get(page);
+ if (tag)
+ this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+ unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
__always_inline bool free_pages_prepare(struct page *page,
unsigned int order)
{
--
2.48.1.262.g85cc9f2d1e-goog
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-26 7:02 ` [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator Suren Baghdasaryan
@ 2025-01-26 16:47 ` Vlastimil Babka
2025-01-27 19:38 ` Suren Baghdasaryan
2025-01-28 22:49 ` Peter Zijlstra
0 siblings, 2 replies; 13+ messages in thread
From: Vlastimil Babka @ 2025-01-26 16:47 UTC (permalink / raw)
To: Suren Baghdasaryan, akpm, Peter Zijlstra
Cc: kent.overstreet, yuzhao, minchan, shakeel.butt, souravpanda,
pasha.tatashin, 00107082, quic_zhenhuah, linux-mm, linux-kernel
On 1/26/25 08:02, Suren Baghdasaryan wrote:
> When a sizable code section is protected by a disabled static key, that
> code gets into the instruction cache even though it's not executed and
> consumes the cache, increasing cache misses. This can be remedied by
> moving such code into a separate uninlined function. The improvement
Weird, I thought the static_branch_likely/unlikely/maybe was already
handling this by the unlikely case being a jump to a block away from the
fast-path stream of instructions, thus making it less likely to get cached.
AFAIU even plain likely()/unlikely() should do this, along with branch
prediction hints.
> however comes at the expense of the configuration when this static key
> gets enabled since there is now an additional function call.
> The default state of the mem_alloc_profiling_key is controlled by
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
> only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
> performance of the default configuration.
> When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
> are inlined and performance does not change.
>
> On a Pixel6 phone, slab allocation profiling overhead measured with
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
>
> baseline modified
> Big 3.31% 0.17%
> Medium 3.79% 0.57%
> Little 6.68% 1.28%
What does big/medium/little mean here? But indeed not nice overhead for
disabled static key.
> When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
> profiling gets enabled, the difference in performance before and after
> this change stays within noise levels.
>
> On x86 this patch does not make noticeable difference because the overhead
> with mem_alloc_profiling_key disabled is much lower (under 1%) to start
> with, so any improvement is less visible and hard to distinguish from the
> noise.
That would be in line with my understanding above. Does the arm64 compiler
not do it as well as x86 (could be maybe found out by disassembling) or the
Pixel6 cpu somhow caches these out of line blocks more aggressively and only
a function call stops it?
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Kinda sad that despite the static key we have to control a lot by the
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
> ---
> include/linux/alloc_tag.h | 6 +++++
> mm/slub.c | 46 ++++++++++++++++++++++++---------------
> 2 files changed, 34 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index a946e0203e6d..c5de2a0c1780 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -116,6 +116,12 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
> DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> mem_alloc_profiling_key);
>
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +#define inline_if_mem_alloc_prof inline
> +#else
> +#define inline_if_mem_alloc_prof noinline
> +#endif
> +
> static inline bool mem_alloc_profiling_enabled(void)
> {
> return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> diff --git a/mm/slub.c b/mm/slub.c
> index 996691c137eb..3107d43dfddc 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2000,7 +2000,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
> return 0;
> }
>
> -static inline void free_slab_obj_exts(struct slab *slab)
> +static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
> {
> struct slabobj_ext *obj_exts;
>
> @@ -2077,33 +2077,35 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
> return slab_obj_exts(slab) + obj_to_index(s, slab, p);
> }
>
> -static inline void
> -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> +static inline_if_mem_alloc_prof void
> +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> {
> - if (need_slab_obj_ext()) {
> - struct slabobj_ext *obj_exts;
> + struct slabobj_ext *obj_exts;
>
> - obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> - /*
> - * Currently obj_exts is used only for allocation profiling.
> - * If other users appear then mem_alloc_profiling_enabled()
> - * check should be added before alloc_tag_add().
> - */
> - if (likely(obj_exts))
> - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> - }
> + obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> + /*
> + * Currently obj_exts is used only for allocation profiling.
> + * If other users appear then mem_alloc_profiling_enabled()
> + * check should be added before alloc_tag_add().
> + */
> + if (likely(obj_exts))
> + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> }
>
> static inline void
> -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> +{
> + if (need_slab_obj_ext())
> + __alloc_tagging_slab_alloc_hook(s, object, flags);
> +}
> +
> +static inline_if_mem_alloc_prof void
> +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> int objects)
> {
> struct slabobj_ext *obj_exts;
> int i;
>
> - if (!mem_alloc_profiling_enabled())
> - return;
> -
> /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
> if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
> return;
> @@ -2119,6 +2121,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> }
> }
>
> +static inline void
> +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> + int objects)
> +{
> + if (mem_alloc_profiling_enabled())
> + __alloc_tagging_slab_free_hook(s, slab, p, objects);
> +}
> +
> #else /* CONFIG_MEM_ALLOC_PROFILING */
>
> static inline void
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-26 16:47 ` Vlastimil Babka
@ 2025-01-27 19:38 ` Suren Baghdasaryan
2025-01-28 19:35 ` Steven Rostedt
2025-01-28 22:49 ` Peter Zijlstra
1 sibling, 1 reply; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-27 19:38 UTC (permalink / raw)
To: Vlastimil Babka, Steven Rostedt
Cc: akpm, Peter Zijlstra, kent.overstreet, yuzhao, minchan,
shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > When a sizable code section is protected by a disabled static key, that
> > code gets into the instruction cache even though it's not executed and
> > consumes the cache, increasing cache misses. This can be remedied by
> > moving such code into a separate uninlined function. The improvement
Sorry, I missed adding Steven Rostedt into the CC list since his
advice was instrumental in finding the way to optimize the static key
performance in this patch. Added now.
>
> Weird, I thought the static_branch_likely/unlikely/maybe was already
> handling this by the unlikely case being a jump to a block away from the
> fast-path stream of instructions, thus making it less likely to get cached.
> AFAIU even plain likely()/unlikely() should do this, along with branch
> prediction hints.
This was indeed an unexpected overhead when I measured it on Android.
Cache pollution was my understanding of the cause for this high
overhead after Steven told me to try uninlining the protected code. He
has done something similar in the tracing subsystem. But maybe I
misunderstood the real reason. Steven, could you please verify if my
understanding of the high overhead cause is correct here? Maybe there
is something else at play that I missed?
>
> > however comes at the expense of the configuration when this static key
> > gets enabled since there is now an additional function call.
> > The default state of the mem_alloc_profiling_key is controlled by
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. Apply this optimization
> > only if CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, improving the
> > performance of the default configuration.
> > When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y the functions
> > are inlined and performance does not change.
> >
> > On a Pixel6 phone, slab allocation profiling overhead measured with
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and profiling disabled:
> >
> > baseline modified
> > Big 3.31% 0.17%
> > Medium 3.79% 0.57%
> > Little 6.68% 1.28%
>
> What does big/medium/little mean here? But indeed not nice overhead for
> disabled static key.
Big/Medium/Little is the CPU core size on my ARM64-based Android phone.
>
> > When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n and memory allocation
> > profiling gets enabled, the difference in performance before and after
> > this change stays within noise levels.
> >
> > On x86 this patch does not make noticeable difference because the overhead
> > with mem_alloc_profiling_key disabled is much lower (under 1%) to start
> > with, so any improvement is less visible and hard to distinguish from the
> > noise.
>
> That would be in line with my understanding above. Does the arm64 compiler
> not do it as well as x86 (could be maybe found out by disassembling) or the
> Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> a function call stops it?
I'll disassemble the code and will see what it looks like.
>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Kinda sad that despite the static key we have to control a lot by the
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
I agree. If there is a better way to fix this regression I'm open to
changes. Let's wait for Steven to confirm my understanding before
proceeding.
Thanks,
Suren.
>
> > ---
> > include/linux/alloc_tag.h | 6 +++++
> > mm/slub.c | 46 ++++++++++++++++++++++++---------------
> > 2 files changed, 34 insertions(+), 18 deletions(-)
> >
> > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > index a946e0203e6d..c5de2a0c1780 100644
> > --- a/include/linux/alloc_tag.h
> > +++ b/include/linux/alloc_tag.h
> > @@ -116,6 +116,12 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
> > DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > mem_alloc_profiling_key);
> >
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +#define inline_if_mem_alloc_prof inline
> > +#else
> > +#define inline_if_mem_alloc_prof noinline
> > +#endif
> > +
> > static inline bool mem_alloc_profiling_enabled(void)
> > {
> > return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 996691c137eb..3107d43dfddc 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -2000,7 +2000,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
> > return 0;
> > }
> >
> > -static inline void free_slab_obj_exts(struct slab *slab)
> > +static inline_if_mem_alloc_prof void free_slab_obj_exts(struct slab *slab)
> > {
> > struct slabobj_ext *obj_exts;
> >
> > @@ -2077,33 +2077,35 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
> > return slab_obj_exts(slab) + obj_to_index(s, slab, p);
> > }
> >
> > -static inline void
> > -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> > +static inline_if_mem_alloc_prof void
> > +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> > {
> > - if (need_slab_obj_ext()) {
> > - struct slabobj_ext *obj_exts;
> > + struct slabobj_ext *obj_exts;
> >
> > - obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> > - /*
> > - * Currently obj_exts is used only for allocation profiling.
> > - * If other users appear then mem_alloc_profiling_enabled()
> > - * check should be added before alloc_tag_add().
> > - */
> > - if (likely(obj_exts))
> > - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> > - }
> > + obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
> > + /*
> > + * Currently obj_exts is used only for allocation profiling.
> > + * If other users appear then mem_alloc_profiling_enabled()
> > + * check should be added before alloc_tag_add().
> > + */
> > + if (likely(obj_exts))
> > + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
> > }
> >
> > static inline void
> > -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
> > +{
> > + if (need_slab_obj_ext())
> > + __alloc_tagging_slab_alloc_hook(s, object, flags);
> > +}
> > +
> > +static inline_if_mem_alloc_prof void
> > +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > int objects)
> > {
> > struct slabobj_ext *obj_exts;
> > int i;
> >
> > - if (!mem_alloc_profiling_enabled())
> > - return;
> > -
> > /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
> > if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
> > return;
> > @@ -2119,6 +2121,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > }
> > }
> >
> > +static inline void
> > +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
> > + int objects)
> > +{
> > + if (mem_alloc_profiling_enabled())
> > + __alloc_tagging_slab_free_hook(s, slab, p, objects);
> > +}
> > +
> > #else /* CONFIG_MEM_ALLOC_PROFILING */
> >
> > static inline void
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-27 19:38 ` Suren Baghdasaryan
@ 2025-01-28 19:35 ` Steven Rostedt
2025-01-28 23:43 ` Suren Baghdasaryan
0 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2025-01-28 19:35 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Vlastimil Babka, akpm, Peter Zijlstra, kent.overstreet, yuzhao,
minchan, shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Mon, 27 Jan 2025 11:38:32 -0800
Suren Baghdasaryan <surenb@google.com> wrote:
> On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > > When a sizable code section is protected by a disabled static key, that
> > > code gets into the instruction cache even though it's not executed and
> > > consumes the cache, increasing cache misses. This can be remedied by
> > > moving such code into a separate uninlined function. The improvement
>
> Sorry, I missed adding Steven Rostedt into the CC list since his
> advice was instrumental in finding the way to optimize the static key
> performance in this patch. Added now.
>
> >
> > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > handling this by the unlikely case being a jump to a block away from the
> > fast-path stream of instructions, thus making it less likely to get cached.
> > AFAIU even plain likely()/unlikely() should do this, along with branch
> > prediction hints.
>
> This was indeed an unexpected overhead when I measured it on Android.
> Cache pollution was my understanding of the cause for this high
> overhead after Steven told me to try uninlining the protected code. He
> has done something similar in the tracing subsystem. But maybe I
> misunderstood the real reason. Steven, could you please verify if my
> understanding of the high overhead cause is correct here? Maybe there
> is something else at play that I missed?
From what I understand, is that the compiler will only move code to the end
of a function with the unlikely(). But, the code after the function could
also be in the control flow path. If you have several functions that are
called together, by adding code to the unlikely() cases may not help the
speed.
I made an effort to make the tracepoint code call functions instead of
having everything inlined. It actually brought down the size of the text of
the kernel, but looking in the change logs I never posted benchmarks. But
I'm sure making the size of the scheduler text section smaller probably did
help.
> > That would be in line with my understanding above. Does the arm64 compiler
> > not do it as well as x86 (could be maybe found out by disassembling) or the
> > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > a function call stops it?
>
> I'll disassemble the code and will see what it looks like.
I think I asked you to do that too ;-)
>
> >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> >
> > Kinda sad that despite the static key we have to control a lot by the
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
>
> I agree. If there is a better way to fix this regression I'm open to
> changes. Let's wait for Steven to confirm my understanding before
> proceeding.
How slow is it to always do the call instead of inlining?
-- Steve
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-26 16:47 ` Vlastimil Babka
2025-01-27 19:38 ` Suren Baghdasaryan
@ 2025-01-28 22:49 ` Peter Zijlstra
1 sibling, 0 replies; 13+ messages in thread
From: Peter Zijlstra @ 2025-01-28 22:49 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Suren Baghdasaryan, akpm, kent.overstreet, yuzhao, minchan,
shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Sun, Jan 26, 2025 at 05:47:08PM +0100, Vlastimil Babka wrote:
> On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > When a sizable code section is protected by a disabled static key, that
> > code gets into the instruction cache even though it's not executed and
> > consumes the cache, increasing cache misses. This can be remedied by
> > moving such code into a separate uninlined function. The improvement
>
> Weird, I thought the static_branch_likely/unlikely/maybe was already
> handling this by the unlikely case being a jump to a block away from the
> fast-path stream of instructions, thus making it less likely to get cached.
> AFAIU even plain likely()/unlikely() should do this, along with branch
> prediction hints.
Very much depends on the compiler :-(
sometimes unlikely just moves it to the end of the function, sometimes
it's moved to .text.unlikely.
Some compilers have label attributes:
l_yes: __attribute__((cold));
but the same compilers utterly ignore it when it's combined with
asm-goto or something -- we could never get it to work reliably.
It's been a while since I looked at this, so I'm not entirely sure what
the current version of compilers do.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-28 19:35 ` Steven Rostedt
@ 2025-01-28 23:43 ` Suren Baghdasaryan
2025-01-29 0:03 ` Steven Rostedt
2025-01-29 2:54 ` Suren Baghdasaryan
0 siblings, 2 replies; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-28 23:43 UTC (permalink / raw)
To: Steven Rostedt
Cc: Vlastimil Babka, akpm, Peter Zijlstra, kent.overstreet, yuzhao,
minchan, shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 27 Jan 2025 11:38:32 -0800
> Suren Baghdasaryan <surenb@google.com> wrote:
>
> > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > >
> > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > > > When a sizable code section is protected by a disabled static key, that
> > > > code gets into the instruction cache even though it's not executed and
> > > > consumes the cache, increasing cache misses. This can be remedied by
> > > > moving such code into a separate uninlined function. The improvement
> >
> > Sorry, I missed adding Steven Rostedt into the CC list since his
> > advice was instrumental in finding the way to optimize the static key
> > performance in this patch. Added now.
> >
> > >
> > > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > > handling this by the unlikely case being a jump to a block away from the
> > > fast-path stream of instructions, thus making it less likely to get cached.
> > > AFAIU even plain likely()/unlikely() should do this, along with branch
> > > prediction hints.
> >
> > This was indeed an unexpected overhead when I measured it on Android.
> > Cache pollution was my understanding of the cause for this high
> > overhead after Steven told me to try uninlining the protected code. He
> > has done something similar in the tracing subsystem. But maybe I
> > misunderstood the real reason. Steven, could you please verify if my
> > understanding of the high overhead cause is correct here? Maybe there
> > is something else at play that I missed?
>
> From what I understand, is that the compiler will only move code to the end
> of a function with the unlikely(). But, the code after the function could
> also be in the control flow path. If you have several functions that are
> called together, by adding code to the unlikely() cases may not help the
> speed.
>
> I made an effort to make the tracepoint code call functions instead of
> having everything inlined. It actually brought down the size of the text of
> the kernel, but looking in the change logs I never posted benchmarks. But
> I'm sure making the size of the scheduler text section smaller probably did
> help.
>
> > > That would be in line with my understanding above. Does the arm64 compiler
> > > not do it as well as x86 (could be maybe found out by disassembling) or the
> > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > > a function call stops it?
> >
> > I'll disassemble the code and will see what it looks like.
>
> I think I asked you to do that too ;-)
Yes you did! And I disassembled almost each of these functions during
my investigation but in my infinite wisdom I did not save any of them.
So, now I need to do that again to answer Vlastimil's question. I'll
try to do that today.
>
> >
> > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > >
> > > Kinda sad that despite the static key we have to control a lot by the
> > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
> >
> > I agree. If there is a better way to fix this regression I'm open to
> > changes. Let's wait for Steven to confirm my understanding before
> > proceeding.
>
> How slow is it to always do the call instead of inlining?
Let's see... The additional overhead if we always call is:
Little core: 2.42%
Middle core: 1.23%
Big core: 0.66%
Not a huge deal because the overhead of memory profiling when enabled
is much higher. So, maybe for simplicity I should indeed always call?
>
> -- Steve
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-28 23:43 ` Suren Baghdasaryan
@ 2025-01-29 0:03 ` Steven Rostedt
2025-01-29 9:50 ` Vlastimil Babka
2025-01-29 2:54 ` Suren Baghdasaryan
1 sibling, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2025-01-29 0:03 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Vlastimil Babka, akpm, Peter Zijlstra, kent.overstreet, yuzhao,
minchan, shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Tue, 28 Jan 2025 15:43:13 -0800
Suren Baghdasaryan <surenb@google.com> wrote:
> > How slow is it to always do the call instead of inlining?
>
> Let's see... The additional overhead if we always call is:
>
> Little core: 2.42%
> Middle core: 1.23%
> Big core: 0.66%
>
> Not a huge deal because the overhead of memory profiling when enabled
> is much higher. So, maybe for simplicity I should indeed always call?
That's what I was thinking, unless the other maintainers are OK with this
special logic.
-- Steve
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-28 23:43 ` Suren Baghdasaryan
2025-01-29 0:03 ` Steven Rostedt
@ 2025-01-29 2:54 ` Suren Baghdasaryan
2025-01-29 9:38 ` Vlastimil Babka
1 sibling, 1 reply; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-29 2:54 UTC (permalink / raw)
To: Steven Rostedt
Cc: Vlastimil Babka, akpm, Peter Zijlstra, kent.overstreet, yuzhao,
minchan, shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On Tue, Jan 28, 2025 at 3:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Mon, 27 Jan 2025 11:38:32 -0800
> > Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> > > >
> > > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
> > > > > When a sizable code section is protected by a disabled static key, that
> > > > > code gets into the instruction cache even though it's not executed and
> > > > > consumes the cache, increasing cache misses. This can be remedied by
> > > > > moving such code into a separate uninlined function. The improvement
> > >
> > > Sorry, I missed adding Steven Rostedt into the CC list since his
> > > advice was instrumental in finding the way to optimize the static key
> > > performance in this patch. Added now.
> > >
> > > >
> > > > Weird, I thought the static_branch_likely/unlikely/maybe was already
> > > > handling this by the unlikely case being a jump to a block away from the
> > > > fast-path stream of instructions, thus making it less likely to get cached.
> > > > AFAIU even plain likely()/unlikely() should do this, along with branch
> > > > prediction hints.
> > >
> > > This was indeed an unexpected overhead when I measured it on Android.
> > > Cache pollution was my understanding of the cause for this high
> > > overhead after Steven told me to try uninlining the protected code. He
> > > has done something similar in the tracing subsystem. But maybe I
> > > misunderstood the real reason. Steven, could you please verify if my
> > > understanding of the high overhead cause is correct here? Maybe there
> > > is something else at play that I missed?
> >
> > From what I understand, is that the compiler will only move code to the end
> > of a function with the unlikely(). But, the code after the function could
> > also be in the control flow path. If you have several functions that are
> > called together, by adding code to the unlikely() cases may not help the
> > speed.
> >
> > I made an effort to make the tracepoint code call functions instead of
> > having everything inlined. It actually brought down the size of the text of
> > the kernel, but looking in the change logs I never posted benchmarks. But
> > I'm sure making the size of the scheduler text section smaller probably did
> > help.
> >
> > > > That would be in line with my understanding above. Does the arm64 compiler
> > > > not do it as well as x86 (could be maybe found out by disassembling) or the
> > > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
> > > > a function call stops it?
> > >
> > > I'll disassemble the code and will see what it looks like.
> >
> > I think I asked you to do that too ;-)
>
> Yes you did! And I disassembled almost each of these functions during
> my investigation but in my infinite wisdom I did not save any of them.
> So, now I need to do that again to answer Vlastimil's question. I'll
> try to do that today.
Yeah, quite a difference. This is alloc_tagging_slab_alloc_hook() with
outlined version of __alloc_tagging_slab_alloc_hook():
ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
ffffffc0803a2dd8: d503201f nop
ffffffc0803a2ddc: d65f03c0 ret
ffffffc0803a2de0: d503233f paciasp
ffffffc0803a2de4: a9bf7bfd stp x29, x30, [sp, #-0x10]!
ffffffc0803a2de8: 910003fd mov x29, sp
ffffffc0803a2dec: 94000004 bl 0xffffffc0803a2dfc
<__alloc_tagging_slab_alloc_hook>
ffffffc0803a2df0: a8c17bfd ldp x29, x30, [sp], #0x10
ffffffc0803a2df4: d50323bf autiasp
ffffffc0803a2df8: d65f03c0 ret
This is the same function with inlined version of
__alloc_tagging_slab_alloc_hook():
ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
ffffffc0803a2dd8: d503233f paciasp
ffffffc0803a2ddc: d10103ff sub sp, sp, #0x40
ffffffc0803a2de0: a9017bfd stp x29, x30, [sp, #0x10]
ffffffc0803a2de4: f90013f5 str x21, [sp, #0x20]
ffffffc0803a2de8: a9034ff4 stp x20, x19, [sp, #0x30]
ffffffc0803a2dec: 910043fd add x29, sp, #0x10
ffffffc0803a2df0: d503201f nop
ffffffc0803a2df4: a9434ff4 ldp x20, x19, [sp, #0x30]
ffffffc0803a2df8: f94013f5 ldr x21, [sp, #0x20]
ffffffc0803a2dfc: a9417bfd ldp x29, x30, [sp, #0x10]
ffffffc0803a2e00: 910103ff add sp, sp, #0x40
ffffffc0803a2e04: d50323bf autiasp
ffffffc0803a2e08: d65f03c0 ret
ffffffc0803a2e0c: b4ffff41 cbz x1, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2e10: b9400808 ldr w8, [x0, #0x8]
ffffffc0803a2e14: 12060049 and w9, w2, #0x4000000
ffffffc0803a2e18: 12152108 and w8, w8, #0xff800
ffffffc0803a2e1c: 120d6108 and w8, w8, #0xfff80fff
ffffffc0803a2e20: 2a090108 orr w8, w8, w9
ffffffc0803a2e24: 35fffe88 cbnz w8, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2e28: d378dc28 lsl x8, x1, #8
ffffffc0803a2e2c: d2c01009 mov x9, #0x8000000000 // =549755813888
ffffffc0803a2e30: f9000fa0 str x0, [x29, #0x18]
ffffffc0803a2e34: f90007e1 str x1, [sp, #0x8]
ffffffc0803a2e38: 8b882128 add x8, x9, x8, asr #8
ffffffc0803a2e3c: b25f7be9 mov x9, #-0x200000000 // =-8589934592
ffffffc0803a2e40: f2b80009 movk x9, #0xc000, lsl #16
ffffffc0803a2e44: d34cfd08 lsr x8, x8, #12
ffffffc0803a2e48: 8b081928 add x8, x9, x8, lsl #6
ffffffc0803a2e4c: f9400509 ldr x9, [x8, #0x8]
ffffffc0803a2e50: d100052a sub x10, x9, #0x1
ffffffc0803a2e54: 7200013f tst w9, #0x1
ffffffc0803a2e58: 9a8a0108 csel x8, x8, x10, eq
ffffffc0803a2e5c: 3940cd09 ldrb w9, [x8, #0x33]
ffffffc0803a2e60: 7103d53f cmp w9, #0xf5
ffffffc0803a2e64: 9a9f0113 csel x19, x8, xzr, eq
ffffffc0803a2e68: f9401e68 ldr x8, [x19, #0x38]
ffffffc0803a2e6c: f1001d1f cmp x8, #0x7
ffffffc0803a2e70: 540000a8 b.hi 0xffffffc0803a2e84
<alloc_tagging_slab_alloc_hook+0xac>
ffffffc0803a2e74: aa1303e0 mov x0, x19
ffffffc0803a2e78: 2a1f03e3 mov w3, wzr
ffffffc0803a2e7c: 97ffd6a5 bl 0xffffffc080398910 <alloc_slab_obj_exts>
ffffffc0803a2e80: 350009c0 cbnz w0, 0xffffffc0803a2fb8
<alloc_tagging_slab_alloc_hook+0x1e0>
ffffffc0803a2e84: b000f2c8 adrp x8, 0xffffffc0821fb000
<max_load_balance_interval>
ffffffc0803a2e88: f9401e6a ldr x10, [x19, #0x38]
ffffffc0803a2e8c: f9453909 ldr x9, [x8, #0xa70]
ffffffc0803a2e90: 927df148 and x8, x10, #0xfffffffffffffff8
ffffffc0803a2e94: b40000e9 cbz x9, 0xffffffc0803a2eb0
<alloc_tagging_slab_alloc_hook+0xd8>
ffffffc0803a2e98: f94007ea ldr x10, [sp, #0x8]
ffffffc0803a2e9c: cb090149 sub x9, x10, x9
ffffffc0803a2ea0: f142013f cmp x9, #0x80, lsl #12 // =0x80000
ffffffc0803a2ea4: 54000062 b.hs 0xffffffc0803a2eb0
<alloc_tagging_slab_alloc_hook+0xd8>
ffffffc0803a2ea8: aa1f03e9 mov x9, xzr
ffffffc0803a2eac: 14000015 b 0xffffffc0803a2f00
<alloc_tagging_slab_alloc_hook+0x128>
ffffffc0803a2eb0: d2ffe009 mov x9, #-0x100000000000000 //
=-72057594037927936
ffffffc0803a2eb4: 14000002 b 0xffffffc0803a2ebc
<alloc_tagging_slab_alloc_hook+0xe4>
ffffffc0803a2eb8: aa1f03e9 mov x9, xzr
ffffffc0803a2ebc: d2dffa0a mov x10, #0xffd000000000 // =281268818280448
ffffffc0803a2ec0: f2e01fea movk x10, #0xff, lsl #48
ffffffc0803a2ec4: 8b13194a add x10, x10, x19, lsl #6
ffffffc0803a2ec8: 9274ad4a and x10, x10, #0xfffffffffff000
ffffffc0803a2ecc: aa0a012a orr x10, x9, x10
ffffffc0803a2ed0: f9400fa9 ldr x9, [x29, #0x18]
ffffffc0803a2ed4: f940112b ldr x11, [x9, #0x20]
ffffffc0803a2ed8: f94007e9 ldr x9, [sp, #0x8]
ffffffc0803a2edc: cb0a0129 sub x9, x9, x10
ffffffc0803a2ee0: d360fd6c lsr x12, x11, #32
ffffffc0803a2ee4: 9bab7d2a umull x10, w9, w11
ffffffc0803a2ee8: d368fd6b lsr x11, x11, #40
ffffffc0803a2eec: d360fd4a lsr x10, x10, #32
ffffffc0803a2ef0: 4b0a0129 sub w9, w9, w10
ffffffc0803a2ef4: 1acc2529 lsr w9, w9, w12
ffffffc0803a2ef8: 0b0a0129 add w9, w9, w10
ffffffc0803a2efc: 1acb2529 lsr w9, w9, w11
ffffffc0803a2f00: ab091109 adds x9, x8, x9, lsl #4
ffffffc0803a2f04: f9400fa8 ldr x8, [x29, #0x18]
ffffffc0803a2f08: 54fff760 b.eq 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f0c: b1002129 adds x9, x9, #0x8
ffffffc0803a2f10: 54fff720 b.eq 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f14: d5384113 mrs x19, SP_EL0
ffffffc0803a2f18: f9402a74 ldr x20, [x19, #0x50]
ffffffc0803a2f1c: b4fff6d4 cbz x20, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2f20: b9401915 ldr w21, [x8, #0x18]
ffffffc0803a2f24: f9000134 str x20, [x9]
ffffffc0803a2f28: b9401268 ldr w8, [x19, #0x10]
ffffffc0803a2f2c: 11000508 add w8, w8, #0x1
ffffffc0803a2f30: b9001268 str w8, [x19, #0x10]
ffffffc0803a2f34: f9401288 ldr x8, [x20, #0x20]
ffffffc0803a2f38: d538d089 mrs x9, TPIDR_EL1
ffffffc0803a2f3c: 8b090108 add x8, x8, x9
ffffffc0803a2f40: 52800029 mov w9, #0x1 // =1
ffffffc0803a2f44: 91002108 add x8, x8, #0x8
ffffffc0803a2f48: c85f7d0b ldxr x11, [x8]
ffffffc0803a2f4c: 8b09016b add x11, x11, x9
ffffffc0803a2f50: c80a7d0b stxr w10, x11, [x8]
ffffffc0803a2f54: 35ffffaa cbnz w10, 0xffffffc0803a2f48
<alloc_tagging_slab_alloc_hook+0x170>
ffffffc0803a2f58: f9400a68 ldr x8, [x19, #0x10]
ffffffc0803a2f5c: f1000508 subs x8, x8, #0x1
ffffffc0803a2f60: b9001268 str w8, [x19, #0x10]
ffffffc0803a2f64: 540003c0 b.eq 0xffffffc0803a2fdc
<alloc_tagging_slab_alloc_hook+0x204>
ffffffc0803a2f68: f9400a68 ldr x8, [x19, #0x10]
ffffffc0803a2f6c: b4000388 cbz x8, 0xffffffc0803a2fdc
<alloc_tagging_slab_alloc_hook+0x204>
ffffffc0803a2f70: b9401268 ldr w8, [x19, #0x10]
ffffffc0803a2f74: 11000508 add w8, w8, #0x1
ffffffc0803a2f78: b9001268 str w8, [x19, #0x10]
ffffffc0803a2f7c: f9401288 ldr x8, [x20, #0x20]
ffffffc0803a2f80: d538d089 mrs x9, TPIDR_EL1
ffffffc0803a2f84: 8b080128 add x8, x9, x8
ffffffc0803a2f88: c85f7d0a ldxr x10, [x8]
ffffffc0803a2f8c: 8b15014a add x10, x10, x21
ffffffc0803a2f90: c8097d0a stxr w9, x10, [x8]
ffffffc0803a2f94: 35ffffa9 cbnz w9, 0xffffffc0803a2f88
<alloc_tagging_slab_alloc_hook+0x1b0>
ffffffc0803a2f98: f9400a68 ldr x8, [x19, #0x10]
ffffffc0803a2f9c: f1000508 subs x8, x8, #0x1
ffffffc0803a2fa0: b9001268 str w8, [x19, #0x10]
ffffffc0803a2fa4: 54000060 b.eq 0xffffffc0803a2fb0
<alloc_tagging_slab_alloc_hook+0x1d8>
ffffffc0803a2fa8: f9400a68 ldr x8, [x19, #0x10]
ffffffc0803a2fac: b5fff248 cbnz x8, 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fb0: 94344478 bl 0xffffffc0810b4190 <preempt_schedule_notrace>
ffffffc0803a2fb4: 17ffff90 b 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fb8: f9400fa8 ldr x8, [x29, #0x18]
ffffffc0803a2fbc: f00092c0 adrp x0, 0xffffffc0815fd000
<f_midi_shortname+0x4cf4>
ffffffc0803a2fc0: 910e5400 add x0, x0, #0x395
ffffffc0803a2fc4: d00099c1 adrp x1, 0xffffffc0816dc000 <longname+0x2727d>
ffffffc0803a2fc8: 911d1421 add x1, x1, #0x745
ffffffc0803a2fcc: f9403102 ldr x2, [x8, #0x60]
ffffffc0803a2fd0: 97f46d47 bl 0xffffffc0800be4ec <__warn_printk>
ffffffc0803a2fd4: d4210000 brk #0x800
ffffffc0803a2fd8: 17ffff87 b 0xffffffc0803a2df4
<alloc_tagging_slab_alloc_hook+0x1c>
ffffffc0803a2fdc: 9434446d bl 0xffffffc0810b4190 <preempt_schedule_notrace>
ffffffc0803a2fe0: 17ffffe4 b 0xffffffc0803a2f70
<alloc_tagging_slab_alloc_hook+0x198>
>
> >
> > >
> > > >
> > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > >
> > > > Kinda sad that despite the static key we have to control a lot by the
> > > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT in addition.
> > >
> > > I agree. If there is a better way to fix this regression I'm open to
> > > changes. Let's wait for Steven to confirm my understanding before
> > > proceeding.
> >
> > How slow is it to always do the call instead of inlining?
>
> Let's see... The additional overhead if we always call is:
>
> Little core: 2.42%
> Middle core: 1.23%
> Big core: 0.66%
>
> Not a huge deal because the overhead of memory profiling when enabled
> is much higher. So, maybe for simplicity I should indeed always call?
>
> >
> > -- Steve
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-29 2:54 ` Suren Baghdasaryan
@ 2025-01-29 9:38 ` Vlastimil Babka
0 siblings, 0 replies; 13+ messages in thread
From: Vlastimil Babka @ 2025-01-29 9:38 UTC (permalink / raw)
To: Suren Baghdasaryan, Steven Rostedt
Cc: akpm, Peter Zijlstra, kent.overstreet, yuzhao, minchan,
shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On 1/29/25 03:54, Suren Baghdasaryan wrote:
> On Tue, Jan 28, 2025 at 3:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>>
>> On Tue, Jan 28, 2025 at 11:35 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>> >
>> > On Mon, 27 Jan 2025 11:38:32 -0800
>> > Suren Baghdasaryan <surenb@google.com> wrote:
>> >
>> > > On Sun, Jan 26, 2025 at 8:47 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>> > > >
>> > > > On 1/26/25 08:02, Suren Baghdasaryan wrote:
>> > > > > When a sizable code section is protected by a disabled static key, that
>> > > > > code gets into the instruction cache even though it's not executed and
>> > > > > consumes the cache, increasing cache misses. This can be remedied by
>> > > > > moving such code into a separate uninlined function. The improvement
>> > >
>> > > Sorry, I missed adding Steven Rostedt into the CC list since his
>> > > advice was instrumental in finding the way to optimize the static key
>> > > performance in this patch. Added now.
>> > >
>> > > >
>> > > > Weird, I thought the static_branch_likely/unlikely/maybe was already
>> > > > handling this by the unlikely case being a jump to a block away from the
>> > > > fast-path stream of instructions, thus making it less likely to get cached.
>> > > > AFAIU even plain likely()/unlikely() should do this, along with branch
>> > > > prediction hints.
>> > >
>> > > This was indeed an unexpected overhead when I measured it on Android.
>> > > Cache pollution was my understanding of the cause for this high
>> > > overhead after Steven told me to try uninlining the protected code. He
>> > > has done something similar in the tracing subsystem. But maybe I
>> > > misunderstood the real reason. Steven, could you please verify if my
>> > > understanding of the high overhead cause is correct here? Maybe there
>> > > is something else at play that I missed?
>> >
>> > From what I understand, is that the compiler will only move code to the end
>> > of a function with the unlikely(). But, the code after the function could
>> > also be in the control flow path. If you have several functions that are
>> > called together, by adding code to the unlikely() cases may not help the
>> > speed.
>> >
>> > I made an effort to make the tracepoint code call functions instead of
>> > having everything inlined. It actually brought down the size of the text of
>> > the kernel, but looking in the change logs I never posted benchmarks. But
>> > I'm sure making the size of the scheduler text section smaller probably did
>> > help.
>> >
>> > > > That would be in line with my understanding above. Does the arm64 compiler
>> > > > not do it as well as x86 (could be maybe found out by disassembling) or the
>> > > > Pixel6 cpu somhow caches these out of line blocks more aggressively and only
>> > > > a function call stops it?
>> > >
>> > > I'll disassemble the code and will see what it looks like.
>> >
>> > I think I asked you to do that too ;-)
>>
>> Yes you did! And I disassembled almost each of these functions during
>> my investigation but in my infinite wisdom I did not save any of them.
>> So, now I need to do that again to answer Vlastimil's question. I'll
>> try to do that today.
>
> Yeah, quite a difference. This is alloc_tagging_slab_alloc_hook() with
> outlined version of __alloc_tagging_slab_alloc_hook():
Not fluent in arm64 assembly but let's see...
> ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
> ffffffc0803a2dd8: d503201f nop
> ffffffc0803a2ddc: d65f03c0 ret
So that's an immediate return unless static key rewrites the nop.
BTW, I wouldn't expect the alloc_tagging_slab_alloc_hook() to exist as a
separate function in the first place, since it's "static inline". It seems
weird to do a function call to a static key test. We should perhaps force
inline it.
> ffffffc0803a2de0: d503233f paciasp
> ffffffc0803a2de4: a9bf7bfd stp x29, x30, [sp, #-0x10]!
> ffffffc0803a2de8: 910003fd mov x29, sp
> ffffffc0803a2dec: 94000004 bl 0xffffffc0803a2dfc
> <__alloc_tagging_slab_alloc_hook>
> ffffffc0803a2df0: a8c17bfd ldp x29, x30, [sp], #0x10
> ffffffc0803a2df4: d50323bf autiasp
> ffffffc0803a2df8: d65f03c0 ret
>
> This is the same function with inlined version of
> __alloc_tagging_slab_alloc_hook():
>
> ffffffc0803a2dd8 <alloc_tagging_slab_alloc_hook>:
> ffffffc0803a2dd8: d503233f paciasp
> ffffffc0803a2ddc: d10103ff sub sp, sp, #0x40
> ffffffc0803a2de0: a9017bfd stp x29, x30, [sp, #0x10]
> ffffffc0803a2de4: f90013f5 str x21, [sp, #0x20]
> ffffffc0803a2de8: a9034ff4 stp x20, x19, [sp, #0x30]
> ffffffc0803a2dec: 910043fd add x29, sp, #0x10
> ffffffc0803a2df0: d503201f nop
> ffffffc0803a2df4: a9434ff4 ldp x20, x19, [sp, #0x30]
> ffffffc0803a2df8: f94013f5 ldr x21, [sp, #0x20]
> ffffffc0803a2dfc: a9417bfd ldp x29, x30, [sp, #0x10]
> ffffffc0803a2e00: 910103ff add sp, sp, #0x40
> ffffffc0803a2e04: d50323bf autiasp
> ffffffc0803a2e08: d65f03c0 ret
Seems to me this will also return unless the nop is rewritten, but instead
of making a call reachable there will be a jump to below?
Now is the overhead larger because the code below gets cached, or because
the block above is doing more in the disabled case? It looks quite suboptimal.
> ffffffc0803a2e0c: b4ffff41 cbz x1, 0xffffffc0803a2df4
> <alloc_tagging_slab_alloc_hook+0x1c>
> ffffffc0803a2e10: b9400808 ldr w8, [x0, #0x8]
> ffffffc0803a2e14: 12060049 and w9, w2, #0x4000000
> ffffffc0803a2e18: 12152108 and w8, w8, #0xff800
> ffffffc0803a2e1c: 120d6108 and w8, w8, #0xfff80fff
> ffffffc0803a2e20: 2a090108 orr w8, w8, w9
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-29 0:03 ` Steven Rostedt
@ 2025-01-29 9:50 ` Vlastimil Babka
2025-01-29 17:26 ` Suren Baghdasaryan
0 siblings, 1 reply; 13+ messages in thread
From: Vlastimil Babka @ 2025-01-29 9:50 UTC (permalink / raw)
To: Steven Rostedt, Suren Baghdasaryan
Cc: akpm, Peter Zijlstra, kent.overstreet, yuzhao, minchan,
shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
On 1/29/25 01:03, Steven Rostedt wrote:
> On Tue, 28 Jan 2025 15:43:13 -0800
> Suren Baghdasaryan <surenb@google.com> wrote:
>
>> > How slow is it to always do the call instead of inlining?
>>
>> Let's see... The additional overhead if we always call is:
>>
>> Little core: 2.42%
>> Middle core: 1.23%
>> Big core: 0.66%
>>
>> Not a huge deal because the overhead of memory profiling when enabled
>> is much higher. So, maybe for simplicity I should indeed always call?
>
> That's what I was thinking, unless the other maintainers are OK with this
> special logic.
If it's acceptable, I would prefer to always call. But at the same time make
sure the static key test is really inlined, i.e. force inline
alloc_tagging_slab_alloc_hook() (see my other reply looking at the disassembly).
Well or rather just open-code the contents of the
alloc_tagging_slab_alloc_hook and alloc_tagging_slab_free_hook (as they look
after this patch) into the callers. It's just two lines. The extra layer is
just unnecessary distraction.
Then it's probably inevitable the actual hook content after the static key
test should not be inline even with
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT as the result would be inlined
into too many places. But since we remove one call layer anyway thanks to
above, even without the full inlining the resulting performance could
hopefully be fine (compared to the state before your series).
> -- Steve
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator
2025-01-29 9:50 ` Vlastimil Babka
@ 2025-01-29 17:26 ` Suren Baghdasaryan
0 siblings, 0 replies; 13+ messages in thread
From: Suren Baghdasaryan @ 2025-01-29 17:26 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Steven Rostedt, akpm, Peter Zijlstra, kent.overstreet, yuzhao,
minchan, shakeel.butt, souravpanda, pasha.tatashin, 00107082,
quic_zhenhuah, linux-mm, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 2221 bytes --]
On Wed, Jan 29, 2025 at 1:50 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/29/25 01:03, Steven Rostedt wrote:
> > On Tue, 28 Jan 2025 15:43:13 -0800
> > Suren Baghdasaryan <surenb@google.com> wrote:
> >
> >> > How slow is it to always do the call instead of inlining?
> >>
> >> Let's see... The additional overhead if we always call is:
> >>
> >> Little core: 2.42%
> >> Middle core: 1.23%
> >> Big core: 0.66%
> >>
> >> Not a huge deal because the overhead of memory profiling when enabled
> >> is much higher. So, maybe for simplicity I should indeed always call?
> >
> > That's what I was thinking, unless the other maintainers are OK with this
> > special logic.
>
> If it's acceptable, I would prefer to always call.
Ok, I'll post that version. If this becomes an issue we can reconsider later.
> But at the same time make
> sure the static key test is really inlined, i.e. force inline
> alloc_tagging_slab_alloc_hook() (see my other reply looking at the disassembly).
Sorry, I should have made it clear that I uninlined
alloc_tagging_slab_alloc_hook() to localize the relevant code. If
reality it is inlined. Since inlined outputs are quite big, I'm
attaching disassembly of kmem_cache_alloc_noprof() which has
alloc_tagging_slab_alloc_hook() inlined in it.
>
> Well or rather just open-code the contents of the
> alloc_tagging_slab_alloc_hook and alloc_tagging_slab_free_hook (as they look
> after this patch) into the callers. It's just two lines. The extra layer is
> just unnecessary distraction.
alloc_tagging_slab_alloc_hook() is inlined, no need to open-code.
>
> Then it's probably inevitable the actual hook content after the static key
> test should not be inline even with
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT as the result would be inlined
> into too many places. But since we remove one call layer anyway thanks to
> above, even without the full inlining the resulting performance could
> hopefully be fine (compared to the state before your series).
Agree. Thanks for the feedback!
I'll prepare v2 with no dependency on
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT for not inlining (always
call).
>
> > -- Steve
>
[-- Attachment #2: noinline.txt --]
[-- Type: text/plain, Size: 14234 bytes --]
ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f paciasp
ffffffc080398e0c: d101c3ff sub sp, sp, #0x70
ffffffc080398e10: a9017bfd stp x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc stp x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa stp x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8 stp x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6 stp x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4 stp x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd add x29, sp, #0x10
ffffffc080398e2c: d5384108 mrs x8, SP_EL0
ffffffc080398e30: aa0003f3 mov x19, x0
ffffffc080398e34: 2a0103f4 mov w20, w1
ffffffc080398e38: f9430908 ldr x8, [x8, #0x610]
ffffffc080398e3c: f90007e8 str x8, [sp, #0x8]
ffffffc080398e40: d50320ff xpaclri
ffffffc080398e44: aa1e03f5 mov x21, x30
ffffffc080398e48: b4000d60 cbz x0, 0xffffffc080398ff4 <kmem_cache_alloc_noprof+0x1ec>
ffffffc080398e4c: b9401e77 ldr w23, [x19, #0x1c]
ffffffc080398e50: d503201f nop
ffffffc080398e54: f90003ff str xzr, [sp]
ffffffc080398e58: d538411a mrs x26, SP_EL0
ffffffc080398e5c: f9400268 ldr x8, [x19]
ffffffc080398e60: d538d089 mrs x9, TPIDR_EL1
ffffffc080398e64: 8b080128 add x8, x9, x8
ffffffc080398e68: f9400518 ldr x24, [x8, #0x8]
ffffffc080398e6c: f9400116 ldr x22, [x8]
ffffffc080398e70: f9400908 ldr x8, [x8, #0x10]
ffffffc080398e74: f10002df cmp x22, #0x0
ffffffc080398e78: fa401904 ccmp x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000da0 b.eq 0xffffffc080399030 <kmem_cache_alloc_noprof+0x228>
ffffffc080398e80: d378dec8 lsl x8, x22, #8
ffffffc080398e84: b9402a69 ldr w9, [x19, #0x28]
ffffffc080398e88: f9405e6a ldr x10, [x19, #0xb8]
ffffffc080398e8c: 91008303 add x3, x24, #0x20
ffffffc080398e90: 8b882128 add x8, x9, x8, asr #8
ffffffc080398e94: f9400109 ldr x9, [x8]
ffffffc080398e98: b940134b ldr w11, [x26, #0x10]
ffffffc080398e9c: dac00d08 rev x8, x8
ffffffc080398ea0: ca080148 eor x8, x10, x8
ffffffc080398ea4: 1100056b add w11, w11, #0x1
ffffffc080398ea8: ca090119 eor x25, x8, x9
ffffffc080398eac: b900134b str w11, [x26, #0x10]
ffffffc080398eb0: f940026b ldr x11, [x19]
ffffffc080398eb4: d538d08c mrs x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184 add x4, x12, x11
ffffffc080398ebc: 14000015 b 0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0 mov x0, x22
ffffffc080398ec4: aa1803e1 mov x1, x24
ffffffc080398ec8: aa1903e2 mov x2, x25
ffffffc080398ecc: 48207c82 casp x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ed4: f1000508 subs x8, x8, #0x1
ffffffc080398ed8: b9001348 str w8, [x26, #0x10]
ffffffc080398edc: 540000e0 b.eq 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ee4: b40000a8 cbz x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f cmp x1, x24
ffffffc080398eec: fa560000 ccmp x0, x22, #0x0, eq
ffffffc080398ef0: 54000200 b.eq 0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda b 0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb mov x27, x1
ffffffc080398efc: aa0003fc mov x28, x0
ffffffc080398f00: 94346cb4 bl 0xffffffc0810b41d0 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0 mov x0, x28
ffffffc080398f08: aa1b03e1 mov x1, x27
ffffffc080398f0c: 17fffff7 b 0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091 prfm pstl1strm, [x4]
ffffffc080398f14: c87f0480 ldxp x0, x1, [x4]
ffffffc080398f18: eb16001f cmp x0, x22
ffffffc080398f1c: fa580020 ccmp x1, x24, #0x0, eq
ffffffc080398f20: 54000061 b.ne 0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99 stxp w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68 cbnz w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9 b 0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68 ldr w8, [x19, #0x28]
ffffffc080398f34: 8b080328 add x8, x25, x8
ffffffc080398f38: f9800110 prfm pstl1keep, [x8]
ffffffc080398f3c: f90003f6 str x22, [sp]
ffffffc080398f40: d503201f nop
ffffffc080398f44: d503201f nop
ffffffc080398f48: f9402668 ldr x8, [x19, #0x48]
ffffffc080398f4c: b4000068 cbz x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8 mov w24, wzr
ffffffc080398f54: 14000008 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268 ldrh w8, [x19, #0x8]
ffffffc080398f5c: 52804089 mov w9, #0x204 // =516
ffffffc080398f60: 6a09011f tst w8, w9
ffffffc080398f64: 54000060 b.eq 0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298 ubfx w24, w20, #8, #1
ffffffc080398f6c: 14000002 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038 mov w24, #0x1 // =1
ffffffc080398f74: f000f308 adrp x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79 ldr w25, [x19, #0x1c]
ffffffc080398f7c: b9495908 ldr w8, [x8, #0x958]
ffffffc080398f80: d503201f nop
ffffffc080398f84: d503201f nop
ffffffc080398f88: 2a1803f7 mov w23, w24
ffffffc080398f8c: 14000007 b 0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102 and w2, w8, w20
ffffffc080398f94: aa1303e0 mov x0, x19
ffffffc080398f98: aa1603e1 mov x1, x22
ffffffc080398f9c: 2a1703e3 mov w3, w23
ffffffc080398fa0: 940077b3 bl 0xffffffc0803b6e6c <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6 mov x22, x0
ffffffc080398fa8: f10002df cmp x22, #0x0
ffffffc080398fac: 52000308 eor w8, w24, #0x1
ffffffc080398fb0: f90003f6 str x22, [sp]
ffffffc080398fb4: 1a9f1508 csinc w8, w8, wzr, ne
ffffffc080398fb8: 37000128 tbnz w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077 cbz w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002 b 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006 b 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2 mov w2, w25
ffffffc080398fcc: aa1603e0 mov x0, x22
ffffffc080398fd0: 2a1f03e1 mov w1, wzr
ffffffc080398fd4: 94337c7b bl 0xffffffc0810781c0 <memset>
ffffffc080398fd8: f94003f6 ldr x22, [sp]
ffffffc080398fdc: d503201f nop
ffffffc080398fe0: 14000004 b 0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc080398fe4: 37b00534 tbnz w20, #0x16, 0xffffffc080399088 <kmem_cache_alloc_noprof+0x280>
ffffffc080398fe8: 39402668 ldrb w8, [x19, #0x9]
ffffffc080398fec: 372804e8 tbnz w8, #0x5, 0xffffffc080399088 <kmem_cache_alloc_noprof+0x280>
ffffffc080398ff0: f94003e0 ldr x0, [sp]
ffffffc080398ff4: d503201f nop
ffffffc080398ff8: d5384108 mrs x8, SP_EL0
ffffffc080398ffc: f9430908 ldr x8, [x8, #0x610]
ffffffc080399000: f94007e9 ldr x9, [sp, #0x8]
ffffffc080399004: eb09011f cmp x8, x9
ffffffc080399008: 54000581 b.ne 0xffffffc0803990b8 <kmem_cache_alloc_noprof+0x2b0>
ffffffc08039900c: a9464ff4 ldp x20, x19, [sp, #0x60]
ffffffc080399010: a94557f6 ldp x22, x21, [sp, #0x50]
ffffffc080399014: a9445ff8 ldp x24, x23, [sp, #0x40]
ffffffc080399018: a94367fa ldp x26, x25, [sp, #0x30]
ffffffc08039901c: a9426ffc ldp x28, x27, [sp, #0x20]
ffffffc080399020: a9417bfd ldp x29, x30, [sp, #0x10]
ffffffc080399024: 9101c3ff add sp, sp, #0x70
ffffffc080399028: d50323bf autiasp
ffffffc08039902c: d65f03c0 ret
ffffffc080399030: d5384118 mrs x24, SP_EL0
ffffffc080399034: b9401308 ldr w8, [x24, #0x10]
ffffffc080399038: aa1303e0 mov x0, x19
ffffffc08039903c: 2a1403e1 mov w1, w20
ffffffc080399040: 12800002 mov w2, #-0x1 // =-1
ffffffc080399044: aa1503e3 mov x3, x21
ffffffc080399048: 11000508 add w8, w8, #0x1
ffffffc08039904c: 2a1703e5 mov w5, w23
ffffffc080399050: b9001308 str w8, [x24, #0x10]
ffffffc080399054: f9400268 ldr x8, [x19]
ffffffc080399058: d538d089 mrs x9, TPIDR_EL1
ffffffc08039905c: 8b080124 add x4, x9, x8
ffffffc080399060: 94001600 bl 0xffffffc08039e860 <___slab_alloc>
ffffffc080399064: aa0003f6 mov x22, x0
ffffffc080399068: f9400b08 ldr x8, [x24, #0x10]
ffffffc08039906c: f1000508 subs x8, x8, #0x1
ffffffc080399070: b9001308 str w8, [x24, #0x10]
ffffffc080399074: 54000060 b.eq 0xffffffc080399080 <kmem_cache_alloc_noprof+0x278>
ffffffc080399078: f9400b08 ldr x8, [x24, #0x10]
ffffffc08039907c: b5fff608 cbnz x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399080: 94346942 bl 0xffffffc0810b3588 <preempt_schedule>
ffffffc080399084: 17ffffae b 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399088: 910003e4 mov x4, sp
ffffffc08039908c: aa1303e0 mov x0, x19
ffffffc080399090: aa1f03e1 mov x1, xzr
ffffffc080399094: 2a1403e2 mov w2, w20
ffffffc080399098: 52800023 mov w3, #0x1 // =1
ffffffc08039909c: 940105fd bl 0xffffffc0803da890 <__memcg_slab_post_alloc_hook>
ffffffc0803990a0: 3707fa80 tbnz w0, #0x0, 0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc0803990a4: f94003e1 ldr x1, [sp]
ffffffc0803990a8: aa1303e0 mov x0, x19
ffffffc0803990ac: 940027e0 bl 0xffffffc0803a302c <memcg_alloc_abort_single>
ffffffc0803990b0: f90003ff str xzr, [sp]
ffffffc0803990b4: 17ffffcf b 0xffffffc080398ff0 <kmem_cache_alloc_noprof+0x1e8>
ffffffc0803990b8: 94345d5d bl 0xffffffc0810b062c <__stack_chk_fail>
ffffffc0803990bc: d5384117 mrs x23, SP_EL0
ffffffc0803990c0: b9402ae8 ldr w8, [x23, #0x28]
ffffffc0803990c4: b000f30a adrp x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990c8: 913ea14a add x10, x10, #0xfa8
ffffffc0803990cc: d343fd09 lsr x9, x8, #3
ffffffc0803990d0: 927d6529 and x9, x9, #0x1ffffff8
ffffffc0803990d4: f8696949 ldr x9, [x10, x9]
ffffffc0803990d8: 9ac82528 lsr x8, x9, x8
ffffffc0803990dc: 3607f8e8 tbz w8, #0x0, 0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc0803990e0: b94012e8 ldr w8, [x23, #0x10]
ffffffc0803990e4: aa0003f6 mov x22, x0
ffffffc0803990e8: aa1f03e0 mov x0, xzr
ffffffc0803990ec: aa1503e1 mov x1, x21
ffffffc0803990f0: aa1603e2 mov x2, x22
ffffffc0803990f4: aa1303e3 mov x3, x19
ffffffc0803990f8: 11000508 add w8, w8, #0x1
ffffffc0803990fc: 2a1403e4 mov w4, w20
ffffffc080399100: 12800005 mov w5, #-0x1 // =-1
ffffffc080399104: b90012e8 str w8, [x23, #0x10]
ffffffc080399108: 97fea8c1 bl 0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc08039910c: f9400ae8 ldr x8, [x23, #0x10]
ffffffc080399110: f1000508 subs x8, x8, #0x1
ffffffc080399114: b90012e8 str w8, [x23, #0x10]
ffffffc080399118: 54000080 b.eq 0xffffffc080399128 <kmem_cache_alloc_noprof+0x320>
ffffffc08039911c: f9400ae8 ldr x8, [x23, #0x10]
ffffffc080399120: aa1603e0 mov x0, x22
ffffffc080399124: b5fff6a8 cbnz x8, 0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc080399128: 94346c2a bl 0xffffffc0810b41d0 <preempt_schedule_notrace>
ffffffc08039912c: aa1603e0 mov x0, x22
ffffffc080399130: 17ffffb2 b 0xffffffc080398ff8 <kmem_cache_alloc_noprof+0x1f0>
ffffffc080399134: 9000f9a8 adrp x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399138: b94e8908 ldr w8, [x8, #0xe88]
ffffffc08039913c: 7100051f cmp w8, #0x1
ffffffc080399140: 54ffe8aa b.ge 0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399144: aa1303e0 mov x0, x19
ffffffc080399148: aa1703e1 mov x1, x23
ffffffc08039914c: 2a1403e2 mov w2, w20
ffffffc080399150: 940080f0 bl 0xffffffc0803b9510 <__kfence_alloc>
ffffffc080399154: f90003e0 str x0, [sp]
ffffffc080399158: b4ffe800 cbz x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc08039915c: aa0003f6 mov x22, x0
ffffffc080399160: 2a1f03f8 mov w24, wzr
ffffffc080399164: 17ffff84 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399168: f9402668 ldr x8, [x19, #0x48]
ffffffc08039916c: b5ffeec8 cbnz x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399170: 79401268 ldrh w8, [x19, #0x8]
ffffffc080399174: 52804089 mov w9, #0x204 // =516
ffffffc080399178: 6a09011f tst w8, w9
ffffffc08039917c: 54ffee41 b.ne 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399180: b4ffee36 cbz x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399184: b9402a68 ldr w8, [x19, #0x28]
ffffffc080399188: b9405269 ldr w9, [x19, #0x50]
ffffffc08039918c: 6b09011f cmp w8, w9
ffffffc080399190: 54ffeda2 b.hs 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399194: 9340dec9 sbfx x9, x22, #0, #56
ffffffc080399198: f828693f str xzr, [x9, x8]
ffffffc08039919c: 17ffff6a b 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: b9400a69 ldr w9, [x19, #0x8]
ffffffc0803991a4: 5280104a mov w10, #0x82 // =130
ffffffc0803991a8: 6a0a013f tst w9, w10
ffffffc0803991ac: 54ffeec0 b.eq 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991b0: 721d013f tst w9, #0x8
ffffffc0803991b4: 1a970339 csel w25, w25, w23, eq
ffffffc0803991b8: 17ffff73 b 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 2a1f03f7 mov w23, wzr
ffffffc0803991c0: 17ffff73 b 0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991c4: aa1303e0 mov x0, x19
ffffffc0803991c8: aa1603e1 mov x1, x22
ffffffc0803991cc: 2a1403e2 mov w2, w20
ffffffc0803991d0: 94002714 bl 0xffffffc0803a2e20 <__alloc_tagging_slab_alloc_hook>
ffffffc0803991d4: 17ffff83 b 0xffffffc080398fe0 <kmem_cache_alloc_noprof+0x1d8>
ffffffc0803991d8: 77 48 22 d5 .word 0xd5224877
[-- Attachment #3: inline.txt --]
[-- Type: text/plain, Size: 28223 bytes --]
ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f paciasp
ffffffc080398e0c: d101c3ff sub sp, sp, #0x70
ffffffc080398e10: a9017bfd stp x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc stp x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa stp x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8 stp x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6 stp x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4 stp x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd add x29, sp, #0x10
ffffffc080398e2c: d5384108 mrs x8, SP_EL0
ffffffc080398e30: aa0003f3 ffffffc080398e08 <kmem_cache_alloc_noprof>:
ffffffc080398e08: d503233f paciasp
ffffffc080398e0c: d101c3ff sub sp, sp, #0x70
ffffffc080398e10: a9017bfd stp x29, x30, [sp, #0x10]
ffffffc080398e14: a9026ffc stp x28, x27, [sp, #0x20]
ffffffc080398e18: a90367fa stp x26, x25, [sp, #0x30]
ffffffc080398e1c: a9045ff8 stp x24, x23, [sp, #0x40]
ffffffc080398e20: a90557f6 stp x22, x21, [sp, #0x50]
ffffffc080398e24: a9064ff4 stp x20, x19, [sp, #0x60]
ffffffc080398e28: 910043fd add x29, sp, #0x10
ffffffc080398e2c: d5384108 mrs x8, SP_EL0
ffffffc080398e30: aa0003f3 mov x19, x0
ffffffc080398e34: 2a0103f4 mov w20, w1
ffffffc080398e38: f9430908 ldr x8, [x8, #0x610]
ffffffc080398e3c: f90007e8 str x8, [sp, #0x8]
ffffffc080398e40: d50320ff xpaclri
ffffffc080398e44: aa1e03f5 mov x21, x30
ffffffc080398e48: b4000dc0 cbz x0, 0xffffffc080399000 <kmem_cache_alloc_noprof+0x1f8>
ffffffc080398e4c: b9401e77 ldr w23, [x19, #0x1c]
ffffffc080398e50: d503201f nop
ffffffc080398e54: f90003ff str xzr, [sp]
ffffffc080398e58: d538411a mrs x26, SP_EL0
ffffffc080398e5c: f9400268 ldr x8, [x19]
ffffffc080398e60: d538d089 mrs x9, TPIDR_EL1
ffffffc080398e64: 8b080128 add x8, x9, x8
ffffffc080398e68: f9400518 ldr x24, [x8, #0x8]
ffffffc080398e6c: f9400116 ldr x22, [x8]
ffffffc080398e70: f9400908 ldr x8, [x8, #0x10]
ffffffc080398e74: f10002df cmp x22, #0x0
ffffffc080398e78: fa401904 ccmp x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000e00 b.eq 0xffffffc08039903c <kmem_cache_alloc_noprof+0x234>
ffffffc080398e80: d378dec8 lsl x8, x22, #8
ffffffc080398e84: b9402a69 ldr w9, [x19, #0x28]
ffffffc080398e88: f9405e6a ldr x10, [x19, #0xb8]
ffffffc080398e8c: 91008303 add x3, x24, #0x20
ffffffc080398e90: 8b882128 add x8, x9, x8, asr #8
ffffffc080398e94: f9400109 ldr x9, [x8]
ffffffc080398e98: b940134b ldr w11, [x26, #0x10]
ffffffc080398e9c: dac00d08 rev x8, x8
ffffffc080398ea0: ca080148 eor x8, x10, x8
ffffffc080398ea4: 1100056b add w11, w11, #0x1
ffffffc080398ea8: ca090119 eor x25, x8, x9
ffffffc080398eac: b900134b str w11, [x26, #0x10]
ffffffc080398eb0: f940026b ldr x11, [x19]
ffffffc080398eb4: d538d08c mrs x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184 add x4, x12, x11
ffffffc080398ebc: 14000015 b 0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0 mov x0, x22
ffffffc080398ec4: aa1803e1 mov x1, x24
ffffffc080398ec8: aa1903e2 mov x2, x25
ffffffc080398ecc: 48207c82 casp x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ed4: f1000508 subs x8, x8, #0x1
ffffffc080398ed8: b9001348 str w8, [x26, #0x10]
ffffffc080398edc: 540000e0 b.eq 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ee4: b40000a8 cbz x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f cmp x1, x24
ffffffc080398eec: fa560000 ccmp x0, x22, #0x0, eq
ffffffc080398ef0: 54000200 b.eq 0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda b 0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb mov x27, x1
ffffffc080398efc: aa0003fc mov x28, x0
ffffffc080398f00: 94346d20 bl 0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0 mov x0, x28
ffffffc080398f08: aa1b03e1 mov x1, x27
ffffffc080398f0c: 17fffff7 b 0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091 prfm pstl1strm, [x4]
ffffffc080398f14: c87f0480 ldxp x0, x1, [x4]
ffffffc080398f18: eb16001f cmp x0, x22
ffffffc080398f1c: fa580020 ccmp x1, x24, #0x0, eq
ffffffc080398f20: 54000061 b.ne 0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99 stxp w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68 cbnz w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9 b 0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68 ldr w8, [x19, #0x28]
ffffffc080398f34: 8b080328 add x8, x25, x8
ffffffc080398f38: f9800110 prfm pstl1keep, [x8]
ffffffc080398f3c: f90003f6 str x22, [sp]
ffffffc080398f40: d503201f nop
ffffffc080398f44: d503201f nop
ffffffc080398f48: f9402668 ldr x8, [x19, #0x48]
ffffffc080398f4c: b4000068 cbz x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8 mov w24, wzr
ffffffc080398f54: 14000008 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268 ldrh w8, [x19, #0x8]
ffffffc080398f5c: 52804089 mov w9, #0x204 // =516
ffffffc080398f60: 6a09011f tst w8, w9
ffffffc080398f64: 54000060 b.eq 0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298 ubfx w24, w20, #8, #1
ffffffc080398f6c: 14000002 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038 mov w24, #0x1 // =1
ffffffc080398f74: f000f308 adrp x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79 ldr w25, [x19, #0x1c]
ffffffc080398f7c: b9495908 ldr w8, [x8, #0x958]
ffffffc080398f80: d503201f nop
ffffffc080398f84: d503201f nop
ffffffc080398f88: 2a1803f7 mov w23, w24
ffffffc080398f8c: 14000007 b 0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102 and w2, w8, w20
ffffffc080398f94: aa1303e0 mov x0, x19
ffffffc080398f98: aa1603e1 mov x1, x22
ffffffc080398f9c: 2a1703e3 mov w3, w23
ffffffc080398fa0: 9400781d bl 0xffffffc0803b7014 <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6 mov x22, x0
ffffffc080398fa8: f10002df cmp x22, #0x0
ffffffc080398fac: 52000308 eor w8, w24, #0x1
ffffffc080398fb0: f90003f6 str x22, [sp]
ffffffc080398fb4: 1a9f1508 csinc w8, w8, wzr, ne
ffffffc080398fb8: 37000128 tbnz w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077 cbz w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002 b 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006 b 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2 mov w2, w25
ffffffc080398fcc: aa1603e0 mov x0, x22
ffffffc080398fd0: 2a1f03e1 mov w1, wzr
ffffffc080398fd4: 94337ceb bl 0xffffffc081078380 <memset>
ffffffc080398fd8: f94003f6 ldr x22, [sp]
ffffffc080398fdc: aa1303e0 mov x0, x19
ffffffc080398fe0: aa1603e1 mov x1, x22
ffffffc080398fe4: 2a1403e2 mov w2, w20
ffffffc080398fe8: 940027d6 bl 0xffffffc0803a2f40 <alloc_tagging_slab_alloc_hook>
ffffffc080398fec: 14000004 b 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc080398ff0: 37b00534 tbnz w20, #0x16, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ff4: 39402668 ldrb w8, [x19, #0x9]
ffffffc080398ff8: 372804e8 tbnz w8, #0x5, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ffc: f94003e0 ldr x0, [sp]
ffffffc080399000: d503201f nop
ffffffc080399004: d5384108 mrs x8, SP_EL0
ffffffc080399008: f9430908 ldr x8, [x8, #0x610]
ffffffc08039900c: f94007e9 ldr x9, [sp, #0x8]
ffffffc080399010: eb09011f cmp x8, x9
ffffffc080399014: 54000581 b.ne 0xffffffc0803990c4 <kmem_cache_alloc_noprof+0x2bc>
ffffffc080399018: a9464ff4 ldp x20, x19, [sp, #0x60]
ffffffc08039901c: a94557f6 ldp x22, x21, [sp, #0x50]
ffffffc080399020: a9445ff8 ldp x24, x23, [sp, #0x40]
ffffffc080399024: a94367fa ldp x26, x25, [sp, #0x30]
ffffffc080399028: a9426ffc ldp x28, x27, [sp, #0x20]
ffffffc08039902c: a9417bfd ldp x29, x30, [sp, #0x10]
ffffffc080399030: 9101c3ff add sp, sp, #0x70
ffffffc080399034: d50323bf autiasp
ffffffc080399038: d65f03c0 ret
ffffffc08039903c: d5384118 mrs x24, SP_EL0
ffffffc080399040: b9401308 ldr w8, [x24, #0x10]
ffffffc080399044: aa1303e0 mov x0, x19
ffffffc080399048: 2a1403e1 mov w1, w20
ffffffc08039904c: 12800002 mov w2, #-0x1 // =-1
ffffffc080399050: aa1503e3 mov x3, x21
ffffffc080399054: 11000508 add w8, w8, #0x1
ffffffc080399058: 2a1703e5 mov w5, w23
ffffffc08039905c: b9001308 str w8, [x24, #0x10]
ffffffc080399060: f9400268 ldr x8, [x19]
ffffffc080399064: d538d089 mrs x9, TPIDR_EL1
ffffffc080399068: 8b080124 add x4, x9, x8
ffffffc08039906c: 94001645 bl 0xffffffc08039e980 <___slab_alloc>
ffffffc080399070: aa0003f6 mov x22, x0
ffffffc080399074: f9400b08 ldr x8, [x24, #0x10]
ffffffc080399078: f1000508 subs x8, x8, #0x1
ffffffc08039907c: b9001308 str w8, [x24, #0x10]
ffffffc080399080: 54000060 b.eq 0xffffffc08039908c <kmem_cache_alloc_noprof+0x284>
ffffffc080399084: f9400b08 ldr x8, [x24, #0x10]
ffffffc080399088: b5fff5a8 cbnz x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc08039908c: 943469ab bl 0xffffffc0810b3738 <preempt_schedule>
ffffffc080399090: 17ffffab b 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399094: 910003e4 mov x4, sp
ffffffc080399098: aa1303e0 mov x0, x19
ffffffc08039909c: aa1f03e1 mov x1, xzr
ffffffc0803990a0: 2a1403e2 mov w2, w20
ffffffc0803990a4: 52800023 mov w3, #0x1 // =1
ffffffc0803990a8: 94010664 bl 0xffffffc0803daa38 <__memcg_slab_post_alloc_hook>
ffffffc0803990ac: 3707fa80 tbnz w0, #0x0, 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990b0: f94003e1 ldr x1, [sp]
ffffffc0803990b4: aa1303e0 mov x0, x19
ffffffc0803990b8: 94002833 bl 0xffffffc0803a3184 <memcg_alloc_abort_single>
ffffffc0803990bc: f90003ff str xzr, [sp]
ffffffc0803990c0: 17ffffcf b 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990c4: 94345dc6 bl 0xffffffc0810b07dc <__stack_chk_fail>
ffffffc0803990c8: d5384117 mrs x23, SP_EL0
ffffffc0803990cc: b9402ae8 ldr w8, [x23, #0x28]
ffffffc0803990d0: b000f30a adrp x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990d4: 913ea14a add x10, x10, #0xfa8
ffffffc0803990d8: d343fd09 lsr x9, x8, #3
ffffffc0803990dc: 927d6529 and x9, x9, #0x1ffffff8
ffffffc0803990e0: f8696949 ldr x9, [x10, x9]
ffffffc0803990e4: 9ac82528 lsr x8, x9, x8
ffffffc0803990e8: 3607f8e8 tbz w8, #0x0, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc0803990ec: b94012e8 ldr w8, [x23, #0x10]
ffffffc0803990f0: aa0003f6 mov x22, x0
ffffffc0803990f4: aa1f03e0 mov x0, xzr
ffffffc0803990f8: aa1503e1 mov x1, x21
ffffffc0803990fc: aa1603e2 mov x2, x22
ffffffc080399100: aa1303e3 mov x3, x19
ffffffc080399104: 11000508 add w8, w8, #0x1
ffffffc080399108: 2a1403e4 mov w4, w20
ffffffc08039910c: 12800005 mov w5, #-0x1 // =-1
ffffffc080399110: b90012e8 str w8, [x23, #0x10]
ffffffc080399114: 97fea8be bl 0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc080399118: f9400ae8 ldr x8, [x23, #0x10]
ffffffc08039911c: f1000508 subs x8, x8, #0x1
ffffffc080399120: b90012e8 str w8, [x23, #0x10]
ffffffc080399124: 54000080 b.eq 0xffffffc080399134 <kmem_cache_alloc_noprof+0x32c>
ffffffc080399128: f9400ae8 ldr x8, [x23, #0x10]
ffffffc08039912c: aa1603e0 mov x0, x22
ffffffc080399130: b5fff6a8 cbnz x8, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399134: 94346c93 bl 0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080399138: aa1603e0 mov x0, x22
ffffffc08039913c: 17ffffb2 b 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399140: 9000f9a8 adrp x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399144: b94e8908 ldr w8, [x8, #0xe88]
ffffffc080399148: 7100051f cmp w8, #0x1
ffffffc08039914c: 54ffe84a b.ge 0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399150: aa1303e0 mov x0, x19
ffffffc080399154: aa1703e1 mov x1, x23
ffffffc080399158: 2a1403e2 mov w2, w20
ffffffc08039915c: 94008157 bl 0xffffffc0803b96b8 <__kfence_alloc>
ffffffc080399160: f90003e0 str x0, [sp]
ffffffc080399164: b4ffe7a0 cbz x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc080399168: aa0003f6 mov x22, x0
ffffffc08039916c: 2a1f03f8 mov w24, wzr
ffffffc080399170: 17ffff81 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399174: f9402668 ldr x8, [x19, #0x48]
ffffffc080399178: b5ffee68 cbnz x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039917c: 79401268 ldrh w8, [x19, #0x8]
ffffffc080399180: 52804089 mov w9, #0x204 // =516
ffffffc080399184: 6a09011f tst w8, w9
ffffffc080399188: 54ffede1 b.ne 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039918c: b4ffedd6 cbz x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399190: b9402a68 ldr w8, [x19, #0x28]
ffffffc080399194: b9405269 ldr w9, [x19, #0x50]
ffffffc080399198: 6b09011f cmp w8, w9
ffffffc08039919c: 54ffed42 b.hs 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: 9340dec9 sbfx x9, x22, #0, #56
ffffffc0803991a4: f828693f str xzr, [x9, x8]
ffffffc0803991a8: 17ffff67 b 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991ac: b9400a69 ldr w9, [x19, #0x8]
ffffffc0803991b0: 5280104a mov w10, #0x82 // =130
ffffffc0803991b4: 6a0a013f tst w9, w10
ffffffc0803991b8: 54ffee60 b.eq 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 721d013f tst w9, #0x8
ffffffc0803991c0: 1a970339 csel w25, w25, w23, eq
ffffffc0803991c4: 17ffff70 b 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991c8: 2a1f03f7 mov w23, wzr
ffffffc0803991cc: 17ffff70 b 0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991d0: 77 48 22 d5 .word 0xd5224877
mov x19, x0
ffffffc080398e34: 2a0103f4 mov w20, w1
ffffffc080398e38: f9430908 ldr x8, [x8, #0x610]
ffffffc080398e3c: f90007e8 str x8, [sp, #0x8]
ffffffc080398e40: d50320ff xpaclri
ffffffc080398e44: aa1e03f5 mov x21, x30
ffffffc080398e48: b4000dc0 cbz x0, 0xffffffc080399000 <kmem_cache_alloc_noprof+0x1f8>
ffffffc080398e4c: b9401e77 ldr w23, [x19, #0x1c]
ffffffc080398e50: d503201f nop
ffffffc080398e54: f90003ff str xzr, [sp]
ffffffc080398e58: d538411a mrs x26, SP_EL0
ffffffc080398e5c: f9400268 ldr x8, [x19]
ffffffc080398e60: d538d089 mrs x9, TPIDR_EL1
ffffffc080398e64: 8b080128 add x8, x9, x8
ffffffc080398e68: f9400518 ldr x24, [x8, #0x8]
ffffffc080398e6c: f9400116 ldr x22, [x8]
ffffffc080398e70: f9400908 ldr x8, [x8, #0x10]
ffffffc080398e74: f10002df cmp x22, #0x0
ffffffc080398e78: fa401904 ccmp x8, #0x0, #0x4, ne
ffffffc080398e7c: 54000e00 b.eq 0xffffffc08039903c <kmem_cache_alloc_noprof+0x234>
ffffffc080398e80: d378dec8 lsl x8, x22, #8
ffffffc080398e84: b9402a69 ldr w9, [x19, #0x28]
ffffffc080398e88: f9405e6a ldr x10, [x19, #0xb8]
ffffffc080398e8c: 91008303 add x3, x24, #0x20
ffffffc080398e90: 8b882128 add x8, x9, x8, asr #8
ffffffc080398e94: f9400109 ldr x9, [x8]
ffffffc080398e98: b940134b ldr w11, [x26, #0x10]
ffffffc080398e9c: dac00d08 rev x8, x8
ffffffc080398ea0: ca080148 eor x8, x10, x8
ffffffc080398ea4: 1100056b add w11, w11, #0x1
ffffffc080398ea8: ca090119 eor x25, x8, x9
ffffffc080398eac: b900134b str w11, [x26, #0x10]
ffffffc080398eb0: f940026b ldr x11, [x19]
ffffffc080398eb4: d538d08c mrs x12, TPIDR_EL1
ffffffc080398eb8: 8b0b0184 add x4, x12, x11
ffffffc080398ebc: 14000015 b 0xffffffc080398f10 <kmem_cache_alloc_noprof+0x108>
ffffffc080398ec0: aa1603e0 mov x0, x22
ffffffc080398ec4: aa1803e1 mov x1, x24
ffffffc080398ec8: aa1903e2 mov x2, x25
ffffffc080398ecc: 48207c82 casp x0, x1, x2, x3, [x4]
ffffffc080398ed0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ed4: f1000508 subs x8, x8, #0x1
ffffffc080398ed8: b9001348 str w8, [x26, #0x10]
ffffffc080398edc: 540000e0 b.eq 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee0: f9400b48 ldr x8, [x26, #0x10]
ffffffc080398ee4: b40000a8 cbz x8, 0xffffffc080398ef8 <kmem_cache_alloc_noprof+0xf0>
ffffffc080398ee8: eb18003f cmp x1, x24
ffffffc080398eec: fa560000 ccmp x0, x22, #0x0, eq
ffffffc080398ef0: 54000200 b.eq 0xffffffc080398f30 <kmem_cache_alloc_noprof+0x128>
ffffffc080398ef4: 17ffffda b 0xffffffc080398e5c <kmem_cache_alloc_noprof+0x54>
ffffffc080398ef8: aa0103fb mov x27, x1
ffffffc080398efc: aa0003fc mov x28, x0
ffffffc080398f00: 94346d20 bl 0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080398f04: aa1c03e0 mov x0, x28
ffffffc080398f08: aa1b03e1 mov x1, x27
ffffffc080398f0c: 17fffff7 b 0xffffffc080398ee8 <kmem_cache_alloc_noprof+0xe0>
ffffffc080398f10: f9800091 prfm pstl1strm, [x4]
ffffffc080398f14: c87f0480 ldxp x0, x1, [x4]
ffffffc080398f18: eb16001f cmp x0, x22
ffffffc080398f1c: fa580020 ccmp x1, x24, #0x0, eq
ffffffc080398f20: 54000061 b.ne 0xffffffc080398f2c <kmem_cache_alloc_noprof+0x124>
ffffffc080398f24: c8280c99 stxp w8, x25, x3, [x4]
ffffffc080398f28: 35ffff68 cbnz w8, 0xffffffc080398f14 <kmem_cache_alloc_noprof+0x10c>
ffffffc080398f2c: 17ffffe9 b 0xffffffc080398ed0 <kmem_cache_alloc_noprof+0xc8>
ffffffc080398f30: b9402a68 ldr w8, [x19, #0x28]
ffffffc080398f34: 8b080328 add x8, x25, x8
ffffffc080398f38: f9800110 prfm pstl1keep, [x8]
ffffffc080398f3c: f90003f6 str x22, [sp]
ffffffc080398f40: d503201f nop
ffffffc080398f44: d503201f nop
ffffffc080398f48: f9402668 ldr x8, [x19, #0x48]
ffffffc080398f4c: b4000068 cbz x8, 0xffffffc080398f58 <kmem_cache_alloc_noprof+0x150>
ffffffc080398f50: 2a1f03f8 mov w24, wzr
ffffffc080398f54: 14000008 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f58: 79401268 ldrh w8, [x19, #0x8]
ffffffc080398f5c: 52804089 mov w9, #0x204 // =516
ffffffc080398f60: 6a09011f tst w8, w9
ffffffc080398f64: 54000060 b.eq 0xffffffc080398f70 <kmem_cache_alloc_noprof+0x168>
ffffffc080398f68: 53082298 ubfx w24, w20, #8, #1
ffffffc080398f6c: 14000002 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080398f70: 52800038 mov w24, #0x1 // =1
ffffffc080398f74: f000f308 adrp x8, 0xffffffc0821fb000 <max_load_balance_interval>
ffffffc080398f78: b9401e79 ldr w25, [x19, #0x1c]
ffffffc080398f7c: b9495908 ldr w8, [x8, #0x958]
ffffffc080398f80: d503201f nop
ffffffc080398f84: d503201f nop
ffffffc080398f88: 2a1803f7 mov w23, w24
ffffffc080398f8c: 14000007 b 0xffffffc080398fa8 <kmem_cache_alloc_noprof+0x1a0>
ffffffc080398f90: 0a140102 and w2, w8, w20
ffffffc080398f94: aa1303e0 mov x0, x19
ffffffc080398f98: aa1603e1 mov x1, x22
ffffffc080398f9c: 2a1703e3 mov w3, w23
ffffffc080398fa0: 9400781d bl 0xffffffc0803b7014 <__kasan_slab_alloc>
ffffffc080398fa4: aa0003f6 mov x22, x0
ffffffc080398fa8: f10002df cmp x22, #0x0
ffffffc080398fac: 52000308 eor w8, w24, #0x1
ffffffc080398fb0: f90003f6 str x22, [sp]
ffffffc080398fb4: 1a9f1508 csinc w8, w8, wzr, ne
ffffffc080398fb8: 37000128 tbnz w8, #0x0, 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fbc: 34000077 cbz w23, 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc0: 14000002 b 0xffffffc080398fc8 <kmem_cache_alloc_noprof+0x1c0>
ffffffc080398fc4: 14000006 b 0xffffffc080398fdc <kmem_cache_alloc_noprof+0x1d4>
ffffffc080398fc8: 2a1903e2 mov w2, w25
ffffffc080398fcc: aa1603e0 mov x0, x22
ffffffc080398fd0: 2a1f03e1 mov w1, wzr
ffffffc080398fd4: 94337ceb bl 0xffffffc081078380 <memset>
ffffffc080398fd8: f94003f6 ldr x22, [sp]
ffffffc080398fdc: aa1303e0 mov x0, x19
ffffffc080398fe0: aa1603e1 mov x1, x22
ffffffc080398fe4: 2a1403e2 mov w2, w20
ffffffc080398fe8: 940027d6 bl 0xffffffc0803a2f40 <alloc_tagging_slab_alloc_hook>
ffffffc080398fec: 14000004 b 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc080398ff0: 37b00534 tbnz w20, #0x16, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ff4: 39402668 ldrb w8, [x19, #0x9]
ffffffc080398ff8: 372804e8 tbnz w8, #0x5, 0xffffffc080399094 <kmem_cache_alloc_noprof+0x28c>
ffffffc080398ffc: f94003e0 ldr x0, [sp]
ffffffc080399000: d503201f nop
ffffffc080399004: d5384108 mrs x8, SP_EL0
ffffffc080399008: f9430908 ldr x8, [x8, #0x610]
ffffffc08039900c: f94007e9 ldr x9, [sp, #0x8]
ffffffc080399010: eb09011f cmp x8, x9
ffffffc080399014: 54000581 b.ne 0xffffffc0803990c4 <kmem_cache_alloc_noprof+0x2bc>
ffffffc080399018: a9464ff4 ldp x20, x19, [sp, #0x60]
ffffffc08039901c: a94557f6 ldp x22, x21, [sp, #0x50]
ffffffc080399020: a9445ff8 ldp x24, x23, [sp, #0x40]
ffffffc080399024: a94367fa ldp x26, x25, [sp, #0x30]
ffffffc080399028: a9426ffc ldp x28, x27, [sp, #0x20]
ffffffc08039902c: a9417bfd ldp x29, x30, [sp, #0x10]
ffffffc080399030: 9101c3ff add sp, sp, #0x70
ffffffc080399034: d50323bf autiasp
ffffffc080399038: d65f03c0 ret
ffffffc08039903c: d5384118 mrs x24, SP_EL0
ffffffc080399040: b9401308 ldr w8, [x24, #0x10]
ffffffc080399044: aa1303e0 mov x0, x19
ffffffc080399048: 2a1403e1 mov w1, w20
ffffffc08039904c: 12800002 mov w2, #-0x1 // =-1
ffffffc080399050: aa1503e3 mov x3, x21
ffffffc080399054: 11000508 add w8, w8, #0x1
ffffffc080399058: 2a1703e5 mov w5, w23
ffffffc08039905c: b9001308 str w8, [x24, #0x10]
ffffffc080399060: f9400268 ldr x8, [x19]
ffffffc080399064: d538d089 mrs x9, TPIDR_EL1
ffffffc080399068: 8b080124 add x4, x9, x8
ffffffc08039906c: 94001645 bl 0xffffffc08039e980 <___slab_alloc>
ffffffc080399070: aa0003f6 mov x22, x0
ffffffc080399074: f9400b08 ldr x8, [x24, #0x10]
ffffffc080399078: f1000508 subs x8, x8, #0x1
ffffffc08039907c: b9001308 str w8, [x24, #0x10]
ffffffc080399080: 54000060 b.eq 0xffffffc08039908c <kmem_cache_alloc_noprof+0x284>
ffffffc080399084: f9400b08 ldr x8, [x24, #0x10]
ffffffc080399088: b5fff5a8 cbnz x8, 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc08039908c: 943469ab bl 0xffffffc0810b3738 <preempt_schedule>
ffffffc080399090: 17ffffab b 0xffffffc080398f3c <kmem_cache_alloc_noprof+0x134>
ffffffc080399094: 910003e4 mov x4, sp
ffffffc080399098: aa1303e0 mov x0, x19
ffffffc08039909c: aa1f03e1 mov x1, xzr
ffffffc0803990a0: 2a1403e2 mov w2, w20
ffffffc0803990a4: 52800023 mov w3, #0x1 // =1
ffffffc0803990a8: 94010664 bl 0xffffffc0803daa38 <__memcg_slab_post_alloc_hook>
ffffffc0803990ac: 3707fa80 tbnz w0, #0x0, 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990b0: f94003e1 ldr x1, [sp]
ffffffc0803990b4: aa1303e0 mov x0, x19
ffffffc0803990b8: 94002833 bl 0xffffffc0803a3184 <memcg_alloc_abort_single>
ffffffc0803990bc: f90003ff str xzr, [sp]
ffffffc0803990c0: 17ffffcf b 0xffffffc080398ffc <kmem_cache_alloc_noprof+0x1f4>
ffffffc0803990c4: 94345dc6 bl 0xffffffc0810b07dc <__stack_chk_fail>
ffffffc0803990c8: d5384117 mrs x23, SP_EL0
ffffffc0803990cc: b9402ae8 ldr w8, [x23, #0x28]
ffffffc0803990d0: b000f30a adrp x10, 0xffffffc0821fa000 <nf_conntrack_locks+0x500>
ffffffc0803990d4: 913ea14a add x10, x10, #0xfa8
ffffffc0803990d8: d343fd09 lsr x9, x8, #3
ffffffc0803990dc: 927d6529 and x9, x9, #0x1ffffff8
ffffffc0803990e0: f8696949 ldr x9, [x10, x9]
ffffffc0803990e4: 9ac82528 lsr x8, x9, x8
ffffffc0803990e8: 3607f8e8 tbz w8, #0x0, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc0803990ec: b94012e8 ldr w8, [x23, #0x10]
ffffffc0803990f0: aa0003f6 mov x22, x0
ffffffc0803990f4: aa1f03e0 mov x0, xzr
ffffffc0803990f8: aa1503e1 mov x1, x21
ffffffc0803990fc: aa1603e2 mov x2, x22
ffffffc080399100: aa1303e3 mov x3, x19
ffffffc080399104: 11000508 add w8, w8, #0x1
ffffffc080399108: 2a1403e4 mov w4, w20
ffffffc08039910c: 12800005 mov w5, #-0x1 // =-1
ffffffc080399110: b90012e8 str w8, [x23, #0x10]
ffffffc080399114: 97fea8be bl 0xffffffc08034340c <__traceiter_kmem_cache_alloc>
ffffffc080399118: f9400ae8 ldr x8, [x23, #0x10]
ffffffc08039911c: f1000508 subs x8, x8, #0x1
ffffffc080399120: b90012e8 str w8, [x23, #0x10]
ffffffc080399124: 54000080 b.eq 0xffffffc080399134 <kmem_cache_alloc_noprof+0x32c>
ffffffc080399128: f9400ae8 ldr x8, [x23, #0x10]
ffffffc08039912c: aa1603e0 mov x0, x22
ffffffc080399130: b5fff6a8 cbnz x8, 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399134: 94346c93 bl 0xffffffc0810b4380 <preempt_schedule_notrace>
ffffffc080399138: aa1603e0 mov x0, x22
ffffffc08039913c: 17ffffb2 b 0xffffffc080399004 <kmem_cache_alloc_noprof+0x1fc>
ffffffc080399140: 9000f9a8 adrp x8, 0xffffffc0822cd000 <page_alloc_sysctl_table+0xa8>
ffffffc080399144: b94e8908 ldr w8, [x8, #0xe88]
ffffffc080399148: 7100051f cmp w8, #0x1
ffffffc08039914c: 54ffe84a b.ge 0xffffffc080398e54 <kmem_cache_alloc_noprof+0x4c>
ffffffc080399150: aa1303e0 mov x0, x19
ffffffc080399154: aa1703e1 mov x1, x23
ffffffc080399158: 2a1403e2 mov w2, w20
ffffffc08039915c: 94008157 bl 0xffffffc0803b96b8 <__kfence_alloc>
ffffffc080399160: f90003e0 str x0, [sp]
ffffffc080399164: b4ffe7a0 cbz x0, 0xffffffc080398e58 <kmem_cache_alloc_noprof+0x50>
ffffffc080399168: aa0003f6 mov x22, x0
ffffffc08039916c: 2a1f03f8 mov w24, wzr
ffffffc080399170: 17ffff81 b 0xffffffc080398f74 <kmem_cache_alloc_noprof+0x16c>
ffffffc080399174: f9402668 ldr x8, [x19, #0x48]
ffffffc080399178: b5ffee68 cbnz x8, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039917c: 79401268 ldrh w8, [x19, #0x8]
ffffffc080399180: 52804089 mov w9, #0x204 // =516
ffffffc080399184: 6a09011f tst w8, w9
ffffffc080399188: 54ffede1 b.ne 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc08039918c: b4ffedd6 cbz x22, 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc080399190: b9402a68 ldr w8, [x19, #0x28]
ffffffc080399194: b9405269 ldr w9, [x19, #0x50]
ffffffc080399198: 6b09011f cmp w8, w9
ffffffc08039919c: 54ffed42 b.hs 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991a0: 9340dec9 sbfx x9, x22, #0, #56
ffffffc0803991a4: f828693f str xzr, [x9, x8]
ffffffc0803991a8: 17ffff67 b 0xffffffc080398f44 <kmem_cache_alloc_noprof+0x13c>
ffffffc0803991ac: b9400a69 ldr w9, [x19, #0x8]
ffffffc0803991b0: 5280104a mov w10, #0x82 // =130
ffffffc0803991b4: 6a0a013f tst w9, w10
ffffffc0803991b8: 54ffee60 b.eq 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991bc: 721d013f tst w9, #0x8
ffffffc0803991c0: 1a970339 csel w25, w25, w23, eq
ffffffc0803991c4: 17ffff70 b 0xffffffc080398f84 <kmem_cache_alloc_noprof+0x17c>
ffffffc0803991c8: 2a1f03f7 mov w23, wzr
ffffffc0803991cc: 17ffff70 b 0xffffffc080398f8c <kmem_cache_alloc_noprof+0x184>
ffffffc0803991d0: 77 48 22 d5 .word 0xd5224877
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2025-01-29 17:26 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-26 7:02 [PATCH 1/3] mm: avoid extra mem_alloc_profiling_enabled() checks Suren Baghdasaryan
2025-01-26 7:02 ` [PATCH 2/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator Suren Baghdasaryan
2025-01-26 16:47 ` Vlastimil Babka
2025-01-27 19:38 ` Suren Baghdasaryan
2025-01-28 19:35 ` Steven Rostedt
2025-01-28 23:43 ` Suren Baghdasaryan
2025-01-29 0:03 ` Steven Rostedt
2025-01-29 9:50 ` Vlastimil Babka
2025-01-29 17:26 ` Suren Baghdasaryan
2025-01-29 2:54 ` Suren Baghdasaryan
2025-01-29 9:38 ` Vlastimil Babka
2025-01-28 22:49 ` Peter Zijlstra
2025-01-26 7:02 ` [PATCH 3/3] alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator Suren Baghdasaryan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox