* [PATCH v4] vmstat: Kernel stack usage histogram
@ 2024-07-18 20:26 Pasha Tatashin
2024-07-18 21:44 ` Kent Overstreet
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Pasha Tatashin @ 2024-07-18 20:26 UTC (permalink / raw)
To: akpm, jpoimboe, pasha.tatashin, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, shakeel.butt,
vbabka, ziy, linux-kernel, linux-mm
As part of the dynamic kernel stack project, we need to know the amount
of data that can be saved by reducing the default kernel stack size [1].
Provide a kernel stack usage histogram to aid in optimizing kernel stack
sizes and minimizing memory waste in large-scale environments. The
histogram divides stack usage into power-of-two buckets and reports the
results in /proc/vmstat. This information is especially valuable in
environments with millions of machines, where even small optimizations
can have a significant impact.
The histogram data is presented in /proc/vmstat with entries like
"kstack_1k", "kstack_2k", and so on, indicating the number of threads
that exited with stack usage falling within each respective bucket.
Example outputs:
Intel:
$ grep kstack /proc/vmstat
kstack_1k 3
kstack_2k 188
kstack_4k 11391
kstack_8k 243
kstack_16k 0
ARM with 64K page_size:
$ grep kstack /proc/vmstat
kstack_1k 1
kstack_2k 340
kstack_4k 25212
kstack_8k 1659
kstack_16k 0
kstack_32k 0
kstack_64k 0
Note: once the dynamic kernel stack is implemented it will depend on the
implementation the usability of this feature: On hardware that supports
faults on kernel stacks, we will have other metrics that show the total
number of pages allocated for stacks. On hardware where faults are not
supported, we will most likely have some optimization where only some
threads are extended, and for those, these metrics will still be very
useful.
[1] https://lwn.net/Articles/974367
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
Changelog:
v4:
- Expanded the commit message as requested by Andrew Morton.
include/linux/sched/task_stack.h | 49 ++++++++++++++++++++++++++++++--
include/linux/vm_event_item.h | 42 +++++++++++++++++++++++++++
include/linux/vmstat.h | 16 -----------
mm/vmstat.c | 24 ++++++++++++++++
4 files changed, 113 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index ccd72b978e1f..65e8c9fb7f9b 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
extern void thread_stack_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#include <linux/vm_event_item.h>
+
+/* Count the maximum pages reached in kernel stacks */
+static inline void kstack_histogram(unsigned long used_stack)
+{
+ if (used_stack <= 1024)
+ this_cpu_inc(vm_event_states.event[KSTACK_1K]);
+#if THREAD_SIZE > 1024
+ else if (used_stack <= 2048)
+ this_cpu_inc(vm_event_states.event[KSTACK_2K]);
+#endif
+#if THREAD_SIZE > 2048
+ else if (used_stack <= 4096)
+ this_cpu_inc(vm_event_states.event[KSTACK_4K]);
+#endif
+#if THREAD_SIZE > 4096
+ else if (used_stack <= 8192)
+ this_cpu_inc(vm_event_states.event[KSTACK_8K]);
+#endif
+#if THREAD_SIZE > 8192
+ else if (used_stack <= 16384)
+ this_cpu_inc(vm_event_states.event[KSTACK_16K]);
+#endif
+#if THREAD_SIZE > 16384
+ else if (used_stack <= 32768)
+ this_cpu_inc(vm_event_states.event[KSTACK_32K]);
+#endif
+#if THREAD_SIZE > 32768
+ else if (used_stack <= 65536)
+ this_cpu_inc(vm_event_states.event[KSTACK_64K]);
+#endif
+#if THREAD_SIZE > 65536
+ else
+ this_cpu_inc(vm_event_states.event[KSTACK_REST]);
+#endif
+}
+#else /* !CONFIG_VM_EVENT_COUNTERS */
+static inline void kstack_histogram(unsigned long used_stack) {}
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
static inline unsigned long stack_not_used(struct task_struct *p)
{
unsigned long *n = end_of_stack(p);
+ unsigned long unused_stack;
do { /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
@@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
} while (!*n);
# ifdef CONFIG_STACK_GROWSUP
- return (unsigned long)end_of_stack(p) - (unsigned long)n;
+ unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
- return (unsigned long)n - (unsigned long)end_of_stack(p);
+ unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
+ kstack_histogram(THREAD_SIZE - unused_stack);
+
+ return unused_stack;
}
#endif
extern void set_task_stack_end_magic(struct task_struct *tsk);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 747943bc8cc2..73fa5fbf33a3 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -154,9 +154,51 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
VMA_LOCK_RETRY,
VMA_LOCK_MISS,
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ KSTACK_1K,
+#if THREAD_SIZE > 1024
+ KSTACK_2K,
+#endif
+#if THREAD_SIZE > 2048
+ KSTACK_4K,
+#endif
+#if THREAD_SIZE > 4096
+ KSTACK_8K,
+#endif
+#if THREAD_SIZE > 8192
+ KSTACK_16K,
+#endif
+#if THREAD_SIZE > 16384
+ KSTACK_32K,
+#endif
+#if THREAD_SIZE > 32768
+ KSTACK_64K,
+#endif
+#if THREAD_SIZE > 65536
+ KSTACK_REST,
+#endif
+#endif /* CONFIG_DEBUG_STACK_USAGE */
NR_VM_EVENT_ITEMS
};
+#ifdef CONFIG_VM_EVENT_COUNTERS
+/*
+ * Light weight per cpu counter implementation.
+ *
+ * Counters should only be incremented and no critical kernel component
+ * should rely on the counter values.
+ *
+ * Counters are handled completely inline. On many platforms the code
+ * generated will simply be the increment of a global address.
+ */
+
+struct vm_event_state {
+ unsigned long event[NR_VM_EVENT_ITEMS];
+};
+
+DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
+#endif
+
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
#define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 735eae6e272c..131966a4af78 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,22 +41,6 @@ enum writeback_stat_item {
};
#ifdef CONFIG_VM_EVENT_COUNTERS
-/*
- * Light weight per cpu counter implementation.
- *
- * Counters should only be incremented and no critical kernel component
- * should rely on the counter values.
- *
- * Counters are handled completely inline. On many platforms the code
- * generated will simply be the increment of a global address.
- */
-
-struct vm_event_state {
- unsigned long event[NR_VM_EVENT_ITEMS];
-};
-
-DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
-
/*
* vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
* local_irq_disable overhead.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8507c497218b..642d761b557b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1416,6 +1416,30 @@ const char * const vmstat_text[] = {
"vma_lock_retry",
"vma_lock_miss",
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ "kstack_1k",
+#if THREAD_SIZE > 1024
+ "kstack_2k",
+#endif
+#if THREAD_SIZE > 2048
+ "kstack_4k",
+#endif
+#if THREAD_SIZE > 4096
+ "kstack_8k",
+#endif
+#if THREAD_SIZE > 8192
+ "kstack_16k",
+#endif
+#if THREAD_SIZE > 16384
+ "kstack_32k",
+#endif
+#if THREAD_SIZE > 32768
+ "kstack_64k",
+#endif
+#if THREAD_SIZE > 65536
+ "kstack_rest",
+#endif
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
--
2.45.2.1089.g2a221341d9-goog
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-18 20:26 [PATCH v4] vmstat: Kernel stack usage histogram Pasha Tatashin
@ 2024-07-18 21:44 ` Kent Overstreet
2024-07-18 23:36 ` Shakeel Butt
2024-07-24 6:46 ` Andrew Morton
2 siblings, 0 replies; 10+ messages in thread
From: Kent Overstreet @ 2024-07-18 21:44 UTC (permalink / raw)
To: Pasha Tatashin
Cc: akpm, jpoimboe, peterz, nphamcs, cerasuolodomenico, surenb,
lizhijian, willy, shakeel.butt, vbabka, ziy, linux-kernel,
linux-mm
On Thu, Jul 18, 2024 at 08:26:11PM GMT, Pasha Tatashin wrote:
> As part of the dynamic kernel stack project, we need to know the amount
> of data that can be saved by reducing the default kernel stack size [1].
>
> Provide a kernel stack usage histogram to aid in optimizing kernel stack
> sizes and minimizing memory waste in large-scale environments. The
> histogram divides stack usage into power-of-two buckets and reports the
> results in /proc/vmstat. This information is especially valuable in
> environments with millions of machines, where even small optimizations
> can have a significant impact.
>
> The histogram data is presented in /proc/vmstat with entries like
> "kstack_1k", "kstack_2k", and so on, indicating the number of threads
> that exited with stack usage falling within each respective bucket.
>
> Example outputs:
> Intel:
> $ grep kstack /proc/vmstat
> kstack_1k 3
> kstack_2k 188
> kstack_4k 11391
> kstack_8k 243
> kstack_16k 0
>
> ARM with 64K page_size:
> $ grep kstack /proc/vmstat
> kstack_1k 1
> kstack_2k 340
> kstack_4k 25212
> kstack_8k 1659
> kstack_16k 0
> kstack_32k 0
> kstack_64k 0
>
> Note: once the dynamic kernel stack is implemented it will depend on the
> implementation the usability of this feature: On hardware that supports
> faults on kernel stacks, we will have other metrics that show the total
> number of pages allocated for stacks. On hardware where faults are not
> supported, we will most likely have some optimization where only some
> threads are extended, and for those, these metrics will still be very
> useful.
>
> [1] https://lwn.net/Articles/974367
Nice and simple, and this gets us exactly the data we want for dynamic
kernel stacks...
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
> ---
>
> Changelog:
> v4:
> - Expanded the commit message as requested by Andrew Morton.
>
> include/linux/sched/task_stack.h | 49 ++++++++++++++++++++++++++++++--
> include/linux/vm_event_item.h | 42 +++++++++++++++++++++++++++
> include/linux/vmstat.h | 16 -----------
> mm/vmstat.c | 24 ++++++++++++++++
> 4 files changed, 113 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
> index ccd72b978e1f..65e8c9fb7f9b 100644
> --- a/include/linux/sched/task_stack.h
> +++ b/include/linux/sched/task_stack.h
> @@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
> extern void thread_stack_cache_init(void);
>
> #ifdef CONFIG_DEBUG_STACK_USAGE
> +#ifdef CONFIG_VM_EVENT_COUNTERS
> +#include <linux/vm_event_item.h>
> +
> +/* Count the maximum pages reached in kernel stacks */
> +static inline void kstack_histogram(unsigned long used_stack)
> +{
> + if (used_stack <= 1024)
> + this_cpu_inc(vm_event_states.event[KSTACK_1K]);
> +#if THREAD_SIZE > 1024
> + else if (used_stack <= 2048)
> + this_cpu_inc(vm_event_states.event[KSTACK_2K]);
> +#endif
> +#if THREAD_SIZE > 2048
> + else if (used_stack <= 4096)
> + this_cpu_inc(vm_event_states.event[KSTACK_4K]);
> +#endif
> +#if THREAD_SIZE > 4096
> + else if (used_stack <= 8192)
> + this_cpu_inc(vm_event_states.event[KSTACK_8K]);
> +#endif
> +#if THREAD_SIZE > 8192
> + else if (used_stack <= 16384)
> + this_cpu_inc(vm_event_states.event[KSTACK_16K]);
> +#endif
> +#if THREAD_SIZE > 16384
> + else if (used_stack <= 32768)
> + this_cpu_inc(vm_event_states.event[KSTACK_32K]);
> +#endif
> +#if THREAD_SIZE > 32768
> + else if (used_stack <= 65536)
> + this_cpu_inc(vm_event_states.event[KSTACK_64K]);
> +#endif
> +#if THREAD_SIZE > 65536
> + else
> + this_cpu_inc(vm_event_states.event[KSTACK_REST]);
> +#endif
> +}
> +#else /* !CONFIG_VM_EVENT_COUNTERS */
> +static inline void kstack_histogram(unsigned long used_stack) {}
> +#endif /* CONFIG_VM_EVENT_COUNTERS */
> +
> static inline unsigned long stack_not_used(struct task_struct *p)
> {
> unsigned long *n = end_of_stack(p);
> + unsigned long unused_stack;
>
> do { /* Skip over canary */
> # ifdef CONFIG_STACK_GROWSUP
> @@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
> } while (!*n);
>
> # ifdef CONFIG_STACK_GROWSUP
> - return (unsigned long)end_of_stack(p) - (unsigned long)n;
> + unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
> # else
> - return (unsigned long)n - (unsigned long)end_of_stack(p);
> + unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
> # endif
> + kstack_histogram(THREAD_SIZE - unused_stack);
> +
> + return unused_stack;
> }
> #endif
> extern void set_task_stack_end_magic(struct task_struct *tsk);
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 747943bc8cc2..73fa5fbf33a3 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -154,9 +154,51 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> VMA_LOCK_RETRY,
> VMA_LOCK_MISS,
> #endif
> +#ifdef CONFIG_DEBUG_STACK_USAGE
> + KSTACK_1K,
> +#if THREAD_SIZE > 1024
> + KSTACK_2K,
> +#endif
> +#if THREAD_SIZE > 2048
> + KSTACK_4K,
> +#endif
> +#if THREAD_SIZE > 4096
> + KSTACK_8K,
> +#endif
> +#if THREAD_SIZE > 8192
> + KSTACK_16K,
> +#endif
> +#if THREAD_SIZE > 16384
> + KSTACK_32K,
> +#endif
> +#if THREAD_SIZE > 32768
> + KSTACK_64K,
> +#endif
> +#if THREAD_SIZE > 65536
> + KSTACK_REST,
> +#endif
> +#endif /* CONFIG_DEBUG_STACK_USAGE */
> NR_VM_EVENT_ITEMS
> };
>
> +#ifdef CONFIG_VM_EVENT_COUNTERS
> +/*
> + * Light weight per cpu counter implementation.
> + *
> + * Counters should only be incremented and no critical kernel component
> + * should rely on the counter values.
> + *
> + * Counters are handled completely inline. On many platforms the code
> + * generated will simply be the increment of a global address.
> + */
> +
> +struct vm_event_state {
> + unsigned long event[NR_VM_EVENT_ITEMS];
> +};
> +
> +DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
> +#endif
> +
> #ifndef CONFIG_TRANSPARENT_HUGEPAGE
> #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
> #define THP_FILE_FALLBACK ({ BUILD_BUG(); 0; })
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index 735eae6e272c..131966a4af78 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -41,22 +41,6 @@ enum writeback_stat_item {
> };
>
> #ifdef CONFIG_VM_EVENT_COUNTERS
> -/*
> - * Light weight per cpu counter implementation.
> - *
> - * Counters should only be incremented and no critical kernel component
> - * should rely on the counter values.
> - *
> - * Counters are handled completely inline. On many platforms the code
> - * generated will simply be the increment of a global address.
> - */
> -
> -struct vm_event_state {
> - unsigned long event[NR_VM_EVENT_ITEMS];
> -};
> -
> -DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
> -
> /*
> * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
> * local_irq_disable overhead.
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 8507c497218b..642d761b557b 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1416,6 +1416,30 @@ const char * const vmstat_text[] = {
> "vma_lock_retry",
> "vma_lock_miss",
> #endif
> +#ifdef CONFIG_DEBUG_STACK_USAGE
> + "kstack_1k",
> +#if THREAD_SIZE > 1024
> + "kstack_2k",
> +#endif
> +#if THREAD_SIZE > 2048
> + "kstack_4k",
> +#endif
> +#if THREAD_SIZE > 4096
> + "kstack_8k",
> +#endif
> +#if THREAD_SIZE > 8192
> + "kstack_16k",
> +#endif
> +#if THREAD_SIZE > 16384
> + "kstack_32k",
> +#endif
> +#if THREAD_SIZE > 32768
> + "kstack_64k",
> +#endif
> +#if THREAD_SIZE > 65536
> + "kstack_rest",
> +#endif
> +#endif
> #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
> };
> #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
> --
> 2.45.2.1089.g2a221341d9-goog
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-18 20:26 [PATCH v4] vmstat: Kernel stack usage histogram Pasha Tatashin
2024-07-18 21:44 ` Kent Overstreet
@ 2024-07-18 23:36 ` Shakeel Butt
2024-07-19 2:55 ` Pasha Tatashin
2024-07-24 6:46 ` Andrew Morton
2 siblings, 1 reply; 10+ messages in thread
From: Shakeel Butt @ 2024-07-18 23:36 UTC (permalink / raw)
To: Pasha Tatashin
Cc: akpm, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Thu, Jul 18, 2024 at 08:26:11PM GMT, Pasha Tatashin wrote:
[...]
> diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
> index ccd72b978e1f..65e8c9fb7f9b 100644
> --- a/include/linux/sched/task_stack.h
> +++ b/include/linux/sched/task_stack.h
> @@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
> extern void thread_stack_cache_init(void);
>
> #ifdef CONFIG_DEBUG_STACK_USAGE
> +#ifdef CONFIG_VM_EVENT_COUNTERS
> +#include <linux/vm_event_item.h>
> +
> +/* Count the maximum pages reached in kernel stacks */
> +static inline void kstack_histogram(unsigned long used_stack)
Any specific reason to add this function in header?
> +{
> + if (used_stack <= 1024)
> + this_cpu_inc(vm_event_states.event[KSTACK_1K]);
Why not count_vm_event(KSTACK_1K)? Avoiding header include recursion?
> +#if THREAD_SIZE > 1024
> + else if (used_stack <= 2048)
> + this_cpu_inc(vm_event_states.event[KSTACK_2K]);
> +#endif
> +#if THREAD_SIZE > 2048
> + else if (used_stack <= 4096)
> + this_cpu_inc(vm_event_states.event[KSTACK_4K]);
> +#endif
> +#if THREAD_SIZE > 4096
> + else if (used_stack <= 8192)
> + this_cpu_inc(vm_event_states.event[KSTACK_8K]);
> +#endif
> +#if THREAD_SIZE > 8192
> + else if (used_stack <= 16384)
> + this_cpu_inc(vm_event_states.event[KSTACK_16K]);
> +#endif
> +#if THREAD_SIZE > 16384
> + else if (used_stack <= 32768)
> + this_cpu_inc(vm_event_states.event[KSTACK_32K]);
> +#endif
> +#if THREAD_SIZE > 32768
> + else if (used_stack <= 65536)
> + this_cpu_inc(vm_event_states.event[KSTACK_64K]);
> +#endif
> +#if THREAD_SIZE > 65536
> + else
> + this_cpu_inc(vm_event_states.event[KSTACK_REST]);
> +#endif
> +}
> +#else /* !CONFIG_VM_EVENT_COUNTERS */
> +static inline void kstack_histogram(unsigned long used_stack) {}
> +#endif /* CONFIG_VM_EVENT_COUNTERS */
> +
> static inline unsigned long stack_not_used(struct task_struct *p)
> {
> unsigned long *n = end_of_stack(p);
> + unsigned long unused_stack;
>
> do { /* Skip over canary */
> # ifdef CONFIG_STACK_GROWSUP
> @@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
> } while (!*n);
>
> # ifdef CONFIG_STACK_GROWSUP
> - return (unsigned long)end_of_stack(p) - (unsigned long)n;
> + unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
> # else
> - return (unsigned long)n - (unsigned long)end_of_stack(p);
> + unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
> # endif
> + kstack_histogram(THREAD_SIZE - unused_stack);
> +
> + return unused_stack;
> }
> #endif
> extern void set_task_stack_end_magic(struct task_struct *tsk);
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-18 23:36 ` Shakeel Butt
@ 2024-07-19 2:55 ` Pasha Tatashin
2024-07-19 22:04 ` Shakeel Butt
2024-07-24 0:10 ` Andrew Morton
0 siblings, 2 replies; 10+ messages in thread
From: Pasha Tatashin @ 2024-07-19 2:55 UTC (permalink / raw)
To: Shakeel Butt
Cc: akpm, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Thu, Jul 18, 2024 at 7:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Jul 18, 2024 at 08:26:11PM GMT, Pasha Tatashin wrote:
> [...]
> > diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
> > index ccd72b978e1f..65e8c9fb7f9b 100644
> > --- a/include/linux/sched/task_stack.h
> > +++ b/include/linux/sched/task_stack.h
> > @@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
> > extern void thread_stack_cache_init(void);
> >
> > #ifdef CONFIG_DEBUG_STACK_USAGE
> > +#ifdef CONFIG_VM_EVENT_COUNTERS
> > +#include <linux/vm_event_item.h>
> > +
> > +/* Count the maximum pages reached in kernel stacks */
> > +static inline void kstack_histogram(unsigned long used_stack)
>
> Any specific reason to add this function in header?
For performance reasons to keep it inlined into stack_not_used() which
is also defined as inline function in this header.
>
> > +{
> > + if (used_stack <= 1024)
> > + this_cpu_inc(vm_event_states.event[KSTACK_1K]);
>
> Why not count_vm_event(KSTACK_1K)? Avoiding header include recursion?
I could not include "linux/vmstat.h" into "linux/sched/task_stack.h"
because it introduces some dependencies such linux/mm.h and
linux/fs.h, uapi/linux/stat.h, and when all of those are added it
still fails to compile on some architectures, so it was just simpler
to stop resolving the conflicts and use this_cpu_inc() directly.
>
> > +#if THREAD_SIZE > 1024
> > + else if (used_stack <= 2048)
> > + this_cpu_inc(vm_event_states.event[KSTACK_2K]);
> > +#endif
> > +#if THREAD_SIZE > 2048
> > + else if (used_stack <= 4096)
> > + this_cpu_inc(vm_event_states.event[KSTACK_4K]);
> > +#endif
> > +#if THREAD_SIZE > 4096
> > + else if (used_stack <= 8192)
> > + this_cpu_inc(vm_event_states.event[KSTACK_8K]);
> > +#endif
> > +#if THREAD_SIZE > 8192
> > + else if (used_stack <= 16384)
> > + this_cpu_inc(vm_event_states.event[KSTACK_16K]);
> > +#endif
> > +#if THREAD_SIZE > 16384
> > + else if (used_stack <= 32768)
> > + this_cpu_inc(vm_event_states.event[KSTACK_32K]);
> > +#endif
> > +#if THREAD_SIZE > 32768
> > + else if (used_stack <= 65536)
> > + this_cpu_inc(vm_event_states.event[KSTACK_64K]);
> > +#endif
> > +#if THREAD_SIZE > 65536
> > + else
> > + this_cpu_inc(vm_event_states.event[KSTACK_REST]);
> > +#endif
> > +}
> > +#else /* !CONFIG_VM_EVENT_COUNTERS */
> > +static inline void kstack_histogram(unsigned long used_stack) {}
> > +#endif /* CONFIG_VM_EVENT_COUNTERS */
> > +
> > static inline unsigned long stack_not_used(struct task_struct *p)
> > {
> > unsigned long *n = end_of_stack(p);
> > + unsigned long unused_stack;
> >
> > do { /* Skip over canary */
> > # ifdef CONFIG_STACK_GROWSUP
> > @@ -108,10 +150,13 @@ static inline unsigned long stack_not_used(struct task_struct *p)
> > } while (!*n);
> >
> > # ifdef CONFIG_STACK_GROWSUP
> > - return (unsigned long)end_of_stack(p) - (unsigned long)n;
> > + unused_stack = (unsigned long)end_of_stack(p) - (unsigned long)n;
> > # else
> > - return (unsigned long)n - (unsigned long)end_of_stack(p);
> > + unused_stack = (unsigned long)n - (unsigned long)end_of_stack(p);
> > # endif
> > + kstack_histogram(THREAD_SIZE - unused_stack);
> > +
> > + return unused_stack;
> > }
> > #endif
> > extern void set_task_stack_end_magic(struct task_struct *tsk);
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-19 2:55 ` Pasha Tatashin
@ 2024-07-19 22:04 ` Shakeel Butt
2024-07-24 0:09 ` Andrew Morton
2024-07-24 0:10 ` Andrew Morton
1 sibling, 1 reply; 10+ messages in thread
From: Shakeel Butt @ 2024-07-19 22:04 UTC (permalink / raw)
To: Pasha Tatashin
Cc: akpm, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Thu, Jul 18, 2024 at 10:55:17PM GMT, Pasha Tatashin wrote:
> On Thu, Jul 18, 2024 at 7:36 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Thu, Jul 18, 2024 at 08:26:11PM GMT, Pasha Tatashin wrote:
> > [...]
> > > diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
> > > index ccd72b978e1f..65e8c9fb7f9b 100644
> > > --- a/include/linux/sched/task_stack.h
> > > +++ b/include/linux/sched/task_stack.h
> > > @@ -95,9 +95,51 @@ static inline int object_is_on_stack(const void *obj)
> > > extern void thread_stack_cache_init(void);
> > >
> > > #ifdef CONFIG_DEBUG_STACK_USAGE
> > > +#ifdef CONFIG_VM_EVENT_COUNTERS
> > > +#include <linux/vm_event_item.h>
> > > +
> > > +/* Count the maximum pages reached in kernel stacks */
> > > +static inline void kstack_histogram(unsigned long used_stack)
> >
> > Any specific reason to add this function in header?
>
> For performance reasons to keep it inlined into stack_not_used() which
> is also defined as inline function in this header.
>
Is this really that performance critical?
> >
> > > +{
> > > + if (used_stack <= 1024)
> > > + this_cpu_inc(vm_event_states.event[KSTACK_1K]);
> >
> > Why not count_vm_event(KSTACK_1K)? Avoiding header include recursion?
>
> I could not include "linux/vmstat.h" into "linux/sched/task_stack.h"
> because it introduces some dependencies such linux/mm.h and
> linux/fs.h, uapi/linux/stat.h, and when all of those are added it
> still fails to compile on some architectures, so it was just simpler
> to stop resolving the conflicts and use this_cpu_inc() directly.
>
The above makes me think it is better to move stack_not_used() and the
new function to C file unless we can show the negative performance
impact.
I have another question. At the moment, the metrics are exposed
conditionally through procfs based on stack size. So, based on the
kernel config someone may not see kstack_16k. Why not just output all of
these metrics irrespective of the config?
Shakeel
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-19 22:04 ` Shakeel Butt
@ 2024-07-24 0:09 ` Andrew Morton
0 siblings, 0 replies; 10+ messages in thread
From: Andrew Morton @ 2024-07-24 0:09 UTC (permalink / raw)
To: Shakeel Butt
Cc: Pasha Tatashin, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Fri, 19 Jul 2024 15:04:40 -0700 Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > Any specific reason to add this function in header?
> >
> > For performance reasons to keep it inlined into stack_not_used() which
> > is also defined as inline function in this header.
> >
>
> Is this really that performance critical?
Nope. stack_not_used() is for CONFIG_DEBUG_STACK_USAGE and
stack_not_used() is slow.
I suggest that stack_not_used() (and hence kstack_histogram()) be
uninlined. This can be a followup patch.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-19 2:55 ` Pasha Tatashin
2024-07-19 22:04 ` Shakeel Butt
@ 2024-07-24 0:10 ` Andrew Morton
1 sibling, 0 replies; 10+ messages in thread
From: Andrew Morton @ 2024-07-24 0:10 UTC (permalink / raw)
To: Pasha Tatashin
Cc: Shakeel Butt, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Thu, 18 Jul 2024 22:55:17 -0400 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> >
> > > +{
> > > + if (used_stack <= 1024)
> > > + this_cpu_inc(vm_event_states.event[KSTACK_1K]);
> >
> > Why not count_vm_event(KSTACK_1K)? Avoiding header include recursion?
>
> I could not include "linux/vmstat.h" into "linux/sched/task_stack.h"
> because it introduces some dependencies such linux/mm.h and
> linux/fs.h, uapi/linux/stat.h, and when all of those are added it
> still fails to compile on some architectures, so it was just simpler
> to stop resolving the conflicts and use this_cpu_inc() directly.
Presumably uninlining stack_not_used() will permit this
to be cleaned up.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-18 20:26 [PATCH v4] vmstat: Kernel stack usage histogram Pasha Tatashin
2024-07-18 21:44 ` Kent Overstreet
2024-07-18 23:36 ` Shakeel Butt
@ 2024-07-24 6:46 ` Andrew Morton
2024-07-24 14:43 ` Pasha Tatashin
2 siblings, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2024-07-24 6:46 UTC (permalink / raw)
To: Pasha Tatashin
Cc: jpoimboe, kent.overstreet, peterz, nphamcs, cerasuolodomenico,
surenb, lizhijian, willy, shakeel.butt, vbabka, ziy,
linux-kernel, linux-mm
On Thu, 18 Jul 2024 20:26:11 +0000 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> As part of the dynamic kernel stack project, we need to know the amount
> of data that can be saved by reducing the default kernel stack size [1].
>
> Provide a kernel stack usage histogram to aid in optimizing kernel stack
> sizes and minimizing memory waste in large-scale environments. The
> histogram divides stack usage into power-of-two buckets and reports the
> results in /proc/vmstat. This information is especially valuable in
> environments with millions of machines, where even small optimizations
> can have a significant impact.
x86_64 allmodconfig:
In file included from <command-line>:
In function 'init_memcg_events',
inlined from 'mem_cgroup_css_alloc' at mm/memcontrol.c:3616:3:
././include/linux/compiler_types.h:510:45: error: call to '__compiletime_assert_2305' declared with attribute error: BUILD_BUG_ON failed: NR_VM_EVENT_ITEMS >= S8_MAX
510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^
././include/linux/compiler_types.h:491:25: note: in definition of macro '__compiletime_assert'
491 | prefix ## suffix(); \
| ^~~~~~
././include/linux/compiler_types.h:510:9: note: in expansion of macro '_compiletime_assert'
510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
| ^~~~~~~~~~~~~~~~~~~
./include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
| ^~~~~~~~~~~~~~~~~~
./include/linux/build_bug.h:50:9: note: in expansion of macro 'BUILD_BUG_ON_MSG'
50 | BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
| ^~~~~~~~~~~~~~~~
mm/memcontrol.c:444:9: note: in expansion of macro 'BUILD_BUG_ON'
444 | BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX);
| ^~~~~~~~~~~~
This looks legitimate - is it time to switch to int16_t?
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-24 6:46 ` Andrew Morton
@ 2024-07-24 14:43 ` Pasha Tatashin
2024-07-24 16:59 ` Shakeel Butt
0 siblings, 1 reply; 10+ messages in thread
From: Pasha Tatashin @ 2024-07-24 14:43 UTC (permalink / raw)
To: Andrew Morton
Cc: jpoimboe, kent.overstreet, peterz, nphamcs, cerasuolodomenico,
surenb, lizhijian, willy, shakeel.butt, vbabka, ziy,
linux-kernel, linux-mm
On Wed, Jul 24, 2024 at 2:46 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 18 Jul 2024 20:26:11 +0000 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
>
> > As part of the dynamic kernel stack project, we need to know the amount
> > of data that can be saved by reducing the default kernel stack size [1].
> >
> > Provide a kernel stack usage histogram to aid in optimizing kernel stack
> > sizes and minimizing memory waste in large-scale environments. The
> > histogram divides stack usage into power-of-two buckets and reports the
> > results in /proc/vmstat. This information is especially valuable in
> > environments with millions of machines, where even small optimizations
> > can have a significant impact.
>
> x86_64 allmodconfig:
>
> In file included from <command-line>:
> In function 'init_memcg_events',
> inlined from 'mem_cgroup_css_alloc' at mm/memcontrol.c:3616:3:
> ././include/linux/compiler_types.h:510:45: error: call to '__compiletime_assert_2305' declared with attribute error: BUILD_BUG_ON failed: NR_VM_EVENT_ITEMS >= S8_MAX
> 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> | ^
> ././include/linux/compiler_types.h:491:25: note: in definition of macro '__compiletime_assert'
> 491 | prefix ## suffix(); \
> | ^~~~~~
> ././include/linux/compiler_types.h:510:9: note: in expansion of macro '_compiletime_assert'
> 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> | ^~~~~~~~~~~~~~~~~~~
> ./include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
> 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
> | ^~~~~~~~~~~~~~~~~~
> ./include/linux/build_bug.h:50:9: note: in expansion of macro 'BUILD_BUG_ON_MSG'
> 50 | BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
> | ^~~~~~~~~~~~~~~~
> mm/memcontrol.c:444:9: note: in expansion of macro 'BUILD_BUG_ON'
> 444 | BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX);
> | ^~~~~~~~~~~~
>
> This looks legitimate - is it time to switch to int16_t?
I am looking into this, and will also uninline stack_not_used() and
kstack_histogram() as discussed earlier in the thread.
Pasha
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v4] vmstat: Kernel stack usage histogram
2024-07-24 14:43 ` Pasha Tatashin
@ 2024-07-24 16:59 ` Shakeel Butt
0 siblings, 0 replies; 10+ messages in thread
From: Shakeel Butt @ 2024-07-24 16:59 UTC (permalink / raw)
To: Pasha Tatashin
Cc: Andrew Morton, jpoimboe, kent.overstreet, peterz, nphamcs,
cerasuolodomenico, surenb, lizhijian, willy, vbabka, ziy,
linux-kernel, linux-mm
On Wed, Jul 24, 2024 at 10:43:59AM GMT, Pasha Tatashin wrote:
> On Wed, Jul 24, 2024 at 2:46 AM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Thu, 18 Jul 2024 20:26:11 +0000 Pasha Tatashin <pasha.tatashin@soleen.com> wrote:
> >
> > > As part of the dynamic kernel stack project, we need to know the amount
> > > of data that can be saved by reducing the default kernel stack size [1].
> > >
> > > Provide a kernel stack usage histogram to aid in optimizing kernel stack
> > > sizes and minimizing memory waste in large-scale environments. The
> > > histogram divides stack usage into power-of-two buckets and reports the
> > > results in /proc/vmstat. This information is especially valuable in
> > > environments with millions of machines, where even small optimizations
> > > can have a significant impact.
> >
> > x86_64 allmodconfig:
> >
> > In file included from <command-line>:
> > In function 'init_memcg_events',
> > inlined from 'mem_cgroup_css_alloc' at mm/memcontrol.c:3616:3:
> > ././include/linux/compiler_types.h:510:45: error: call to '__compiletime_assert_2305' declared with attribute error: BUILD_BUG_ON failed: NR_VM_EVENT_ITEMS >= S8_MAX
> > 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> > | ^
> > ././include/linux/compiler_types.h:491:25: note: in definition of macro '__compiletime_assert'
> > 491 | prefix ## suffix(); \
> > | ^~~~~~
> > ././include/linux/compiler_types.h:510:9: note: in expansion of macro '_compiletime_assert'
> > 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> > | ^~~~~~~~~~~~~~~~~~~
> > ./include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert'
> > 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
> > | ^~~~~~~~~~~~~~~~~~
> > ./include/linux/build_bug.h:50:9: note: in expansion of macro 'BUILD_BUG_ON_MSG'
> > 50 | BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
> > | ^~~~~~~~~~~~~~~~
> > mm/memcontrol.c:444:9: note: in expansion of macro 'BUILD_BUG_ON'
> > 444 | BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX);
> > | ^~~~~~~~~~~~
> >
> > This looks legitimate - is it time to switch to int16_t?
>
> I am looking into this, and will also uninline stack_not_used() and
> kstack_histogram() as discussed earlier in the thread.
>
Let me take care of this specific build error.
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2024-07-24 16:59 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-07-18 20:26 [PATCH v4] vmstat: Kernel stack usage histogram Pasha Tatashin
2024-07-18 21:44 ` Kent Overstreet
2024-07-18 23:36 ` Shakeel Butt
2024-07-19 2:55 ` Pasha Tatashin
2024-07-19 22:04 ` Shakeel Butt
2024-07-24 0:09 ` Andrew Morton
2024-07-24 0:10 ` Andrew Morton
2024-07-24 6:46 ` Andrew Morton
2024-07-24 14:43 ` Pasha Tatashin
2024-07-24 16:59 ` Shakeel Butt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox