* [PATCH 02/10] mm: shrinker: Add a .to_text() method for shrinkers
[not found] <20240824180454.3160385-1-kent.overstreet@linux.dev>
@ 2024-08-24 18:04 ` Kent Overstreet
2024-08-24 18:04 ` [PATCH 03/10] mm: shrinker: Add new stats for .to_text() Kent Overstreet
2024-08-24 18:04 ` [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c Kent Overstreet
2 siblings, 0 replies; 4+ messages in thread
From: Kent Overstreet @ 2024-08-24 18:04 UTC (permalink / raw)
To: david, linux-fsdevel, linux-mm
Cc: Kent Overstreet, Andrew Morton, Qi Zheng, Roman Gushchin, linux-mm
This adds a new callback method to shrinkers which they can use to
describe anything relevant to memory reclaim about their internal state,
for example object dirtyness.
This patch also adds shrinkers_to_text(), which reports on the top 10
shrinkers - by object count - in sorted order, to be used in OOM
reporting.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: linux-mm@kvack.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
include/linux/shrinker.h | 7 +++-
mm/shrinker.c | 73 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 78 insertions(+), 2 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1a00be90d93a..6193612617a1 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -24,6 +24,8 @@ struct shrinker_info {
struct shrinker_info_unit *unit[];
};
+struct seq_buf;
+
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extension later.
@@ -80,10 +82,12 @@ struct shrink_control {
* @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
+ const char *name;
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
+ void (*to_text)(struct seq_buf *, struct shrinker *);
long batch; /* reclaim batch size, 0 = default */
int seeks; /* seeks to recreate an obj */
@@ -110,7 +114,6 @@ struct shrinker {
#endif
#ifdef CONFIG_SHRINKER_DEBUG
int debugfs_id;
- const char *name;
struct dentry *debugfs_entry;
#endif
/* objs pending delete, per node */
@@ -135,6 +138,8 @@ __printf(2, 3)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
void shrinker_register(struct shrinker *shrinker);
void shrinker_free(struct shrinker *shrinker);
+void shrinker_to_text(struct seq_buf *, struct shrinker *);
+void shrinkers_to_text(struct seq_buf *);
static inline bool shrinker_try_get(struct shrinker *shrinker)
{
diff --git a/mm/shrinker.c b/mm/shrinker.c
index dc5d2a6fcfc4..ad52c269bb48 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -1,8 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
+#include <linux/rculist.h>
#include <linux/rwsem.h>
+#include <linux/seq_buf.h>
#include <linux/shrinker.h>
-#include <linux/rculist.h>
#include <trace/events/vmscan.h>
#include "internal.h"
@@ -807,3 +808,73 @@ void shrinker_free(struct shrinker *shrinker)
call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);
+
+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
+{
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+
+ seq_buf_puts(out, shrinker->name);
+ seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+
+ if (shrinker->to_text) {
+ shrinker->to_text(out, shrinker);
+ seq_buf_puts(out, "\n");
+ }
+}
+
+/**
+ * shrinkers_to_text - Report on shrinkers with highest usage
+ *
+ * This reports on the top 10 shrinkers, by object counts, in sorted order:
+ * intended to be used for OOM reporting.
+ */
+void shrinkers_to_text(struct seq_buf *out)
+{
+ struct shrinker *shrinker;
+ struct shrinker_by_mem {
+ struct shrinker *shrinker;
+ unsigned long mem;
+ } shrinkers_by_mem[10];
+ int i, nr = 0;
+
+ if (!mutex_trylock(&shrinker_mutex)) {
+ seq_buf_puts(out, "(couldn't take shrinker lock)");
+ return;
+ }
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ unsigned long mem = shrinker->count_objects(shrinker, &sc);
+
+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
+ continue;
+
+ for (i = 0; i < nr; i++)
+ if (mem < shrinkers_by_mem[i].mem)
+ break;
+
+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
+ memmove(&shrinkers_by_mem[i + 1],
+ &shrinkers_by_mem[i],
+ sizeof(shrinkers_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&shrinkers_by_mem[0],
+ &shrinkers_by_mem[1],
+ sizeof(shrinkers_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ shrinkers_by_mem[i] = (struct shrinker_by_mem) {
+ .shrinker = shrinker,
+ .mem = mem,
+ };
+ }
+
+ for (i = nr - 1; i >= 0; --i)
+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
+
+ mutex_unlock(&shrinker_mutex);
+}
--
2.45.2
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 03/10] mm: shrinker: Add new stats for .to_text()
[not found] <20240824180454.3160385-1-kent.overstreet@linux.dev>
2024-08-24 18:04 ` [PATCH 02/10] mm: shrinker: Add a .to_text() method for shrinkers Kent Overstreet
@ 2024-08-24 18:04 ` Kent Overstreet
2024-08-24 18:04 ` [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c Kent Overstreet
2 siblings, 0 replies; 4+ messages in thread
From: Kent Overstreet @ 2024-08-24 18:04 UTC (permalink / raw)
To: david, linux-fsdevel, linux-mm
Cc: Kent Overstreet, Andrew Morton, Qi Zheng, Roman Gushchin, linux-mm
Add a few new shrinker stats.
number of objects requested to free, number of objects freed:
Shrinkers won't necessarily free all objects requested for a variety of
reasons, but if the two counts are wildly different something is likely
amiss.
.scan_objects runtime:
If one shrinker is taking an excessive amount of time to free
objects that will block kswapd from running other shrinkers.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: linux-mm@kvack.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
include/linux/shrinker.h | 6 ++++++
mm/shrinker.c | 23 ++++++++++++++++++++++-
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 6193612617a1..106622ddac77 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -118,6 +118,12 @@ struct shrinker {
#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
+
+ atomic_long_t objects_requested_to_free;
+ atomic_long_t objects_freed;
+ unsigned long last_freed; /* timestamp, in jiffies */
+ unsigned long last_scanned; /* timestamp, in jiffies */
+ atomic64_t ns_run;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
diff --git a/mm/shrinker.c b/mm/shrinker.c
index ad52c269bb48..feaa8122afc9 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -430,13 +430,24 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
+ u64 start_time = ktime_get_ns();
+
+ atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free);
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run);
if (ret == SHRINK_STOP)
break;
freed += ret;
+ unsigned long now = jiffies;
+ if (ret) {
+ atomic_long_add(ret, &shrinker->objects_freed);
+ shrinker->last_freed = now;
+ }
+ shrinker->last_scanned = now;
count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
total_scan -= shrinkctl->nr_scanned;
@@ -812,9 +823,19 @@ EXPORT_SYMBOL_GPL(shrinker_free);
void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
{
struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed);
seq_buf_puts(out, shrinker->name);
- seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+ seq_buf_putc(out, '\n');
+
+ seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+ seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free));
+ seq_buf_printf(out, "objects freed: %lu\n", nr_freed);
+ seq_buf_printf(out, "last scanned: %li sec ago\n", (jiffies - shrinker->last_scanned) / HZ);
+ seq_buf_printf(out, "last freed: %li sec ago\n", (jiffies - shrinker->last_freed) / HZ);
+ seq_buf_printf(out, "ns per object freed: %llu\n", nr_freed
+ ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed)
+ : 0);
if (shrinker->to_text) {
shrinker->to_text(out, shrinker);
--
2.45.2
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c
[not found] <20240824180454.3160385-1-kent.overstreet@linux.dev>
2024-08-24 18:04 ` [PATCH 02/10] mm: shrinker: Add a .to_text() method for shrinkers Kent Overstreet
2024-08-24 18:04 ` [PATCH 03/10] mm: shrinker: Add new stats for .to_text() Kent Overstreet
@ 2024-08-24 18:04 ` Kent Overstreet
2 siblings, 0 replies; 4+ messages in thread
From: Kent Overstreet @ 2024-08-24 18:04 UTC (permalink / raw)
To: david, linux-fsdevel, linux-mm
Cc: Kent Overstreet, Andrew Morton, Qi Zheng, Roman Gushchin, linux-mm
This patch:
- Changes show_mem() to always report on slab usage
- Instead of reporting on all slabs, we only report on top 10 slabs,
and in sorted order
- Also reports on shrinkers, with the new shrinkers_to_text().
Shrinkers need to be included in OOM/allocation failure reporting
because they're responsible for memory reclaim - if a shrinker isn't
giving up its memory, we need to know which one and why.
More OOM reporting can be moved to show_mem.c and improved, this patch
is only a start.
New example output on OOM/memory allocation failure:
00177 Mem-Info:
00177 active_anon:13706 inactive_anon:32266 isolated_anon:16
00177 active_file:1653 inactive_file:1822 isolated_file:0
00177 unevictable:0 dirty:0 writeback:0
00177 slab_reclaimable:6242 slab_unreclaimable:11168
00177 mapped:3824 shmem:3 pagetables:1266 bounce:0
00177 kernel_misc_reclaimable:0
00177 free:4362 free_pcp:35 free_cma:0
00177 Node 0 active_anon:54824kB inactive_anon:129064kB active_file:6612kB inactive_file:7288kB unevictable:0kB isolated(anon):64kB isolated(file):0kB mapped:15296kB dirty:0kB writeback:0kB shmem:12kB writeback_tmp:0kB kernel_stack:3392kB pagetables:5064kB all_unreclaimable? no
00177 DMA free:2232kB boost:0kB min:88kB low:108kB high:128kB reserved_highatomic:0KB active_anon:2924kB inactive_anon:6596kB active_file:428kB inactive_file:384kB unevictable:0kB writepending:0kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 426 426 426
00177 DMA32 free:15092kB boost:5836kB min:8432kB low:9080kB high:9728kB reserved_highatomic:0KB active_anon:52196kB inactive_anon:122392kB active_file:6176kB inactive_file:7068kB unevictable:0kB writepending:0kB present:507760kB managed:441816kB mlocked:0kB bounce:0kB free_pcp:72kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 0 0 0
00177 DMA: 284*4kB (UM) 53*8kB (UM) 21*16kB (U) 11*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 2248kB
00177 DMA32: 2765*4kB (UME) 375*8kB (UME) 57*16kB (UM) 5*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 15132kB
00177 4656 total pagecache pages
00177 1031 pages in swap cache
00177 Swap cache stats: add 6572399, delete 6572173, find 488603/3286476
00177 Free swap = 509112kB
00177 Total swap = 2097148kB
00177 130938 pages RAM
00177 0 pages HighMem/MovableOnly
00177 16644 pages reserved
00177 Unreclaimable slab info:
00177 9p-fcall-cache total: 8.25 MiB active: 8.25 MiB
00177 kernfs_node_cache total: 2.15 MiB active: 2.15 MiB
00177 kmalloc-64 total: 2.08 MiB active: 2.07 MiB
00177 task_struct total: 1.95 MiB active: 1.95 MiB
00177 kmalloc-4k total: 1.50 MiB active: 1.50 MiB
00177 signal_cache total: 1.34 MiB active: 1.34 MiB
00177 kmalloc-2k total: 1.16 MiB active: 1.16 MiB
00177 bch_inode_info total: 1.02 MiB active: 922 KiB
00177 perf_event total: 1.02 MiB active: 1.02 MiB
00177 biovec-max total: 992 KiB active: 960 KiB
00177 Shrinkers:
00177 super_cache_scan: objects: 127
00177 super_cache_scan: objects: 106
00177 jbd2_journal_shrink_scan: objects: 32
00177 ext4_es_scan: objects: 32
00177 bch2_btree_cache_scan: objects: 8
00177 nr nodes: 24
00177 nr dirty: 0
00177 cannibalize lock: 0000000000000000
00177
00177 super_cache_scan: objects: 8
00177 super_cache_scan: objects: 1
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: linux-mm@kvack.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
mm/oom_kill.c | 23 ---------------------
mm/show_mem.c | 43 +++++++++++++++++++++++++++++++++++++++
mm/slab.h | 6 ++++--
mm/slab_common.c | 52 +++++++++++++++++++++++++++++++++++++++---------
4 files changed, 90 insertions(+), 34 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..dc56239ff057 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p)
return false;
}
-/*
- * Check whether unreclaimable slab amount is greater than
- * all user memory(LRU pages).
- * dump_unreclaimable_slab() could help in the case that
- * oom due to too much unreclaimable slab used by kernel.
-*/
-static bool should_dump_unreclaim_slab(void)
-{
- unsigned long nr_lru;
-
- nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
- global_node_page_state(NR_INACTIVE_ANON) +
- global_node_page_state(NR_ACTIVE_FILE) +
- global_node_page_state(NR_INACTIVE_FILE) +
- global_node_page_state(NR_ISOLATED_ANON) +
- global_node_page_state(NR_ISOLATED_FILE) +
- global_node_page_state(NR_UNEVICTABLE);
-
- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
-}
-
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
@@ -464,8 +443,6 @@ static void dump_header(struct oom_control *oc)
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
- if (should_dump_unreclaim_slab())
- dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..a8ea4c41ced5 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -7,15 +7,18 @@
#include <linux/blkdev.h>
#include <linux/cma.h>
+#include <linux/console.h>
#include <linux/cpuset.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/seq_buf.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
#include "internal.h"
+#include "slab.h"
#include "swap.h"
atomic_long_t _totalram_pages __read_mostly;
@@ -397,10 +400,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
show_swap_cache_info();
}
+static void print_string_as_lines(const char *prefix, const char *lines)
+{
+ if (!lines) {
+ printk("%s (null)\n", prefix);
+ return;
+ }
+
+ bool locked = console_trylock();
+
+ while (1) {
+ const char *p = strchrnul(lines, '\n');
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+ if (!*p)
+ break;
+ lines = p + 1;
+ }
+ if (locked)
+ console_unlock();
+}
+
void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long total = 0, reserved = 0, highmem = 0;
struct zone *zone;
+ char *buf;
printk("Mem-Info:\n");
show_free_areas(filter, nodemask, max_zone_idx);
@@ -449,4 +473,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
}
}
#endif
+
+ const unsigned buf_size = 8192;
+ buf = kmalloc(buf_size, GFP_ATOMIC);
+ if (buf) {
+ struct seq_buf s;
+
+ printk("Unreclaimable slab info:\n");
+ seq_buf_init(&s, buf, buf_size);
+ dump_unreclaimable_slab(&s);
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+
+ printk("Shrinkers:\n");
+ seq_buf_init(&s, buf, buf_size);
+ shrinkers_to_text(&s);
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+ /* previous output doesn't get flushed without this - why? */
+
+ kfree(buf);
+ }
}
diff --git a/mm/slab.h b/mm/slab.h
index dcdb56b8e7f5..b523b3e3d9d3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -611,10 +611,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
return s->size;
}
+struct seq_buf;
+
#ifdef CONFIG_SLUB_DEBUG
-void dump_unreclaimable_slab(void);
+void dump_unreclaimable_slab(struct seq_buf *);
#else
-static inline void dump_unreclaimable_slab(void)
+static inline void dump_unreclaimable_slab(struct seq_buf *out)
{
}
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 40b582a014b8..bd50a57161cf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -27,6 +27,7 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
+#include <linux/seq_buf.h>
#include <linux/stackdepot.h>
#include "internal.h"
@@ -1181,10 +1182,15 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-void dump_unreclaimable_slab(void)
+void dump_unreclaimable_slab(struct seq_buf *out)
{
struct kmem_cache *s;
struct slabinfo sinfo;
+ struct slab_by_mem {
+ struct kmem_cache *s;
+ size_t total, active;
+ } slabs_by_mem[10], n;
+ int i, nr = 0;
/*
* Here acquiring slab_mutex is risky since we don't prefer to get
@@ -1194,24 +1200,52 @@ void dump_unreclaimable_slab(void)
* without acquiring the mutex.
*/
if (!mutex_trylock(&slab_mutex)) {
- pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n");
return;
}
- pr_info("Unreclaimable slab info:\n");
- pr_info("Name Used Total\n");
-
list_for_each_entry(s, &slab_caches, list) {
if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
- if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", s->name,
- (sinfo.active_objs * s->size) / 1024,
- (sinfo.num_objs * s->size) / 1024);
+ if (!sinfo.num_objs)
+ continue;
+
+ n.s = s;
+ n.total = sinfo.num_objs * s->size;
+ n.active = sinfo.active_objs * s->size;
+
+ for (i = 0; i < nr; i++)
+ if (n.total < slabs_by_mem[i].total)
+ break;
+
+ if (nr < ARRAY_SIZE(slabs_by_mem)) {
+ memmove(&slabs_by_mem[i + 1],
+ &slabs_by_mem[i],
+ sizeof(slabs_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&slabs_by_mem[0],
+ &slabs_by_mem[1],
+ sizeof(slabs_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ slabs_by_mem[i] = n;
}
+
+ for (i = nr - 1; i >= 0; --i) {
+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2);
+ seq_buf_printf(out, " active: ");
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2);
+ seq_buf_putc(out, '\n');
+ }
+
mutex_unlock(&slab_mutex);
}
--
2.45.2
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c
2024-08-24 19:10 [PATCH 00/10] shrinker debugging, .to_text() report (resend) Kent Overstreet
@ 2024-08-24 19:10 ` Kent Overstreet
0 siblings, 0 replies; 4+ messages in thread
From: Kent Overstreet @ 2024-08-24 19:10 UTC (permalink / raw)
To: david, linux-fsdevel, linux-mm
Cc: Kent Overstreet, Andrew Morton, Qi Zheng, Roman Gushchin
This patch:
- Changes show_mem() to always report on slab usage
- Instead of reporting on all slabs, we only report on top 10 slabs,
and in sorted order
- Also reports on shrinkers, with the new shrinkers_to_text().
Shrinkers need to be included in OOM/allocation failure reporting
because they're responsible for memory reclaim - if a shrinker isn't
giving up its memory, we need to know which one and why.
More OOM reporting can be moved to show_mem.c and improved, this patch
is only a start.
New example output on OOM/memory allocation failure:
00177 Mem-Info:
00177 active_anon:13706 inactive_anon:32266 isolated_anon:16
00177 active_file:1653 inactive_file:1822 isolated_file:0
00177 unevictable:0 dirty:0 writeback:0
00177 slab_reclaimable:6242 slab_unreclaimable:11168
00177 mapped:3824 shmem:3 pagetables:1266 bounce:0
00177 kernel_misc_reclaimable:0
00177 free:4362 free_pcp:35 free_cma:0
00177 Node 0 active_anon:54824kB inactive_anon:129064kB active_file:6612kB inactive_file:7288kB unevictable:0kB isolated(anon):64kB isolated(file):0kB mapped:15296kB dirty:0kB writeback:0kB shmem:12kB writeback_tmp:0kB kernel_stack:3392kB pagetables:5064kB all_unreclaimable? no
00177 DMA free:2232kB boost:0kB min:88kB low:108kB high:128kB reserved_highatomic:0KB active_anon:2924kB inactive_anon:6596kB active_file:428kB inactive_file:384kB unevictable:0kB writepending:0kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 426 426 426
00177 DMA32 free:15092kB boost:5836kB min:8432kB low:9080kB high:9728kB reserved_highatomic:0KB active_anon:52196kB inactive_anon:122392kB active_file:6176kB inactive_file:7068kB unevictable:0kB writepending:0kB present:507760kB managed:441816kB mlocked:0kB bounce:0kB free_pcp:72kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 0 0 0
00177 DMA: 284*4kB (UM) 53*8kB (UM) 21*16kB (U) 11*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 2248kB
00177 DMA32: 2765*4kB (UME) 375*8kB (UME) 57*16kB (UM) 5*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 15132kB
00177 4656 total pagecache pages
00177 1031 pages in swap cache
00177 Swap cache stats: add 6572399, delete 6572173, find 488603/3286476
00177 Free swap = 509112kB
00177 Total swap = 2097148kB
00177 130938 pages RAM
00177 0 pages HighMem/MovableOnly
00177 16644 pages reserved
00177 Unreclaimable slab info:
00177 9p-fcall-cache total: 8.25 MiB active: 8.25 MiB
00177 kernfs_node_cache total: 2.15 MiB active: 2.15 MiB
00177 kmalloc-64 total: 2.08 MiB active: 2.07 MiB
00177 task_struct total: 1.95 MiB active: 1.95 MiB
00177 kmalloc-4k total: 1.50 MiB active: 1.50 MiB
00177 signal_cache total: 1.34 MiB active: 1.34 MiB
00177 kmalloc-2k total: 1.16 MiB active: 1.16 MiB
00177 bch_inode_info total: 1.02 MiB active: 922 KiB
00177 perf_event total: 1.02 MiB active: 1.02 MiB
00177 biovec-max total: 992 KiB active: 960 KiB
00177 Shrinkers:
00177 super_cache_scan: objects: 127
00177 super_cache_scan: objects: 106
00177 jbd2_journal_shrink_scan: objects: 32
00177 ext4_es_scan: objects: 32
00177 bch2_btree_cache_scan: objects: 8
00177 nr nodes: 24
00177 nr dirty: 0
00177 cannibalize lock: 0000000000000000
00177
00177 super_cache_scan: objects: 8
00177 super_cache_scan: objects: 1
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: linux-mm@kvack.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
mm/oom_kill.c | 23 ---------------------
mm/show_mem.c | 43 +++++++++++++++++++++++++++++++++++++++
mm/slab.h | 6 ++++--
mm/slab_common.c | 52 +++++++++++++++++++++++++++++++++++++++---------
4 files changed, 90 insertions(+), 34 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4d7a0004df2c..dc56239ff057 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,27 +169,6 @@ static bool oom_unkillable_task(struct task_struct *p)
return false;
}
-/*
- * Check whether unreclaimable slab amount is greater than
- * all user memory(LRU pages).
- * dump_unreclaimable_slab() could help in the case that
- * oom due to too much unreclaimable slab used by kernel.
-*/
-static bool should_dump_unreclaim_slab(void)
-{
- unsigned long nr_lru;
-
- nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
- global_node_page_state(NR_INACTIVE_ANON) +
- global_node_page_state(NR_ACTIVE_FILE) +
- global_node_page_state(NR_INACTIVE_FILE) +
- global_node_page_state(NR_ISOLATED_ANON) +
- global_node_page_state(NR_ISOLATED_FILE) +
- global_node_page_state(NR_UNEVICTABLE);
-
- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
-}
-
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
@@ -464,8 +443,6 @@ static void dump_header(struct oom_control *oc)
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
- if (should_dump_unreclaim_slab())
- dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..a8ea4c41ced5 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -7,15 +7,18 @@
#include <linux/blkdev.h>
#include <linux/cma.h>
+#include <linux/console.h>
#include <linux/cpuset.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/seq_buf.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
#include "internal.h"
+#include "slab.h"
#include "swap.h"
atomic_long_t _totalram_pages __read_mostly;
@@ -397,10 +400,31 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
show_swap_cache_info();
}
+static void print_string_as_lines(const char *prefix, const char *lines)
+{
+ if (!lines) {
+ printk("%s (null)\n", prefix);
+ return;
+ }
+
+ bool locked = console_trylock();
+
+ while (1) {
+ const char *p = strchrnul(lines, '\n');
+ printk("%s%.*s\n", prefix, (int) (p - lines), lines);
+ if (!*p)
+ break;
+ lines = p + 1;
+ }
+ if (locked)
+ console_unlock();
+}
+
void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long total = 0, reserved = 0, highmem = 0;
struct zone *zone;
+ char *buf;
printk("Mem-Info:\n");
show_free_areas(filter, nodemask, max_zone_idx);
@@ -449,4 +473,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
}
}
#endif
+
+ const unsigned buf_size = 8192;
+ buf = kmalloc(buf_size, GFP_ATOMIC);
+ if (buf) {
+ struct seq_buf s;
+
+ printk("Unreclaimable slab info:\n");
+ seq_buf_init(&s, buf, buf_size);
+ dump_unreclaimable_slab(&s);
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+
+ printk("Shrinkers:\n");
+ seq_buf_init(&s, buf, buf_size);
+ shrinkers_to_text(&s);
+ print_string_as_lines(KERN_NOTICE, seq_buf_str(&s));
+ /* previous output doesn't get flushed without this - why? */
+
+ kfree(buf);
+ }
}
diff --git a/mm/slab.h b/mm/slab.h
index dcdb56b8e7f5..b523b3e3d9d3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -611,10 +611,12 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
return s->size;
}
+struct seq_buf;
+
#ifdef CONFIG_SLUB_DEBUG
-void dump_unreclaimable_slab(void);
+void dump_unreclaimable_slab(struct seq_buf *);
#else
-static inline void dump_unreclaimable_slab(void)
+static inline void dump_unreclaimable_slab(struct seq_buf *out)
{
}
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 40b582a014b8..bd50a57161cf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -27,6 +27,7 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
+#include <linux/seq_buf.h>
#include <linux/stackdepot.h>
#include "internal.h"
@@ -1181,10 +1182,15 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-void dump_unreclaimable_slab(void)
+void dump_unreclaimable_slab(struct seq_buf *out)
{
struct kmem_cache *s;
struct slabinfo sinfo;
+ struct slab_by_mem {
+ struct kmem_cache *s;
+ size_t total, active;
+ } slabs_by_mem[10], n;
+ int i, nr = 0;
/*
* Here acquiring slab_mutex is risky since we don't prefer to get
@@ -1194,24 +1200,52 @@ void dump_unreclaimable_slab(void)
* without acquiring the mutex.
*/
if (!mutex_trylock(&slab_mutex)) {
- pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n");
return;
}
- pr_info("Unreclaimable slab info:\n");
- pr_info("Name Used Total\n");
-
list_for_each_entry(s, &slab_caches, list) {
if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
- if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", s->name,
- (sinfo.active_objs * s->size) / 1024,
- (sinfo.num_objs * s->size) / 1024);
+ if (!sinfo.num_objs)
+ continue;
+
+ n.s = s;
+ n.total = sinfo.num_objs * s->size;
+ n.active = sinfo.active_objs * s->size;
+
+ for (i = 0; i < nr; i++)
+ if (n.total < slabs_by_mem[i].total)
+ break;
+
+ if (nr < ARRAY_SIZE(slabs_by_mem)) {
+ memmove(&slabs_by_mem[i + 1],
+ &slabs_by_mem[i],
+ sizeof(slabs_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&slabs_by_mem[0],
+ &slabs_by_mem[1],
+ sizeof(slabs_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ slabs_by_mem[i] = n;
}
+
+ for (i = nr - 1; i >= 0; --i) {
+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2);
+ seq_buf_printf(out, " active: ");
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2);
+ seq_buf_putc(out, '\n');
+ }
+
mutex_unlock(&slab_mutex);
}
--
2.45.2
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-08-24 19:10 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240824180454.3160385-1-kent.overstreet@linux.dev>
2024-08-24 18:04 ` [PATCH 02/10] mm: shrinker: Add a .to_text() method for shrinkers Kent Overstreet
2024-08-24 18:04 ` [PATCH 03/10] mm: shrinker: Add new stats for .to_text() Kent Overstreet
2024-08-24 18:04 ` [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c Kent Overstreet
2024-08-24 19:10 [PATCH 00/10] shrinker debugging, .to_text() report (resend) Kent Overstreet
2024-08-24 19:10 ` [PATCH 04/10] mm: Centralize & improve oom reporting in show_mem.c Kent Overstreet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox