linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/3] Helpers for debugging dying cgroups
@ 2023-09-11  7:55 Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Yakunin, Dmitry (Nebius) @ 2023-09-11  7:55 UTC (permalink / raw)
  To: cgroups, linux-kernel, linux-mm
  Cc: NB-Core Team, tj, hannes, mhocko, Yakunin, Dmitry (Nebius)

This patch series is mostly based on Konstantin's patches which he had sent
years ago [1].

This functionality still seems very useful for debugging the difference
between entities in cgroupfs and counters in /proc/cgroups, e.g.
searching for files that have page cache which prevents destruction of memcg.

I saw the comments in the original thread but didn't understand the Tejun's
comment about usage of filehandle instead of ino. Also I saved the original
output format in debugfs with extra counters. We can rework this format in
the future but now it seems straightforward for just filtering through
cmdline utilities.

[1] https://lore.kernel.org/lkml/153414348591.737150.14229960913953276515.stgit@buzz/

Dmitry Yakunin (3):
  cgroup: list all subsystem states in debugfs files
  proc/kpagecgroup: report also inode numbers of offline cgroups
  tools/mm/page-types: add flag for showing inodes of offline cgroups

 fs/proc/page.c              |  24 ++++++++-
 include/linux/cgroup-defs.h |   1 +
 include/linux/memcontrol.h  |   2 +-
 kernel/cgroup/cgroup.c      | 101 ++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c             |  19 ++++++-
 mm/memory-failure.c         |   2 +-
 tools/mm/page-types.c       |  18 ++++++-
 7 files changed, 159 insertions(+), 8 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files
  2023-09-11  7:55 [RFC PATCH 0/3] Helpers for debugging dying cgroups Yakunin, Dmitry (Nebius)
@ 2023-09-11  7:55 ` Yakunin, Dmitry (Nebius)
  2023-09-11 18:55   ` tj
  2023-09-11 22:16   ` Yosry Ahmed
  2023-09-11  7:55 ` [RFC PATCH 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 3/3] tools/mm/page-types: add flag for showing inodes " Yakunin, Dmitry (Nebius)
  2 siblings, 2 replies; 8+ messages in thread
From: Yakunin, Dmitry (Nebius) @ 2023-09-11  7:55 UTC (permalink / raw)
  To: cgroups, linux-kernel, linux-mm
  Cc: NB-Core Team, tj, hannes, mhocko, Yakunin, Dmitry (Nebius),
	Konstantin Khlebnikov, Andrey Ryabinin

After removing cgroup subsystem state could leak or live in background
forever because it is pinned by some reference. For example memory cgroup
could be pinned by pages in cache or tmpfs.

This patch adds common debugfs interface for listing basic state for each
controller. Controller could define callback for dumping own attributes.

In file /sys/kernel/debug/cgroup/<controller> each line shows state in
format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]

Common attributes:

css - css pointer
cgroup - cgroup pointer
id - css id
ino - cgroup inode
flags - css flags
refcnt - css atomic refcount, for online shows huge bias
path - cgroup path

This patch adds memcg attributes:

mem_id - 16-bit memory cgroup id
memory - charged pages
memsw - charged memory+swap for v1 and swap for v2
kmem - charged kernel pages
tcpmem - charged tcp pages
shmem - shmem/tmpfs pages

Link: https://lore.kernel.org/lkml/153414348591.737150.14229960913953276515.stgit@buzz
Suggested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Andrey Ryabinin <arbn@yandex-team.com>
Signed-off-by: Dmitry Yakunin <zeil@nebius.com>
---
 include/linux/cgroup-defs.h |   1 +
 kernel/cgroup/cgroup.c      | 101 ++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c             |  14 +++++
 3 files changed, 116 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a0d5466c7be..810bd300cbee 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -673,6 +673,7 @@ struct cgroup_subsys {
 	void (*exit)(struct task_struct *task);
 	void (*release)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
+	void (*css_dump)(struct cgroup_subsys_state *css, struct seq_file *m);
 
 	bool early_init:1;
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 625d7483951c..fb9931ff7570 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -40,6 +40,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
@@ -7068,3 +7069,103 @@ static int __init cgroup_sysfs_init(void)
 subsys_initcall(cgroup_sysfs_init);
 
 #endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_DEBUG_FS
+void *css_debugfs_seqfile_start(struct seq_file *m, loff_t *pos)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css;
+	int id = *pos;
+
+	rcu_read_lock();
+	css = idr_get_next(&ss->css_idr, &id);
+	*pos = id;
+	return css;
+}
+
+void *css_debugfs_seqfile_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css;
+	int id = *pos + 1;
+
+	css = idr_get_next(&ss->css_idr, &id);
+	*pos = id;
+	return css;
+}
+
+void css_debugfs_seqfile_stop(struct seq_file *m, void *v)
+{
+	rcu_read_unlock();
+}
+
+int css_debugfs_seqfile_show(struct seq_file *m, void *v)
+{
+	struct cgroup_subsys *ss = m->private;
+	struct cgroup_subsys_state *css = v;
+	/* data is NULL for root cgroup_subsys_state */
+	struct percpu_ref_data *data = css->refcnt.data;
+	size_t buflen;
+	char *buf;
+	int len;
+
+	seq_printf(m, "css=%pK cgroup=%pK id=%d ino=%lu flags=%#x refcnt=%lu path=",
+		   css, css->cgroup, css->id, cgroup_ino(css->cgroup),
+		   css->flags, data ? atomic_long_read(&data->count) : 0);
+
+	buflen = seq_get_buf(m, &buf);
+	if (buf) {
+		len = cgroup_path(css->cgroup, buf, buflen);
+		seq_commit(m, len < buflen ? len : -1);
+	}
+
+	if (ss->css_dump) {
+		seq_puts(m, " -- ");
+		ss->css_dump(css, m);
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static const struct seq_operations css_debug_seq_ops = {
+	.start = css_debugfs_seqfile_start,
+	.next = css_debugfs_seqfile_next,
+	.stop = css_debugfs_seqfile_stop,
+	.show = css_debugfs_seqfile_show,
+};
+
+static int css_debugfs_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &css_debug_seq_ops);
+	struct seq_file *m = file->private_data;
+
+	if (!ret)
+		m->private = inode->i_private;
+	return ret;
+}
+
+static const struct file_operations css_debugfs_fops = {
+	.open = css_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init css_debugfs_init(void)
+{
+	struct cgroup_subsys *ss;
+	struct dentry *dir;
+	int ssid;
+
+	dir = debugfs_create_dir("cgroup", NULL);
+	if (dir) {
+		for_each_subsys(ss, ssid)
+			debugfs_create_file(ss->name, 0644, dir, ss,
+					    &css_debugfs_fops);
+	}
+
+	return 0;
+}
+late_initcall(css_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4b27e245a055..7b3d4a10ac63 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5654,6 +5654,20 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 	}
 }
 
+static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
+				struct seq_file *m)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu",
+		   mem_cgroup_id(memcg),
+		   page_counter_read(&memcg->memory),
+		   page_counter_read(&memcg->memsw),
+		   page_counter_read(&memcg->kmem),
+		   page_counter_read(&memcg->tcpmem),
+		   memcg_page_state(memcg, NR_SHMEM));
+}
+
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 static int mem_cgroup_do_precharge(unsigned long count)
-- 
2.25.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC PATCH 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups
  2023-09-11  7:55 [RFC PATCH 0/3] Helpers for debugging dying cgroups Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
@ 2023-09-11  7:55 ` Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 3/3] tools/mm/page-types: add flag for showing inodes " Yakunin, Dmitry (Nebius)
  2 siblings, 0 replies; 8+ messages in thread
From: Yakunin, Dmitry (Nebius) @ 2023-09-11  7:55 UTC (permalink / raw)
  To: cgroups, linux-kernel, linux-mm
  Cc: NB-Core Team, tj, hannes, mhocko, Yakunin, Dmitry (Nebius),
	Konstantin Khlebnikov, Andrey Ryabinin

By default this interface reports inode number of closest online ancestor
if cgroups is offline (removed). Information about real owner is required
for detecting which pages keep removed cgroup.

This patch adds per-file mode which is changed by writing 64-bit flags
into opened /proc/kpagecgroup. For now only first bit is used.

Link: https://lore.kernel.org/lkml/153414348994.737150.10057219558779418929.stgit@buzz
Suggested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Andrey Ryabinin <arbn@yandex-team.com>
Signed-off-by: Dmitry Yakunin <zeil@nebius.com>
---
 fs/proc/page.c             | 24 ++++++++++++++++++++++--
 include/linux/memcontrol.h |  2 +-
 mm/memcontrol.c            |  5 +++--
 mm/memory-failure.c        |  2 +-
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 195b077c0fac..ae6feca2bbc7 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -278,6 +278,7 @@ static const struct proc_ops kpageflags_proc_ops = {
 static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
+	unsigned long flags = (unsigned long)file->private_data;
 	const unsigned long max_dump_pfn = get_max_dump_pfn();
 	u64 __user *out = (u64 __user *)buf;
 	struct page *ppage;
@@ -301,7 +302,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 		ppage = pfn_to_online_page(pfn);
 
 		if (ppage)
-			ino = page_cgroup_ino(ppage);
+			ino = page_cgroup_ino(ppage, !(flags & 1));
 		else
 			ino = 0;
 
@@ -323,10 +324,29 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
 	return ret;
 }
 
+static ssize_t kpagecgroup_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 flags;
+
+	if (count != 8)
+		return -EINVAL;
+
+	if (get_user(flags, buf))
+		return -EFAULT;
+
+	if (flags > 1)
+		return -EINVAL;
+
+	file->private_data = (void *)(unsigned long)flags;
+	return count;
+}
+
 static const struct proc_ops kpagecgroup_proc_ops = {
 	.proc_flags	= PROC_ENTRY_PERMANENT,
 	.proc_lseek	= mem_lseek,
 	.proc_read	= kpagecgroup_read,
+	.proc_write	= kpagecgroup_write,
 };
 #endif /* CONFIG_MEMCG */
 
@@ -335,7 +355,7 @@ static int __init proc_page_init(void)
 	proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops);
 	proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops);
 #ifdef CONFIG_MEMCG
-	proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops);
+	proc_create("kpagecgroup", 0600, NULL, &kpagecgroup_proc_ops);
 #endif
 	return 0;
 }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 222d7370134c..bbbddaa260d3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -892,7 +892,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
-ino_t page_cgroup_ino(struct page *page);
+ino_t page_cgroup_ino(struct page *page, bool online);
 
 static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3d4a10ac63..48cfe3695e06 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -380,6 +380,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
 /**
  * page_cgroup_ino - return inode number of the memcg a page is charged to
  * @page: the page
+ * @online: return closest online ancestor
  *
  * Look up the closest online ancestor of the memory cgroup @page is charged to
  * and return its inode number or 0 if @page is not charged to any cgroup. It
@@ -390,7 +391,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
  * after page_cgroup_ino() returns, so it only should be used by callers that
  * do not care (such as procfs interfaces).
  */
-ino_t page_cgroup_ino(struct page *page)
+ino_t page_cgroup_ino(struct page *page, bool online)
 {
 	struct mem_cgroup *memcg;
 	unsigned long ino = 0;
@@ -399,7 +400,7 @@ ino_t page_cgroup_ino(struct page *page)
 	/* page_folio() is racy here, but the entire function is racy anyway */
 	memcg = folio_memcg_check(page_folio(page));
 
-	while (memcg && !(memcg->css.flags & CSS_ONLINE))
+	while (memcg && online && !(memcg->css.flags & CSS_ONLINE))
 		memcg = parent_mem_cgroup(memcg);
 	if (memcg)
 		ino = cgroup_ino(memcg->css.cgroup);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5b663eca1f29..6734489b2435 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -267,7 +267,7 @@ static int hwpoison_filter_task(struct page *p)
 	if (!hwpoison_filter_memcg)
 		return 0;
 
-	if (page_cgroup_ino(p) != hwpoison_filter_memcg)
+	if (page_cgroup_ino(p, true) != hwpoison_filter_memcg)
 		return -EINVAL;
 
 	return 0;
-- 
2.25.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC PATCH 3/3] tools/mm/page-types: add flag for showing inodes of offline cgroups
  2023-09-11  7:55 [RFC PATCH 0/3] Helpers for debugging dying cgroups Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
  2023-09-11  7:55 ` [RFC PATCH 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Yakunin, Dmitry (Nebius)
@ 2023-09-11  7:55 ` Yakunin, Dmitry (Nebius)
  2 siblings, 0 replies; 8+ messages in thread
From: Yakunin, Dmitry (Nebius) @ 2023-09-11  7:55 UTC (permalink / raw)
  To: cgroups, linux-kernel, linux-mm
  Cc: NB-Core Team, tj, hannes, mhocko, Yakunin, Dmitry (Nebius),
	Konstantin Khlebnikov, Andrey Ryabinin

With flag -R|--real-cgroup page-types will report real owner.

Link: https://lore.kernel.org/lkml/153414349419.737150.8224164787883146532.stgit@buzz
Suggested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Andrey Ryabinin <arbn@yandex-team.com>
Signed-off-by: Dmitry Yakunin <zeil@nebius.com>
---
 tools/mm/page-types.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 8d5595b6c59f..f26035b362d2 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -161,6 +161,7 @@ static pid_t		opt_pid;	/* process to walk */
 const char		*opt_file;	/* file or directory path */
 static uint64_t		opt_cgroup;	/* cgroup inode */
 static int		opt_list_cgroup;/* list page cgroup */
+static int		opt_real_cgroup;/* list page cgroup */
 static int		opt_list_mapcnt;/* list page map count */
 static const char	*opt_kpageflags;/* kpageflags file to parse */
 
@@ -837,6 +838,7 @@ static void usage(void)
 "            -l|--list                  Show page details in ranges\n"
 "            -L|--list-each             Show page details one by one\n"
 "            -C|--list-cgroup           Show cgroup inode for pages\n"
+"            -R|--real-cgroup           Show real offline cgroups\n"
 "            -M|--list-mapcnt           Show page map count\n"
 "            -N|--no-summary            Don't show summary info\n"
 "            -X|--hwpoison              hwpoison pages\n"
@@ -1257,6 +1259,7 @@ static const struct option opts[] = {
 	{ "list"      , 0, NULL, 'l' },
 	{ "list-each" , 0, NULL, 'L' },
 	{ "list-cgroup", 0, NULL, 'C' },
+	{ "real-cgroup", 0, NULL, 'R' },
 	{ "list-mapcnt", 0, NULL, 'M' },
 	{ "no-summary", 0, NULL, 'N' },
 	{ "hwpoison"  , 0, NULL, 'X' },
@@ -1273,7 +1276,7 @@ int main(int argc, char *argv[])
 	page_size = getpagesize();
 
 	while ((c = getopt_long(argc, argv,
-				"rp:f:a:b:d:c:CilLMNXxF:h",
+				"rp:f:a:b:d:c:CRilLMNXxF:h",
 				opts, NULL)) != -1) {
 		switch (c) {
 		case 'r':
@@ -1297,6 +1300,9 @@ int main(int argc, char *argv[])
 		case 'C':
 			opt_list_cgroup = 1;
 			break;
+		case 'R':
+			opt_real_cgroup = 1;
+			break;
 		case 'd':
 			describe_flags(optarg);
 			exit(0);
@@ -1338,7 +1344,15 @@ int main(int argc, char *argv[])
 	if (!opt_kpageflags)
 		opt_kpageflags = PROC_KPAGEFLAGS;
 
-	if (opt_cgroup || opt_list_cgroup)
+	if (opt_real_cgroup) {
+		uint64_t flags = 1;
+
+		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDWR);
+		if (write(kpagecgroup_fd, &flags, sizeof(flags)) < 0) {
+			perror(PROC_KPAGECGROUP);
+			exit(EXIT_FAILURE);
+		}
+	} else if (opt_cgroup || opt_list_cgroup)
 		kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
 
 	if (opt_list && opt_list_mapcnt)
-- 
2.25.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files
  2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
@ 2023-09-11 18:55   ` tj
  2023-09-13 10:33     ` Dmitry Yakunin
  2023-09-11 22:16   ` Yosry Ahmed
  1 sibling, 1 reply; 8+ messages in thread
From: tj @ 2023-09-11 18:55 UTC (permalink / raw)
  To: Yakunin, Dmitry (Nebius)
  Cc: cgroups, linux-kernel, linux-mm, NB-Core Team, hannes, mhocko,
	Konstantin Khlebnikov, Andrey Ryabinin

On Mon, Sep 11, 2023 at 07:55:15AM +0000, Yakunin, Dmitry (Nebius) wrote:
> +static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
> +				struct seq_file *m)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +
> +	seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu",
> +		   mem_cgroup_id(memcg),
> +		   page_counter_read(&memcg->memory),
> +		   page_counter_read(&memcg->memsw),
> +		   page_counter_read(&memcg->kmem),
> +		   page_counter_read(&memcg->tcpmem),
> +		   memcg_page_state(memcg, NR_SHMEM));
> +}

Can you please take a look at drgn (https://github.com/osandov/drgn) and see
whether that satifies your needs? We can easily add drgn scripts under tools
directory too (e.g. iocost already does that).

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files
  2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
  2023-09-11 18:55   ` tj
@ 2023-09-11 22:16   ` Yosry Ahmed
  2023-09-13 10:35     ` Dmitry Yakunin
  1 sibling, 1 reply; 8+ messages in thread
From: Yosry Ahmed @ 2023-09-11 22:16 UTC (permalink / raw)
  To: Yakunin, Dmitry (Nebius)
  Cc: cgroups, linux-kernel, linux-mm, NB-Core Team, tj, hannes,
	mhocko, Konstantin Khlebnikov, Andrey Ryabinin

On Mon, Sep 11, 2023 at 12:55 AM Yakunin, Dmitry (Nebius)
<zeil@nebius.com> wrote:
>
> After removing cgroup subsystem state could leak or live in background
> forever because it is pinned by some reference. For example memory cgroup
> could be pinned by pages in cache or tmpfs.
>
> This patch adds common debugfs interface for listing basic state for each
> controller. Controller could define callback for dumping own attributes.
>
> In file /sys/kernel/debug/cgroup/<controller> each line shows state in
> format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]
>
> Common attributes:
>
> css - css pointer
> cgroup - cgroup pointer
> id - css id
> ino - cgroup inode
> flags - css flags
> refcnt - css atomic refcount, for online shows huge bias
> path - cgroup path
>
> This patch adds memcg attributes:
>
> mem_id - 16-bit memory cgroup id
> memory - charged pages
> memsw - charged memory+swap for v1 and swap for v2
> kmem - charged kernel pages
> tcpmem - charged tcp pages
> shmem - shmem/tmpfs pages
>
> Link: https://lore.kernel.org/lkml/153414348591.737150.14229960913953276515.stgit@buzz
> Suggested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Reviewed-by: Andrey Ryabinin <arbn@yandex-team.com>
> Signed-off-by: Dmitry Yakunin <zeil@nebius.com>

FWIW, I was just recently working on a debugfs directly that exposes a
list of all zombie memcgs as well as the "memory.stat" output for all
of them.

This entails a file at /sys/kernel/debug/zombie_memcgs/all that
contains a list of zombie memcgs (with indentation to reflect the
hierarchy) and an id for each of them.

This id can be used to index per-memcg directories at
/sys/kernel/debug/zombie_memcgs/<id>/, which include debug files. The
only one we have so far is
/sys/kernel/debug/zombie_memcgs/<id>/memory.stat.

If there is interest in this, I can share more information.

> ---
>  include/linux/cgroup-defs.h |   1 +
>  kernel/cgroup/cgroup.c      | 101 ++++++++++++++++++++++++++++++++++++
>  mm/memcontrol.c             |  14 +++++
>  3 files changed, 116 insertions(+)
>
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 8a0d5466c7be..810bd300cbee 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -673,6 +673,7 @@ struct cgroup_subsys {
>         void (*exit)(struct task_struct *task);
>         void (*release)(struct task_struct *task);
>         void (*bind)(struct cgroup_subsys_state *root_css);
> +       void (*css_dump)(struct cgroup_subsys_state *css, struct seq_file *m);
>
>         bool early_init:1;
>
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 625d7483951c..fb9931ff7570 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -40,6 +40,7 @@
>  #include <linux/mount.h>
>  #include <linux/pagemap.h>
>  #include <linux/proc_fs.h>
> +#include <linux/debugfs.h>
>  #include <linux/rcupdate.h>
>  #include <linux/sched.h>
>  #include <linux/sched/task.h>
> @@ -7068,3 +7069,103 @@ static int __init cgroup_sysfs_init(void)
>  subsys_initcall(cgroup_sysfs_init);
>
>  #endif /* CONFIG_SYSFS */
> +
> +#ifdef CONFIG_DEBUG_FS
> +void *css_debugfs_seqfile_start(struct seq_file *m, loff_t *pos)
> +{
> +       struct cgroup_subsys *ss = m->private;
> +       struct cgroup_subsys_state *css;
> +       int id = *pos;
> +
> +       rcu_read_lock();
> +       css = idr_get_next(&ss->css_idr, &id);
> +       *pos = id;
> +       return css;
> +}
> +
> +void *css_debugfs_seqfile_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> +       struct cgroup_subsys *ss = m->private;
> +       struct cgroup_subsys_state *css;
> +       int id = *pos + 1;
> +
> +       css = idr_get_next(&ss->css_idr, &id);
> +       *pos = id;
> +       return css;
> +}
> +
> +void css_debugfs_seqfile_stop(struct seq_file *m, void *v)
> +{
> +       rcu_read_unlock();
> +}
> +
> +int css_debugfs_seqfile_show(struct seq_file *m, void *v)
> +{
> +       struct cgroup_subsys *ss = m->private;
> +       struct cgroup_subsys_state *css = v;
> +       /* data is NULL for root cgroup_subsys_state */
> +       struct percpu_ref_data *data = css->refcnt.data;
> +       size_t buflen;
> +       char *buf;
> +       int len;
> +
> +       seq_printf(m, "css=%pK cgroup=%pK id=%d ino=%lu flags=%#x refcnt=%lu path=",
> +                  css, css->cgroup, css->id, cgroup_ino(css->cgroup),
> +                  css->flags, data ? atomic_long_read(&data->count) : 0);
> +
> +       buflen = seq_get_buf(m, &buf);
> +       if (buf) {
> +               len = cgroup_path(css->cgroup, buf, buflen);
> +               seq_commit(m, len < buflen ? len : -1);
> +       }
> +
> +       if (ss->css_dump) {
> +               seq_puts(m, " -- ");
> +               ss->css_dump(css, m);
> +       }
> +
> +       seq_putc(m, '\n');
> +       return 0;
> +}
> +
> +static const struct seq_operations css_debug_seq_ops = {
> +       .start = css_debugfs_seqfile_start,
> +       .next = css_debugfs_seqfile_next,
> +       .stop = css_debugfs_seqfile_stop,
> +       .show = css_debugfs_seqfile_show,
> +};
> +
> +static int css_debugfs_open(struct inode *inode, struct file *file)
> +{
> +       int ret = seq_open(file, &css_debug_seq_ops);
> +       struct seq_file *m = file->private_data;
> +
> +       if (!ret)
> +               m->private = inode->i_private;
> +       return ret;
> +}
> +
> +static const struct file_operations css_debugfs_fops = {
> +       .open = css_debugfs_open,
> +       .read = seq_read,
> +       .llseek = seq_lseek,
> +       .release = seq_release,
> +};
> +
> +static int __init css_debugfs_init(void)
> +{
> +       struct cgroup_subsys *ss;
> +       struct dentry *dir;
> +       int ssid;
> +
> +       dir = debugfs_create_dir("cgroup", NULL);
> +       if (dir) {
> +               for_each_subsys(ss, ssid)
> +                       debugfs_create_file(ss->name, 0644, dir, ss,
> +                                           &css_debugfs_fops);
> +       }
> +
> +       return 0;
> +}
> +late_initcall(css_debugfs_init);
> +#endif /* CONFIG_DEBUG_FS */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 4b27e245a055..7b3d4a10ac63 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5654,6 +5654,20 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>         }
>  }
>
> +static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
> +                               struct seq_file *m)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +
> +       seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu",
> +                  mem_cgroup_id(memcg),
> +                  page_counter_read(&memcg->memory),
> +                  page_counter_read(&memcg->memsw),
> +                  page_counter_read(&memcg->kmem),
> +                  page_counter_read(&memcg->tcpmem),
> +                  memcg_page_state(memcg, NR_SHMEM));
> +}
> +
>  #ifdef CONFIG_MMU
>  /* Handlers for move charge at task migration. */
>  static int mem_cgroup_do_precharge(unsigned long count)
> --
> 2.25.1
>
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files
  2023-09-11 18:55   ` tj
@ 2023-09-13 10:33     ` Dmitry Yakunin
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitry Yakunin @ 2023-09-13 10:33 UTC (permalink / raw)
  To: tj
  Cc: cgroups, linux-kernel, linux-mm, NB-Core Team, hannes, mhocko,
	Konstantin Khlebnikov, Andrey Ryabinin

Thank you for pointing out to drgn scripts in kernel source tree, didn't 
know about them, I will take a look.

On 11.09.2023 20:55, tj@kernel.org wrote:
> On Mon, Sep 11, 2023 at 07:55:15AM +0000, Yakunin, Dmitry (Nebius) wrote:
>> +static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
>> +                             struct seq_file *m)
>> +{
>> +     struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>> +
>> +     seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu",
>> +                mem_cgroup_id(memcg),
>> +                page_counter_read(&memcg->memory),
>> +                page_counter_read(&memcg->memsw),
>> +                page_counter_read(&memcg->kmem),
>> +                page_counter_read(&memcg->tcpmem),
>> +                memcg_page_state(memcg, NR_SHMEM));
>> +}
> Can you please take a look at drgn (https://github.com/osandov/drgn) and see
> whether that satifies your needs? We can easily add drgn scripts under tools
> directory too (e.g. iocost already does that).
>
> Thanks.
>
> --
> tejun


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files
  2023-09-11 22:16   ` Yosry Ahmed
@ 2023-09-13 10:35     ` Dmitry Yakunin
  0 siblings, 0 replies; 8+ messages in thread
From: Dmitry Yakunin @ 2023-09-13 10:35 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: cgroups, linux-kernel, linux-mm, NB-Core Team, tj, hannes,
	mhocko, Konstantin Khlebnikov, Andrey Ryabinin

Sure, if you can share you work it would be nice. Thank you.

On 12.09.2023 00:16, Yosry Ahmed wrote:
> On Mon, Sep 11, 2023 at 12:55 AM Yakunin, Dmitry (Nebius)
> <zeil@nebius.com> wrote:
>> After removing cgroup subsystem state could leak or live in background
>> forever because it is pinned by some reference. For example memory cgroup
>> could be pinned by pages in cache or tmpfs.
>>
>> This patch adds common debugfs interface for listing basic state for each
>> controller. Controller could define callback for dumping own attributes.
>>
>> In file /sys/kernel/debug/cgroup/<controller> each line shows state in
>> format: <common_attr>=<value>... [-- <controller_attr>=<value>... ]
>>
>> Common attributes:
>>
>> css - css pointer
>> cgroup - cgroup pointer
>> id - css id
>> ino - cgroup inode
>> flags - css flags
>> refcnt - css atomic refcount, for online shows huge bias
>> path - cgroup path
>>
>> This patch adds memcg attributes:
>>
>> mem_id - 16-bit memory cgroup id
>> memory - charged pages
>> memsw - charged memory+swap for v1 and swap for v2
>> kmem - charged kernel pages
>> tcpmem - charged tcp pages
>> shmem - shmem/tmpfs pages
>>
>> Link: https://lore.kernel.org/lkml/153414348591.737150.14229960913953276515.stgit@buzz
>> Suggested-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
>> Reviewed-by: Andrey Ryabinin <arbn@yandex-team.com>
>> Signed-off-by: Dmitry Yakunin <zeil@nebius.com>
> FWIW, I was just recently working on a debugfs directly that exposes a
> list of all zombie memcgs as well as the "memory.stat" output for all
> of them.
>
> This entails a file at /sys/kernel/debug/zombie_memcgs/all that
> contains a list of zombie memcgs (with indentation to reflect the
> hierarchy) and an id for each of them.
>
> This id can be used to index per-memcg directories at
> /sys/kernel/debug/zombie_memcgs/<id>/, which include debug files. The
> only one we have so far is
> /sys/kernel/debug/zombie_memcgs/<id>/memory.stat.
>
> If there is interest in this, I can share more information.
>
>> ---
>>   include/linux/cgroup-defs.h |   1 +
>>   kernel/cgroup/cgroup.c      | 101 ++++++++++++++++++++++++++++++++++++
>>   mm/memcontrol.c             |  14 +++++
>>   3 files changed, 116 insertions(+)
>>
>> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
>> index 8a0d5466c7be..810bd300cbee 100644
>> --- a/include/linux/cgroup-defs.h
>> +++ b/include/linux/cgroup-defs.h
>> @@ -673,6 +673,7 @@ struct cgroup_subsys {
>>          void (*exit)(struct task_struct *task);
>>          void (*release)(struct task_struct *task);
>>          void (*bind)(struct cgroup_subsys_state *root_css);
>> +       void (*css_dump)(struct cgroup_subsys_state *css, struct seq_file *m);
>>
>>          bool early_init:1;
>>
>> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
>> index 625d7483951c..fb9931ff7570 100644
>> --- a/kernel/cgroup/cgroup.c
>> +++ b/kernel/cgroup/cgroup.c
>> @@ -40,6 +40,7 @@
>>   #include <linux/mount.h>
>>   #include <linux/pagemap.h>
>>   #include <linux/proc_fs.h>
>> +#include <linux/debugfs.h>
>>   #include <linux/rcupdate.h>
>>   #include <linux/sched.h>
>>   #include <linux/sched/task.h>
>> @@ -7068,3 +7069,103 @@ static int __init cgroup_sysfs_init(void)
>>   subsys_initcall(cgroup_sysfs_init);
>>
>>   #endif /* CONFIG_SYSFS */
>> +
>> +#ifdef CONFIG_DEBUG_FS
>> +void *css_debugfs_seqfile_start(struct seq_file *m, loff_t *pos)
>> +{
>> +       struct cgroup_subsys *ss = m->private;
>> +       struct cgroup_subsys_state *css;
>> +       int id = *pos;
>> +
>> +       rcu_read_lock();
>> +       css = idr_get_next(&ss->css_idr, &id);
>> +       *pos = id;
>> +       return css;
>> +}
>> +
>> +void *css_debugfs_seqfile_next(struct seq_file *m, void *v, loff_t *pos)
>> +{
>> +       struct cgroup_subsys *ss = m->private;
>> +       struct cgroup_subsys_state *css;
>> +       int id = *pos + 1;
>> +
>> +       css = idr_get_next(&ss->css_idr, &id);
>> +       *pos = id;
>> +       return css;
>> +}
>> +
>> +void css_debugfs_seqfile_stop(struct seq_file *m, void *v)
>> +{
>> +       rcu_read_unlock();
>> +}
>> +
>> +int css_debugfs_seqfile_show(struct seq_file *m, void *v)
>> +{
>> +       struct cgroup_subsys *ss = m->private;
>> +       struct cgroup_subsys_state *css = v;
>> +       /* data is NULL for root cgroup_subsys_state */
>> +       struct percpu_ref_data *data = css->refcnt.data;
>> +       size_t buflen;
>> +       char *buf;
>> +       int len;
>> +
>> +       seq_printf(m, "css=%pK cgroup=%pK id=%d ino=%lu flags=%#x refcnt=%lu path=",
>> +                  css, css->cgroup, css->id, cgroup_ino(css->cgroup),
>> +                  css->flags, data ? atomic_long_read(&data->count) : 0);
>> +
>> +       buflen = seq_get_buf(m, &buf);
>> +       if (buf) {
>> +               len = cgroup_path(css->cgroup, buf, buflen);
>> +               seq_commit(m, len < buflen ? len : -1);
>> +       }
>> +
>> +       if (ss->css_dump) {
>> +               seq_puts(m, " -- ");
>> +               ss->css_dump(css, m);
>> +       }
>> +
>> +       seq_putc(m, '\n');
>> +       return 0;
>> +}
>> +
>> +static const struct seq_operations css_debug_seq_ops = {
>> +       .start = css_debugfs_seqfile_start,
>> +       .next = css_debugfs_seqfile_next,
>> +       .stop = css_debugfs_seqfile_stop,
>> +       .show = css_debugfs_seqfile_show,
>> +};
>> +
>> +static int css_debugfs_open(struct inode *inode, struct file *file)
>> +{
>> +       int ret = seq_open(file, &css_debug_seq_ops);
>> +       struct seq_file *m = file->private_data;
>> +
>> +       if (!ret)
>> +               m->private = inode->i_private;
>> +       return ret;
>> +}
>> +
>> +static const struct file_operations css_debugfs_fops = {
>> +       .open = css_debugfs_open,
>> +       .read = seq_read,
>> +       .llseek = seq_lseek,
>> +       .release = seq_release,
>> +};
>> +
>> +static int __init css_debugfs_init(void)
>> +{
>> +       struct cgroup_subsys *ss;
>> +       struct dentry *dir;
>> +       int ssid;
>> +
>> +       dir = debugfs_create_dir("cgroup", NULL);
>> +       if (dir) {
>> +               for_each_subsys(ss, ssid)
>> +                       debugfs_create_file(ss->name, 0644, dir, ss,
>> +                                           &css_debugfs_fops);
>> +       }
>> +
>> +       return 0;
>> +}
>> +late_initcall(css_debugfs_init);
>> +#endif /* CONFIG_DEBUG_FS */
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 4b27e245a055..7b3d4a10ac63 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -5654,6 +5654,20 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>>          }
>>   }
>>
>> +static void mem_cgroup_css_dump(struct cgroup_subsys_state *css,
>> +                               struct seq_file *m)
>> +{
>> +       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>> +
>> +       seq_printf(m, "mem_id=%u memory=%lu memsw=%lu kmem=%lu tcpmem=%lu shmem=%lu",
>> +                  mem_cgroup_id(memcg),
>> +                  page_counter_read(&memcg->memory),
>> +                  page_counter_read(&memcg->memsw),
>> +                  page_counter_read(&memcg->kmem),
>> +                  page_counter_read(&memcg->tcpmem),
>> +                  memcg_page_state(memcg, NR_SHMEM));
>> +}
>> +
>>   #ifdef CONFIG_MMU
>>   /* Handlers for move charge at task migration. */
>>   static int mem_cgroup_do_precharge(unsigned long count)
>> --
>> 2.25.1
>>
>>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-09-13 10:35 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-11  7:55 [RFC PATCH 0/3] Helpers for debugging dying cgroups Yakunin, Dmitry (Nebius)
2023-09-11  7:55 ` [RFC PATCH 1/3] cgroup: list all subsystem states in debugfs files Yakunin, Dmitry (Nebius)
2023-09-11 18:55   ` tj
2023-09-13 10:33     ` Dmitry Yakunin
2023-09-11 22:16   ` Yosry Ahmed
2023-09-13 10:35     ` Dmitry Yakunin
2023-09-11  7:55 ` [RFC PATCH 2/3] proc/kpagecgroup: report also inode numbers of offline cgroups Yakunin, Dmitry (Nebius)
2023-09-11  7:55 ` [RFC PATCH 3/3] tools/mm/page-types: add flag for showing inodes " Yakunin, Dmitry (Nebius)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox