linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Andrey Ryabinin <arbn@yandex-team.com>
To: linux-kernel@vger.kernel.org
Cc: Alexander Graf <graf@amazon.com>,
	James Gowans <jgowans@amazon.com>,
	Mike Rapoport <rppt@kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	x86@kernel.org, "H. Peter Anvin" <hpa@zytor.com>,
	Eric Biederman <ebiederm@xmission.com>,
	kexec@lists.infradead.org, Steven Rostedt <rostedt@goodmis.org>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	linux-trace-kernel@vger.kernel.org, valesini@yandex-team.com,
	Andrey Ryabinin <arbn@yandex-team.com>
Subject: [RFC PATCH 7/7] trace: migrate trace buffers across kexec
Date: Wed,  2 Oct 2024 18:07:22 +0200	[thread overview]
Message-ID: <20241002160722.20025-8-arbn@yandex-team.com> (raw)
In-Reply-To: <20241002160722.20025-1-arbn@yandex-team.com>

This is a demonstration of kstate capabilities to migrate across
kexec something more complex rather than simple structure like
in the lib/test_kstate.c module.

Here we migrate tracing_on/current_trace and content of the
trace buffers to the new kernel

The 'global_trace_state' describes 'tracing_on' and 'current_trace'
states. The 'trace_buffer' kstate field in 'global_trace_state'
points to 'kstate_trace_buffer' describing the state of ring buffers.

The code in kstate_rb_[save/restore]() saves and restore list of buffer
pages. It turned out to be somewhat hacky and ugly, partially because
kstate currently can't migrate slab data. So because of that we have
to save/restore positions of commit_page/reader_page/etc in the list
of pages. We could probably teach kstate to migrate slab pages, preserving
contents at the same address, which would make easier to migrate lists
like the ring buffer list in the trace, as we would need to save/restore
only pointer.

Signed-off-by: Andrey Ryabinin <arbn@yandex-team.com>
---
 include/linux/kstate.h     |   4 +
 kernel/trace/ring_buffer.c | 189 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c       |  81 ++++++++++++++++
 3 files changed, 274 insertions(+)

diff --git a/include/linux/kstate.h b/include/linux/kstate.h
index 2ddbe41a1f171..ae807a75a02f8 100644
--- a/include/linux/kstate.h
+++ b/include/linux/kstate.h
@@ -32,6 +32,10 @@ enum kstate_ids {
 	KSTATE_PAGE_ID,
 	KSTATE_RSVD_MEM_ID,
 	KSTATE_TEST_ID,
+	KSTATE_TRACE_ID,
+	KSTATE_TRACE_BUFFER_ID,
+	KSTATE_TRACE_RING_BUFFER_ID,
+	KSTATE_TRACE_BUFFER_PAGE_ID,
 	KSTATE_LAST_ID = -1,
 };
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 77dc0b25140e6..9a8692d7d960c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kstate.h>
 #include <linux/kthread.h>	/* for self test */
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -1467,6 +1468,194 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 }
 
+#ifdef CONFIG_KSTATE
+static int kstate_bpage_save(void *mig_stream, void *obj, const struct kstate_field *field)
+{
+	struct buffer_page *bpage = obj;
+
+	kstate_register_page(virt_to_page(bpage->page), bpage->order);
+	return 0;
+
+}
+struct kstate_description kstate_buffer_page = {
+	.name = "buffer_page",
+	.id = KSTATE_TRACE_BUFFER_PAGE_ID,
+	.fields = (const struct kstate_field[]) {
+		KSTATE_SIMPLE(write, struct buffer_page),
+		KSTATE_SIMPLE(read, struct buffer_page),
+		KSTATE_SIMPLE(entries, struct buffer_page),
+		KSTATE_SIMPLE(real_end, struct buffer_page),
+		KSTATE_SIMPLE(order, struct buffer_page),
+		KSTATE_SIMPLE(page, struct buffer_page),
+		{
+			.name = "buffer_page",
+			.flags = KS_CUSTOM,
+			.save = kstate_bpage_save,
+			.size = (sizeof(struct buffer_page)),
+		},
+		KSTATE_END_OF_LIST(),
+	},
+};
+
+static void restore_pages_positions(void **mig_stream,
+				struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct list_head *tmp;
+	struct list_head *head = rb_list_head(cpu_buffer->pages);
+	unsigned long commit_page_nr, reader_page_nr,
+		head_page_nr, tail_page_nr;
+	int i = 0;
+
+	commit_page_nr = kstate_get_ulong(mig_stream);
+	reader_page_nr = kstate_get_ulong(mig_stream);
+	head_page_nr = kstate_get_ulong(mig_stream);
+	tail_page_nr = kstate_get_ulong(mig_stream);
+
+	for (tmp = head;;) {
+		struct buffer_page *page = (struct buffer_page *)tmp;
+
+		if (commit_page_nr == i)
+			cpu_buffer->commit_page = page;
+		if (reader_page_nr == i)
+			cpu_buffer->reader_page = page;
+		if (head_page_nr == i)
+			cpu_buffer->head_page = page;
+		if (tail_page_nr == i)
+			cpu_buffer->tail_page = page;
+		i++;
+		tmp = rb_list_head(tmp->next);
+		if (tmp == head)
+			break;
+	}
+}
+
+static int kstate_rb_restore(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = obj;
+	LIST_HEAD(pages);
+	void *stream_start = mig_stream;
+	struct buffer_page *page;
+	struct list_head *tmp;
+	struct list_head *head = rb_list_head(cpu_buffer->pages);
+	int i = 0;
+
+	while (kstate_get_byte(&mig_stream)) {
+		int j = 0;
+		bool page_exists  = false;
+
+		for (tmp = rb_list_head(head->next); tmp != head;
+		     tmp = rb_list_head(tmp->next)) {
+			if (j == i) {
+				page_exists = true;
+				page = (struct buffer_page *)tmp;
+				break;
+			}
+			j++;
+		}
+		if (!page_exists) {
+			struct buffer_page *bpage;
+
+			bpage = kzalloc_node(ALIGN(sizeof(*bpage),
+					cache_line_size()), GFP_KERNEL,
+					cpu_to_node(cpu_buffer->cpu));
+			list_add(&bpage->list, &pages);
+			page = bpage;
+		}
+		mig_stream = restore_kstate((struct kstate_entry *)mig_stream,
+					i++, field->ksd, page);
+	}
+
+	restore_pages_positions(&mig_stream, cpu_buffer);
+
+	return mig_stream - stream_start;
+}
+
+static int kstate_rb_save(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = obj;
+	struct list_head *tmp;
+	struct list_head *head = rb_list_head(cpu_buffer->pages);
+	void *stream_start = mig_stream;
+	unsigned long commit_page_nr, reader_page_nr,
+		head_page_nr, tail_page_nr;
+	int i = 0;
+
+
+	for (tmp = head;;) {
+		struct buffer_page *page = (struct buffer_page *)tmp;
+
+		mig_stream = kstate_save_byte(mig_stream, 1);
+		mig_stream = save_kstate(mig_stream, i, field->ksd, page);
+
+		if (cpu_buffer->commit_page == page)
+			commit_page_nr = i;
+		if (cpu_buffer->reader_page == page)
+			reader_page_nr = i;
+		if (cpu_buffer->head_page == page)
+			head_page_nr = i;
+		if (cpu_buffer->tail_page == page)
+			tail_page_nr = i;
+		i++;
+		tmp = rb_list_head(tmp->next);
+		if (tmp == head)
+			break;
+	}
+
+	mig_stream = kstate_save_byte(mig_stream, 0);
+
+	/* save pages positions */
+	mig_stream = kstate_save_ulong(mig_stream, commit_page_nr);
+	mig_stream = kstate_save_ulong(mig_stream, reader_page_nr);
+	mig_stream = kstate_save_ulong(mig_stream, head_page_nr);
+	mig_stream = kstate_save_ulong(mig_stream, tail_page_nr);
+
+	return mig_stream - stream_start;
+}
+
+struct kstate_description kstate_ring_buffer_per_cpu = {
+	.name = "ring_buffer_per_cpu",
+	.id = KSTATE_TRACE_RING_BUFFER_ID,
+	.state_list = LIST_HEAD_INIT(kstate_ring_buffer_per_cpu.state_list),
+	.fields = (const struct kstate_field[]) {
+		KSTATE_SIMPLE(entries, struct ring_buffer_per_cpu),
+		KSTATE_SIMPLE(entries_bytes, struct ring_buffer_per_cpu),
+		{
+			.name = "buffer_pages",
+			.flags = KS_CUSTOM,
+			.size = (sizeof(struct ring_buffer_per_cpu)),
+			.ksd = &kstate_buffer_page,
+			.save = kstate_rb_save,
+			.restore = kstate_rb_restore,
+		},
+		KSTATE_END_OF_LIST(),
+	},
+};
+
+static int nr_ring_buffers(void)
+{
+	return nr_cpu_ids;
+}
+
+struct kstate_description kstate_trace_buffer = {
+	.name = "trace_buffer",
+	.id = KSTATE_TRACE_BUFFER_ID,
+	.state_list = LIST_HEAD_INIT(kstate_trace_buffer.state_list),
+	.fields = (const struct kstate_field[]) {
+		{
+			.name = "ring_buffers",
+			.flags = KS_STRUCT|KS_POINTER|KS_ARRAY_OF_POINTER,
+			.size = (sizeof(struct ring_buffer_per_cpu *)),
+			.offset = offsetof(struct trace_buffer, buffers),
+			.count = nr_ring_buffers,
+			.ksd = &kstate_ring_buffer_per_cpu,
+		},
+		KSTATE_END_OF_LIST(),
+	}
+};
+#endif
+
 static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
 			  struct buffer_page *bpage)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c01375adc4714..bb07d716beab4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/security.h>
+#include <linux/kstate.h>
 #include <linux/seq_file.h>
 #include <linux/irqflags.h>
 #include <linux/debugfs.h>
@@ -10621,6 +10622,84 @@ __init static void enable_instances(void)
 	}
 }
 
+#ifdef CONFIG_KSTATE
+static int cur_trace_save(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct trace_array *tr = obj;
+
+	return strscpy(mig_stream, tr->current_trace->name, 100) + 1;
+}
+
+static int cur_trace_restore(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct trace_array *tr = obj;
+
+	tracing_set_tracer(tr, mig_stream);
+	return strlen(mig_stream) + 1;
+}
+
+static int tracing_on_save(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct trace_array *tr = obj;
+
+	*(u8 *)mig_stream = (u8)tracer_tracing_is_on(tr);
+	return sizeof(u8);
+
+}
+
+static int tracing_on_restore(void *mig_stream, void *obj,
+			const struct kstate_field *field)
+{
+	struct trace_array *tr = obj;
+	u8 on = *(u8 *)mig_stream;
+
+	if (on)
+		tracer_tracing_on(tr);
+	else
+		tracer_tracing_off(tr);
+
+	return sizeof(on);
+}
+
+extern struct kstate_description kstate_trace_buffer;
+
+struct kstate_description global_trace_state = {
+	.name = "trace_state",
+	.id = KSTATE_TRACE_ID,
+	.version_id = 1,
+	.state_list = LIST_HEAD_INIT(global_trace_state.state_list),
+	.fields = (const struct kstate_field[]) {
+		{
+			.name = "tracing_on",
+			.flags = KS_CUSTOM,
+			.version_id = 0,
+			.size = sizeof(struct trace_array),
+			.save = tracing_on_save,
+			.restore = tracing_on_restore,
+		},
+		{
+			.name = "current_trace",
+			.flags = KS_CUSTOM,
+			.version_id = 0,
+			.size = sizeof(struct trace_array),
+			.save = cur_trace_save,
+			.restore = cur_trace_restore,
+
+		},
+		{
+			.name = "trace_buffer",
+			.flags = KS_STRUCT|KS_POINTER,
+			.offset = offsetof(struct trace_array, array_buffer.buffer),
+			.ksd = &kstate_trace_buffer,
+		},
+		KSTATE_END_OF_LIST()
+	},
+};
+#endif
+
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
@@ -10848,6 +10927,8 @@ __init static int late_trace_init(void)
 
 	tracing_set_default_clock();
 	clear_boot_tracer();
+	kstate_register(&global_trace_state, &global_trace);
+
 	return 0;
 }
 
-- 
2.45.2



      parent reply	other threads:[~2024-10-02 16:09 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-02 16:07 [RFC PATCH 0/7] KSTATE: a mechanism to migrate some part of the kernel state " Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 1/7] kstate: Add kstate - a mechanism to migrate some " Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 2/7] kexec: Hack and abuse crashkernel for the kstate's migration stream Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 3/7] [hack] purgatory: disable purgatory verification Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 4/7] mm/memblock: Add MEMBLOCK_PRSRV flag Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 5/7] kstate: Add mechanism to preserved specified memory pages across kexec Andrey Ryabinin
2024-10-02 16:07 ` [RFC PATCH 6/7] kstate, test: add test module for testing kstate subsystem Andrey Ryabinin
2024-10-02 16:07 ` Andrey Ryabinin [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20241002160722.20025-8-arbn@yandex-team.com \
    --to=arbn@yandex-team.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=ebiederm@xmission.com \
    --cc=graf@amazon.com \
    --cc=hpa@zytor.com \
    --cc=jgowans@amazon.com \
    --cc=kexec@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=mingo@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=tglx@linutronix.de \
    --cc=valesini@yandex-team.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox