linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
@ 2009-11-20  7:53 Li Zefan
  2009-11-20  7:53 ` [PATCH 2/2] tracing: Remove kmemtrace tracer Li Zefan
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: Li Zefan @ 2009-11-20  7:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Pekka Enberg, Eduard - Gabriel Munteanu, LKML, linux-mm

This tool is mostly a perf version of kmemtrace-user.

The following information is provided by this tool:

- the total amount of memory allocated and fragmentation per call-site
- the total amount of memory allocated and fragmentation per allocation
- total memory allocated and fragmentation in the collected dataset
- ...

 # ./perf kmem record
 ^C
 # ./perf kmem --stat caller --stat alloc -l 10

 ------------------------------------------------------------------------------
 Callsite          | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
 ------------------------------------------------------------------------------
 0xc052f37a        |   790528/4096   |   790528/4096   |    193 |    0.000%
 0xc0541d70        |   524288/4096   |   524288/4096   |    128 |    0.000%
 0xc051cc68        |   481600/200    |   481600/200    |   2408 |    0.000%
 0xc0572623        |   297444/676    |   297440/676    |    440 |    0.001%
 0xc05399f1        |    73476/164    |    73472/164    |    448 |    0.005%
 0xc05243bf        |    51456/256    |    51456/256    |    201 |    0.000%
 0xc0730d0e        |    31844/497    |    31808/497    |     64 |    0.113%
 0xc0734c4e        |    17152/256    |    17152/256    |     67 |    0.000%
 0xc0541a6d        |    16384/128    |    16384/128    |    128 |    0.000%
 0xc059c217        |    13120/40     |    13120/40     |    328 |    0.000%
 0xc0501ee6        |    11264/88     |    11264/88     |    128 |    0.000%
 0xc04daef0        |     7504/682    |     7128/648    |     11 |    5.011%
 0xc04e14a3        |     4216/191    |     4216/191    |     22 |    0.000%
 0xc05041ca        |     3524/44     |     3520/44     |     80 |    0.114%
 0xc0734fa3        |     2104/701    |     1620/540    |      3 |   23.004%
 0xc05ec9f1        |     2024/289    |     2016/288    |      7 |    0.395%
 0xc06a1999        |     1792/256    |     1792/256    |      7 |    0.000%
 0xc0463b9a        |     1584/144    |     1584/144    |     11 |    0.000%
 0xc0541eb0        |     1024/16     |     1024/16     |     64 |    0.000%
 0xc06a19ac        |      896/128    |      896/128    |      7 |    0.000%
 0xc05721c0        |      772/12     |      768/12     |     64 |    0.518%
 0xc054d1e6        |      288/57     |      280/56     |      5 |    2.778%
 0xc04b562e        |      157/31     |      154/30     |      5 |    1.911%
 0xc04b536f        |       80/16     |       80/16     |      5 |    0.000%
 0xc05855a0        |       64/64     |       36/36     |      1 |   43.750%
 ------------------------------------------------------------------------------

 ------------------------------------------------------------------------------
 Alloc Ptr         | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
 ------------------------------------------------------------------------------
 0xda884000        |  1052672/4096   |  1052672/4096   |    257 |    0.000%
 0xda886000        |   262144/4096   |   262144/4096   |     64 |    0.000%
 0xf60c7c00        |    16512/128    |    16512/128    |    129 |    0.000%
 0xf59a4118        |    13120/40     |    13120/40     |    328 |    0.000%
 0xdfd4b2c0        |    11264/88     |    11264/88     |    128 |    0.000%
 0xf5274600        |     7680/256    |     7680/256    |     30 |    0.000%
 0xe8395000        |     5948/594    |     5464/546    |     10 |    8.137%
 0xe59c3c00        |     5748/479    |     5712/476    |     12 |    0.626%
 0xf4cd1a80        |     3524/44     |     3520/44     |     80 |    0.114%
 0xe5bd1600        |     2892/482    |     2856/476    |      6 |    1.245%
 ...               | ...             | ...             | ...    | ...
 ------------------------------------------------------------------------------

SUMMARY
=======
Total bytes requested: 2333626
Total bytes allocated: 2353712
Total bytes wasted on internal fragmentation: 20086
Internal fragmentation: 0.853375%


TODO:
- show sym+offset in 'callsite' column
- show cross node allocation stats
- collect more useful stats?
- ...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 tools/perf/Makefile       |    1 +
 tools/perf/builtin-kmem.c |  578 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin.h      |    1 +
 tools/perf/perf.c         |   27 +-
 4 files changed, 594 insertions(+), 13 deletions(-)
 create mode 100644 tools/perf/builtin-kmem.c

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 53e663a..4ec86da 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -445,6 +445,7 @@ BUILTIN_OBJS += builtin-timechart.o
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
 BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 0000000..285fda3
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,578 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const		*input_name = "perf.data";
+
+static struct perf_header	*header;
+static u64			sample_type;
+
+static int			alloc_flag;
+static int			caller_flag;
+
+sort_fn_t			alloc_sort_fn;
+sort_fn_t			caller_sort_fn;
+
+static int			alloc_lines = -1;
+static int			caller_lines = -1;
+
+static char			*cwd;
+static int			cwdlen;
+
+struct alloc_stat {
+	union {
+		struct {
+			char	*name;
+			u64	call_site;
+		};
+		u64	ptr;
+	};
+	u64	bytes_req;
+	u64	bytes_alloc;
+	u32	hit;
+
+	struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+
+struct raw_event_sample {
+	u32 size;
+	char data[0];
+};
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void insert_alloc_stat(unsigned long ptr,
+			      int bytes_req, int bytes_alloc)
+{
+	struct rb_node **node = &root_alloc_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	if (!alloc_flag)
+		return;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (ptr > data->ptr)
+			node = &(*node)->rb_right;
+		else if (ptr < data->ptr)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->ptr == ptr) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		data->ptr = ptr;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_alloc_stat);
+	}
+}
+
+static void insert_caller_stat(unsigned long call_site,
+			      int bytes_req, int bytes_alloc)
+{
+	struct rb_node **node = &root_caller_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	if (!caller_flag)
+		return;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (call_site > data->call_site)
+			node = &(*node)->rb_right;
+		else if (call_site < data->call_site)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->call_site == call_site) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		data->call_site = call_site;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_caller_stat);
+	}
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+				struct event *event,
+				int cpu __used,
+				u64 timestamp __used,
+				struct thread *thread __used,
+				int node __used)
+{
+	unsigned long call_site;
+	unsigned long ptr;
+	int bytes_req;
+	int bytes_alloc;
+
+	ptr = raw_field_value(event, "ptr", raw->data);
+	call_site = raw_field_value(event, "call_site", raw->data);
+	bytes_req = raw_field_value(event, "bytes_req", raw->data);
+	bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+	insert_alloc_stat(ptr, bytes_req, bytes_alloc);
+	insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+	total_requested += bytes_req;
+	total_allocated += bytes_alloc;
+}
+
+static void process_free_event(struct raw_event_sample *raw __used,
+			       struct event *event __used,
+			       int cpu __used,
+			       u64 timestamp __used,
+			       struct thread *thread __used)
+{
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct raw_event_sample *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	if (!strcmp(event->name, "kmalloc") ||
+	    !strcmp(event->name, "kmem_cache_alloc")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+		return;
+	} 
+
+	if (!strcmp(event->name, "kmalloc_node") ||
+	    !strcmp(event->name, "kmem_cache_alloc_node")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+		return;
+	}
+
+	if (!strcmp(event->name, "kfree") ||
+	    !strcmp(event->name, "kmem_cache_free")) {
+		process_free_event(raw, event, cpu, timestamp, thread);
+		return;
+	}
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	struct thread *thread = threads__findnew(event->ip.pid);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			 event->header.type);
+		return -1;
+	}
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	process_raw_event(event, more_data, cpu, timestamp, thread);
+
+	return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+	sample_type = type;
+
+	if (!(sample_type & PERF_SAMPLE_RAW)) {
+		fprintf(stderr,
+			"No trace sample to read. Did you call perf record "
+			"without -R?");
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct perf_file_handler file_handler = {
+	.process_sample_event	= process_sample_event,
+	.process_comm_event	= process_comm_event,
+	.sample_type_check	= sample_type_check,
+};
+
+static int read_events(void)
+{
+	register_idle_thread();
+	register_perf_file_handler(&file_handler);
+
+	return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+				       &cwdlen, &cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+	if (n_alloc == 0)
+		return 0.0;
+	else
+		return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+	struct rb_node *next;
+
+	printf("\n ------------------------------------------------------------------------------\n");
+	if (is_caller)
+		printf(" Callsite          |");
+	else
+		printf(" Alloc Ptr         |");
+	printf(" Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation\n");
+	printf(" ------------------------------------------------------------------------------\n");
+
+	next = rb_first(root);
+
+	while (next && n_lines--) {
+		struct alloc_stat *data;
+
+		data = rb_entry(next, struct alloc_stat, node);
+
+		printf(" %-16p  | %8llu/%-6lu | %8llu/%-6lu | %6lu | %8.3f%%\n",
+		       is_caller ? (void *)(unsigned long)data->call_site :
+				   (void *)(unsigned long)data->ptr,
+		       (unsigned long long)data->bytes_alloc,
+		       (unsigned long)data->bytes_alloc / data->hit,
+		       (unsigned long long)data->bytes_req,
+		       (unsigned long)data->bytes_req / data->hit,
+		       (unsigned long)data->hit,
+		       fragmentation(data->bytes_req, data->bytes_alloc));
+
+		next = rb_next(next);
+	}
+
+	if (n_lines == -1)
+		printf(" ...               | ...             | ...             | ...    | ...   \n");
+
+	printf(" ------------------------------------------------------------------------------\n");
+}
+
+static void print_summary(void)
+{
+	printf("\nSUMMARY\n=======\n");
+	printf("Total bytes requested: %lu\n", total_requested);
+	printf("Total bytes allocated: %lu\n", total_allocated);
+	printf("Total bytes wasted on internal fragmentation: %lu\n",
+	       total_allocated - total_requested);
+	printf("Internal fragmentation: %f%%\n",
+	       fragmentation(total_requested, total_allocated));
+}
+
+static void print_result(void)
+{
+	if (caller_flag)
+		__print_result(&root_caller_sorted, caller_lines, 1);
+	if (alloc_flag)
+		__print_result(&root_alloc_sorted, alloc_lines, 0);
+	print_summary();
+}
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+			sort_fn_t sort_fn)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct alloc_stat *this;
+		int cmp;
+
+		this = rb_entry(*new, struct alloc_stat, node);
+		parent = *new;
+
+		cmp = sort_fn(data, this);
+
+		if (cmp > 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+			  sort_fn_t sort_fn)
+{
+	struct rb_node *node;
+	struct alloc_stat *data;
+
+	for (;;) {
+		node = rb_first(root);
+		if (!node)
+			break;
+
+		rb_erase(node, root);
+		data = rb_entry(node, struct alloc_stat, node);
+		sort_insert(root_sorted, data, sort_fn);
+	}
+}
+
+static void sort_result(void)
+{
+	__sort_result(&root_alloc_stat, &root_alloc_sorted, alloc_sort_fn);
+	__sort_result(&root_caller_stat, &root_caller_sorted, caller_sort_fn);
+}
+
+static int __cmd_kmem(void)
+{
+	setup_pager();
+	read_events();
+	sort_result();
+	print_result();
+
+	return 0;
+}
+
+static const char * const kmem_usage[] = {
+	"perf kmem [<options>] {record}",
+	NULL
+};
+
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->ptr < r->ptr)
+		return -1;
+	else if (l->ptr > r->ptr)
+		return 1;
+	return 0;
+}
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->call_site < r->call_site)
+		return -1;
+	else if (l->call_site > r->call_site)
+		return 1;
+	return 0;
+}
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->bytes_alloc < r->bytes_alloc)
+		return -1;
+	else if (l->bytes_alloc > r->bytes_alloc)
+		return 1;
+	return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	sort_fn_t sort_fn;
+
+	if (!arg)
+		return -1;
+
+	if (strcmp(arg, "ptr") == 0)
+		sort_fn = ptr_cmp;
+	else if (strcmp(arg, "call_site") == 0)
+		sort_fn = callsite_cmp;
+	else if (strcmp(arg, "bytes") == 0)
+		sort_fn = bytes_cmp;
+	else
+		return -1;
+
+	if (caller_flag > alloc_flag)
+		caller_sort_fn = sort_fn;
+	else
+		alloc_sort_fn = sort_fn;
+
+	return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	if (!arg)
+		return -1;
+
+	if (strcmp(arg, "alloc") == 0)
+		alloc_flag = (caller_flag + 1);
+	else if (strcmp(arg, "caller") == 0)
+		caller_flag = (alloc_flag + 1);
+	else
+		return -1;
+	return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	int lines;
+
+	if (!arg)
+		return -1;
+
+	lines = strtoul(arg, NULL, 10);
+
+	if (caller_flag > alloc_flag)
+		caller_lines = lines;
+	else
+		alloc_lines = lines;
+
+	return 0;
+}
+
+static const struct option kmem_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		   "input file name"),
+	OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+		     "stat selector, Pass 'alloc' or 'caller'.",
+		     parse_stat_opt),
+	OPT_CALLBACK('s', "sort", NULL, "key",
+		     "sort by key: ptr, call_site, hit, bytes",
+		     parse_sort_opt),
+	OPT_CALLBACK('l', "line", NULL, "num",
+		     "show n lins",
+		     parse_line_opt),
+	OPT_END()
+};
+
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-M",
+	"-f",
+	"-c", "1",
+	"-e", "kmem:kmalloc",
+	"-e", "kmem:kmalloc_node",
+	"-e", "kmem:kfree",
+	"-e", "kmem:kmem_cache_alloc",
+	"-e", "kmem:kmem_cache_alloc_node",
+	"-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init(0);
+
+	argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+	if (argc && !strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (argc)
+		usage_with_options(kmem_usage, kmem_options);
+
+	if (!alloc_sort_fn)
+		alloc_sort_fn = bytes_cmp;
+	if (!caller_sort_fn)
+		caller_sort_fn = bytes_cmp;
+
+	return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85..a3d8bf6 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
 extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82ac..cf64049 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
-		{ "help", cmd_help, 0 },
-		{ "list", cmd_list, 0 },
 		{ "buildid-list", cmd_buildid_list, 0 },
-		{ "record", cmd_record, 0 },
-		{ "report", cmd_report, 0 },
-		{ "bench", cmd_bench, 0 },
-		{ "stat", cmd_stat, 0 },
-		{ "timechart", cmd_timechart, 0 },
-		{ "top", cmd_top, 0 },
-		{ "annotate", cmd_annotate, 0 },
-		{ "version", cmd_version, 0 },
-		{ "trace", cmd_trace, 0 },
-		{ "sched", cmd_sched, 0 },
-		{ "probe", cmd_probe, 0 },
+		{ "help",	cmd_help,	0 },
+		{ "list",	cmd_list,	0 },
+		{ "record",	cmd_record,	0 },
+		{ "report",	cmd_report,	0 },
+		{ "bench",	cmd_bench,	0 },
+		{ "stat",	cmd_stat,	0 },
+		{ "timechart",	cmd_timechart,	0 },
+		{ "top",	cmd_top,	0 },
+		{ "annotate",	cmd_annotate,	0 },
+		{ "version",	cmd_version,	0 },
+		{ "trace",	cmd_trace,	0 },
+		{ "sched",	cmd_sched,	0 },
+		{ "probe",	cmd_probe,	0 },
+		{ "kmem",	cmd_kmem,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
1.6.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 2/2] tracing: Remove kmemtrace tracer
  2009-11-20  7:53 [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Li Zefan
@ 2009-11-20  7:53 ` Li Zefan
  2009-11-20  8:20   ` Pekka Enberg
  2009-11-20  8:14 ` [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Ingo Molnar
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 32+ messages in thread
From: Li Zefan @ 2009-11-20  7:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Pekka Enberg, Eduard - Gabriel Munteanu, LKML, linux-mm

The kmem trace events can replace the functions of kmemtrace
tracer.

And kmemtrace-user can be modified to use trace events.
(But after cloning the git repo, I found it's still based on
the original relay version..), not to mention now we have
'perf kmem' tool.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 Documentation/ABI/testing/debugfs-kmemtrace |   71 ----
 Documentation/trace/kmemtrace.txt           |  126 -------
 MAINTAINERS                                 |    7 -
 include/linux/kmemtrace.h                   |   25 --
 include/linux/slab_def.h                    |   16 +-
 include/linux/slub_def.h                    |   14 +-
 init/main.c                                 |    2 -
 kernel/trace/Kconfig                        |   20 -
 kernel/trace/Makefile                       |    1 -
 kernel/trace/kmemtrace.c                    |  511 ---------------------------
 kernel/trace/trace.h                        |   11 -
 kernel/trace/trace_entries.h                |   35 --
 mm/slab.c                                   |   11 -
 mm/slub.c                                   |    3 -
 14 files changed, 7 insertions(+), 846 deletions(-)
 delete mode 100644 Documentation/ABI/testing/debugfs-kmemtrace
 delete mode 100644 Documentation/trace/kmemtrace.txt
 delete mode 100644 include/linux/kmemtrace.h
 delete mode 100644 kernel/trace/kmemtrace.c

diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace
deleted file mode 100644
index 5e6a92a..0000000
--- a/Documentation/ABI/testing/debugfs-kmemtrace
+++ /dev/null
@@ -1,71 +0,0 @@
-What:		/sys/kernel/debug/kmemtrace/
-Date:		July 2008
-Contact:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-Description:
-
-In kmemtrace-enabled kernels, the following files are created:
-
-/sys/kernel/debug/kmemtrace/
-	cpu<n>		(0400)	Per-CPU tracing data, see below. (binary)
-	total_overruns	(0400)	Total number of bytes which were dropped from
-				cpu<n> files because of full buffer condition,
-				non-binary. (text)
-	abi_version	(0400)	Kernel's kmemtrace ABI version. (text)
-
-Each per-CPU file should be read according to the relay interface. That is,
-the reader should set affinity to that specific CPU and, as currently done by
-the userspace application (though there are other methods), use poll() with
-an infinite timeout before every read(). Otherwise, erroneous data may be
-read. The binary data has the following _core_ format:
-
-	Event ID	(1 byte)	Unsigned integer, one of:
-		0 - represents an allocation (KMEMTRACE_EVENT_ALLOC)
-		1 - represents a freeing of previously allocated memory
-		    (KMEMTRACE_EVENT_FREE)
-	Type ID		(1 byte)	Unsigned integer, one of:
-		0 - this is a kmalloc() / kfree()
-		1 - this is a kmem_cache_alloc() / kmem_cache_free()
-		2 - this is a __get_free_pages() et al.
-	Event size	(2 bytes)	Unsigned integer representing the
-					size of this event. Used to extend
-					kmemtrace. Discard the bytes you
-					don't know about.
-	Sequence number	(4 bytes)	Signed integer used to reorder data
-					logged on SMP machines. Wraparound
-					must be taken into account, although
-					it is unlikely.
-	Caller address	(8 bytes)	Return address to the caller.
-	Pointer to mem	(8 bytes)	Pointer to target memory area. Can be
-					NULL, but not all such calls might be
-					recorded.
-
-In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow:
-
-	Requested bytes	(8 bytes)	Total number of requested bytes,
-					unsigned, must not be zero.
-	Allocated bytes (8 bytes)	Total number of actually allocated
-					bytes, unsigned, must not be lower
-					than requested bytes.
-	Requested flags	(4 bytes)	GFP flags supplied by the caller.
-	Target CPU	(4 bytes)	Signed integer, valid for event id 1.
-					If equal to -1, target CPU is the same
-					as origin CPU, but the reverse might
-					not be true.
-
-The data is made available in the same endianness the machine has.
-
-Other event ids and type ids may be defined and added. Other fields may be
-added by increasing event size, but see below for details.
-Every modification to the ABI, including new id definitions, are followed
-by bumping the ABI version by one.
-
-Adding new data to the packet (features) is done at the end of the mandatory
-data:
-	Feature size	(2 byte)
-	Feature ID	(1 byte)
-	Feature data	(Feature size - 3 bytes)
-
-
-Users:
-	kmemtrace-user - git://repo.or.cz/kmemtrace-user.git
-
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
deleted file mode 100644
index 6308735..0000000
--- a/Documentation/trace/kmemtrace.txt
+++ /dev/null
@@ -1,126 +0,0 @@
-			kmemtrace - Kernel Memory Tracer
-
-			  by Eduard - Gabriel Munteanu
-			     <eduard.munteanu@linux360.ro>
-
-I. Introduction
-===============
-
-kmemtrace helps kernel developers figure out two things:
-1) how different allocators (SLAB, SLUB etc.) perform
-2) how kernel code allocates memory and how much
-
-To do this, we trace every allocation and export information to the userspace
-through the relay interface. We export things such as the number of requested
-bytes, the number of bytes actually allocated (i.e. including internal
-fragmentation), whether this is a slab allocation or a plain kmalloc() and so
-on.
-
-The actual analysis is performed by a userspace tool (see section III for
-details on where to get it from). It logs the data exported by the kernel,
-processes it and (as of writing this) can provide the following information:
-- the total amount of memory allocated and fragmentation per call-site
-- the amount of memory allocated and fragmentation per allocation
-- total memory allocated and fragmentation in the collected dataset
-- number of cross-CPU allocation and frees (makes sense in NUMA environments)
-
-Moreover, it can potentially find inconsistent and erroneous behavior in
-kernel code, such as using slab free functions on kmalloc'ed memory or
-allocating less memory than requested (but not truly failed allocations).
-
-kmemtrace also makes provisions for tracing on some arch and analysing the
-data on another.
-
-II. Design and goals
-====================
-
-kmemtrace was designed to handle rather large amounts of data. Thus, it uses
-the relay interface to export whatever is logged to userspace, which then
-stores it. Analysis and reporting is done asynchronously, that is, after the
-data is collected and stored. By design, it allows one to log and analyse
-on different machines and different arches.
-
-As of writing this, the ABI is not considered stable, though it might not
-change much. However, no guarantees are made about compatibility yet. When
-deemed stable, the ABI should still allow easy extension while maintaining
-backward compatibility. This is described further in Documentation/ABI.
-
-Summary of design goals:
-	- allow logging and analysis to be done across different machines
-	- be fast and anticipate usage in high-load environments (*)
-	- be reasonably extensible
-	- make it possible for GNU/Linux distributions to have kmemtrace
-	included in their repositories
-
-(*) - one of the reasons Pekka Enberg's original userspace data analysis
-    tool's code was rewritten from Perl to C (although this is more than a
-    simple conversion)
-
-
-III. Quick usage guide
-======================
-
-1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable
-CONFIG_KMEMTRACE).
-
-2) Get the userspace tool and build it:
-$ git clone git://repo.or.cz/kmemtrace-user.git		# current repository
-$ cd kmemtrace-user/
-$ ./autogen.sh
-$ ./configure
-$ make
-
-3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the
-'single' runlevel (so that relay buffers don't fill up easily), and run
-kmemtrace:
-# '$' does not mean user, but root here.
-$ mount -t debugfs none /sys/kernel/debug
-$ mount -t proc none /proc
-$ cd path/to/kmemtrace-user/
-$ ./kmemtraced
-Wait a bit, then stop it with CTRL+C.
-$ cat /sys/kernel/debug/kmemtrace/total_overruns	# Check if we didn't
-							# overrun, should
-							# be zero.
-$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to
-		check its correctness]
-$ ./kmemtrace-report
-
-Now you should have a nice and short summary of how the allocator performs.
-
-IV. FAQ and known issues
-========================
-
-Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix
-this? Should I worry?
-A: If it's non-zero, this affects kmemtrace's accuracy, depending on how
-large the number is. You can fix it by supplying a higher
-'kmemtrace.subbufs=N' kernel parameter.
----
-
-Q: kmemtrace_check reports errors, how do I fix this? Should I worry?
-A: This is a bug and should be reported. It can occur for a variety of
-reasons:
-	- possible bugs in relay code
-	- possible misuse of relay by kmemtrace
-	- timestamps being collected unorderly
-Or you may fix it yourself and send us a patch.
----
-
-Q: kmemtrace_report shows many errors, how do I fix this? Should I worry?
-A: This is a known issue and I'm working on it. These might be true errors
-in kernel code, which may have inconsistent behavior (e.g. allocating memory
-with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed
-out this behavior may work with SLAB, but may fail with other allocators.
-
-It may also be due to lack of tracing in some unusual allocator functions.
-
-We don't want bug reports regarding this issue yet.
----
-
-V. See also
-===========
-
-Documentation/kernel-parameters.txt
-Documentation/ABI/testing/debugfs-kmemtrace
-
diff --git a/MAINTAINERS b/MAINTAINERS
index c824b4d..dfb2e7d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3100,13 +3100,6 @@ F:	include/linux/kmemleak.h
 F:	mm/kmemleak.c
 F:	mm/kmemleak-test.c
 
-KMEMTRACE
-M:	Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
-S:	Maintained
-F:	Documentation/trace/kmemtrace.txt
-F:	include/linux/kmemtrace.h
-F:	kernel/trace/kmemtrace.c
-
 KPROBES
 M:	Ananth N Mavinakayanahalli <ananth@in.ibm.com>
 M:	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index b616d39..0000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <trace/events/kmem.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 850d057..56171d6 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,8 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+
+#include <trace/events/kmem.h>
 
 /*
  * struct kmem_cache
@@ -108,22 +109,13 @@ struct cache_sizes {
 extern struct cache_sizes malloc_sizes[];
 
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *kmem_cache_alloc_notrace(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
-#ifdef CONFIG_KMEMTRACE
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags);
-extern size_t slab_buffer_size(struct kmem_cache *cachep);
-#else
-static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
 static inline size_t slab_buffer_size(struct kmem_cache *cachep)
 {
-	return 0;
+	return cachep->buffer_size;
 }
-#endif
 
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5ad70a6..b41dd8c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,9 +10,10 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemleak.h>
 
+#include <trace/events/kmem.h>
+
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
@@ -215,18 +216,9 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
 #endif
 
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *kmem_cache_alloc_notrace(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
-#ifdef CONFIG_KMEMTRACE
-extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags);
-#else
-static __always_inline void *
-kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
-{
-	return kmem_cache_alloc(s, gfpflags);
-}
-#endif
-
 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 {
 	unsigned int order = get_order(size);
diff --git a/init/main.c b/init/main.c
index 5988deb..daf13a4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -66,7 +66,6 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/kmemcheck.h>
-#include <linux/kmemtrace.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <trace/boot.h>
@@ -645,7 +644,6 @@ asmlinkage void __init start_kernel(void)
 #endif
 	page_cgroup_init();
 	enable_debug_pagealloc();
-	kmemtrace_init();
 	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 20e3695..530bbb1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -406,26 +406,6 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
-config KMEMTRACE
-	bool "Trace SLAB allocations"
-	select GENERIC_TRACER
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/trace/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd8..cd5ed77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
deleted file mode 100644
index a91da69..0000000
--- a/kernel/trace/kmemtrace.c
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Memory allocator tracing
- *
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- */
-
-#include <linux/tracepoint.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-#include <linux/kmemtrace.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_KMEM_OPT_MINIMAL	0x1
-
-static struct tracer_opt kmem_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
-	{ }
-};
-
-static struct tracer_flags kmem_tracer_flags = {
-	.val			= 0,
-	.opts			= kmem_opts
-};
-
-static struct trace_array *kmemtrace_array;
-
-/* Trace allocations */
-static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
-				   unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	struct ftrace_event_call *call = &event_kmem_alloc;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_alloc_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-
-	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_ALLOC;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-	entry->bytes_req	= bytes_req;
-	entry->bytes_alloc	= bytes_alloc;
-	entry->gfp_flags	= gfp_flags;
-	entry->node		= node;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
-				  unsigned long call_site,
-				  const void *ptr)
-{
-	struct ftrace_event_call *call = &event_kmem_free;
-	struct trace_array *tr = kmemtrace_array;
-	struct kmemtrace_free_entry *entry;
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type		= TRACE_KMEM_FREE;
-	entry->type_id		= type_id;
-	entry->call_site	= call_site;
-	entry->ptr		= ptr;
-
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-}
-
-static void kmemtrace_kmalloc(unsigned long call_site,
-			      const void *ptr,
-			      size_t bytes_req,
-			      size_t bytes_alloc,
-			      gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
-				       const void *ptr,
-				       size_t bytes_req,
-				       size_t bytes_alloc,
-				       gfp_t gfp_flags)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-static void kmemtrace_kmalloc_node(unsigned long call_site,
-				   const void *ptr,
-				   size_t bytes_req,
-				   size_t bytes_alloc,
-				   gfp_t gfp_flags,
-				   int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
-					    const void *ptr,
-					    size_t bytes_req,
-					    size_t bytes_alloc,
-					    gfp_t gfp_flags,
-					    int node)
-{
-	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
-			bytes_req, bytes_alloc, gfp_flags, node);
-}
-
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
-}
-
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
-{
-	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
-}
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = register_trace_kmalloc(kmemtrace_kmalloc);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
-	if (err)
-		return err;
-	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
-	if (err)
-		return err;
-	err = register_trace_kfree(kmemtrace_kfree);
-	if (err)
-		return err;
-	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	unregister_trace_kmalloc(kmemtrace_kmalloc);
-	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
-	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
-	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
-	unregister_trace_kfree(kmemtrace_kfree);
-	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
-}
-
-static int kmem_trace_init(struct trace_array *tr)
-{
-	kmemtrace_array = tr;
-
-	tracing_reset_online_cpus(tr);
-
-	kmemtrace_start_probes();
-
-	return 0;
-}
-
-static void kmem_trace_reset(struct trace_array *tr)
-{
-	kmemtrace_stop_probes();
-}
-
-static void kmemtrace_headers(struct seq_file *s)
-{
-	/* Don't need headers for the original kmemtrace output */
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return;
-
-	seq_printf(s, "#\n");
-	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
-			"      POINTER         NODE    CALLER\n");
-	seq_printf(s, "# FREE   |      |     |       |       "
-			"       |   |            |        |\n");
-	seq_printf(s, "# |\n\n");
-}
-
-/*
- * The following functions give the original output from kmemtrace,
- * plus the origin CPU, since reordering occurs in-kernel now.
- */
-
-#define KMEMTRACE_USER_ALLOC	0
-#define KMEMTRACE_USER_FREE	1
-
-struct kmemtrace_user_event {
-	u8			event_id;
-	u8			type_id;
-	u16			event_size;
-	u32			cpu;
-	u64			timestamp;
-	unsigned long		call_site;
-	unsigned long		ptr;
-};
-
-struct kmemtrace_user_event_alloc {
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	unsigned		gfp_flags;
-	int			node;
-};
-
-static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
-	    "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	    entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
-	    (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
-	    (unsigned long)entry->gfp_flags, entry->node);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
-			       entry->type_id, (void *)entry->call_site,
-			       (unsigned long)entry->ptr);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_alloc_entry *entry;
-	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_ALLOC;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
-	if (!ev_alloc)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev_alloc->bytes_req	= entry->bytes_req;
-	ev_alloc->bytes_alloc	= entry->bytes_alloc;
-	ev_alloc->gfp_flags	= entry->gfp_flags;
-	ev_alloc->node		= entry->node;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	struct kmemtrace_free_entry *entry;
-	struct kmemtrace_user_event *ev;
-
-	trace_assign_type(entry, iter->ent);
-
-	ev = trace_seq_reserve(s, sizeof(*ev));
-	if (!ev)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ev->event_id		= KMEMTRACE_USER_FREE;
-	ev->type_id		= entry->type_id;
-	ev->event_size		= sizeof(*ev);
-	ev->cpu			= iter->cpu;
-	ev->timestamp		= iter->ts;
-	ev->call_site		= entry->call_site;
-	ev->ptr			= (unsigned long)entry->ptr;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-/* The two other following provide a more minimalistic output */
-static enum print_line_t
-kmemtrace_print_alloc_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Alloc entry */
-	ret = trace_seq_printf(s, "  +      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K   ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C   ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P   ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?   ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Requested */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Allocated */
-	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Flags
-	 * TODO: would be better to see the name of the GFP flag names
-	 */
-	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Node and call site*/
-	ret = trace_seq_printf(s, "%4d   %pf\n", entry->node,
-						 (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t
-kmemtrace_print_free_compress(struct trace_iterator *iter)
-{
-	struct kmemtrace_free_entry *entry;
-	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	trace_assign_type(entry, iter->ent);
-
-	/* Free entry */
-	ret = trace_seq_printf(s, "  -      ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Type */
-	switch (entry->type_id) {
-	case KMEMTRACE_TYPE_KMALLOC:
-		ret = trace_seq_printf(s, "K     ");
-		break;
-	case KMEMTRACE_TYPE_CACHE:
-		ret = trace_seq_printf(s, "C     ");
-		break;
-	case KMEMTRACE_TYPE_PAGES:
-		ret = trace_seq_printf(s, "P     ");
-		break;
-	default:
-		ret = trace_seq_printf(s, "?     ");
-	}
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip requested/allocated/flags */
-	ret = trace_seq_printf(s, "                       ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Pointer to allocated */
-	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	/* Skip node and print call site*/
-	ret = trace_seq_printf(s, "       %pf\n", (void *)entry->call_site);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
-{
-	struct trace_entry *entry = iter->ent;
-
-	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
-		return TRACE_TYPE_UNHANDLED;
-
-	switch (entry->type) {
-	case TRACE_KMEM_ALLOC:
-		return kmemtrace_print_alloc_compress(iter);
-	case TRACE_KMEM_FREE:
-		return kmemtrace_print_free_compress(iter);
-	default:
-		return TRACE_TYPE_UNHANDLED;
-	}
-}
-
-static struct trace_event kmem_trace_alloc = {
-	.type			= TRACE_KMEM_ALLOC,
-	.trace			= kmemtrace_print_alloc,
-	.binary			= kmemtrace_print_alloc_user,
-};
-
-static struct trace_event kmem_trace_free = {
-	.type			= TRACE_KMEM_FREE,
-	.trace			= kmemtrace_print_free,
-	.binary			= kmemtrace_print_free_user,
-};
-
-static struct tracer kmem_tracer __read_mostly = {
-	.name			= "kmemtrace",
-	.init			= kmem_trace_init,
-	.reset			= kmem_trace_reset,
-	.print_line		= kmemtrace_print_line,
-	.print_header		= kmemtrace_headers,
-	.flags			= &kmem_tracer_flags
-};
-
-void kmemtrace_init(void)
-{
-	/* earliest opportunity to start kmem tracing */
-}
-
-static int __init init_kmem_tracer(void)
-{
-	if (!register_ftrace_event(&kmem_trace_alloc)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (!register_ftrace_event(&kmem_trace_free)) {
-		pr_warning("Warning: could not register kmem events\n");
-		return 1;
-	}
-
-	if (register_tracer(&kmem_tracer) != 0) {
-		pr_warning("Warning: could not register the kmem tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5d6398b..d850dc2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,7 +10,6 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <linux/kmemtrace.h>
 #include <linux/hw_breakpoint.h>
 
 #include <linux/trace_seq.h>
@@ -43,12 +42,6 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 };
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
 extern struct tracer boot_tracer;
 
 #undef __field
@@ -230,10 +223,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
-		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
-			  TRACE_KMEM_ALLOC);	\
-		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
-			  TRACE_KMEM_FREE);	\
 		IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
 		__ftrace_bad_type();					\
 	} while (0)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f..7564b56 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,41 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
 	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
 );
 
-FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
-
-	TRACE_KMEM_ALLOC,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-		__field(	size_t,			bytes_req	)
-		__field(	size_t,			bytes_alloc	)
-		__field(	gfp_t,			gfp_flags	)
-		__field(	int,			node		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
-		 " flags:%x node:%d",
-		 __entry->type_id, __entry->call_site, __entry->ptr,
-		 __entry->bytes_req, __entry->bytes_alloc,
-		 __entry->gfp_flags, __entry->node)
-);
-
-FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
-
-	TRACE_KMEM_FREE,
-
-	F_STRUCT(
-		__field(	enum kmemtrace_type_id,	type_id		)
-		__field(	unsigned long,		call_site	)
-		__field(	const void *,		ptr		)
-	),
-
-	F_printk("type:%u call_site:%lx ptr:%p",
-		 __entry->type_id, __entry->call_site, __entry->ptr)
-);
-
 FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
 
 	TRACE_KSYM,
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481..07e4072 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,6 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
@@ -490,14 +489,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-#ifdef CONFIG_KMEMTRACE
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
-	return cachep->buffer_size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
-
 /*
  * Do not go above this order unless 0 objects fit into the slab.
  */
@@ -3558,13 +3549,11 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
-#ifdef CONFIG_KMEMTRACE
 void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
 {
 	return __cache_alloc(cachep, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
 
 /**
  * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc7..97f2da6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -1754,13 +1753,11 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
-#ifdef CONFIG_KMEMTRACE
 void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
 {
 	return slab_alloc(s, gfpflags, -1, _RET_IP_);
 }
 EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
 
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
-- 
1.6.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  7:53 [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Li Zefan
  2009-11-20  7:53 ` [PATCH 2/2] tracing: Remove kmemtrace tracer Li Zefan
@ 2009-11-20  8:14 ` Ingo Molnar
  2009-11-20  8:19   ` Pekka Enberg
  2009-11-20  8:20   ` Li Zefan
  2009-11-20  8:54 ` [tip:perf/core] " tip-bot for Li Zefan
  2009-11-20  8:55 ` [RFC][PATCH 1/2] " Ingo Molnar
  3 siblings, 2 replies; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20  8:14 UTC (permalink / raw)
  To: Li Zefan, Arnaldo Carvalho de Melo
  Cc: Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Pekka Enberg, Eduard - Gabriel Munteanu, LKML, linux-mm


* Li Zefan <lizf@cn.fujitsu.com> wrote:

> This tool is mostly a perf version of kmemtrace-user.
> 
> The following information is provided by this tool:
> 
> - the total amount of memory allocated and fragmentation per call-site
> - the total amount of memory allocated and fragmentation per allocation
> - total memory allocated and fragmentation in the collected dataset
> - ...
> 
>  # ./perf kmem record
>  ^C
>  # ./perf kmem --stat caller --stat alloc -l 10
> 
>  ------------------------------------------------------------------------------
>  Callsite          | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
>  ------------------------------------------------------------------------------
>  0xc052f37a        |   790528/4096   |   790528/4096   |    193 |    0.000%
>  0xc0541d70        |   524288/4096   |   524288/4096   |    128 |    0.000%
>  0xc051cc68        |   481600/200    |   481600/200    |   2408 |    0.000%
>  0xc0572623        |   297444/676    |   297440/676    |    440 |    0.001%
>  0xc05399f1        |    73476/164    |    73472/164    |    448 |    0.005%
>  0xc05243bf        |    51456/256    |    51456/256    |    201 |    0.000%
>  0xc0730d0e        |    31844/497    |    31808/497    |     64 |    0.113%
>  0xc0734c4e        |    17152/256    |    17152/256    |     67 |    0.000%
>  0xc0541a6d        |    16384/128    |    16384/128    |    128 |    0.000%
>  0xc059c217        |    13120/40     |    13120/40     |    328 |    0.000%
>  0xc0501ee6        |    11264/88     |    11264/88     |    128 |    0.000%
>  0xc04daef0        |     7504/682    |     7128/648    |     11 |    5.011%
>  0xc04e14a3        |     4216/191    |     4216/191    |     22 |    0.000%
>  0xc05041ca        |     3524/44     |     3520/44     |     80 |    0.114%
>  0xc0734fa3        |     2104/701    |     1620/540    |      3 |   23.004%
>  0xc05ec9f1        |     2024/289    |     2016/288    |      7 |    0.395%
>  0xc06a1999        |     1792/256    |     1792/256    |      7 |    0.000%
>  0xc0463b9a        |     1584/144    |     1584/144    |     11 |    0.000%
>  0xc0541eb0        |     1024/16     |     1024/16     |     64 |    0.000%
>  0xc06a19ac        |      896/128    |      896/128    |      7 |    0.000%
>  0xc05721c0        |      772/12     |      768/12     |     64 |    0.518%
>  0xc054d1e6        |      288/57     |      280/56     |      5 |    2.778%
>  0xc04b562e        |      157/31     |      154/30     |      5 |    1.911%
>  0xc04b536f        |       80/16     |       80/16     |      5 |    0.000%
>  0xc05855a0        |       64/64     |       36/36     |      1 |   43.750%
>  ------------------------------------------------------------------------------
> 
>  ------------------------------------------------------------------------------
>  Alloc Ptr         | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
>  ------------------------------------------------------------------------------
>  0xda884000        |  1052672/4096   |  1052672/4096   |    257 |    0.000%
>  0xda886000        |   262144/4096   |   262144/4096   |     64 |    0.000%
>  0xf60c7c00        |    16512/128    |    16512/128    |    129 |    0.000%
>  0xf59a4118        |    13120/40     |    13120/40     |    328 |    0.000%
>  0xdfd4b2c0        |    11264/88     |    11264/88     |    128 |    0.000%
>  0xf5274600        |     7680/256    |     7680/256    |     30 |    0.000%
>  0xe8395000        |     5948/594    |     5464/546    |     10 |    8.137%
>  0xe59c3c00        |     5748/479    |     5712/476    |     12 |    0.626%
>  0xf4cd1a80        |     3524/44     |     3520/44     |     80 |    0.114%
>  0xe5bd1600        |     2892/482    |     2856/476    |      6 |    1.245%
>  ...               | ...             | ...             | ...    | ...
>  ------------------------------------------------------------------------------
> 
> SUMMARY
> =======
> Total bytes requested: 2333626
> Total bytes allocated: 2353712
> Total bytes wasted on internal fragmentation: 20086
> Internal fragmentation: 0.853375%

Very impressive!

> TODO:
> - show sym+offset in 'callsite' column

The way to print symbolic information for the 'callsite' column is to 
fill in and walk the thread->DSO->symbol trees that all perf tools 
maintain:

	/* simplified, without error handling */

	ip = event->ip.ip;

	thread = threads__findnew(event->ip.pid);

	map = thread__find_map(thread, ip);

	ip = map->map_ip(map, ip); /* map absolute RIP into DSO-relative one */

	sym = map__find_symbol(map, ip, symbol_filter);

then sym->name is the string that can be printed out. This works in a 
symmetric way for both kernel-space and user-space symbols. (Call-chain 
information can be captured and displayed too.)

( 'Alloc Ptr' symbolization is harder, but it would be useful too i 
  think, to map it back to the slab cache name. )

> - show cross node allocation stats

I checked and we appear to have all the right events for that - the node 
ID is being traced consistently AFAICS.

> - collect more useful stats?
> - ...

Pekka, Eduard and the other slab hackers might have ideas about what 
other stats they generally like to see to judge the health of a workload 
(or system).

If this iteration looks good to the slab folks then i can apply it as-is 
and we can do the other changes relative to that. It looks good to me as 
a first step, and it's functional already.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:14 ` [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Ingo Molnar
@ 2009-11-20  8:19   ` Pekka Enberg
  2009-11-20  8:30     ` Ingo Molnar
  2009-11-20  8:20   ` Li Zefan
  1 sibling, 1 reply; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20  8:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <mingo@elte.hu> wrote:
> Pekka, Eduard and the other slab hackers might have ideas about what
> other stats they generally like to see to judge the health of a workload
> (or system).

kmalloc()/kfree() CPU ping-pong call-sites (i.e. alloc and free
happening on different CPUs) is one interesting metric we haven't
implemented yet. Valgrind massif tool type of output graph would be
helpful as well:

http://valgrind.org/docs/manual/ms-manual.html

On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <mingo@elte.hu> wrote:
> If this iteration looks good to the slab folks then i can apply it as-is
> and we can do the other changes relative to that. It looks good to me as
> a first step, and it's functional already.

Yeah, looks OK to me as the first step. Patch 2 looks premature,
though, looking at the output of "perf kmem" from patch 1.

Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] tracing: Remove kmemtrace tracer
  2009-11-20  7:53 ` [PATCH 2/2] tracing: Remove kmemtrace tracer Li Zefan
@ 2009-11-20  8:20   ` Pekka Enberg
  2009-11-20  8:24     ` Li Zefan
  0 siblings, 1 reply; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20  8:20 UTC (permalink / raw)
  To: Li Zefan
  Cc: Ingo Molnar, Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Eduard - Gabriel Munteanu, LKML, linux-mm

Li Zefan kirjoitti:
> The kmem trace events can replace the functions of kmemtrace
> tracer.
> 
> And kmemtrace-user can be modified to use trace events.
> (But after cloning the git repo, I found it's still based on
> the original relay version..), not to mention now we have
> 'perf kmem' tool.
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

NAK for the time being. "perf kmem" output is not yet as good as that of 
kmemtrace-user.

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:14 ` [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Ingo Molnar
  2009-11-20  8:19   ` Pekka Enberg
@ 2009-11-20  8:20   ` Li Zefan
  1 sibling, 0 replies; 32+ messages in thread
From: Li Zefan @ 2009-11-20  8:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Pekka Enberg, Eduard - Gabriel Munteanu, LKML,
	linux-mm

>> TODO:
>> - show sym+offset in 'callsite' column
> 
> The way to print symbolic information for the 'callsite' column is to 
> fill in and walk the thread->DSO->symbol trees that all perf tools 
> maintain:
> 
> 	/* simplified, without error handling */
> 
> 	ip = event->ip.ip;
> 
> 	thread = threads__findnew(event->ip.pid);
> 
> 	map = thread__find_map(thread, ip);
> 
> 	ip = map->map_ip(map, ip); /* map absolute RIP into DSO-relative one */
> 
> 	sym = map__find_symbol(map, ip, symbol_filter);
> 
> then sym->name is the string that can be printed out. This works in a 
> symmetric way for both kernel-space and user-space symbols. (Call-chain 
> information can be captured and displayed too.)
> 
> ( 'Alloc Ptr' symbolization is harder, but it would be useful too i 
>   think, to map it back to the slab cache name. )
> 

Thanks.

I was lazy to figure it out by myself. ;)

>> - show cross node allocation stats
> 
> I checked and we appear to have all the right events for that - the node 
> ID is being traced consistently AFAICS.
> 

Actually kmemtrace-user shows this stats, but in a wrong way.
It doesn't map cpu_nr to node.

>> - collect more useful stats?
>> - ...
> 
> Pekka, Eduard and the other slab hackers might have ideas about what 
> other stats they generally like to see to judge the health of a workload 
> (or system).
> 
> If this iteration looks good to the slab folks then i can apply it as-is 
> and we can do the other changes relative to that. It looks good to me as 
> a first step, and it's functional already.
> 

Thanks!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] tracing: Remove kmemtrace tracer
  2009-11-20  8:20   ` Pekka Enberg
@ 2009-11-20  8:24     ` Li Zefan
  2009-11-20  8:27       ` Pekka Enberg
  0 siblings, 1 reply; 32+ messages in thread
From: Li Zefan @ 2009-11-20  8:24 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Eduard - Gabriel Munteanu, LKML, linux-mm

于 2009年11月20日 16:20, Pekka Enberg 写道:
> Li Zefan kirjoitti:
>> The kmem trace events can replace the functions of kmemtrace
>> tracer.
>>
>> And kmemtrace-user can be modified to use trace events.
>> (But after cloning the git repo, I found it's still based on
>> the original relay version..), not to mention now we have
>> 'perf kmem' tool.
>>
>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> NAK for the time being. "perf kmem" output is not yet as good as that of
> kmemtrace-user.
> 

But is the current kmemtrace-user based on kmemtrace?

>From the git repo:
	http://repo.or.cz/w/kmemtrace-user.git

I found it's still based on relay.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] tracing: Remove kmemtrace tracer
  2009-11-20  8:24     ` Li Zefan
@ 2009-11-20  8:27       ` Pekka Enberg
  2009-11-20  8:31         ` Li Zefan
  0 siblings, 1 reply; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20  8:27 UTC (permalink / raw)
  To: Li Zefan
  Cc: Ingo Molnar, Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Eduard - Gabriel Munteanu, LKML, linux-mm

On Fri, Nov 20, 2009 at 10:24 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
> 于 2009年11月20日 16:20, Pekka Enberg 写道:
>> Li Zefan kirjoitti:
>>> The kmem trace events can replace the functions of kmemtrace
>>> tracer.
>>>
>>> And kmemtrace-user can be modified to use trace events.
>>> (But after cloning the git repo, I found it's still based on
>>> the original relay version..), not to mention now we have
>>> 'perf kmem' tool.
>>>
>>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
>>
>> NAK for the time being. "perf kmem" output is not yet as good as that of
>> kmemtrace-user.
>>
>
> But is the current kmemtrace-user based on kmemtrace?
>
> From the git repo:
>        http://repo.or.cz/w/kmemtrace-user.git
>
> I found it's still based on relay.

The "ftrace-temp" branch seems to have the ftrace based version in it. Eduard?

                      Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:19   ` Pekka Enberg
@ 2009-11-20  8:30     ` Ingo Molnar
  2009-11-20  8:47       ` Pekka Enberg
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20  8:30 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm


* Pekka Enberg <penberg@cs.helsinki.fi> wrote:

> On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > Pekka, Eduard and the other slab hackers might have ideas about what
> > other stats they generally like to see to judge the health of a workload
> > (or system).
> 
> kmalloc()/kfree() CPU ping-pong call-sites (i.e. alloc and free
> happening on different CPUs) is one interesting metric we haven't
> implemented yet. Valgrind massif tool type of output graph would be
> helpful as well:
> 
> http://valgrind.org/docs/manual/ms-manual.html
> 
> On Fri, Nov 20, 2009 at 10:14 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > If this iteration looks good to the slab folks then i can apply it as-is
> > and we can do the other changes relative to that. It looks good to me as
> > a first step, and it's functional already.
> 
> Yeah, looks OK to me as the first step. Patch 2 looks premature,
> though, looking at the output of "perf kmem" from patch 1.
> 
> Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>

Great - thanks for the quick ack!

Regarding patch 2 - can we set some definitive benchmark threshold for 
that? I.e. a list of must-have features in 'perf kmem' before we can do 
it? 100% information and analysis equivalency with kmemtrace-user tool? 
Eduard, what do you think?

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2] tracing: Remove kmemtrace tracer
  2009-11-20  8:27       ` Pekka Enberg
@ 2009-11-20  8:31         ` Li Zefan
  0 siblings, 0 replies; 32+ messages in thread
From: Li Zefan @ 2009-11-20  8:31 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Eduard - Gabriel Munteanu, LKML, linux-mm

Pekka Enberg wrote:
> On Fri, Nov 20, 2009 at 10:24 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>> 于 2009年11月20日 16:20, Pekka Enberg 写道:
>>> Li Zefan kirjoitti:
>>>> The kmem trace events can replace the functions of kmemtrace
>>>> tracer.
>>>>
>>>> And kmemtrace-user can be modified to use trace events.
>>>> (But after cloning the git repo, I found it's still based on
>>>> the original relay version..), not to mention now we have
>>>> 'perf kmem' tool.
>>>>
>>>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
>>> NAK for the time being. "perf kmem" output is not yet as good as that of
>>> kmemtrace-user.
>>>
>> But is the current kmemtrace-user based on kmemtrace?
>>
>> From the git repo:
>>        http://repo.or.cz/w/kmemtrace-user.git
>>
>> I found it's still based on relay.
> 
> The "ftrace-temp" branch seems to have the ftrace based version in it. Eduard?
> 

Thanks. I just overlooked the branch..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:30     ` Ingo Molnar
@ 2009-11-20  8:47       ` Pekka Enberg
  2009-11-20  8:53         ` Li Zefan
  2009-11-20  9:01         ` Ingo Molnar
  0 siblings, 2 replies; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20  8:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

Ingo Molnar kirjoitti:
> Regarding patch 2 - can we set some definitive benchmark threshold for 
> that? I.e. a list of must-have features in 'perf kmem' before we can do 
> it? 100% information and analysis equivalency with kmemtrace-user tool? 

I'd be interested to hear Eduard's comment on that.

That said, I'll try to find some time to test "perf kmem" and provide 
feedback on that. I can ACK the patch when I'm happy with the output. :-)

I'm mostly interested in two scenarios: (1) getting a nice report on 
worst fragmented call-sites (perf kmem needs symbol lookup) and (2) 
doing "perf kmem record" on machine A (think embedded here) and then 
"perf kmem report" on machine B. I haven't tried kmemtrace-user for a 
while but it did support both of them quite nicely at some point.

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:47       ` Pekka Enberg
@ 2009-11-20  8:53         ` Li Zefan
  2009-11-20  9:03           ` Ingo Molnar
  2009-11-20  9:01         ` Ingo Molnar
  1 sibling, 1 reply; 32+ messages in thread
From: Li Zefan @ 2009-11-20  8:53 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

Pekka Enberg wrote:
> Ingo Molnar kirjoitti:
>> Regarding patch 2 - can we set some definitive benchmark threshold for
>> that? I.e. a list of must-have features in 'perf kmem' before we can
>> do it? 100% information and analysis equivalency with kmemtrace-user
>> tool? 
> 
> I'd be interested to hear Eduard's comment on that.
> 
> That said, I'll try to find some time to test "perf kmem" and provide
> feedback on that. I can ACK the patch when I'm happy with the output. :-)
> 
> I'm mostly interested in two scenarios: (1) getting a nice report on
> worst fragmented call-sites (perf kmem needs symbol lookup) and

This will be done in next version.

> (2) doing "perf kmem record" on machine A (think embedded here) and then
> "perf kmem report" on machine B. I haven't tried kmemtrace-user for a
> while but it did support both of them quite nicely at some point.
> 

Everything needed and machine-specific will be recorded in perf.data,
so this should already been supported. I'll try it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [tip:perf/core] perf: Add 'perf kmem' tool
  2009-11-20  7:53 [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Li Zefan
  2009-11-20  7:53 ` [PATCH 2/2] tracing: Remove kmemtrace tracer Li Zefan
  2009-11-20  8:14 ` [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Ingo Molnar
@ 2009-11-20  8:54 ` tip-bot for Li Zefan
  2009-11-20  8:55 ` [RFC][PATCH 1/2] " Ingo Molnar
  3 siblings, 0 replies; 32+ messages in thread
From: tip-bot for Li Zefan @ 2009-11-20  8:54 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, penberg, lizf, peterz, eduard.munteanu,
	fweisbec, rostedt, tglx, linux-mm, mingo

Commit-ID:  ba77c9e11111a172c9e8687fe16a6a173a61916f
Gitweb:     http://git.kernel.org/tip/ba77c9e11111a172c9e8687fe16a6a173a61916f
Author:     Li Zefan <lizf@cn.fujitsu.com>
AuthorDate: Fri, 20 Nov 2009 15:53:25 +0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 20 Nov 2009 09:51:41 +0100

perf: Add 'perf kmem' tool

This tool is mostly a perf version of kmemtrace-user.

The following information is provided by this tool:

 - the total amount of memory allocated and fragmentation per
   call-site

 - the total amount of memory allocated and fragmentation per
   allocation

 - total memory allocated and fragmentation in the collected
   dataset - ...

Sample output:

 # ./perf kmem record
 ^C
 # ./perf kmem --stat caller --stat alloc -l 10

 ------------------------------------------------------------------------------
 Callsite          | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
 ------------------------------------------------------------------------------
 0xc052f37a        |   790528/4096   |   790528/4096   |    193 |    0.000%
 0xc0541d70        |   524288/4096   |   524288/4096   |    128 |    0.000%
 0xc051cc68        |   481600/200    |   481600/200    |   2408 |    0.000%
 0xc0572623        |   297444/676    |   297440/676    |    440 |    0.001%
 0xc05399f1        |    73476/164    |    73472/164    |    448 |    0.005%
 0xc05243bf        |    51456/256    |    51456/256    |    201 |    0.000%
 0xc0730d0e        |    31844/497    |    31808/497    |     64 |    0.113%
 0xc0734c4e        |    17152/256    |    17152/256    |     67 |    0.000%
 0xc0541a6d        |    16384/128    |    16384/128    |    128 |    0.000%
 0xc059c217        |    13120/40     |    13120/40     |    328 |    0.000%
 0xc0501ee6        |    11264/88     |    11264/88     |    128 |    0.000%
 0xc04daef0        |     7504/682    |     7128/648    |     11 |    5.011%
 0xc04e14a3        |     4216/191    |     4216/191    |     22 |    0.000%
 0xc05041ca        |     3524/44     |     3520/44     |     80 |    0.114%
 0xc0734fa3        |     2104/701    |     1620/540    |      3 |   23.004%
 0xc05ec9f1        |     2024/289    |     2016/288    |      7 |    0.395%
 0xc06a1999        |     1792/256    |     1792/256    |      7 |    0.000%
 0xc0463b9a        |     1584/144    |     1584/144    |     11 |    0.000%
 0xc0541eb0        |     1024/16     |     1024/16     |     64 |    0.000%
 0xc06a19ac        |      896/128    |      896/128    |      7 |    0.000%
 0xc05721c0        |      772/12     |      768/12     |     64 |    0.518%
 0xc054d1e6        |      288/57     |      280/56     |      5 |    2.778%
 0xc04b562e        |      157/31     |      154/30     |      5 |    1.911%
 0xc04b536f        |       80/16     |       80/16     |      5 |    0.000%
 0xc05855a0        |       64/64     |       36/36     |      1 |   43.750%
 ------------------------------------------------------------------------------

 ------------------------------------------------------------------------------
 Alloc Ptr         | Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation
 ------------------------------------------------------------------------------
 0xda884000        |  1052672/4096   |  1052672/4096   |    257 |    0.000%
 0xda886000        |   262144/4096   |   262144/4096   |     64 |    0.000%
 0xf60c7c00        |    16512/128    |    16512/128    |    129 |    0.000%
 0xf59a4118        |    13120/40     |    13120/40     |    328 |    0.000%
 0xdfd4b2c0        |    11264/88     |    11264/88     |    128 |    0.000%
 0xf5274600        |     7680/256    |     7680/256    |     30 |    0.000%
 0xe8395000        |     5948/594    |     5464/546    |     10 |    8.137%
 0xe59c3c00        |     5748/479    |     5712/476    |     12 |    0.626%
 0xf4cd1a80        |     3524/44     |     3520/44     |     80 |    0.114%
 0xe5bd1600        |     2892/482    |     2856/476    |      6 |    1.245%
 ...               | ...             | ...             | ...    | ...
 ------------------------------------------------------------------------------

SUMMARY
=======
Total bytes requested: 2333626
Total bytes allocated: 2353712
Total bytes wasted on internal fragmentation: 20086
Internal fragmentation: 0.853375%

TODO:
- show sym+offset in 'callsite' column
- show cross node allocation stats
- collect more useful stats?
- ...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: linux-mm@kvack.org <linux-mm@kvack.org>
LKML-Reference: <4B064AF5.9060208@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile       |    1 +
 tools/perf/builtin-kmem.c |  578 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin.h      |    1 +
 tools/perf/perf.c         |   27 +-
 4 files changed, 594 insertions(+), 13 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 3f0666a..d7198c5 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -444,6 +444,7 @@ BUILTIN_OBJS += builtin-timechart.o
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
 BUILTIN_OBJS += builtin-probe.o
+BUILTIN_OBJS += builtin-kmem.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
new file mode 100644
index 0000000..f315b05
--- /dev/null
+++ b/tools/perf/builtin-kmem.c
@@ -0,0 +1,578 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+#include "util/data_map.h"
+
+#include <linux/rbtree.h>
+
+struct alloc_stat;
+typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
+
+static char const		*input_name = "perf.data";
+
+static struct perf_header	*header;
+static u64			sample_type;
+
+static int			alloc_flag;
+static int			caller_flag;
+
+sort_fn_t			alloc_sort_fn;
+sort_fn_t			caller_sort_fn;
+
+static int			alloc_lines = -1;
+static int			caller_lines = -1;
+
+static char			*cwd;
+static int			cwdlen;
+
+struct alloc_stat {
+	union {
+		struct {
+			char	*name;
+			u64	call_site;
+		};
+		u64	ptr;
+	};
+	u64	bytes_req;
+	u64	bytes_alloc;
+	u32	hit;
+
+	struct rb_node node;
+};
+
+static struct rb_root root_alloc_stat;
+static struct rb_root root_alloc_sorted;
+static struct rb_root root_caller_stat;
+static struct rb_root root_caller_sorted;
+
+static unsigned long total_requested, total_allocated;
+
+struct raw_event_sample {
+	u32 size;
+	char data[0];
+};
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void insert_alloc_stat(unsigned long ptr,
+			      int bytes_req, int bytes_alloc)
+{
+	struct rb_node **node = &root_alloc_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	if (!alloc_flag)
+		return;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (ptr > data->ptr)
+			node = &(*node)->rb_right;
+		else if (ptr < data->ptr)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->ptr == ptr) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		data->ptr = ptr;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_alloc_stat);
+	}
+}
+
+static void insert_caller_stat(unsigned long call_site,
+			      int bytes_req, int bytes_alloc)
+{
+	struct rb_node **node = &root_caller_stat.rb_node;
+	struct rb_node *parent = NULL;
+	struct alloc_stat *data = NULL;
+
+	if (!caller_flag)
+		return;
+
+	while (*node) {
+		parent = *node;
+		data = rb_entry(*node, struct alloc_stat, node);
+
+		if (call_site > data->call_site)
+			node = &(*node)->rb_right;
+		else if (call_site < data->call_site)
+			node = &(*node)->rb_left;
+		else
+			break;
+	}
+
+	if (data && data->call_site == call_site) {
+		data->hit++;
+		data->bytes_req += bytes_req;
+		data->bytes_alloc += bytes_req;
+	} else {
+		data = malloc(sizeof(*data));
+		data->call_site = call_site;
+		data->hit = 1;
+		data->bytes_req = bytes_req;
+		data->bytes_alloc = bytes_alloc;
+
+		rb_link_node(&data->node, parent, node);
+		rb_insert_color(&data->node, &root_caller_stat);
+	}
+}
+
+static void process_alloc_event(struct raw_event_sample *raw,
+				struct event *event,
+				int cpu __used,
+				u64 timestamp __used,
+				struct thread *thread __used,
+				int node __used)
+{
+	unsigned long call_site;
+	unsigned long ptr;
+	int bytes_req;
+	int bytes_alloc;
+
+	ptr = raw_field_value(event, "ptr", raw->data);
+	call_site = raw_field_value(event, "call_site", raw->data);
+	bytes_req = raw_field_value(event, "bytes_req", raw->data);
+	bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data);
+
+	insert_alloc_stat(ptr, bytes_req, bytes_alloc);
+	insert_caller_stat(call_site, bytes_req, bytes_alloc);
+
+	total_requested += bytes_req;
+	total_allocated += bytes_alloc;
+}
+
+static void process_free_event(struct raw_event_sample *raw __used,
+			       struct event *event __used,
+			       int cpu __used,
+			       u64 timestamp __used,
+			       struct thread *thread __used)
+{
+}
+
+static void
+process_raw_event(event_t *raw_event __used, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct raw_event_sample *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	if (!strcmp(event->name, "kmalloc") ||
+	    !strcmp(event->name, "kmem_cache_alloc")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 0);
+		return;
+	}
+
+	if (!strcmp(event->name, "kmalloc_node") ||
+	    !strcmp(event->name, "kmem_cache_alloc_node")) {
+		process_alloc_event(raw, event, cpu, timestamp, thread, 1);
+		return;
+	}
+
+	if (!strcmp(event->name, "kfree") ||
+	    !strcmp(event->name, "kmem_cache_free")) {
+		process_free_event(raw, event, cpu, timestamp, thread);
+		return;
+	}
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	struct thread *thread = threads__findnew(event->ip.pid);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			 event->header.type);
+		return -1;
+	}
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	process_raw_event(event, more_data, cpu, timestamp, thread);
+
+	return 0;
+}
+
+static int sample_type_check(u64 type)
+{
+	sample_type = type;
+
+	if (!(sample_type & PERF_SAMPLE_RAW)) {
+		fprintf(stderr,
+			"No trace sample to read. Did you call perf record "
+			"without -R?");
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct perf_file_handler file_handler = {
+	.process_sample_event	= process_sample_event,
+	.process_comm_event	= process_comm_event,
+	.sample_type_check	= sample_type_check,
+};
+
+static int read_events(void)
+{
+	register_idle_thread();
+	register_perf_file_handler(&file_handler);
+
+	return mmap_dispatch_perf_file(&header, input_name, 0, 0,
+				       &cwdlen, &cwd);
+}
+
+static double fragmentation(unsigned long n_req, unsigned long n_alloc)
+{
+	if (n_alloc == 0)
+		return 0.0;
+	else
+		return 100.0 - (100.0 * n_req / n_alloc);
+}
+
+static void __print_result(struct rb_root *root, int n_lines, int is_caller)
+{
+	struct rb_node *next;
+
+	printf("\n ------------------------------------------------------------------------------\n");
+	if (is_caller)
+		printf(" Callsite          |");
+	else
+		printf(" Alloc Ptr         |");
+	printf(" Total_alloc/Per |  Total_req/Per  |  Hit   | Fragmentation\n");
+	printf(" ------------------------------------------------------------------------------\n");
+
+	next = rb_first(root);
+
+	while (next && n_lines--) {
+		struct alloc_stat *data;
+
+		data = rb_entry(next, struct alloc_stat, node);
+
+		printf(" %-16p  | %8llu/%-6lu | %8llu/%-6lu | %6lu | %8.3f%%\n",
+		       is_caller ? (void *)(unsigned long)data->call_site :
+				   (void *)(unsigned long)data->ptr,
+		       (unsigned long long)data->bytes_alloc,
+		       (unsigned long)data->bytes_alloc / data->hit,
+		       (unsigned long long)data->bytes_req,
+		       (unsigned long)data->bytes_req / data->hit,
+		       (unsigned long)data->hit,
+		       fragmentation(data->bytes_req, data->bytes_alloc));
+
+		next = rb_next(next);
+	}
+
+	if (n_lines == -1)
+		printf(" ...               | ...             | ...             | ...    | ...   \n");
+
+	printf(" ------------------------------------------------------------------------------\n");
+}
+
+static void print_summary(void)
+{
+	printf("\nSUMMARY\n=======\n");
+	printf("Total bytes requested: %lu\n", total_requested);
+	printf("Total bytes allocated: %lu\n", total_allocated);
+	printf("Total bytes wasted on internal fragmentation: %lu\n",
+	       total_allocated - total_requested);
+	printf("Internal fragmentation: %f%%\n",
+	       fragmentation(total_requested, total_allocated));
+}
+
+static void print_result(void)
+{
+	if (caller_flag)
+		__print_result(&root_caller_sorted, caller_lines, 1);
+	if (alloc_flag)
+		__print_result(&root_alloc_sorted, alloc_lines, 0);
+	print_summary();
+}
+
+static void sort_insert(struct rb_root *root, struct alloc_stat *data,
+			sort_fn_t sort_fn)
+{
+	struct rb_node **new = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct alloc_stat *this;
+		int cmp;
+
+		this = rb_entry(*new, struct alloc_stat, node);
+		parent = *new;
+
+		cmp = sort_fn(data, this);
+
+		if (cmp > 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void __sort_result(struct rb_root *root, struct rb_root *root_sorted,
+			  sort_fn_t sort_fn)
+{
+	struct rb_node *node;
+	struct alloc_stat *data;
+
+	for (;;) {
+		node = rb_first(root);
+		if (!node)
+			break;
+
+		rb_erase(node, root);
+		data = rb_entry(node, struct alloc_stat, node);
+		sort_insert(root_sorted, data, sort_fn);
+	}
+}
+
+static void sort_result(void)
+{
+	__sort_result(&root_alloc_stat, &root_alloc_sorted, alloc_sort_fn);
+	__sort_result(&root_caller_stat, &root_caller_sorted, caller_sort_fn);
+}
+
+static int __cmd_kmem(void)
+{
+	setup_pager();
+	read_events();
+	sort_result();
+	print_result();
+
+	return 0;
+}
+
+static const char * const kmem_usage[] = {
+	"perf kmem [<options>] {record}",
+	NULL
+};
+
+
+static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->ptr < r->ptr)
+		return -1;
+	else if (l->ptr > r->ptr)
+		return 1;
+	return 0;
+}
+
+static int callsite_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->call_site < r->call_site)
+		return -1;
+	else if (l->call_site > r->call_site)
+		return 1;
+	return 0;
+}
+
+static int bytes_cmp(struct alloc_stat *l, struct alloc_stat *r)
+{
+	if (l->bytes_alloc < r->bytes_alloc)
+		return -1;
+	else if (l->bytes_alloc > r->bytes_alloc)
+		return 1;
+	return 0;
+}
+
+static int parse_sort_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	sort_fn_t sort_fn;
+
+	if (!arg)
+		return -1;
+
+	if (strcmp(arg, "ptr") == 0)
+		sort_fn = ptr_cmp;
+	else if (strcmp(arg, "call_site") == 0)
+		sort_fn = callsite_cmp;
+	else if (strcmp(arg, "bytes") == 0)
+		sort_fn = bytes_cmp;
+	else
+		return -1;
+
+	if (caller_flag > alloc_flag)
+		caller_sort_fn = sort_fn;
+	else
+		alloc_sort_fn = sort_fn;
+
+	return 0;
+}
+
+static int parse_stat_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	if (!arg)
+		return -1;
+
+	if (strcmp(arg, "alloc") == 0)
+		alloc_flag = (caller_flag + 1);
+	else if (strcmp(arg, "caller") == 0)
+		caller_flag = (alloc_flag + 1);
+	else
+		return -1;
+	return 0;
+}
+
+static int parse_line_opt(const struct option *opt __used,
+			  const char *arg, int unset __used)
+{
+	int lines;
+
+	if (!arg)
+		return -1;
+
+	lines = strtoul(arg, NULL, 10);
+
+	if (caller_flag > alloc_flag)
+		caller_lines = lines;
+	else
+		alloc_lines = lines;
+
+	return 0;
+}
+
+static const struct option kmem_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		   "input file name"),
+	OPT_CALLBACK(0, "stat", NULL, "<alloc>|<caller>",
+		     "stat selector, Pass 'alloc' or 'caller'.",
+		     parse_stat_opt),
+	OPT_CALLBACK('s', "sort", NULL, "key",
+		     "sort by key: ptr, call_site, hit, bytes",
+		     parse_sort_opt),
+	OPT_CALLBACK('l', "line", NULL, "num",
+		     "show n lins",
+		     parse_line_opt),
+	OPT_END()
+};
+
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-M",
+	"-f",
+	"-c", "1",
+	"-e", "kmem:kmalloc",
+	"-e", "kmem:kmalloc_node",
+	"-e", "kmem:kfree",
+	"-e", "kmem:kmem_cache_alloc",
+	"-e", "kmem:kmem_cache_alloc_node",
+	"-e", "kmem:kmem_cache_free",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init(0);
+
+	argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+
+	if (argc && !strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (argc)
+		usage_with_options(kmem_usage, kmem_options);
+
+	if (!alloc_sort_fn)
+		alloc_sort_fn = bytes_cmp;
+	if (!caller_sort_fn)
+		caller_sort_fn = bytes_cmp;
+
+	return __cmd_kmem();
+}
+
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 9b02d85..a3d8bf6 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -28,5 +28,6 @@ extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
 extern int cmd_probe(int argc, const char **argv, const char *prefix);
+extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 89b82ac..cf64049 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -285,20 +285,21 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
-		{ "help", cmd_help, 0 },
-		{ "list", cmd_list, 0 },
 		{ "buildid-list", cmd_buildid_list, 0 },
-		{ "record", cmd_record, 0 },
-		{ "report", cmd_report, 0 },
-		{ "bench", cmd_bench, 0 },
-		{ "stat", cmd_stat, 0 },
-		{ "timechart", cmd_timechart, 0 },
-		{ "top", cmd_top, 0 },
-		{ "annotate", cmd_annotate, 0 },
-		{ "version", cmd_version, 0 },
-		{ "trace", cmd_trace, 0 },
-		{ "sched", cmd_sched, 0 },
-		{ "probe", cmd_probe, 0 },
+		{ "help",	cmd_help,	0 },
+		{ "list",	cmd_list,	0 },
+		{ "record",	cmd_record,	0 },
+		{ "report",	cmd_report,	0 },
+		{ "bench",	cmd_bench,	0 },
+		{ "stat",	cmd_stat,	0 },
+		{ "timechart",	cmd_timechart,	0 },
+		{ "top",	cmd_top,	0 },
+		{ "annotate",	cmd_annotate,	0 },
+		{ "version",	cmd_version,	0 },
+		{ "trace",	cmd_trace,	0 },
+		{ "sched",	cmd_sched,	0 },
+		{ "probe",	cmd_probe,	0 },
+		{ "kmem",	cmd_kmem,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  7:53 [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Li Zefan
                   ` (2 preceding siblings ...)
  2009-11-20  8:54 ` [tip:perf/core] " tip-bot for Li Zefan
@ 2009-11-20  8:55 ` Ingo Molnar
  2009-11-20  9:11   ` Li Zefan
  3 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20  8:55 UTC (permalink / raw)
  To: Li Zefan
  Cc: Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Pekka Enberg, Eduard - Gabriel Munteanu, LKML, linux-mm


* Li Zefan <lizf@cn.fujitsu.com> wrote:

> ---
>  tools/perf/Makefile       |    1 +
>  tools/perf/builtin-kmem.c |  578 +++++++++++++++++++++++++++++++++++++++++++++
>  tools/perf/builtin.h      |    1 +
>  tools/perf/perf.c         |   27 +-
>  4 files changed, 594 insertions(+), 13 deletions(-)
>  create mode 100644 tools/perf/builtin-kmem.c

btw., you might want to add it to command-list.txt as well (in a future 
patch), so that 'kmem' shows up in the default 'perf' output.

Plus a Documentation/perf-kmem.txt file will make sure that 'perf help 
kmem' and 'perf kmem --help' displays a help page, etc.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:47       ` Pekka Enberg
  2009-11-20  8:53         ` Li Zefan
@ 2009-11-20  9:01         ` Ingo Molnar
  2009-11-20  9:15           ` Pekka Enberg
  1 sibling, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20  9:01 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm


* Pekka Enberg <penberg@cs.helsinki.fi> wrote:

> Ingo Molnar kirjoitti:
> >Regarding patch 2 - can we set some definitive benchmark threshold
> >for that? I.e. a list of must-have features in 'perf kmem' before
> >we can do it? 100% information and analysis equivalency with
> >kmemtrace-user tool?
> 
> I'd be interested to hear Eduard's comment on that.
> 
> That said, I'll try to find some time to test "perf kmem" and
> provide feedback on that. I can ACK the patch when I'm happy with
> the output. :-)
> 
> I'm mostly interested in two scenarios: (1) getting a nice report on
> worst fragmented call-sites (perf kmem needs symbol lookup) and (2)
> doing "perf kmem record" on machine A (think embedded here) and then
> "perf kmem report" on machine B. I haven't tried kmemtrace-user for
> a while but it did support both of them quite nicely at some point.

The perf.data can be copied over and to get off-side kernel symbol 
resolution you can specify the kernel vmlinux via -k/--vmlinux to perf 
report, then perf will look up the symbols from that vmlinux.

Cross word-size data files should work fine - cross-endian probably 
needs a few fixes.

Plus off-site user-space symbols need more work, right now we dont 
embedd them in the perf.data. It would need a symbol lookup + embedd-it 
pass in perf record (perhaps available as a separate 'perf archive' 
command as well), and some smarts on the reporting side to make use of 
them. (Probably a copy of all relevant DSOs is what works best - that 
enables off-site annotate as well.)

But ... even without that, perf is really fast and is supposed to build 
fine even in minimal (embedded) environments, so you can run it on the 
embedded board too. That's useful to get live inspection features like 
'perf top', 'perf stat' and 'perf probe' anyway.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:53         ` Li Zefan
@ 2009-11-20  9:03           ` Ingo Molnar
  2009-11-20  9:14             ` Li Zefan
  2009-11-20 14:42             ` Arnaldo Carvalho de Melo
  0 siblings, 2 replies; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20  9:03 UTC (permalink / raw)
  To: Li Zefan
  Cc: Pekka Enberg, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm


* Li Zefan <lizf@cn.fujitsu.com> wrote:

> > (2) doing "perf kmem record" on machine A (think embedded here) and 
> > then "perf kmem report" on machine B. I haven't tried kmemtrace-user 
> > for a while but it did support both of them quite nicely at some 
> > point.
> 
> Everything needed and machine-specific will be recorded in perf.data, 
> so this should already been supported. I'll try it.

Right now the DSOs are not recorded in the perf.data - but it would be 
useful to add it and to turn perf.data into a self-sufficient capture of 
all relevant data, which can be analyzed on any box.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  8:55 ` [RFC][PATCH 1/2] " Ingo Molnar
@ 2009-11-20  9:11   ` Li Zefan
  0 siblings, 0 replies; 32+ messages in thread
From: Li Zefan @ 2009-11-20  9:11 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Steven Rostedt, Peter Zijlstra,
	Pekka Enberg, Eduard - Gabriel Munteanu, LKML, linux-mm

Ingo Molnar wrote:
> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>> ---
>>  tools/perf/Makefile       |    1 +
>>  tools/perf/builtin-kmem.c |  578 +++++++++++++++++++++++++++++++++++++++++++++
>>  tools/perf/builtin.h      |    1 +
>>  tools/perf/perf.c         |   27 +-
>>  4 files changed, 594 insertions(+), 13 deletions(-)
>>  create mode 100644 tools/perf/builtin-kmem.c
> 
> btw., you might want to add it to command-list.txt as well (in a future 
> patch), so that 'kmem' shows up in the default 'perf' output.
> 
> Plus a Documentation/perf-kmem.txt file will make sure that 'perf help 
> kmem' and 'perf kmem --help' displays a help page, etc.
> 

I planed to do these after collecting comments and getting positive
responses. So sure, I'll post further patches.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  9:03           ` Ingo Molnar
@ 2009-11-20  9:14             ` Li Zefan
  2009-11-20 14:42             ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 32+ messages in thread
From: Li Zefan @ 2009-11-20  9:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

Ingo Molnar wrote:
> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>>> (2) doing "perf kmem record" on machine A (think embedded here) and 
>>> then "perf kmem report" on machine B. I haven't tried kmemtrace-user 
>>> for a while but it did support both of them quite nicely at some 
>>> point.
>> Everything needed and machine-specific will be recorded in perf.data, 
>> so this should already been supported. I'll try it.
> 
> Right now the DSOs are not recorded in the perf.data - but it would be 
> useful to add it and to turn perf.data into a self-sufficient capture of 
> all relevant data, which can be analyzed on any box.
> 

But still 'perf kmem' should function better than kmemtrace-user,
since the latter records no more than raw trace data.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  9:01         ` Ingo Molnar
@ 2009-11-20  9:15           ` Pekka Enberg
  2009-11-20 10:13             ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20  9:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

Hi Ingo,

On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
> But ... even without that, perf is really fast and is supposed to build
> fine even in minimal (embedded) environments, so you can run it on the
> embedded board too. That's useful to get live inspection features like
> 'perf top', 'perf stat' and 'perf probe' anyway.

Maybe I'm just too damn lazy but if I don't go through the trouble of
building my kernel on the box, I sure don't want to do that for perf
either. Anyway, I'm sure we can fix "perf kmem" to support what
kmemtrace-user does so it's not an issue.

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  9:15           ` Pekka Enberg
@ 2009-11-20 10:13             ` Ingo Molnar
  2009-11-20 10:31               ` Pekka Enberg
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20 10:13 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm


* Pekka Enberg <penberg@cs.helsinki.fi> wrote:

> Hi Ingo,
> 
> On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > But ... even without that, perf is really fast and is supposed to build
> > fine even in minimal (embedded) environments, so you can run it on the
> > embedded board too. That's useful to get live inspection features like
> > 'perf top', 'perf stat' and 'perf probe' anyway.
> 
> Maybe I'm just too damn lazy but if I don't go through the trouble of
> building my kernel on the box, I sure don't want to do that for perf
> either. [...]

Well you'll need 'perf' on that box anyway, to be able to do 'perf kmem 
record'.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 10:13             ` Ingo Molnar
@ 2009-11-20 10:31               ` Pekka Enberg
  2009-11-20 10:49                 ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Pekka Enberg @ 2009-11-20 10:31 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm

Ingo Molnar kirjoitti:
> * Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> 
>> Hi Ingo,
>>
>> On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
>>> But ... even without that, perf is really fast and is supposed to build
>>> fine even in minimal (embedded) environments, so you can run it on the
>>> embedded board too. That's useful to get live inspection features like
>>> 'perf top', 'perf stat' and 'perf probe' anyway.
>> Maybe I'm just too damn lazy but if I don't go through the trouble of
>> building my kernel on the box, I sure don't want to do that for perf
>> either. [...]
> 
> Well you'll need 'perf' on that box anyway, to be able to do 'perf kmem 
> record'.

/me turns brains on

You're right, of course. With kmemtrace-user, I just copied the raw 
trace file from /sys/kernel. I wonder if that's a good enough reason to 
keep kmemtrace bits around?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 10:31               ` Pekka Enberg
@ 2009-11-20 10:49                 ` Ingo Molnar
  2009-11-23 14:46                   ` Steven Rostedt
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20 10:49 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Li Zefan, Arnaldo Carvalho de Melo, Frederic Weisbecker,
	Steven Rostedt, Peter Zijlstra, Eduard - Gabriel Munteanu, LKML,
	linux-mm


* Pekka Enberg <penberg@cs.helsinki.fi> wrote:

> Ingo Molnar kirjoitti:
> >* Pekka Enberg <penberg@cs.helsinki.fi> wrote:
> >
> >>Hi Ingo,
> >>
> >>On Fri, Nov 20, 2009 at 11:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >>>But ... even without that, perf is really fast and is supposed to build
> >>>fine even in minimal (embedded) environments, so you can run it on the
> >>>embedded board too. That's useful to get live inspection features like
> >>>'perf top', 'perf stat' and 'perf probe' anyway.
> >>Maybe I'm just too damn lazy but if I don't go through the trouble of
> >>building my kernel on the box, I sure don't want to do that for perf
> >>either. [...]
> >
> >Well you'll need 'perf' on that box anyway, to be able to do 'perf
> >kmem record'.
> 
> /me turns brains on
> 
> You're right, of course. With kmemtrace-user, I just copied the raw 
> trace file from /sys/kernel. I wonder if that's a good enough reason 
> to keep kmemtrace bits around?

Not really. If then a light-weight recording app could be made but i'd 
rather wait for actual usecases to pop up.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20  9:03           ` Ingo Molnar
  2009-11-20  9:14             ` Li Zefan
@ 2009-11-20 14:42             ` Arnaldo Carvalho de Melo
  2009-11-20 16:41               ` Ingo Molnar
  1 sibling, 1 reply; 32+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-11-20 14:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Pekka Enberg, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Eduard - Gabriel Munteanu, LKML, linux-mm

Em Fri, Nov 20, 2009 at 10:03:53AM +0100, Ingo Molnar escreveu:
> 
> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
> > > (2) doing "perf kmem record" on machine A (think embedded here) and 
> > > then "perf kmem report" on machine B. I haven't tried kmemtrace-user 
> > > for a while but it did support both of them quite nicely at some 
> > > point.
> > 
> > Everything needed and machine-specific will be recorded in perf.data, 
> > so this should already been supported. I'll try it.
> 
> Right now the DSOs are not recorded in the perf.data - but it would be 
> useful to add it and to turn perf.data into a self-sufficient capture of 
> all relevant data, which can be analyzed on any box.

Well, the DSOs are recorded in perf.data, just not its symtabs, but now
we have buildids, so we can ask for them to be installed on the other
machine and it'll all work. Or should. :)

For instance:

[root@doppio linux-2.6-tip]# perf buildid-list -i perf.data | egrep 'vmlinux|nfs|libc-'
ec8dd400904ddfcac8b1c343263a790f977159dc /lib64/libc-2.10.1.so
0da49504693a200583fda6f1b949e6d2f799e692 /usr/lib64/libnfsidmap_nsswitch.so.0.0.0
c90269c87eaf08559012a9fa29f60780743360cd /usr/lib64/libnfsidmap.so.0.3.0
18e7cc53db62a7d35e9d6f6c9ddc23017d38ee9a vmlinux
3982866276471cde6ac5821fdced42a7b1bfd848 [nfs]
1489007276a50005753e730198fd93dd05b2082f [nfsd]
5a128f082fe7fdcab6fb5d1b71935accb1f34383 [nfs_acl]
[root@doppio linux-2.6-tip]#

Now if I ask that the buildid for /usr/lib64/libnfsidmap.so.0.3.0 above
to be installed, like this:

[root@doppio linux-2.6-tip]# yum install /usr/lib/debug/.build-id/c9/0269c87eaf08559012a9fa29f60780743360cd
Loaded plugins: auto-update-debuginfo, refresh-packagekit
Found 44 installed debuginfo package(s)
Enabling fedora-debuginfo: Fedora 11 - x86_64 - Debug
Reading repository metadata in from local files
Enabling updates-debuginfo: Fedora 11 - x86_64 - Updates - Debug
Reading repository metadata in from local files
Setting up Install Process
Importing additional filelist information
Resolving Dependencies
--> Running transaction check
---> Package nfs-utils-lib-debuginfo.x86_64 0:1.1.4-6.fc11 set to be updated
--> Finished Dependency Resolution

Dependencies Resolved

========================================================================
 Package                   Arch   Version       Repository	 Size
========================================================================
Installing:
 nfs-utils-lib-debuginfo   x86_64 1.1.4-6.fc11  fedora-debuginfo 174 k

Transaction Summary
========================================================================
Install       1 Package(s)
Upgrade       0 Package(s)

Total download size: 174 k
Is this ok [y/N]:

So now we have:

1) 'perf record' records the build-ids into perf.data
2) 'perf buildid-list' list them, distro specific porcelain needed
   to do the equivalent to the yum install above.
3) 'perf report' will only use the symtab in a file that has the matching
   build-id, if a build-id is found in the perf.data header for a
   particular DSO.

So we have a mechanism that is already present in several distros
(build-id), that is in the kernel build process since ~2.6.23, and that
avoids using mismatching DSOs when resolving symbols.

I'm working on some of these aspects, but most of the infrastructure is
alreadu in tip.

- Arnaldo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 14:42             ` Arnaldo Carvalho de Melo
@ 2009-11-20 16:41               ` Ingo Molnar
  2009-11-20 17:52                 ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 32+ messages in thread
From: Ingo Molnar @ 2009-11-20 16:41 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Li Zefan, Pekka Enberg, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Eduard - Gabriel Munteanu, LKML, linux-mm


* Arnaldo Carvalho de Melo <acme@infradead.org> wrote:

> Em Fri, Nov 20, 2009 at 10:03:53AM +0100, Ingo Molnar escreveu:
> > 
> > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> > > > (2) doing "perf kmem record" on machine A (think embedded here) and 
> > > > then "perf kmem report" on machine B. I haven't tried kmemtrace-user 
> > > > for a while but it did support both of them quite nicely at some 
> > > > point.
> > > 
> > > Everything needed and machine-specific will be recorded in perf.data, 
> > > so this should already been supported. I'll try it.
> > 
> > Right now the DSOs are not recorded in the perf.data - but it would be 
> > useful to add it and to turn perf.data into a self-sufficient capture of 
> > all relevant data, which can be analyzed on any box.
> 
> Well, the DSOs are recorded in perf.data, just not its symtabs, but now
> we have buildids, so we can ask for them to be installed on the other
> machine and it'll all work. Or should. :)
> 
> For instance:
> 
> [root@doppio linux-2.6-tip]# perf buildid-list -i perf.data | egrep 'vmlinux|nfs|libc-'
> ec8dd400904ddfcac8b1c343263a790f977159dc /lib64/libc-2.10.1.so
> 0da49504693a200583fda6f1b949e6d2f799e692 /usr/lib64/libnfsidmap_nsswitch.so.0.0.0
> c90269c87eaf08559012a9fa29f60780743360cd /usr/lib64/libnfsidmap.so.0.3.0
> 18e7cc53db62a7d35e9d6f6c9ddc23017d38ee9a vmlinux
> 3982866276471cde6ac5821fdced42a7b1bfd848 [nfs]
> 1489007276a50005753e730198fd93dd05b2082f [nfsd]
> 5a128f082fe7fdcab6fb5d1b71935accb1f34383 [nfs_acl]
> [root@doppio linux-2.6-tip]#
> 
> Now if I ask that the buildid for /usr/lib64/libnfsidmap.so.0.3.0 above
> to be installed, like this:
> 
> [root@doppio linux-2.6-tip]# yum install /usr/lib/debug/.build-id/c9/0269c87eaf08559012a9fa29f60780743360cd
> Loaded plugins: auto-update-debuginfo, refresh-packagekit
> Found 44 installed debuginfo package(s)
> Enabling fedora-debuginfo: Fedora 11 - x86_64 - Debug
> Reading repository metadata in from local files
> Enabling updates-debuginfo: Fedora 11 - x86_64 - Updates - Debug
> Reading repository metadata in from local files
> Setting up Install Process
> Importing additional filelist information
> Resolving Dependencies
> --> Running transaction check
> ---> Package nfs-utils-lib-debuginfo.x86_64 0:1.1.4-6.fc11 set to be updated
> --> Finished Dependency Resolution
> 
> Dependencies Resolved
> 
> ========================================================================
>  Package                   Arch   Version       Repository	 Size
> ========================================================================
> Installing:
>  nfs-utils-lib-debuginfo   x86_64 1.1.4-6.fc11  fedora-debuginfo 174 k
> 
> Transaction Summary
> ========================================================================
> Install       1 Package(s)
> Upgrade       0 Package(s)
> 
> Total download size: 174 k
> Is this ok [y/N]:
> 
> So now we have:
> 
> 1) 'perf record' records the build-ids into perf.data
> 2) 'perf buildid-list' list them, distro specific porcelain needed
>    to do the equivalent to the yum install above.
> 3) 'perf report' will only use the symtab in a file that has the matching
>    build-id, if a build-id is found in the perf.data header for a
>    particular DSO.
> 
> So we have a mechanism that is already present in several distros
> (build-id), that is in the kernel build process since ~2.6.23, and that
> avoids using mismatching DSOs when resolving symbols.

But what do we do if we have another box that runs say on a MIPS CPU, 
uses some minimal distro - and copy that perf.data over to an x86 box.

The idea is there to be some new mode of perf.data where all the 
relevant DSO contents (symtabs but also sections with instructions for 
perf annotate to work) are copied into perf.data, during or after data 
capture - on the box that does the recording.

Once we have everything embedded in the perf.data, analysis passes only 
have to work based on that particular perf.data - no external data.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 16:41               ` Ingo Molnar
@ 2009-11-20 17:52                 ` Arnaldo Carvalho de Melo
  2009-11-23  6:51                   ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-11-20 17:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Pekka Enberg, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Eduard - Gabriel Munteanu, LKML, linux-mm

Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > So we have a mechanism that is already present in several distros
> > (build-id), that is in the kernel build process since ~2.6.23, and that
> > avoids using mismatching DSOs when resolving symbols.
> 
> But what do we do if we have another box that runs say on a MIPS CPU, 
> uses some minimal distro - and copy that perf.data over to an x86 box.

There would be no problem, it would be just a matter of installing the
right -debuginfo packages, for MIPS.

Or the original, unstripped FS image sent to the machine with the MIPS
cpu, if there aren't -debuginfo packages.

Either one, the right DSOs would be found by the buildids.

There are other scenarios, like a binary that gets updated while a long
running perf record session runs, the way to differentiate between the
two DSOs wouldn't be the name, but the buildid.

> The idea is there to be some new mode of perf.data where all the 
> relevant DSO contents (symtabs but also sections with instructions for 
> perf annotate to work) are copied into perf.data, during or after data 
> capture - on the box that does the recording.
> 
> Once we have everything embedded in the perf.data, analysis passes only 
> have to work based on that particular perf.data - no external data.

Well, we can that, additionally, but think about stripped binaries, we
would lose potentially a lot because the symtabs on that small machine
would have poorer symtabs than the ones in an unstriped binary (or in a
-debuginfo package).

- Arnaldo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 17:52                 ` Arnaldo Carvalho de Melo
@ 2009-11-23  6:51                   ` Ingo Molnar
  2009-11-23  7:22                     ` Peter Zijlstra
  2009-11-23 14:32                     ` Arnaldo Carvalho de Melo
  0 siblings, 2 replies; 32+ messages in thread
From: Ingo Molnar @ 2009-11-23  6:51 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: Li Zefan, Pekka Enberg, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Eduard - Gabriel Munteanu, LKML, linux-mm


* Arnaldo Carvalho de Melo <acme@infradead.org> wrote:

> Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > So we have a mechanism that is already present in several distros
> > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > avoids using mismatching DSOs when resolving symbols.
> > 
> > But what do we do if we have another box that runs say on a MIPS CPU, 
> > uses some minimal distro - and copy that perf.data over to an x86 box.
> 
> There would be no problem, it would be just a matter of installing the
> right -debuginfo packages, for MIPS.

I havent tried this - is this really possible to do on an x86 box, with 
a typical distro? Can i install say Fedora PowerPC debuginfo packages on 
an x86 box, while also having the x86 debuginfo packages there?

> Or the original, unstripped FS image sent to the machine with the MIPS 
> cpu, if there aren't -debuginfo packages.
> 
> Either one, the right DSOs would be found by the buildids.
> 
> There are other scenarios, like a binary that gets updated while a long
> running perf record session runs, the way to differentiate between the
> two DSOs wouldn't be the name, but the buildid.
> 
> > The idea is there to be some new mode of perf.data where all the 
> > relevant DSO contents (symtabs but also sections with instructions for 
> > perf annotate to work) are copied into perf.data, during or after data 
> > capture - on the box that does the recording.
> > 
> > Once we have everything embedded in the perf.data, analysis passes only 
> > have to work based on that particular perf.data - no external data.
> 
> Well, we can that, additionally, but think about stripped binaries, we 
> would lose potentially a lot because the symtabs on that small machine 
> would have poorer symtabs than the ones in an unstriped binary (or in 
> a -debuginfo package).

We should definitely use the widest and best quality information we can 
- if it's available.

So even if we 'inline' any information from the box, if there's better 
info available at the time of analysis, we should use that too.

Basically what matters is the principle of 'what is possible'.

If a user records on a box and analyses on a different box, and we end 
up not doing something (and printing an error or displaying an empty 
profile) that could reasonably have been done, then the user will be 
unhappy and we might lose that user.

The user wont be unhappy about us using a big set of data sources that 
we can recover information from transparently. The user will be unhappy 
if we insist on (and force) a certain form of information source - such 
as debuginfo.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-23  6:51                   ` Ingo Molnar
@ 2009-11-23  7:22                     ` Peter Zijlstra
  2009-11-23  7:33                       ` Ingo Molnar
  2009-11-23 14:37                       ` Arnaldo Carvalho de Melo
  2009-11-23 14:32                     ` Arnaldo Carvalho de Melo
  1 sibling, 2 replies; 32+ messages in thread
From: Peter Zijlstra @ 2009-11-23  7:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Li Zefan, Pekka Enberg,
	Frederic Weisbecker, Steven Rostedt, Eduard - Gabriel Munteanu,
	LKML, linux-mm

On Mon, 2009-11-23 at 07:51 +0100, Ingo Molnar wrote:
> 
> * Arnaldo Carvalho de Melo <acme@infradead.org> wrote:
> 
> > Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > > So we have a mechanism that is already present in several distros
> > > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > > avoids using mismatching DSOs when resolving symbols.
> > > 
> > > But what do we do if we have another box that runs say on a MIPS CPU, 
> > > uses some minimal distro - and copy that perf.data over to an x86 box.
> > 
> > There would be no problem, it would be just a matter of installing the
> > right -debuginfo packages, for MIPS.
> 
> I havent tried this - is this really possible to do on an x86 box, with 
> a typical distro? Can i install say Fedora PowerPC debuginfo packages on 
> an x86 box, while also having the x86 debuginfo packages there? 

The best option would be to allow to specify a chroot parameter, where
we can specify the embedded root filesystem on out machine.

I'm not even sure embedded distros even have this separate debug package
crazyness, you simply build the distro with or without debuginfo.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-23  7:22                     ` Peter Zijlstra
@ 2009-11-23  7:33                       ` Ingo Molnar
  2009-11-23 14:37                       ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 32+ messages in thread
From: Ingo Molnar @ 2009-11-23  7:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Arnaldo Carvalho de Melo, Li Zefan, Pekka Enberg,
	Frederic Weisbecker, Steven Rostedt, Eduard - Gabriel Munteanu,
	LKML, linux-mm


* Peter Zijlstra <peterz@infradead.org> wrote:

> > I havent tried this - is this really possible to do on an x86 box, 
> > with a typical distro? Can i install say Fedora PowerPC debuginfo 
> > packages on an x86 box, while also having the x86 debuginfo packages 
> > there?
> 
> The best option would be to allow to specify a chroot parameter, where 
> we can specify the embedded root filesystem on out machine.
> 
> I'm not even sure embedded distros even have this separate debug 
> package crazyness, you simply build the distro with or without 
> debuginfo.

yes - we could use -R/--root (which opreport has as well), as a 
mandatory path prefix to all DSO/debuginfo searches.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-23  6:51                   ` Ingo Molnar
  2009-11-23  7:22                     ` Peter Zijlstra
@ 2009-11-23 14:32                     ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 32+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-11-23 14:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, Pekka Enberg, Frederic Weisbecker, Steven Rostedt,
	Peter Zijlstra, Eduard - Gabriel Munteanu, LKML, linux-mm

Em Mon, Nov 23, 2009 at 07:51:10AM +0100, Ingo Molnar escreveu:
> 
> * Arnaldo Carvalho de Melo <acme@infradead.org> wrote:
> 
> > Em Fri, Nov 20, 2009 at 05:41:10PM +0100, Ingo Molnar escreveu:
> > > > So we have a mechanism that is already present in several distros
> > > > (build-id), that is in the kernel build process since ~2.6.23, and that
> > > > avoids using mismatching DSOs when resolving symbols.
> > > 
> > > But what do we do if we have another box that runs say on a MIPS CPU, 
> > > uses some minimal distro - and copy that perf.data over to an x86 box.
> > 
> > There would be no problem, it would be just a matter of installing the
> > right -debuginfo packages, for MIPS.
> 
> I havent tried this - is this really possible to do on an x86 box, with 
> a typical distro? Can i install say Fedora PowerPC debuginfo packages on 
> an x86 box, while also having the x86 debuginfo packages there?

I should have added "in theory", as I haven't tested this as well using
the current tools, but it should :)
 
> > Or the original, unstripped FS image sent to the machine with the MIPS 
> > cpu, if there aren't -debuginfo packages.
> > 
> > Either one, the right DSOs would be found by the buildids.
> > 
> > There are other scenarios, like a binary that gets updated while a long
> > running perf record session runs, the way to differentiate between the
> > two DSOs wouldn't be the name, but the buildid.
> > 
> > > The idea is there to be some new mode of perf.data where all the 
> > > relevant DSO contents (symtabs but also sections with instructions for 
> > > perf annotate to work) are copied into perf.data, during or after data 
> > > capture - on the box that does the recording.
> > > 
> > > Once we have everything embedded in the perf.data, analysis passes only 
> > > have to work based on that particular perf.data - no external data.
> > 
> > Well, we can that, additionally, but think about stripped binaries, we 
> > would lose potentially a lot because the symtabs on that small machine 
> > would have poorer symtabs than the ones in an unstriped binary (or in 
> > a -debuginfo package).
> 
> We should definitely use the widest and best quality information we can 
> - if it's available.
> 
> So even if we 'inline' any information from the box, if there's better 
> info available at the time of analysis, we should use that too.
> 
> Basically what matters is the principle of 'what is possible'.
> 
> If a user records on a box and analyses on a different box, and we end 
> up not doing something (and printing an error or displaying an empty 
> profile) that could reasonably have been done, then the user will be 
> unhappy and we might lose that user.
> 
> The user wont be unhappy about us using a big set of data sources that 
> we can recover information from transparently. The user will be unhappy 
> if we insist on (and force) a certain form of information source - such 
> as debuginfo.

Sure thing, I'm thinking about how to encode the perf.data file inside
an ELF section while merging all symtabs to reduce size by sharing the
strings table, etc.

The dso__load routine already does that fallback from what is best
(debuginfo packages) to what is available (the symtab, dynsym tables in
the DSO itself), its just a matter of efficiently encoding the symtabs
into the perf.data file and that will be another source of symbols if
the preferred one (debuginfo) is not available.

- Arnaldo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-23  7:22                     ` Peter Zijlstra
  2009-11-23  7:33                       ` Ingo Molnar
@ 2009-11-23 14:37                       ` Arnaldo Carvalho de Melo
  1 sibling, 0 replies; 32+ messages in thread
From: Arnaldo Carvalho de Melo @ 2009-11-23 14:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Li Zefan, Pekka Enberg, Frederic Weisbecker,
	Steven Rostedt, Eduard - Gabriel Munteanu, LKML, linux-mm

Em Mon, Nov 23, 2009 at 08:22:21AM +0100, Peter Zijlstra escreveu:
> On Mon, 2009-11-23 at 07:51 +0100, Ingo Molnar wrote:
> > I havent tried this - is this really possible to do on an x86 box, with 
> > a typical distro? Can i install say Fedora PowerPC debuginfo packages on 
> > an x86 box, while also having the x86 debuginfo packages there? 
> 
> The best option would be to allow to specify a chroot parameter, where
> we can specify the embedded root filesystem on out machine.

yeah, I'm working now on a vmlinux_path, so that the symbol machinery in
perf looks at /lib/module/`uname -r`/build/vmlinux,
/usr/lib/debug/lib/modules/`uname -r`/vmlinux, ./vmlinux as a default or
in getenv("VMLINUX_PATH") if set. Being able to specify a
SYMTAB_PREFIX_PATH also should be possible.
 
> I'm not even sure embedded distros even have this separate debug package
> crazyness, you simply build the distro with or without debuginfo.

Whatever crazyness people usually do to find the files with matching,
richer symtabs we should support :)

- Arnaldo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-20 10:49                 ` Ingo Molnar
@ 2009-11-23 14:46                   ` Steven Rostedt
  2009-11-23 17:53                     ` Ingo Molnar
  0 siblings, 1 reply; 32+ messages in thread
From: Steven Rostedt @ 2009-11-23 14:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Li Zefan, Arnaldo Carvalho de Melo,
	Frederic Weisbecker, Peter Zijlstra, Eduard - Gabriel Munteanu,
	LKML, linux-mm

On Fri, 2009-11-20 at 11:49 +0100, Ingo Molnar wrote:
> > 
> > You're right, of course. With kmemtrace-user, I just copied the raw 
> > trace file from /sys/kernel. I wonder if that's a good enough reason 
> > to keep kmemtrace bits around?
> 
> Not really. If then a light-weight recording app could be made but i'd 
> rather wait for actual usecases to pop up.

Hmm, but isn't this an actual use case?

-- Steve


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [RFC][PATCH 1/2] perf: Add 'perf kmem' tool
  2009-11-23 14:46                   ` Steven Rostedt
@ 2009-11-23 17:53                     ` Ingo Molnar
  0 siblings, 0 replies; 32+ messages in thread
From: Ingo Molnar @ 2009-11-23 17:53 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Pekka Enberg, Li Zefan, Arnaldo Carvalho de Melo,
	Frederic Weisbecker, Peter Zijlstra, Eduard - Gabriel Munteanu,
	LKML, linux-mm


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 2009-11-20 at 11:49 +0100, Ingo Molnar wrote:
> > > 
> > > You're right, of course. With kmemtrace-user, I just copied the raw 
> > > trace file from /sys/kernel. I wonder if that's a good enough reason 
> > > to keep kmemtrace bits around?
> > 
> > Not really. If then a light-weight recording app could be made but 
> > i'd rather wait for actual usecases to pop up.
> 
> Hmm, but isn't this an actual use case?

Not really - perf record is pretty lightweight and you'd want perf for 
hands-on stats anyway.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2009-11-23 17:53 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-20  7:53 [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Li Zefan
2009-11-20  7:53 ` [PATCH 2/2] tracing: Remove kmemtrace tracer Li Zefan
2009-11-20  8:20   ` Pekka Enberg
2009-11-20  8:24     ` Li Zefan
2009-11-20  8:27       ` Pekka Enberg
2009-11-20  8:31         ` Li Zefan
2009-11-20  8:14 ` [RFC][PATCH 1/2] perf: Add 'perf kmem' tool Ingo Molnar
2009-11-20  8:19   ` Pekka Enberg
2009-11-20  8:30     ` Ingo Molnar
2009-11-20  8:47       ` Pekka Enberg
2009-11-20  8:53         ` Li Zefan
2009-11-20  9:03           ` Ingo Molnar
2009-11-20  9:14             ` Li Zefan
2009-11-20 14:42             ` Arnaldo Carvalho de Melo
2009-11-20 16:41               ` Ingo Molnar
2009-11-20 17:52                 ` Arnaldo Carvalho de Melo
2009-11-23  6:51                   ` Ingo Molnar
2009-11-23  7:22                     ` Peter Zijlstra
2009-11-23  7:33                       ` Ingo Molnar
2009-11-23 14:37                       ` Arnaldo Carvalho de Melo
2009-11-23 14:32                     ` Arnaldo Carvalho de Melo
2009-11-20  9:01         ` Ingo Molnar
2009-11-20  9:15           ` Pekka Enberg
2009-11-20 10:13             ` Ingo Molnar
2009-11-20 10:31               ` Pekka Enberg
2009-11-20 10:49                 ` Ingo Molnar
2009-11-23 14:46                   ` Steven Rostedt
2009-11-23 17:53                     ` Ingo Molnar
2009-11-20  8:20   ` Li Zefan
2009-11-20  8:54 ` [tip:perf/core] " tip-bot for Li Zefan
2009-11-20  8:55 ` [RFC][PATCH 1/2] " Ingo Molnar
2009-11-20  9:11   ` Li Zefan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox