linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
@ 2024-11-25 17:16 Shakeel Butt
  2024-11-25 17:34 ` Shakeel Butt
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Shakeel Butt @ 2024-11-25 17:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Axel Rasmussen, Steven Rostedt,
	Suren Baghdasaryan, Matthew Wilcox, linux-mm, cgroups,
	linux-kernel, Meta kernel team

We are starting to deploy mmap_lock tracepoint monitoring across our
fleet and the early results showed that these tracepoints are consuming
significant amount of CPUs in kernfs_path_from_node when enabled.

It seems like the kernel is trying to resolve the cgroup path in the
fast path of the locking code path when the tracepoints are enabled. In
addition for some application their metrics are regressing when
monitoring is enabled.

The cgroup path resolution can be slow and should not be done in the
fast path. Most userspace tools, like bpftrace, provides functionality
to get the cgroup path from cgroup id, so let's just trace the cgroup
id and the users can use better tools to get the path in the slow path.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---

Changes since v1:
- Fixed commit message (Yosry).
- Renamed memcg_id_from_mm to cgroup_id_from_mm (Yosry).
- Fixed handling of root memcg (Yosry).
- Fixed mem_cgroup_disabled() handling.
- Kept mm pointer printing based on Steven's feedback.

 include/linux/memcontrol.h       | 22 ++++++++++++++
 include/trace/events/mmap_lock.h | 32 ++++++++++----------
 mm/mmap_lock.c                   | 50 ++------------------------------
 3 files changed, 40 insertions(+), 64 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5502aa8e138e..b28180269e75 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 
 void split_page_memcg(struct page *head, int old_order, int new_order);
 
+static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+	u64 id;
+
+	if (mem_cgroup_disabled())
+		return 0;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	id = cgroup_id(memcg->css.cgroup);
+	rcu_read_unlock();
+	return id;
+}
+
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 static inline void split_page_memcg(struct page *head, int old_order, int new_order)
 {
 }
+
+static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 /*
diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
index bc2e3ad787b3..cf9f9faf8914 100644
--- a/include/trace/events/mmap_lock.h
+++ b/include/trace/events/mmap_lock.h
@@ -5,6 +5,7 @@
 #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_MMAP_LOCK_H
 
+#include <linux/memcontrol.h>
 #include <linux/tracepoint.h>
 #include <linux/types.h>
 
@@ -12,64 +13,61 @@ struct mm_struct;
 
 DECLARE_EVENT_CLASS(mmap_lock,
 
-	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
+	TP_PROTO(struct mm_struct *mm, bool write),
 
-	TP_ARGS(mm, memcg_path, write),
+	TP_ARGS(mm, write),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
-		__string(memcg_path, memcg_path)
+		__field(u64, memcg_id)
 		__field(bool, write)
 	),
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__assign_str(memcg_path);
+		__entry->memcg_id = cgroup_id_from_mm(mm);
 		__entry->write = write;
 	),
 
 	TP_printk(
-		"mm=%p memcg_path=%s write=%s",
-		__entry->mm,
-		__get_str(memcg_path),
+		"mm=%p memcg_id=%llu write=%s",
+		__entry->mm, __entry->memcg_id,
 		__entry->write ? "true" : "false"
 	)
 );
 
 #define DEFINE_MMAP_LOCK_EVENT(name)                                    \
 	DEFINE_EVENT(mmap_lock, name,                                   \
-		TP_PROTO(struct mm_struct *mm, const char *memcg_path,  \
-			bool write),                                    \
-		TP_ARGS(mm, memcg_path, write))
+		TP_PROTO(struct mm_struct *mm, bool write),		\
+		TP_ARGS(mm, write))
 
 DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
 DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
 
 TRACE_EVENT(mmap_lock_acquire_returned,
 
-	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
-		bool success),
+	TP_PROTO(struct mm_struct *mm, bool write, bool success),
 
-	TP_ARGS(mm, memcg_path, write, success),
+	TP_ARGS(mm, write, success),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
-		__string(memcg_path, memcg_path)
+		__field(u64, memcg_id)
 		__field(bool, write)
 		__field(bool, success)
 	),
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__assign_str(memcg_path);
+		__entry->memcg_id = cgroup_id_from_mm(mm);
 		__entry->write = write;
 		__entry->success = success;
 	),
 
 	TP_printk(
-		"mm=%p memcg_path=%s write=%s success=%s",
+		"mm=%p memcg_id=%llu write=%s success=%s",
 		__entry->mm,
-		__get_str(memcg_path),
+		__entry->memcg_id,
 		__entry->write ? "true" : "false",
 		__entry->success ? "true" : "false"
 	)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index f186d57df2c6..e7dbaf96aa17 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
 
-#ifdef CONFIG_MEMCG
-
-/*
- * Size of the buffer for memcg path names. Ignoring stack trace support,
- * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
- */
-#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)				\
-	do {								\
-		if (trace_mmap_lock_##type##_enabled()) {		\
-			char buf[MEMCG_PATH_BUF_SIZE];                  \
-			get_mm_memcg_path(mm, buf, sizeof(buf));        \
-			trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
-		}							\
-	} while (0)
-
-#else /* !CONFIG_MEMCG */
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
-	trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
-#endif /* CONFIG_MEMCG */
-
 #ifdef CONFIG_TRACING
-#ifdef CONFIG_MEMCG
-/*
- * Write the given mm_struct's memcg path to a buffer. If the path cannot be
- * determined, empty string is written.
- */
-static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
-{
-	struct mem_cgroup *memcg;
-
-	buf[0] = '\0';
-	memcg = get_mem_cgroup_from_mm(mm);
-	if (memcg == NULL)
-		return;
-	if (memcg->css.cgroup)
-		cgroup_path(memcg->css.cgroup, buf, buflen);
-	css_put(&memcg->css);
-}
-
-#endif /* CONFIG_MEMCG */
-
 /*
  * Trace calls must be in a separate file, as otherwise there's a circular
  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
@@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
 
 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
 {
-	TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+	trace_mmap_lock_start_locking(mm, write);
 }
 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
 
 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
 					   bool success)
 {
-	TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+	trace_mmap_lock_acquire_returned(mm, write, success);
 }
 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
 
 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
 {
-	TRACE_MMAP_LOCK_EVENT(released, mm, write);
+	trace_mmap_lock_released(mm, write);
 }
 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
 #endif /* CONFIG_TRACING */
-- 
2.43.5



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
  2024-11-25 17:16 [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints Shakeel Butt
@ 2024-11-25 17:34 ` Shakeel Butt
  2024-11-25 19:51 ` Yosry Ahmed
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Shakeel Butt @ 2024-11-25 17:34 UTC (permalink / raw)
  To: Andrew Morton, Yosry Ahmed
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Vlastimil Babka, Axel Rasmussen, Steven Rostedt,
	Suren Baghdasaryan, Matthew Wilcox, linux-mm, cgroups,
	linux-kernel, Meta kernel team

Cc Yosry.

On Mon, Nov 25, 2024 at 09:16:17AM -0800, Shakeel Butt wrote:
> We are starting to deploy mmap_lock tracepoint monitoring across our
> fleet and the early results showed that these tracepoints are consuming
> significant amount of CPUs in kernfs_path_from_node when enabled.
> 
> It seems like the kernel is trying to resolve the cgroup path in the
> fast path of the locking code path when the tracepoints are enabled. In
> addition for some application their metrics are regressing when
> monitoring is enabled.
> 
> The cgroup path resolution can be slow and should not be done in the
> fast path. Most userspace tools, like bpftrace, provides functionality
> to get the cgroup path from cgroup id, so let's just trace the cgroup
> id and the users can use better tools to get the path in the slow path.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
> 
> Changes since v1:
> - Fixed commit message (Yosry).
> - Renamed memcg_id_from_mm to cgroup_id_from_mm (Yosry).
> - Fixed handling of root memcg (Yosry).
> - Fixed mem_cgroup_disabled() handling.
> - Kept mm pointer printing based on Steven's feedback.
> 
>  include/linux/memcontrol.h       | 22 ++++++++++++++
>  include/trace/events/mmap_lock.h | 32 ++++++++++----------
>  mm/mmap_lock.c                   | 50 ++------------------------------
>  3 files changed, 40 insertions(+), 64 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5502aa8e138e..b28180269e75 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
>  
>  void split_page_memcg(struct page *head, int old_order, int new_order);
>  
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> +	struct mem_cgroup *memcg;
> +	u64 id;
> +
> +	if (mem_cgroup_disabled())
> +		return 0;
> +
> +	rcu_read_lock();
> +	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> +	if (!memcg)
> +		memcg = root_mem_cgroup;
> +	id = cgroup_id(memcg->css.cgroup);
> +	rcu_read_unlock();
> +	return id;
> +}
> +
>  #else /* CONFIG_MEMCG */
>  
>  #define MEM_CGROUP_ID_SHIFT	0
> @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
>  static inline void split_page_memcg(struct page *head, int old_order, int new_order)
>  {
>  }
> +
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_MEMCG */
>  
>  /*
> diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
> index bc2e3ad787b3..cf9f9faf8914 100644
> --- a/include/trace/events/mmap_lock.h
> +++ b/include/trace/events/mmap_lock.h
> @@ -5,6 +5,7 @@
>  #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
>  #define _TRACE_MMAP_LOCK_H
>  
> +#include <linux/memcontrol.h>
>  #include <linux/tracepoint.h>
>  #include <linux/types.h>
>  
> @@ -12,64 +13,61 @@ struct mm_struct;
>  
>  DECLARE_EVENT_CLASS(mmap_lock,
>  
> -	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
> +	TP_PROTO(struct mm_struct *mm, bool write),
>  
> -	TP_ARGS(mm, memcg_path, write),
> +	TP_ARGS(mm, write),
>  
>  	TP_STRUCT__entry(
>  		__field(struct mm_struct *, mm)
> -		__string(memcg_path, memcg_path)
> +		__field(u64, memcg_id)
>  		__field(bool, write)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->mm = mm;
> -		__assign_str(memcg_path);
> +		__entry->memcg_id = cgroup_id_from_mm(mm);
>  		__entry->write = write;
>  	),
>  
>  	TP_printk(
> -		"mm=%p memcg_path=%s write=%s",
> -		__entry->mm,
> -		__get_str(memcg_path),
> +		"mm=%p memcg_id=%llu write=%s",
> +		__entry->mm, __entry->memcg_id,
>  		__entry->write ? "true" : "false"
>  	)
>  );
>  
>  #define DEFINE_MMAP_LOCK_EVENT(name)                                    \
>  	DEFINE_EVENT(mmap_lock, name,                                   \
> -		TP_PROTO(struct mm_struct *mm, const char *memcg_path,  \
> -			bool write),                                    \
> -		TP_ARGS(mm, memcg_path, write))
> +		TP_PROTO(struct mm_struct *mm, bool write),		\
> +		TP_ARGS(mm, write))
>  
>  DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
>  DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
>  
>  TRACE_EVENT(mmap_lock_acquire_returned,
>  
> -	TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
> -		bool success),
> +	TP_PROTO(struct mm_struct *mm, bool write, bool success),
>  
> -	TP_ARGS(mm, memcg_path, write, success),
> +	TP_ARGS(mm, write, success),
>  
>  	TP_STRUCT__entry(
>  		__field(struct mm_struct *, mm)
> -		__string(memcg_path, memcg_path)
> +		__field(u64, memcg_id)
>  		__field(bool, write)
>  		__field(bool, success)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->mm = mm;
> -		__assign_str(memcg_path);
> +		__entry->memcg_id = cgroup_id_from_mm(mm);
>  		__entry->write = write;
>  		__entry->success = success;
>  	),
>  
>  	TP_printk(
> -		"mm=%p memcg_path=%s write=%s success=%s",
> +		"mm=%p memcg_id=%llu write=%s success=%s",
>  		__entry->mm,
> -		__get_str(memcg_path),
> +		__entry->memcg_id,
>  		__entry->write ? "true" : "false",
>  		__entry->success ? "true" : "false"
>  	)
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index f186d57df2c6..e7dbaf96aa17 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
>  EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
>  EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
>  
> -#ifdef CONFIG_MEMCG
> -
> -/*
> - * Size of the buffer for memcg path names. Ignoring stack trace support,
> - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
> - */
> -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)				\
> -	do {								\
> -		if (trace_mmap_lock_##type##_enabled()) {		\
> -			char buf[MEMCG_PATH_BUF_SIZE];                  \
> -			get_mm_memcg_path(mm, buf, sizeof(buf));        \
> -			trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
> -		}							\
> -	} while (0)
> -
> -#else /* !CONFIG_MEMCG */
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
> -	trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
> -
> -#endif /* CONFIG_MEMCG */
> -
>  #ifdef CONFIG_TRACING
> -#ifdef CONFIG_MEMCG
> -/*
> - * Write the given mm_struct's memcg path to a buffer. If the path cannot be
> - * determined, empty string is written.
> - */
> -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
> -{
> -	struct mem_cgroup *memcg;
> -
> -	buf[0] = '\0';
> -	memcg = get_mem_cgroup_from_mm(mm);
> -	if (memcg == NULL)
> -		return;
> -	if (memcg->css.cgroup)
> -		cgroup_path(memcg->css.cgroup, buf, buflen);
> -	css_put(&memcg->css);
> -}
> -
> -#endif /* CONFIG_MEMCG */
> -
>  /*
>   * Trace calls must be in a separate file, as otherwise there's a circular
>   * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
> @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
>  
>  void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
>  {
> -	TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
> +	trace_mmap_lock_start_locking(mm, write);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
>  
>  void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
>  					   bool success)
>  {
> -	TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
> +	trace_mmap_lock_acquire_returned(mm, write, success);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
>  
>  void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
>  {
> -	TRACE_MMAP_LOCK_EVENT(released, mm, write);
> +	trace_mmap_lock_released(mm, write);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_released);
>  #endif /* CONFIG_TRACING */
> -- 
> 2.43.5
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
  2024-11-25 17:16 [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints Shakeel Butt
  2024-11-25 17:34 ` Shakeel Butt
@ 2024-11-25 19:51 ` Yosry Ahmed
  2024-11-26  7:32 ` Vlastimil Babka
  2024-11-26 19:29 ` Roman Gushchin
  3 siblings, 0 replies; 6+ messages in thread
From: Yosry Ahmed @ 2024-11-25 19:51 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Muchun Song, Vlastimil Babka, Axel Rasmussen, Steven Rostedt,
	Suren Baghdasaryan, Matthew Wilcox, linux-mm, cgroups,
	linux-kernel, Meta kernel team

On Mon, Nov 25, 2024 at 9:16 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> We are starting to deploy mmap_lock tracepoint monitoring across our
> fleet and the early results showed that these tracepoints are consuming
> significant amount of CPUs in kernfs_path_from_node when enabled.
>
> It seems like the kernel is trying to resolve the cgroup path in the
> fast path of the locking code path when the tracepoints are enabled. In
> addition for some application their metrics are regressing when
> monitoring is enabled.
>
> The cgroup path resolution can be slow and should not be done in the
> fast path. Most userspace tools, like bpftrace, provides functionality
> to get the cgroup path from cgroup id, so let's just trace the cgroup
> id and the users can use better tools to get the path in the slow path.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Reviewed-by: Yosry Ahmed <yosryahmed@google.com>

> ---
>
> Changes since v1:
> - Fixed commit message (Yosry).
> - Renamed memcg_id_from_mm to cgroup_id_from_mm (Yosry).
> - Fixed handling of root memcg (Yosry).
> - Fixed mem_cgroup_disabled() handling.
> - Kept mm pointer printing based on Steven's feedback.
>
>  include/linux/memcontrol.h       | 22 ++++++++++++++
>  include/trace/events/mmap_lock.h | 32 ++++++++++----------
>  mm/mmap_lock.c                   | 50 ++------------------------------
>  3 files changed, 40 insertions(+), 64 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5502aa8e138e..b28180269e75 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
>
>  void split_page_memcg(struct page *head, int old_order, int new_order);
>
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> +       struct mem_cgroup *memcg;
> +       u64 id;
> +
> +       if (mem_cgroup_disabled())
> +               return 0;
> +
> +       rcu_read_lock();
> +       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> +       if (!memcg)
> +               memcg = root_mem_cgroup;
> +       id = cgroup_id(memcg->css.cgroup);
> +       rcu_read_unlock();
> +       return id;
> +}
> +
>  #else /* CONFIG_MEMCG */
>
>  #define MEM_CGROUP_ID_SHIFT    0
> @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
>  static inline void split_page_memcg(struct page *head, int old_order, int new_order)
>  {
>  }
> +
> +static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
> +{
> +       return 0;
> +}
>  #endif /* CONFIG_MEMCG */
>
>  /*
> diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
> index bc2e3ad787b3..cf9f9faf8914 100644
> --- a/include/trace/events/mmap_lock.h
> +++ b/include/trace/events/mmap_lock.h
> @@ -5,6 +5,7 @@
>  #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
>  #define _TRACE_MMAP_LOCK_H
>
> +#include <linux/memcontrol.h>
>  #include <linux/tracepoint.h>
>  #include <linux/types.h>
>
> @@ -12,64 +13,61 @@ struct mm_struct;
>
>  DECLARE_EVENT_CLASS(mmap_lock,
>
> -       TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
> +       TP_PROTO(struct mm_struct *mm, bool write),
>
> -       TP_ARGS(mm, memcg_path, write),
> +       TP_ARGS(mm, write),
>
>         TP_STRUCT__entry(
>                 __field(struct mm_struct *, mm)
> -               __string(memcg_path, memcg_path)
> +               __field(u64, memcg_id)
>                 __field(bool, write)
>         ),
>
>         TP_fast_assign(
>                 __entry->mm = mm;
> -               __assign_str(memcg_path);
> +               __entry->memcg_id = cgroup_id_from_mm(mm);
>                 __entry->write = write;
>         ),
>
>         TP_printk(
> -               "mm=%p memcg_path=%s write=%s",
> -               __entry->mm,
> -               __get_str(memcg_path),
> +               "mm=%p memcg_id=%llu write=%s",
> +               __entry->mm, __entry->memcg_id,
>                 __entry->write ? "true" : "false"
>         )
>  );
>
>  #define DEFINE_MMAP_LOCK_EVENT(name)                                    \
>         DEFINE_EVENT(mmap_lock, name,                                   \
> -               TP_PROTO(struct mm_struct *mm, const char *memcg_path,  \
> -                       bool write),                                    \
> -               TP_ARGS(mm, memcg_path, write))
> +               TP_PROTO(struct mm_struct *mm, bool write),             \
> +               TP_ARGS(mm, write))
>
>  DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
>  DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
>
>  TRACE_EVENT(mmap_lock_acquire_returned,
>
> -       TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
> -               bool success),
> +       TP_PROTO(struct mm_struct *mm, bool write, bool success),
>
> -       TP_ARGS(mm, memcg_path, write, success),
> +       TP_ARGS(mm, write, success),
>
>         TP_STRUCT__entry(
>                 __field(struct mm_struct *, mm)
> -               __string(memcg_path, memcg_path)
> +               __field(u64, memcg_id)
>                 __field(bool, write)
>                 __field(bool, success)
>         ),
>
>         TP_fast_assign(
>                 __entry->mm = mm;
> -               __assign_str(memcg_path);
> +               __entry->memcg_id = cgroup_id_from_mm(mm);
>                 __entry->write = write;
>                 __entry->success = success;
>         ),
>
>         TP_printk(
> -               "mm=%p memcg_path=%s write=%s success=%s",
> +               "mm=%p memcg_id=%llu write=%s success=%s",
>                 __entry->mm,
> -               __get_str(memcg_path),
> +               __entry->memcg_id,
>                 __entry->write ? "true" : "false",
>                 __entry->success ? "true" : "false"
>         )
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index f186d57df2c6..e7dbaf96aa17 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -17,51 +17,7 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
>  EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
>  EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
>
> -#ifdef CONFIG_MEMCG
> -
> -/*
> - * Size of the buffer for memcg path names. Ignoring stack trace support,
> - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
> - */
> -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                           \
> -       do {                                                            \
> -               if (trace_mmap_lock_##type##_enabled()) {               \
> -                       char buf[MEMCG_PATH_BUF_SIZE];                  \
> -                       get_mm_memcg_path(mm, buf, sizeof(buf));        \
> -                       trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \
> -               }                                                       \
> -       } while (0)
> -
> -#else /* !CONFIG_MEMCG */
> -
> -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
> -       trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
> -
> -#endif /* CONFIG_MEMCG */
> -
>  #ifdef CONFIG_TRACING
> -#ifdef CONFIG_MEMCG
> -/*
> - * Write the given mm_struct's memcg path to a buffer. If the path cannot be
> - * determined, empty string is written.
> - */
> -static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
> -{
> -       struct mem_cgroup *memcg;
> -
> -       buf[0] = '\0';
> -       memcg = get_mem_cgroup_from_mm(mm);
> -       if (memcg == NULL)
> -               return;
> -       if (memcg->css.cgroup)
> -               cgroup_path(memcg->css.cgroup, buf, buflen);
> -       css_put(&memcg->css);
> -}
> -
> -#endif /* CONFIG_MEMCG */
> -
>  /*
>   * Trace calls must be in a separate file, as otherwise there's a circular
>   * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
> @@ -69,20 +25,20 @@ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen)
>
>  void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
>  {
> -       TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
> +       trace_mmap_lock_start_locking(mm, write);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
>
>  void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
>                                            bool success)
>  {
> -       TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
> +       trace_mmap_lock_acquire_returned(mm, write, success);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
>
>  void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
>  {
> -       TRACE_MMAP_LOCK_EVENT(released, mm, write);
> +       trace_mmap_lock_released(mm, write);
>  }
>  EXPORT_SYMBOL(__mmap_lock_do_trace_released);
>  #endif /* CONFIG_TRACING */
> --
> 2.43.5
>
>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
  2024-11-25 17:16 [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints Shakeel Butt
  2024-11-25 17:34 ` Shakeel Butt
  2024-11-25 19:51 ` Yosry Ahmed
@ 2024-11-26  7:32 ` Vlastimil Babka
  2024-11-26 19:29 ` Roman Gushchin
  3 siblings, 0 replies; 6+ messages in thread
From: Vlastimil Babka @ 2024-11-26  7:32 UTC (permalink / raw)
  To: Shakeel Butt, Andrew Morton
  Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
	Axel Rasmussen, Steven Rostedt, Suren Baghdasaryan,
	Matthew Wilcox, linux-mm, cgroups, linux-kernel,
	Meta kernel team

On 11/25/24 18:16, Shakeel Butt wrote:
> We are starting to deploy mmap_lock tracepoint monitoring across our
> fleet and the early results showed that these tracepoints are consuming
> significant amount of CPUs in kernfs_path_from_node when enabled.
> 
> It seems like the kernel is trying to resolve the cgroup path in the
> fast path of the locking code path when the tracepoints are enabled. In
> addition for some application their metrics are regressing when
> monitoring is enabled.
> 
> The cgroup path resolution can be slow and should not be done in the
> fast path. Most userspace tools, like bpftrace, provides functionality
> to get the cgroup path from cgroup id, so let's just trace the cgroup
> id and the users can use better tools to get the path in the slow path.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Vlastimil Babka <vbabka@suse.cz>




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
  2024-11-25 17:16 [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints Shakeel Butt
                   ` (2 preceding siblings ...)
  2024-11-26  7:32 ` Vlastimil Babka
@ 2024-11-26 19:29 ` Roman Gushchin
  2024-11-27 20:09   ` Axel Rasmussen
  3 siblings, 1 reply; 6+ messages in thread
From: Roman Gushchin @ 2024-11-26 19:29 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Muchun Song,
	Vlastimil Babka, Axel Rasmussen, Steven Rostedt,
	Suren Baghdasaryan, Matthew Wilcox, linux-mm, cgroups,
	linux-kernel, Meta kernel team

On Mon, Nov 25, 2024 at 09:16:17AM -0800, Shakeel Butt wrote:
> We are starting to deploy mmap_lock tracepoint monitoring across our
> fleet and the early results showed that these tracepoints are consuming
> significant amount of CPUs in kernfs_path_from_node when enabled.
> 
> It seems like the kernel is trying to resolve the cgroup path in the
> fast path of the locking code path when the tracepoints are enabled. In
> addition for some application their metrics are regressing when
> monitoring is enabled.
> 
> The cgroup path resolution can be slow and should not be done in the
> fast path. Most userspace tools, like bpftrace, provides functionality
> to get the cgroup path from cgroup id, so let's just trace the cgroup
> id and the users can use better tools to get the path in the slow path.
> 
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>

Acked-by: Roman Gushchin <roman.gushchin@linux.dev>

Thanks!


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints
  2024-11-26 19:29 ` Roman Gushchin
@ 2024-11-27 20:09   ` Axel Rasmussen
  0 siblings, 0 replies; 6+ messages in thread
From: Axel Rasmussen @ 2024-11-27 20:09 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Shakeel Butt, Andrew Morton, Johannes Weiner, Michal Hocko,
	Muchun Song, Vlastimil Babka, Steven Rostedt, Suren Baghdasaryan,
	Matthew Wilcox, linux-mm, cgroups, linux-kernel,
	Meta kernel team

No objections. :)

Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>


On Tue, Nov 26, 2024 at 11:29 AM Roman Gushchin
<roman.gushchin@linux.dev> wrote:
>
> On Mon, Nov 25, 2024 at 09:16:17AM -0800, Shakeel Butt wrote:
> > We are starting to deploy mmap_lock tracepoint monitoring across our
> > fleet and the early results showed that these tracepoints are consuming
> > significant amount of CPUs in kernfs_path_from_node when enabled.
> >
> > It seems like the kernel is trying to resolve the cgroup path in the
> > fast path of the locking code path when the tracepoints are enabled. In
> > addition for some application their metrics are regressing when
> > monitoring is enabled.
> >
> > The cgroup path resolution can be slow and should not be done in the
> > fast path. Most userspace tools, like bpftrace, provides functionality
> > to get the cgroup path from cgroup id, so let's just trace the cgroup
> > id and the users can use better tools to get the path in the slow path.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
>
> Thanks!


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-11-27 20:10 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-11-25 17:16 [PATCH v2] mm: mmap_lock: optimize mmap_lock tracepoints Shakeel Butt
2024-11-25 17:34 ` Shakeel Butt
2024-11-25 19:51 ` Yosry Ahmed
2024-11-26  7:32 ` Vlastimil Babka
2024-11-26 19:29 ` Roman Gushchin
2024-11-27 20:09   ` Axel Rasmussen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox