[PATCH 0/2] memcg: tracepoint for flushing stats

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] memcg: tracepoint for flushing stats
@ 2024-10-25  0:25 JP Kobryn
  2024-10-25  0:25 ` [PATCH 1/2] memcg: add memcg flush tracepoint event JP Kobryn
  2024-10-25  0:25 ` [PATCH 2/2] memcg: use memcg flush tracepoint JP Kobryn
  0 siblings, 2 replies; 9+ messages in thread
From: JP Kobryn @ 2024-10-25  0:25 UTC (permalink / raw)
  To: shakeel.butt, hannes, yosryahmed, akpm, rostedt; +Cc: linux-mm, cgroups

This tracepoint gives visibility on how often the flushing of memcg stats
occurs, along with the reason for the flush. It can help with understanding
how readers are affected by having to perform the flush; the event captures
the info on whether the flush was skipped or not. The information collected
could help in determining the effectiveness of the background work where
the flush is scheduled periodically. Paired with the recently added
tracepoints for tracing rstat updates, it can also help show correlation
where stats exceed thresholds frequently.

Note there is one reason called "zswap" that was included to distinguish
one special case where a zswap function makes a direct call to
do_flush_stats().

JP Kobryn (2):
  add memcg flush tracepoint event
  use memcg flush tracepoint

 include/trace/events/memcg.h | 56 ++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c              |  7 ++++-
 2 files changed, 62 insertions(+), 1 deletion(-)

-- 
2.47.0

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] memcg: add memcg flush tracepoint event
  2024-10-25  0:25 [PATCH 0/2] memcg: tracepoint for flushing stats JP Kobryn
@ 2024-10-25  0:25 ` JP Kobryn
  2024-10-25  0:25 ` [PATCH 2/2] memcg: use memcg flush tracepoint JP Kobryn
  1 sibling, 0 replies; 9+ messages in thread
From: JP Kobryn @ 2024-10-25  0:25 UTC (permalink / raw)
  To: shakeel.butt, hannes, yosryahmed, akpm, rostedt; +Cc: linux-mm, cgroups

This is a tracepoint event that contains the memcg pointer and an enum value
representing the reason associated with the flush.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
 include/trace/events/memcg.h | 56 ++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h
index 8667e57816d2..9cccf0f13d4f 100644
--- a/include/trace/events/memcg.h
+++ b/include/trace/events/memcg.h
@@ -8,6 +8,43 @@
 #include <linux/memcontrol.h>
 #include <linux/tracepoint.h>
 
+#define MEMCG_FLUSH_REASONS \
+	EM(TRACE_MEMCG_FLUSH_READER, "reader") \
+	EM(TRACE_MEMCG_FLUSH_READER_SKIP, "reader skip") \
+	EM(TRACE_MEMCG_FLUSH_PERIODIC, "periodic") \
+	EMe(TRACE_MEMCG_FLUSH_ZSWAP, "zswap")
+
+#ifndef __MEMCG_DECLARE_TRACE_ENUMS_ONLY_ONCE
+#define __MEMCG_DECLARE_TRACE_ENUMS_ONLY_ONCE
+
+/* Redefine macros to help declare enum */
+#undef EM
+#undef EMe
+#define EM(a, b)	a,
+#define EMe(a, b)	a
+
+enum memcg_flush_reason {
+	MEMCG_FLUSH_REASONS
+};
+
+#endif /* __MEMCG_DECLARE_TRACE_ENUMS_ONLY_ONCE */
+
+/* Redefine macros to export the enums to userspace */
+#undef EM
+#undef EMe
+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
+#define EMe(a, b)	TRACE_DEFINE_ENUM(a)
+
+MEMCG_FLUSH_REASONS;
+
+/*
+ * Redefine macros to map the enums to the strings that will
+ * be printed in the output
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)	{ a, b },
+#define EMe(a, b)	{ a, b }
 
 DECLARE_EVENT_CLASS(memcg_rstat_stats,
 
@@ -74,6 +111,25 @@ DEFINE_EVENT(memcg_rstat_events, count_memcg_events,
 	TP_ARGS(memcg, item, val)
 );
 
+TRACE_EVENT(memcg_flush_stats,
+
+	TP_PROTO(struct mem_cgroup *memcg, enum memcg_flush_reason reason),
+
+	TP_ARGS(memcg, reason),
+
+	TP_STRUCT__entry(
+		__field(u64, id)
+		__field(enum memcg_flush_reason, reason)
+	),
+
+	TP_fast_assign(
+		__entry->id = cgroup_id(memcg->css.cgroup);
+		__entry->reason = reason;
+	),
+
+	TP_printk("memcg_id=%llu reason=%s",
+		  __entry->id, __print_symbolic(__entry->reason, MEMCG_FLUSH_REASONS))
+);
 
 #endif /* _TRACE_MEMCG_H */
 
-- 
2.47.0



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25  0:25 [PATCH 0/2] memcg: tracepoint for flushing stats JP Kobryn
  2024-10-25  0:25 ` [PATCH 1/2] memcg: add memcg flush tracepoint event JP Kobryn
@ 2024-10-25  0:25 ` JP Kobryn
  2024-10-25  0:57   ` Yosry Ahmed
  1 sibling, 1 reply; 9+ messages in thread
From: JP Kobryn @ 2024-10-25  0:25 UTC (permalink / raw)
  To: shakeel.butt, hannes, yosryahmed, akpm, rostedt; +Cc: linux-mm, cgroups

Make use of the flush tracepoint within memcontrol.

Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
---
 mm/memcontrol.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 18c3f513d766..f816737228fa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -613,8 +613,11 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
 	if (!memcg)
 		memcg = root_mem_cgroup;
 
-	if (memcg_vmstats_needs_flush(memcg->vmstats))
+	if (memcg_vmstats_needs_flush(memcg->vmstats)) {
+		trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_READER);
 		do_flush_stats(memcg);
+	} else
+		trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_READER_SKIP);
 }
 
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
@@ -630,6 +633,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
 	 * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
 	 * in latency-sensitive paths is as cheap as possible.
 	 */
+	trace_memcg_flush_stats(root_mem_cgroup, TRACE_MEMCG_FLUSH_PERIODIC);
 	do_flush_stats(root_mem_cgroup);
 	queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }
@@ -5285,6 +5289,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
 		 * mem_cgroup_flush_stats() ignores small changes. Use
 		 * do_flush_stats() directly to get accurate stats for charging.
 		 */
+		trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_ZSWAP);
 		do_flush_stats(memcg);
 		pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
 		if (pages < max)
-- 
2.47.0



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25  0:25 ` [PATCH 2/2] memcg: use memcg flush tracepoint JP Kobryn
@ 2024-10-25  0:57   ` Yosry Ahmed
  2024-10-25  1:15     ` Shakeel Butt
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2024-10-25  0:57 UTC (permalink / raw)
  To: JP Kobryn; +Cc: shakeel.butt, hannes, akpm, rostedt, linux-mm, cgroups

On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
>
> Make use of the flush tracepoint within memcontrol.
>
> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>

Is the intention to use tools like bpftrace to analyze where we flush
the most? In this case, why can't we just attach to the fentry of
do_flush_stats() and use the stack trace to find the path?

We can also attach to mem_cgroup_flush_stats(), and the difference in
counts between the two will be the number of skipped flushes.

Are there other use cases for these tracepoints?

> ---
>  mm/memcontrol.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 18c3f513d766..f816737228fa 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -613,8 +613,11 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
>         if (!memcg)
>                 memcg = root_mem_cgroup;
>
> -       if (memcg_vmstats_needs_flush(memcg->vmstats))
> +       if (memcg_vmstats_needs_flush(memcg->vmstats)) {
> +               trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_READER);
>                 do_flush_stats(memcg);
> +       } else
> +               trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_READER_SKIP);
>  }
>
>  void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
> @@ -630,6 +633,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>          * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
>          * in latency-sensitive paths is as cheap as possible.
>          */
> +       trace_memcg_flush_stats(root_mem_cgroup, TRACE_MEMCG_FLUSH_PERIODIC);
>         do_flush_stats(root_mem_cgroup);
>         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
>  }
> @@ -5285,6 +5289,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
>                  * mem_cgroup_flush_stats() ignores small changes. Use
>                  * do_flush_stats() directly to get accurate stats for charging.
>                  */
> +               trace_memcg_flush_stats(memcg, TRACE_MEMCG_FLUSH_ZSWAP);
>                 do_flush_stats(memcg);
>                 pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
>                 if (pages < max)
> --
> 2.47.0
>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25  0:57   ` Yosry Ahmed
@ 2024-10-25  1:15     ` Shakeel Butt
  2024-10-25  7:40       ` Yosry Ahmed
  0 siblings, 1 reply; 9+ messages in thread
From: Shakeel Butt @ 2024-10-25  1:15 UTC (permalink / raw)
  To: Yosry Ahmed; +Cc: JP Kobryn, hannes, akpm, rostedt, linux-mm, cgroups

On Thu, Oct 24, 2024 at 05:57:25PM GMT, Yosry Ahmed wrote:
> On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
> >
> > Make use of the flush tracepoint within memcontrol.
> >
> > Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> 
> Is the intention to use tools like bpftrace to analyze where we flush
> the most? In this case, why can't we just attach to the fentry of
> do_flush_stats() and use the stack trace to find the path?
> 
> We can also attach to mem_cgroup_flush_stats(), and the difference in
> counts between the two will be the number of skipped flushes.
> 

All these functions can get inlined and then we can not really attach
easily. We can somehow find the offset in the inlined places and try to
use kprobe but it is prohibitive when have to do for multiple kernels
built with fdo/bolt.

Please note that tracepoints are not really API, so we can remove them
in future if we see no usage for them.

Thanks for the review,
Shakeel


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25  1:15     ` Shakeel Butt
@ 2024-10-25  7:40       ` Yosry Ahmed
  2024-10-25 17:04         ` JP Kobryn
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2024-10-25  7:40 UTC (permalink / raw)
  To: Shakeel Butt; +Cc: JP Kobryn, hannes, akpm, rostedt, linux-mm, cgroups

On Thu, Oct 24, 2024 at 6:16 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Oct 24, 2024 at 05:57:25PM GMT, Yosry Ahmed wrote:
> > On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
> > >
> > > Make use of the flush tracepoint within memcontrol.
> > >
> > > Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> >
> > Is the intention to use tools like bpftrace to analyze where we flush
> > the most? In this case, why can't we just attach to the fentry of
> > do_flush_stats() and use the stack trace to find the path?
> >
> > We can also attach to mem_cgroup_flush_stats(), and the difference in
> > counts between the two will be the number of skipped flushes.
> >
>
> All these functions can get inlined and then we can not really attach
> easily. We can somehow find the offset in the inlined places and try to
> use kprobe but it is prohibitive when have to do for multiple kernels
> built with fdo/bolt.
>
> Please note that tracepoints are not really API, so we can remove them
> in future if we see no usage for them.

That's fair, but can we just add two tracepoints? This seems enough to
collect necessary data, and prevent proliferation of tracepoints and
the addition of the enum.

I am thinking one in mem_cgroup_flush_stats() and one in
do_flush_stats(), e.g. trace_mem_cgroup_flush_stats() and
trace_do_flush_stats(). Although the name of the latter is too
generic, maybe we should rename the function first to add mem_cgroup_*
or memcg_*.

WDYT?


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25  7:40       ` Yosry Ahmed
@ 2024-10-25 17:04         ` JP Kobryn
  2024-10-25 17:53           ` Yosry Ahmed
  0 siblings, 1 reply; 9+ messages in thread
From: JP Kobryn @ 2024-10-25 17:04 UTC (permalink / raw)
  To: Yosry Ahmed, Shakeel Butt; +Cc: hannes, akpm, rostedt, linux-mm, cgroups

On 10/25/24 12:40 AM, Yosry Ahmed wrote:
> On Thu, Oct 24, 2024 at 6:16 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>> On Thu, Oct 24, 2024 at 05:57:25PM GMT, Yosry Ahmed wrote:
>>> On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
>>>> Make use of the flush tracepoint within memcontrol.
>>>>
>>>> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
>>> Is the intention to use tools like bpftrace to analyze where we flush
>>> the most? In this case, why can't we just attach to the fentry of
>>> do_flush_stats() and use the stack trace to find the path?
>>>
>>> We can also attach to mem_cgroup_flush_stats(), and the difference in
>>> counts between the two will be the number of skipped flushes.
>>>
>> All these functions can get inlined and then we can not really attach
>> easily. We can somehow find the offset in the inlined places and try to
>> use kprobe but it is prohibitive when have to do for multiple kernels
>> built with fdo/bolt.
>>
>> Please note that tracepoints are not really API, so we can remove them
>> in future if we see no usage for them.
> That's fair, but can we just add two tracepoints? This seems enough to
> collect necessary data, and prevent proliferation of tracepoints and
> the addition of the enum.
>
> I am thinking one in mem_cgroup_flush_stats() and one in
> do_flush_stats(), e.g. trace_mem_cgroup_flush_stats() and
> trace_do_flush_stats(). Although the name of the latter is too
> generic, maybe we should rename the function first to add mem_cgroup_*
> or memcg_*.
>
> WDYT?

Hmmm, I think if we did that we wouldn't get accurate info on when the 
flush was skipped. Comparing the number of hits between 
mem_cgroup_flush_stats() and do_flush_stats() to determine the number of 
skips doesn't seem reliable because of the places where do_flush_stats() 
is called outside of mem_cgroup_flush_stats(). There would be situations 
where a skip occurs, but meanwhile each call to do_flush_stats() outside 
of mem_cgroup_flush_stats() would effectively subtract that skip, making 
it appear that a skip did not occur.

Maybe as a middle ground we could remove the trace calls for the zswap 
and periodic cases, since no skips can occur there. We could then just 
leave one trace call in mem_cgroup_flush_stats() and instead of an enum 
we can pass a bool saying skipped or not. Something like this:

mem_cgroup_flush_stats()

{

     bool needs_flush = memcg_vmstats_needs_flush(...);

     trace_memcg_flush_stats(memcg, needs_flush);

     if (needs_flush)

         do_flush_stats(...);

}

Yosry/Shakeel, do you have any thoughts on whether we should keep the 
trace calls for obj_cgroup_may_zswap() and periodic workqueue cases?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25 17:04         ` JP Kobryn
@ 2024-10-25 17:53           ` Yosry Ahmed
  2024-10-25 18:26             ` JP Kobryn
  0 siblings, 1 reply; 9+ messages in thread
From: Yosry Ahmed @ 2024-10-25 17:53 UTC (permalink / raw)
  To: JP Kobryn; +Cc: Shakeel Butt, hannes, akpm, rostedt, linux-mm, cgroups

On Fri, Oct 25, 2024 at 10:05 AM JP Kobryn <inwardvessel@gmail.com> wrote:
>
>
> On 10/25/24 12:40 AM, Yosry Ahmed wrote:
> > On Thu, Oct 24, 2024 at 6:16 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >> On Thu, Oct 24, 2024 at 05:57:25PM GMT, Yosry Ahmed wrote:
> >>> On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
> >>>> Make use of the flush tracepoint within memcontrol.
> >>>>
> >>>> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
> >>> Is the intention to use tools like bpftrace to analyze where we flush
> >>> the most? In this case, why can't we just attach to the fentry of
> >>> do_flush_stats() and use the stack trace to find the path?
> >>>
> >>> We can also attach to mem_cgroup_flush_stats(), and the difference in
> >>> counts between the two will be the number of skipped flushes.
> >>>
> >> All these functions can get inlined and then we can not really attach
> >> easily. We can somehow find the offset in the inlined places and try to
> >> use kprobe but it is prohibitive when have to do for multiple kernels
> >> built with fdo/bolt.
> >>
> >> Please note that tracepoints are not really API, so we can remove them
> >> in future if we see no usage for them.
> > That's fair, but can we just add two tracepoints? This seems enough to
> > collect necessary data, and prevent proliferation of tracepoints and
> > the addition of the enum.
> >
> > I am thinking one in mem_cgroup_flush_stats() and one in
> > do_flush_stats(), e.g. trace_mem_cgroup_flush_stats() and
> > trace_do_flush_stats(). Although the name of the latter is too
> > generic, maybe we should rename the function first to add mem_cgroup_*
> > or memcg_*.
> >
> > WDYT?
>
> Hmmm, I think if we did that we wouldn't get accurate info on when the
> flush was skipped. Comparing the number of hits between
> mem_cgroup_flush_stats() and do_flush_stats() to determine the number of
> skips doesn't seem reliable because of the places where do_flush_stats()
> is called outside of mem_cgroup_flush_stats(). There would be situations
> where a skip occurs, but meanwhile each call to do_flush_stats() outside
> of mem_cgroup_flush_stats() would effectively subtract that skip, making
> it appear that a skip did not occur.

You're underestimating the power of BPF, my friend :) We can count the
number of flushes in task local storages, in which case we can get a
very accurate representation because the counters are per-task, so we
know exactly when we skipped, but..

>
> Maybe as a middle ground we could remove the trace calls for the zswap
> and periodic cases, since no skips can occur there. We could then just
> leave one trace call in mem_cgroup_flush_stats() and instead of an enum
> we can pass a bool saying skipped or not. Something like this:
>
> mem_cgroup_flush_stats()
>
> {
>
>      bool needs_flush = memcg_vmstats_needs_flush(...);
>
>      trace_memcg_flush_stats(memcg, needs_flush);
>
>      if (needs_flush)
>
>          do_flush_stats(...);
>
> }
>
>
> Yosry/Shakeel, do you have any thoughts on whether we should keep the
> trace calls for obj_cgroup_may_zswap() and periodic workqueue cases?

..with that being said, I do like having a single tracepoint. I think
with some refactoring we can end up with a single tracepoint and more
data. We can even capture the cases where we force a flush but we
don't really need to flush. We can even add vmstats->stats_updates to
the tracepoint to know exactly how many updates we have when we flush.

What about the following:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7845c64a2c570..be0e7f52ad11a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -584,8 +584,14 @@ static inline void memcg_rstat_updated(struct
mem_cgroup *memcg, int val)
        }
 }

-static void do_flush_stats(struct mem_cgroup *memcg)
+static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
 {
+       bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
+
+       trace_memcg_flush_stats(memcg, needs_flush, force, ...);
+       if (!force && !needs_flush)
+               return;
+
        if (mem_cgroup_is_root(memcg))
                WRITE_ONCE(flush_last_time, jiffies_64);

@@ -609,8 +615,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
        if (!memcg)
                memcg = root_mem_cgroup;

-       if (memcg_vmstats_needs_flush(memcg->vmstats))
-               do_flush_stats(memcg);
+       __mem_cgroup_flush_stats(memcg, false);
 }

 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
@@ -626,7 +631,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
         * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
         * in latency-sensitive paths is as cheap as possible.
         */
-       do_flush_stats(root_mem_cgroup);
+       __mem_cgroup_flush_stats(root_mem_cgroup, true);
        queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
 }

@@ -5272,11 +5277,8 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
                        break;
                }

-               /*
-                * mem_cgroup_flush_stats() ignores small changes. Use
-                * do_flush_stats() directly to get accurate stats for charging.
-                */
-               do_flush_stats(memcg);
+               /* Force a flush to get accurate stats for charging */
+               __mem_cgroup_flush_stats(memcg, true);
                pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
                if (pages < max)
                        continue;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] memcg: use memcg flush tracepoint
  2024-10-25 17:53           ` Yosry Ahmed
@ 2024-10-25 18:26             ` JP Kobryn
  0 siblings, 0 replies; 9+ messages in thread
From: JP Kobryn @ 2024-10-25 18:26 UTC (permalink / raw)
  To: Yosry Ahmed; +Cc: Shakeel Butt, hannes, akpm, rostedt, linux-mm, cgroups


On 10/25/24 10:53 AM, Yosry Ahmed wrote:
> On Fri, Oct 25, 2024 at 10:05 AM JP Kobryn <inwardvessel@gmail.com> wrote:
>>
>> On 10/25/24 12:40 AM, Yosry Ahmed wrote:
>>> On Thu, Oct 24, 2024 at 6:16 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>>>> On Thu, Oct 24, 2024 at 05:57:25PM GMT, Yosry Ahmed wrote:
>>>>> On Thu, Oct 24, 2024 at 5:26 PM JP Kobryn <inwardvessel@gmail.com> wrote:
>>>>>> Make use of the flush tracepoint within memcontrol.
>>>>>>
>>>>>> Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
>>>>> Is the intention to use tools like bpftrace to analyze where we flush
>>>>> the most? In this case, why can't we just attach to the fentry of
>>>>> do_flush_stats() and use the stack trace to find the path?
>>>>>
>>>>> We can also attach to mem_cgroup_flush_stats(), and the difference in
>>>>> counts between the two will be the number of skipped flushes.
>>>>>
>>>> All these functions can get inlined and then we can not really attach
>>>> easily. We can somehow find the offset in the inlined places and try to
>>>> use kprobe but it is prohibitive when have to do for multiple kernels
>>>> built with fdo/bolt.
>>>>
>>>> Please note that tracepoints are not really API, so we can remove them
>>>> in future if we see no usage for them.
>>> That's fair, but can we just add two tracepoints? This seems enough to
>>> collect necessary data, and prevent proliferation of tracepoints and
>>> the addition of the enum.
>>>
>>> I am thinking one in mem_cgroup_flush_stats() and one in
>>> do_flush_stats(), e.g. trace_mem_cgroup_flush_stats() and
>>> trace_do_flush_stats(). Although the name of the latter is too
>>> generic, maybe we should rename the function first to add mem_cgroup_*
>>> or memcg_*.
>>>
>>> WDYT?
>> Hmmm, I think if we did that we wouldn't get accurate info on when the
>> flush was skipped. Comparing the number of hits between
>> mem_cgroup_flush_stats() and do_flush_stats() to determine the number of
>> skips doesn't seem reliable because of the places where do_flush_stats()
>> is called outside of mem_cgroup_flush_stats(). There would be situations
>> where a skip occurs, but meanwhile each call to do_flush_stats() outside
>> of mem_cgroup_flush_stats() would effectively subtract that skip, making
>> it appear that a skip did not occur.
> You're underestimating the power of BPF, my friend :) We can count the
> number of flushes in task local storages, in which case we can get a
> very accurate representation because the counters are per-task, so we
> know exactly when we skipped, but..
Interesting. Thanks for explaining.
>
>> Maybe as a middle ground we could remove the trace calls for the zswap
>> and periodic cases, since no skips can occur there. We could then just
>> leave one trace call in mem_cgroup_flush_stats() and instead of an enum
>> we can pass a bool saying skipped or not. Something like this:
>>
>> mem_cgroup_flush_stats()
>>
>> {
>>
>>       bool needs_flush = memcg_vmstats_needs_flush(...);
>>
>>       trace_memcg_flush_stats(memcg, needs_flush);
>>
>>       if (needs_flush)
>>
>>           do_flush_stats(...);
>>
>> }
>>
>>
>> Yosry/Shakeel, do you have any thoughts on whether we should keep the
>> trace calls for obj_cgroup_may_zswap() and periodic workqueue cases?
> ..with that being said, I do like having a single tracepoint. I think
> with some refactoring we can end up with a single tracepoint and more
> data. We can even capture the cases where we force a flush but we
> don't really need to flush. We can even add vmstats->stats_updates to
> the tracepoint to know exactly how many updates we have when we flush.
>
> What about the following:
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7845c64a2c570..be0e7f52ad11a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -584,8 +584,14 @@ static inline void memcg_rstat_updated(struct
> mem_cgroup *memcg, int val)
>          }
>   }
>
> -static void do_flush_stats(struct mem_cgroup *memcg)
> +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
>   {
> +       bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats);
> +
> +       trace_memcg_flush_stats(memcg, needs_flush, force, ...);
> +       if (!force && !needs_flush)
> +               return;
> +
>          if (mem_cgroup_is_root(memcg))
>                  WRITE_ONCE(flush_last_time, jiffies_64);
>
> @@ -609,8 +615,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
>          if (!memcg)
>                  memcg = root_mem_cgroup;
>
> -       if (memcg_vmstats_needs_flush(memcg->vmstats))
> -               do_flush_stats(memcg);
> +       __mem_cgroup_flush_stats(memcg, false);
>   }
>
>   void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
> @@ -626,7 +631,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>           * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
>           * in latency-sensitive paths is as cheap as possible.
>           */
> -       do_flush_stats(root_mem_cgroup);
> +       __mem_cgroup_flush_stats(root_mem_cgroup, true);
>          queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
>   }
>
> @@ -5272,11 +5277,8 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
>                          break;
>                  }
>
> -               /*
> -                * mem_cgroup_flush_stats() ignores small changes. Use
> -                * do_flush_stats() directly to get accurate stats for charging.
> -                */
> -               do_flush_stats(memcg);
> +               /* Force a flush to get accurate stats for charging */
> +               __mem_cgroup_flush_stats(memcg, true);
>                  pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
>                  if (pages < max)
>                          continue;
I like the additional info. I'll incorporate this into v2.


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-10-25 18:27 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-10-25  0:25 [PATCH 0/2] memcg: tracepoint for flushing stats JP Kobryn
2024-10-25  0:25 ` [PATCH 1/2] memcg: add memcg flush tracepoint event JP Kobryn
2024-10-25  0:25 ` [PATCH 2/2] memcg: use memcg flush tracepoint JP Kobryn
2024-10-25  0:57   ` Yosry Ahmed
2024-10-25  1:15     ` Shakeel Butt
2024-10-25  7:40       ` Yosry Ahmed
2024-10-25 17:04         ` JP Kobryn
2024-10-25 17:53           ` Yosry Ahmed
2024-10-25 18:26             ` JP Kobryn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox