linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] Fix calculations in trace_balance_dirty_pages() for cgwb
@ 2025-03-04 11:03 Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter Tang Yizhou
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Tang Yizhou @ 2025-03-04 11:03 UTC (permalink / raw)
  To: tj, jack, brauner, willy, akpm
  Cc: rostedt, mhiramat, ast, linux-mm, linux-fsdevel, linux-kernel,
	Tang Yizhou

From: Tang Yizhou <yizhou.tang@shopee.com>

In my experiment, I found that the output of trace_balance_dirty_pages()
in the cgroup writeback scenario was strange because
trace_balance_dirty_pages() always uses global_wb_domain.dirty_limit for
related calculations instead of the dirty_limit of the corresponding
memcg's wb_domain.

The basic idea of the fix is to store the hard dirty limit value computed
in wb_position_ratio() into struct dirty_throttle_control and use it for
calculations in trace_balance_dirty_pages().

v2:
Adopt Tejun's suggestion and split the renaming code into Patch #2.
Pick up Tejun's Acked-by tag in Patch #3.

Tang Yizhou (3):
  writeback: Let trace_balance_dirty_pages() take struct dtc as
    parameter
  writeback: Rename variables in trace_balance_dirty_pages()
  writeback: Fix calculations in trace_balance_dirty_pages() for cgwb

 include/linux/writeback.h        | 24 +++++++++++++++++++++
 include/trace/events/writeback.h | 33 ++++++++++++----------------
 mm/page-writeback.c              | 37 +++-----------------------------
 3 files changed, 41 insertions(+), 53 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter
  2025-03-04 11:03 [PATCH v2 0/3] Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
@ 2025-03-04 11:03 ` Tang Yizhou
  2025-03-04 15:39   ` Steven Rostedt
  2025-03-04 11:03 ` [PATCH v2 2/3] writeback: Rename variables in trace_balance_dirty_pages() Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 3/3] writeback: Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
  2 siblings, 1 reply; 5+ messages in thread
From: Tang Yizhou @ 2025-03-04 11:03 UTC (permalink / raw)
  To: tj, jack, brauner, willy, akpm
  Cc: rostedt, mhiramat, ast, linux-mm, linux-fsdevel, linux-kernel,
	Tang Yizhou

From: Tang Yizhou <yizhou.tang@shopee.com>

Currently, trace_balance_dirty_pages() already has 12 parameters. In the
patch #3, I initially attempted to introduce an additional parameter.
However, in include/linux/trace_events.h, bpf_trace_run12() only supports
up to 12 parameters and bpf_trace_run13() does not exist.

To reduce the number of parameters in trace_balance_dirty_pages(), we can
make it accept a pointer to struct dirty_throttle_control as a parameter.
To achieve this, we need to move the definition of struct
dirty_throttle_control from mm/page-writeback.c to
include/linux/writeback.h.

Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
---
 include/linux/writeback.h        | 23 +++++++++++++++++++++
 include/trace/events/writeback.h | 16 ++++++---------
 mm/page-writeback.c              | 35 ++------------------------------
 3 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d11b903c2edb..32095928365c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -313,6 +313,29 @@ static inline void cgroup_writeback_umount(struct super_block *sb)
 /*
  * mm/page-writeback.c
  */
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct wb_domain	*dom;
+	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
+#endif
+	struct bdi_writeback	*wb;
+	struct fprop_local_percpu *wb_completions;
+
+	unsigned long		avail;		/* dirtyable */
+	unsigned long		dirty;		/* file_dirty + write + nfs */
+	unsigned long		thresh;		/* dirty threshold */
+	unsigned long		bg_thresh;	/* dirty background threshold */
+
+	unsigned long		wb_dirty;	/* per-wb counterparts */
+	unsigned long		wb_thresh;
+	unsigned long		wb_bg_thresh;
+
+	unsigned long		pos_ratio;
+	bool			freerun;
+	bool			dirty_exceeded;
+};
+
 void laptop_io_completion(struct backing_dev_info *info);
 void laptop_sync_completion(void);
 void laptop_mode_timer_fn(struct timer_list *t);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a261e86e61fa..3213b9023794 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -629,11 +629,7 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 TRACE_EVENT(balance_dirty_pages,
 
 	TP_PROTO(struct bdi_writeback *wb,
-		 unsigned long thresh,
-		 unsigned long bg_thresh,
-		 unsigned long dirty,
-		 unsigned long bdi_thresh,
-		 unsigned long bdi_dirty,
+		 struct dirty_throttle_control *dtc,
 		 unsigned long dirty_ratelimit,
 		 unsigned long task_ratelimit,
 		 unsigned long dirtied,
@@ -641,7 +637,7 @@ TRACE_EVENT(balance_dirty_pages,
 		 long pause,
 		 unsigned long start_time),
 
-	TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+	TP_ARGS(wb, dtc,
 		dirty_ratelimit, task_ratelimit,
 		dirtied, period, pause, start_time),
 
@@ -664,16 +660,16 @@ TRACE_EVENT(balance_dirty_pages,
 	),
 
 	TP_fast_assign(
-		unsigned long freerun = (thresh + bg_thresh) / 2;
+		unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
 		strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
 
 		__entry->limit		= global_wb_domain.dirty_limit;
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
 						freerun) / 2;
-		__entry->dirty		= dirty;
+		__entry->dirty		= dtc->dirty;
 		__entry->bdi_setpoint	= __entry->setpoint *
-						bdi_thresh / (thresh + 1);
-		__entry->bdi_dirty	= bdi_dirty;
+						dtc->wb_thresh / (dtc->thresh + 1);
+		__entry->bdi_dirty	= dtc->wb_dirty;
 		__entry->dirty_ratelimit = KBps(dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb55ece39c56..e980b2aec352 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -120,29 +120,6 @@ EXPORT_SYMBOL(laptop_mode);
 
 struct wb_domain global_wb_domain;
 
-/* consolidated parameters for balance_dirty_pages() and its subroutines */
-struct dirty_throttle_control {
-#ifdef CONFIG_CGROUP_WRITEBACK
-	struct wb_domain	*dom;
-	struct dirty_throttle_control *gdtc;	/* only set in memcg dtc's */
-#endif
-	struct bdi_writeback	*wb;
-	struct fprop_local_percpu *wb_completions;
-
-	unsigned long		avail;		/* dirtyable */
-	unsigned long		dirty;		/* file_dirty + write + nfs */
-	unsigned long		thresh;		/* dirty threshold */
-	unsigned long		bg_thresh;	/* dirty background threshold */
-
-	unsigned long		wb_dirty;	/* per-wb counterparts */
-	unsigned long		wb_thresh;
-	unsigned long		wb_bg_thresh;
-
-	unsigned long		pos_ratio;
-	bool			freerun;
-	bool			dirty_exceeded;
-};
-
 /*
  * Length of period for aging writeout fractions of bdis. This is an
  * arbitrarily chosen number. The longer the period, the slower fractions will
@@ -1962,11 +1939,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
 		 */
 		if (pause < min_pause) {
 			trace_balance_dirty_pages(wb,
-						  sdtc->thresh,
-						  sdtc->bg_thresh,
-						  sdtc->dirty,
-						  sdtc->wb_thresh,
-						  sdtc->wb_dirty,
+						  sdtc,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
@@ -1991,11 +1964,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
 
 pause:
 		trace_balance_dirty_pages(wb,
-					  sdtc->thresh,
-					  sdtc->bg_thresh,
-					  sdtc->dirty,
-					  sdtc->wb_thresh,
-					  sdtc->wb_dirty,
+					  sdtc,
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 2/3] writeback: Rename variables in trace_balance_dirty_pages()
  2025-03-04 11:03 [PATCH v2 0/3] Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter Tang Yizhou
@ 2025-03-04 11:03 ` Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 3/3] writeback: Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
  2 siblings, 0 replies; 5+ messages in thread
From: Tang Yizhou @ 2025-03-04 11:03 UTC (permalink / raw)
  To: tj, jack, brauner, willy, akpm
  Cc: rostedt, mhiramat, ast, linux-mm, linux-fsdevel, linux-kernel,
	Tang Yizhou

From: Tang Yizhou <yizhou.tang@shopee.com>

Rename bdi_setpoint and bdi_dirty in the tracepoint to wb_setpoint and
wb_dirty, respectively. These changes were omitted by Tejun in the cgroup
writeback patchset.

Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
---
 include/trace/events/writeback.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3213b9023794..3046ca6b08ea 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -646,8 +646,8 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(unsigned long,	limit)
 		__field(unsigned long,	setpoint)
 		__field(unsigned long,	dirty)
-		__field(unsigned long,	bdi_setpoint)
-		__field(unsigned long,	bdi_dirty)
+		__field(unsigned long,	wb_setpoint)
+		__field(unsigned long,	wb_dirty)
 		__field(unsigned long,	dirty_ratelimit)
 		__field(unsigned long,	task_ratelimit)
 		__field(unsigned int,	dirtied)
@@ -667,9 +667,9 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
 						freerun) / 2;
 		__entry->dirty		= dtc->dirty;
-		__entry->bdi_setpoint	= __entry->setpoint *
+		__entry->wb_setpoint	= __entry->setpoint *
 						dtc->wb_thresh / (dtc->thresh + 1);
-		__entry->bdi_dirty	= dtc->wb_dirty;
+		__entry->wb_dirty	= dtc->wb_dirty;
 		__entry->dirty_ratelimit = KBps(dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->dirtied	= dirtied;
@@ -685,7 +685,7 @@ TRACE_EVENT(balance_dirty_pages,
 
 	TP_printk("bdi %s: "
 		  "limit=%lu setpoint=%lu dirty=%lu "
-		  "bdi_setpoint=%lu bdi_dirty=%lu "
+		  "wb_setpoint=%lu wb_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
 		  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
@@ -693,8 +693,8 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->limit,
 		  __entry->setpoint,
 		  __entry->dirty,
-		  __entry->bdi_setpoint,
-		  __entry->bdi_dirty,
+		  __entry->wb_setpoint,
+		  __entry->wb_dirty,
 		  __entry->dirty_ratelimit,
 		  __entry->task_ratelimit,
 		  __entry->dirtied,
-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 3/3] writeback: Fix calculations in trace_balance_dirty_pages() for cgwb
  2025-03-04 11:03 [PATCH v2 0/3] Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter Tang Yizhou
  2025-03-04 11:03 ` [PATCH v2 2/3] writeback: Rename variables in trace_balance_dirty_pages() Tang Yizhou
@ 2025-03-04 11:03 ` Tang Yizhou
  2 siblings, 0 replies; 5+ messages in thread
From: Tang Yizhou @ 2025-03-04 11:03 UTC (permalink / raw)
  To: tj, jack, brauner, willy, akpm
  Cc: rostedt, mhiramat, ast, linux-mm, linux-fsdevel, linux-kernel,
	Tang Yizhou

From: Tang Yizhou <yizhou.tang@shopee.com>

In the commit dcc25ae76eb7 ("writeback: move global_dirty_limit into
wb_domain") of the cgroup writeback backpressure propagation patchset,
Tejun made some adaptations to trace_balance_dirty_pages() for cgroup
writeback. However, this adaptation was incomplete and Tejun missed
further adaptation in the subsequent patches.

In the cgroup writeback scenario, if sdtc in balance_dirty_pages() is
assigned to mdtc, then upon entering trace_balance_dirty_pages(),
__entry->limit should be assigned based on the dirty_limit of the
corresponding memcg's wb_domain, rather than global_wb_domain.

To address this issue and simplify the implementation, introduce a 'limit'
field in struct dirty_throttle_control to store the hard_limit value
computed in wb_position_ratio() by calling hard_dirty_limit(). This field
will then be used in trace_balance_dirty_pages() to assign the value to
__entry->limit.

Fixes: dcc25ae76eb7 ("writeback: move global_dirty_limit into wb_domain")
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tang Yizhou <yizhou.tang@shopee.com>
---
 include/linux/writeback.h        | 1 +
 include/trace/events/writeback.h | 5 ++---
 mm/page-writeback.c              | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 32095928365c..58bda3347914 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -326,6 +326,7 @@ struct dirty_throttle_control {
 	unsigned long		dirty;		/* file_dirty + write + nfs */
 	unsigned long		thresh;		/* dirty threshold */
 	unsigned long		bg_thresh;	/* dirty background threshold */
+	unsigned long		limit;		/* hard dirty limit */
 
 	unsigned long		wb_dirty;	/* per-wb counterparts */
 	unsigned long		wb_thresh;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3046ca6b08ea..0ff388131fc9 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -663,9 +663,8 @@ TRACE_EVENT(balance_dirty_pages,
 		unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
 		strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
 
-		__entry->limit		= global_wb_domain.dirty_limit;
-		__entry->setpoint	= (global_wb_domain.dirty_limit +
-						freerun) / 2;
+		__entry->limit		= dtc->limit;
+		__entry->setpoint	= (dtc->limit + freerun) / 2;
 		__entry->dirty		= dtc->dirty;
 		__entry->wb_setpoint	= __entry->setpoint *
 						dtc->wb_thresh / (dtc->thresh + 1);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e980b2aec352..3147119a9a04 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1072,7 +1072,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
 	struct bdi_writeback *wb = dtc->wb;
 	unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
 	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
-	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+	unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
 	unsigned long wb_thresh = dtc->wb_thresh;
 	unsigned long x_intercept;
 	unsigned long setpoint;		/* dirty pages' target balance point */
-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter
  2025-03-04 11:03 ` [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter Tang Yizhou
@ 2025-03-04 15:39   ` Steven Rostedt
  0 siblings, 0 replies; 5+ messages in thread
From: Steven Rostedt @ 2025-03-04 15:39 UTC (permalink / raw)
  To: Tang Yizhou
  Cc: tj, jack, brauner, willy, akpm, mhiramat, ast, linux-mm,
	linux-fsdevel, linux-kernel

On Tue,  4 Mar 2025 19:03:16 +0800
Tang Yizhou <yizhou.tang@shopee.com> wrote:

> @@ -664,16 +660,16 @@ TRACE_EVENT(balance_dirty_pages,
>  	),
>  
>  	TP_fast_assign(
> -		unsigned long freerun = (thresh + bg_thresh) / 2;
> +		unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
>  		strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
>  
>  		__entry->limit		= global_wb_domain.dirty_limit;
>  		__entry->setpoint	= (global_wb_domain.dirty_limit +
>  						freerun) / 2;
> -		__entry->dirty		= dirty;
> +		__entry->dirty		= dtc->dirty;
>  		__entry->bdi_setpoint	= __entry->setpoint *
> -						bdi_thresh / (thresh + 1);
> -		__entry->bdi_dirty	= bdi_dirty;
> +						dtc->wb_thresh / (dtc->thresh + 1);
> +		__entry->bdi_dirty	= dtc->wb_dirty;
>  		__entry->dirty_ratelimit = KBps(dirty_ratelimit);
>  		__entry->task_ratelimit	= KBps(task_ratelimit);
>  		__entry->dirtied	= dirtied;

I don't know how much of a fast path these tracepoints are in, but instead
of doing the divisions above, why not just save the values in the ring
buffer, and do the divisions in the TP_printk() section, which is done when
the user reads it and not when the code is executing?

-- Steve


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-03-04 15:39 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-04 11:03 [PATCH v2 0/3] Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou
2025-03-04 11:03 ` [PATCH v2 1/3] writeback: Let trace_balance_dirty_pages() take struct dtc as parameter Tang Yizhou
2025-03-04 15:39   ` Steven Rostedt
2025-03-04 11:03 ` [PATCH v2 2/3] writeback: Rename variables in trace_balance_dirty_pages() Tang Yizhou
2025-03-04 11:03 ` [PATCH v2 3/3] writeback: Fix calculations in trace_balance_dirty_pages() for cgwb Tang Yizhou

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox