[PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
@ 2023-12-04 19:41 David Finkel
  2023-12-04 23:33 ` Shakeel Butt
                   ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: David Finkel @ 2023-12-04 19:41 UTC (permalink / raw)
  To: Muchun Song
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest, David Finkel

Other mechanisms for querying the peak memory usage of either a process
or v1 memory cgroup allow for resetting the high watermark. Restore
parity with those mechanisms.

For example:
 - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
   the high watermark.
 - writing "5" to the clear_refs pseudo-file in a processes's proc
   directory resets the peak RSS.

This change copies the cgroup v1 behavior so any write to the
memory.peak and memory.swap.peak pseudo-files reset the high watermark
to the current usage.

This behavior is particularly useful for work scheduling systems that
need to track memory usage of worker processes/cgroups per-work-item.
Since memory can't be squeezed like CPU can (the OOM-killer has
opinions), these systems need to track the peak memory usage to compute
system/container fullness when binpacking workitems.

Signed-off-by: David Finkel <davidf@vimeo.com>
---
 Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
 mm/memcontrol.c                               | 23 ++++++
 .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
 3 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3f85254f3cef..95af0628dc44 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1305,11 +1305,13 @@ PAGE_SIZE multiple when read back.
 	reclaim induced by memory.reclaim.
 
   memory.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max memory usage recorded for the cgroup and its descendants since
+	either the creation of the cgroup or the most recent reset.
 
-	The max memory usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current memory usage.
+	All content written is completely ignored.
 
   memory.oom.group
 	A read-write single value file which exists on non-root
@@ -1626,11 +1628,13 @@ PAGE_SIZE multiple when read back.
 	Healthy workloads are not expected to reach this limit.
 
   memory.swap.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max swap usage recorded for the cgroup and its descendants since
+	the creation of the cgroup or the most recent reset.
 
-	The max swap usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current swap usage.
+	All content written is completely ignored.
 
   memory.swap.max
 	A read-write single value file which exists on non-root
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c1061df9cd1..b04af158922d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
+#include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
@@ -6635,6 +6636,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->memory.watermark * PAGE_SIZE;
 }
 
+static ssize_t memory_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->memory);
+
+	return nbytes;
+}
+
 static int memory_min_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -6947,6 +6958,7 @@ static struct cftype memory_files[] = {
 		.name = "peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = memory_peak_read,
+		.write = memory_peak_write,
 	},
 	{
 		.name = "min",
@@ -7917,6 +7929,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->swap.watermark * PAGE_SIZE;
 }
 
+static ssize_t swap_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->swap);
+
+	return nbytes;
+}
+
 static int swap_high_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -7999,6 +8021,7 @@ static struct cftype swap_files[] = {
 		.name = "swap.peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = swap_peak_read,
+		.write = swap_peak_write,
 	},
 	{
 		.name = "swap.events",
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index c7c9572003a8..0326c317f1f2 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
 /*
  * This test create a memory cgroup, allocates
  * some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
  */
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long current;
+	long current, peak, peak_reset;
 	char *memcg;
 
 	memcg = cg_name(root, "memcg_test");
@@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
 	if (current != 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak != 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	peak_reset = cg_write(memcg, "memory.peak", "\n");
+	if (peak_reset != 0)
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak > MB(30))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -815,13 +835,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
 
 /*
  * This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
  */
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *memcg;
-	long max;
+	long max, peak;
 
 	if (!is_swap_enabled())
 		return KSFT_SKIP;
@@ -838,6 +859,12 @@ static int test_memcg_swap_max(const char *root)
 		goto cleanup;
 	}
 
+	if (cg_read_long(memcg, "memory.swap.peak"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
 		goto cleanup;
 
@@ -860,6 +887,27 @@ static int test_memcg_swap_max(const char *root)
 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
+		goto cleanup;
+
+
+	if (cg_write(memcg, "memory.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
 		goto cleanup;
 
@@ -867,6 +915,14 @@ static int test_memcg_swap_max(const char *root)
 	if (max <= 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(19))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -1293,7 +1349,7 @@ struct memcg_test {
 	const char *name;
 } tests[] = {
 	T(test_memcg_subtree_control),
-	T(test_memcg_current),
+	T(test_memcg_current_peak),
 	T(test_memcg_min),
 	T(test_memcg_low),
 	T(test_memcg_high),
@@ -1301,7 +1357,7 @@ struct memcg_test {
 	T(test_memcg_max),
 	T(test_memcg_reclaim),
 	T(test_memcg_oom_events),
-	T(test_memcg_swap_max),
+	T(test_memcg_swap_max_peak),
 	T(test_memcg_sock),
 	T(test_memcg_oom_group_leaf_events),
 	T(test_memcg_oom_group_parent_events),
-- 
2.39.2



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2023-12-04 19:41 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
@ 2023-12-04 23:33 ` Shakeel Butt
  2023-12-05  9:07 ` Michal Hocko
  2024-02-07 21:06 ` David Finkel
  2 siblings, 0 replies; 47+ messages in thread
From: Shakeel Butt @ 2023-12-04 23:33 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, core-services, Jonathan Corbet, Michal Hocko,
	Roman Gushchin, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Mon, Dec 4, 2023 at 11:42 AM David Finkel <davidf@vimeo.com> wrote:
>
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
>
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
>
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
>
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
>
> Signed-off-by: David Finkel <davidf@vimeo.com>

Acked-by: Shakeel Butt <shakeelb@google.com>


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2023-12-04 19:41 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
  2023-12-04 23:33 ` Shakeel Butt
@ 2023-12-05  9:07 ` Michal Hocko
  2023-12-05 16:00   ` David Finkel
  2024-02-07 21:06 ` David Finkel
  2 siblings, 1 reply; 47+ messages in thread
From: Michal Hocko @ 2023-12-05  9:07 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, core-services, Jonathan Corbet, Roman Gushchin,
	Shakeel Butt, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Mon 04-12-23 14:41:56, David Finkel wrote:
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
> 
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
> 
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
> 
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.

I do not understand the OOM-killer reference here but I do understand
that your worker reuses a cgroup and you want a peak memory consumption
of a single run to better profile/configure the memcg configuration for
the specific worker type. Correct?

> Signed-off-by: David Finkel <davidf@vimeo.com>

Makes sense to me
Acked-by: Michal Hocko <mhocko@suse.com>

Thanks!

> ---
>  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
>  mm/memcontrol.c                               | 23 ++++++
>  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
>  3 files changed, 99 insertions(+), 16 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 3f85254f3cef..95af0628dc44 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1305,11 +1305,13 @@ PAGE_SIZE multiple when read back.
>  	reclaim induced by memory.reclaim.
>  
>    memory.peak
> -	A read-only single value file which exists on non-root
> -	cgroups.
> +	A read-write single value file which exists on non-root cgroups.
> +
> +	The max memory usage recorded for the cgroup and its descendants since
> +	either the creation of the cgroup or the most recent reset.
>  
> -	The max memory usage recorded for the cgroup and its
> -	descendants since the creation of the cgroup.
> +	Any non-empty write to this file resets it to the current memory usage.
> +	All content written is completely ignored.
>  
>    memory.oom.group
>  	A read-write single value file which exists on non-root
> @@ -1626,11 +1628,13 @@ PAGE_SIZE multiple when read back.
>  	Healthy workloads are not expected to reach this limit.
>  
>    memory.swap.peak
> -	A read-only single value file which exists on non-root
> -	cgroups.
> +	A read-write single value file which exists on non-root cgroups.
> +
> +	The max swap usage recorded for the cgroup and its descendants since
> +	the creation of the cgroup or the most recent reset.
>  
> -	The max swap usage recorded for the cgroup and its
> -	descendants since the creation of the cgroup.
> +	Any non-empty write to this file resets it to the current swap usage.
> +	All content written is completely ignored.
>  
>    memory.swap.max
>  	A read-write single value file which exists on non-root
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1c1061df9cd1..b04af158922d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -25,6 +25,7 @@
>   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
>   */
>  
> +#include <linux/cgroup-defs.h>
>  #include <linux/page_counter.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> @@ -6635,6 +6636,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
>  	return (u64)memcg->memory.watermark * PAGE_SIZE;
>  }
>  
> +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> +				 char *buf, size_t nbytes, loff_t off)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +	page_counter_reset_watermark(&memcg->memory);
> +
> +	return nbytes;
> +}
> +
>  static int memory_min_show(struct seq_file *m, void *v)
>  {
>  	return seq_puts_memcg_tunable(m,
> @@ -6947,6 +6958,7 @@ static struct cftype memory_files[] = {
>  		.name = "peak",
>  		.flags = CFTYPE_NOT_ON_ROOT,
>  		.read_u64 = memory_peak_read,
> +		.write = memory_peak_write,
>  	},
>  	{
>  		.name = "min",
> @@ -7917,6 +7929,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
>  	return (u64)memcg->swap.watermark * PAGE_SIZE;
>  }
>  
> +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> +				 char *buf, size_t nbytes, loff_t off)
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +	page_counter_reset_watermark(&memcg->swap);
> +
> +	return nbytes;
> +}
> +
>  static int swap_high_show(struct seq_file *m, void *v)
>  {
>  	return seq_puts_memcg_tunable(m,
> @@ -7999,6 +8021,7 @@ static struct cftype swap_files[] = {
>  		.name = "swap.peak",
>  		.flags = CFTYPE_NOT_ON_ROOT,
>  		.read_u64 = swap_peak_read,
> +		.write = swap_peak_write,
>  	},
>  	{
>  		.name = "swap.events",
> diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> index c7c9572003a8..0326c317f1f2 100644
> --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
>  /*
>   * This test create a memory cgroup, allocates
>   * some anonymous memory and some pagecache
> - * and check memory.current and some memory.stat values.
> + * and checks memory.current, memory.peak, and some memory.stat values.
>   */
> -static int test_memcg_current(const char *root)
> +static int test_memcg_current_peak(const char *root)
>  {
>  	int ret = KSFT_FAIL;
> -	long current;
> +	long current, peak, peak_reset;
>  	char *memcg;
>  
>  	memcg = cg_name(root, "memcg_test");
> @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
>  	if (current != 0)
>  		goto cleanup;
>  
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak != 0)
> +		goto cleanup;
> +
>  	if (cg_run(memcg, alloc_anon_50M_check, NULL))
>  		goto cleanup;
>  
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak < MB(50))
> +		goto cleanup;
> +
> +	peak_reset = cg_write(memcg, "memory.peak", "\n");
> +	if (peak_reset != 0)
> +		goto cleanup;
> +
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak > MB(30))
> +		goto cleanup;
> +
>  	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
>  		goto cleanup;
>  
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak < MB(50))
> +		goto cleanup;
> +
>  	ret = KSFT_PASS;
>  
>  cleanup:
> @@ -815,13 +835,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
>  
>  /*
>   * This test checks that memory.swap.max limits the amount of
> - * anonymous memory which can be swapped out.
> + * anonymous memory which can be swapped out. Additionally, it verifies that
> + * memory.swap.peak reflects the high watermark and can be reset.
>   */
> -static int test_memcg_swap_max(const char *root)
> +static int test_memcg_swap_max_peak(const char *root)
>  {
>  	int ret = KSFT_FAIL;
>  	char *memcg;
> -	long max;
> +	long max, peak;
>  
>  	if (!is_swap_enabled())
>  		return KSFT_SKIP;
> @@ -838,6 +859,12 @@ static int test_memcg_swap_max(const char *root)
>  		goto cleanup;
>  	}
>  
> +	if (cg_read_long(memcg, "memory.swap.peak"))
> +		goto cleanup;
> +
> +	if (cg_read_long(memcg, "memory.peak"))
> +		goto cleanup;
> +
>  	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
>  		goto cleanup;
>  
> @@ -860,6 +887,27 @@ static int test_memcg_swap_max(const char *root)
>  	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
>  		goto cleanup;
>  
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak < MB(29))
> +		goto cleanup;
> +
> +	peak = cg_read_long(memcg, "memory.swap.peak");
> +	if (peak < MB(29))
> +		goto cleanup;
> +
> +	if (cg_write(memcg, "memory.swap.peak", "\n"))
> +		goto cleanup;
> +
> +	if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> +		goto cleanup;
> +
> +
> +	if (cg_write(memcg, "memory.peak", "\n"))
> +		goto cleanup;
> +
> +	if (cg_read_long(memcg, "memory.peak"))
> +		goto cleanup;
> +
>  	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
>  		goto cleanup;
>  
> @@ -867,6 +915,14 @@ static int test_memcg_swap_max(const char *root)
>  	if (max <= 0)
>  		goto cleanup;
>  
> +	peak = cg_read_long(memcg, "memory.peak");
> +	if (peak < MB(29))
> +		goto cleanup;
> +
> +	peak = cg_read_long(memcg, "memory.swap.peak");
> +	if (peak < MB(19))
> +		goto cleanup;
> +
>  	ret = KSFT_PASS;
>  
>  cleanup:
> @@ -1293,7 +1349,7 @@ struct memcg_test {
>  	const char *name;
>  } tests[] = {
>  	T(test_memcg_subtree_control),
> -	T(test_memcg_current),
> +	T(test_memcg_current_peak),
>  	T(test_memcg_min),
>  	T(test_memcg_low),
>  	T(test_memcg_high),
> @@ -1301,7 +1357,7 @@ struct memcg_test {
>  	T(test_memcg_max),
>  	T(test_memcg_reclaim),
>  	T(test_memcg_oom_events),
> -	T(test_memcg_swap_max),
> +	T(test_memcg_swap_max_peak),
>  	T(test_memcg_sock),
>  	T(test_memcg_oom_group_leaf_events),
>  	T(test_memcg_oom_group_parent_events),
> -- 
> 2.39.2

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2023-12-05  9:07 ` Michal Hocko
@ 2023-12-05 16:00   ` David Finkel
  2023-12-06  8:45     ` Michal Hocko
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2023-12-05 16:00 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Muchun Song, core-services, Jonathan Corbet, Roman Gushchin,
	Shakeel Butt, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Dec 5, 2023 at 4:07 AM Michal Hocko <mhocko@suse.com> wrote:

> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
>
> I do not understand the OOM-killer reference here but I do understand
> that your worker reuses a cgroup and you want a peak memory consumption
> of a single run to better profile/configure the memcg configuration for
> the specific worker type. Correct?

To a certain extent, yes.
At the moment, we're only using the inner memcg cgroups for
accounting/profiling, and using a
larger (k8s container) cgroup for enforcement.

The OOM-killer is involved because we're not configuring any memory limits on
these individual "worker" cgroups, so we need to provision for
multiple workloads using
their peak memory at the same time to minimize OOM-killing.

In case you're curious, this is the job/queue-work scheduling system
we wrote in-house
called Quickset that's mentioned in this blog post about our new
transcoder system:
https://medium.com/vimeo-engineering-blog/riding-the-dragon-e328a3dfd39d

>
> > Signed-off-by: David Finkel <davidf@vimeo.com>
>
> Makes sense to me
> Acked-by: Michal Hocko <mhocko@suse.com>
>
> Thanks!

Thank you!

-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2023-12-05 16:00   ` David Finkel
@ 2023-12-06  8:45     ` Michal Hocko
  0 siblings, 0 replies; 47+ messages in thread
From: Michal Hocko @ 2023-12-06  8:45 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, core-services, Jonathan Corbet, Roman Gushchin,
	Shakeel Butt, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue 05-12-23 11:00:47, David Finkel wrote:
> On Tue, Dec 5, 2023 at 4:07 AM Michal Hocko <mhocko@suse.com> wrote:
> 
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
> >
> > I do not understand the OOM-killer reference here but I do understand
> > that your worker reuses a cgroup and you want a peak memory consumption
> > of a single run to better profile/configure the memcg configuration for
> > the specific worker type. Correct?
> 
> To a certain extent, yes.
> At the moment, we're only using the inner memcg cgroups for
> accounting/profiling, and using a
> larger (k8s container) cgroup for enforcement.
> 
> The OOM-killer is involved because we're not configuring any memory limits on
> these individual "worker" cgroups, so we need to provision for
> multiple workloads using
> their peak memory at the same time to minimize OOM-killing.

OK, that makes more sense now. Just be aware that you might under
utilize your limit that way because peak memory can still be reclaimed.
If you set the hard limit (memory.max) to the peak memory consumption
you would get a very conservative configuration wihtout any memory
reclaim.
 
> In case you're curious, this is the job/queue-work scheduling system
> we wrote in-house
> called Quickset that's mentioned in this blog post about our new
> transcoder system:
> https://medium.com/vimeo-engineering-blog/riding-the-dragon-e328a3dfd39d

Thanks!

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2023-12-04 19:41 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
  2023-12-04 23:33 ` Shakeel Butt
  2023-12-05  9:07 ` Michal Hocko
@ 2024-02-07 21:06 ` David Finkel
  2 siblings, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-02-07 21:06 UTC (permalink / raw)
  To: Muchun Song
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, cgroups, linux-doc, linux-mm,
	linux-kselftest

Did I miss a reviewer on this change?

I've clearly missed the window for 6.8, but it would be nice to get
this into a staging branch for 6.9.

(I can definitely rebase and re-mail if necessary)

Thanks,
David Finkel


On Mon, Dec 4, 2023 at 2:42 PM David Finkel <davidf@vimeo.com> wrote:
>
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
>
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
>
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
>
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
>
> Signed-off-by: David Finkel <davidf@vimeo.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
>  mm/memcontrol.c                               | 23 ++++++
>  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
>  3 files changed, 99 insertions(+), 16 deletions(-)


-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
@ 2024-07-15 20:36 David Finkel
  2024-07-15 20:36 ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-15 20:36 UTC (permalink / raw)
  To: Muchun Song, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest

This is a rebase of a patch I sent a few months ago, on which I received
two acks, but the thread petered out:
https://www.spinics.net/lists/cgroups/msg40602.html.

I'm hoping that it can make it into the next LTS (and 6.11 if possible)

Thank you,

David Finkel

Sr. Principal Software Engineer, Vimeo



^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-15 20:36 David Finkel
@ 2024-07-15 20:36 ` David Finkel
  2024-07-15 20:42   ` David Finkel
  2024-07-16 13:48   ` Michal Hocko
  0 siblings, 2 replies; 47+ messages in thread
From: David Finkel @ 2024-07-15 20:36 UTC (permalink / raw)
  To: Muchun Song, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest, David Finkel

Other mechanisms for querying the peak memory usage of either a process
or v1 memory cgroup allow for resetting the high watermark. Restore
parity with those mechanisms.

For example:
 - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
   the high watermark.
 - writing "5" to the clear_refs pseudo-file in a processes's proc
   directory resets the peak RSS.

This change copies the cgroup v1 behavior so any write to the
memory.peak and memory.swap.peak pseudo-files reset the high watermark
to the current usage.

This behavior is particularly useful for work scheduling systems that
need to track memory usage of worker processes/cgroups per-work-item.
Since memory can't be squeezed like CPU can (the OOM-killer has
opinions), these systems need to track the peak memory usage to compute
system/container fullness when binpacking workitems.

Signed-off-by: David Finkel <davidf@vimeo.com>
---
 Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
 mm/memcontrol.c                               | 23 ++++++
 .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
 3 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8fbb0519d556..201d8e5d9f82 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
 	reclaim induced by memory.reclaim.
 
   memory.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max memory usage recorded for the cgroup and its descendants since
+	either the creation of the cgroup or the most recent reset.
 
-	The max memory usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current memory usage.
+	All content written is completely ignored.
 
   memory.oom.group
 	A read-write single value file which exists on non-root
@@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
 	Healthy workloads are not expected to reach this limit.
 
   memory.swap.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max swap usage recorded for the cgroup and its descendants since
+	the creation of the cgroup or the most recent reset.
 
-	The max swap usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	Any non-empty write to this file resets it to the current swap usage.
+	All content written is completely ignored.
 
   memory.swap.max
 	A read-write single value file which exists on non-root
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f2f1bb18c9c..abfa547615d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
+#include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
@@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->memory.watermark * PAGE_SIZE;
 }
 
+static ssize_t memory_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->memory);
+
+	return nbytes;
+}
+
 static int memory_min_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
 		.name = "peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = memory_peak_read,
+		.write = memory_peak_write,
 	},
 	{
 		.name = "min",
@@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
 	return (u64)memcg->swap.watermark * PAGE_SIZE;
 }
 
+static ssize_t swap_peak_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+	page_counter_reset_watermark(&memcg->swap);
+
+	return nbytes;
+}
+
 static int swap_high_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
 		.name = "swap.peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_u64 = swap_peak_read,
+		.write = swap_peak_write,
 	},
 	{
 		.name = "swap.events",
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 41ae8047b889..681972de673b 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
 /*
  * This test create a memory cgroup, allocates
  * some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
  */
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long current;
+	long current, peak, peak_reset;
 	char *memcg;
 
 	memcg = cg_name(root, "memcg_test");
@@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
 	if (current != 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak != 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	peak_reset = cg_write(memcg, "memory.peak", "\n");
+	if (peak_reset != 0)
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak > MB(30))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
 
 /*
  * This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
  */
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *memcg;
-	long max;
+	long max, peak;
 
 	if (!is_swap_enabled())
 		return KSFT_SKIP;
@@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
 		goto cleanup;
 	}
 
+	if (cg_read_long(memcg, "memory.swap.peak"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
 		goto cleanup;
 
@@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	if (cg_write(memcg, "memory.swap.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
+		goto cleanup;
+
+
+	if (cg_write(memcg, "memory.peak", "\n"))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
 		goto cleanup;
 
@@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
 	if (max <= 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(19))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
@@ -1295,7 +1351,7 @@ struct memcg_test {
 	const char *name;
 } tests[] = {
 	T(test_memcg_subtree_control),
-	T(test_memcg_current),
+	T(test_memcg_current_peak),
 	T(test_memcg_min),
 	T(test_memcg_low),
 	T(test_memcg_high),
@@ -1303,7 +1359,7 @@ struct memcg_test {
 	T(test_memcg_max),
 	T(test_memcg_reclaim),
 	T(test_memcg_oom_events),
-	T(test_memcg_swap_max),
+	T(test_memcg_swap_max_peak),
 	T(test_memcg_sock),
 	T(test_memcg_oom_group_leaf_events),
 	T(test_memcg_oom_group_parent_events),
-- 
2.40.1



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-15 20:36 ` David Finkel
@ 2024-07-15 20:42   ` David Finkel
  2024-07-15 20:46     ` David Finkel
  2024-07-16 13:48   ` Michal Hocko
  1 sibling, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-15 20:42 UTC (permalink / raw)
  To: Muchun Song, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest

Note: this is a simple rebase of a patch I sent a few months ago,
which received two acks before the thread petered out:
https://www.spinics.net/lists/cgroups/msg40602.html

Thanks,

On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
>
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
>
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
>
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
>
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
>
> Signed-off-by: David Finkel <davidf@vimeo.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
>  mm/memcontrol.c                               | 23 ++++++
>  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
>  3 files changed, 99 insertions(+), 16 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 8fbb0519d556..201d8e5d9f82 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
>         reclaim induced by memory.reclaim.
>
>    memory.peak
> -       A read-only single value file which exists on non-root
> -       cgroups.
> +       A read-write single value file which exists on non-root cgroups.
> +
> +       The max memory usage recorded for the cgroup and its descendants since
> +       either the creation of the cgroup or the most recent reset.
>
> -       The max memory usage recorded for the cgroup and its
> -       descendants since the creation of the cgroup.
> +       Any non-empty write to this file resets it to the current memory usage.
> +       All content written is completely ignored.
>
>    memory.oom.group
>         A read-write single value file which exists on non-root
> @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
>         Healthy workloads are not expected to reach this limit.
>
>    memory.swap.peak
> -       A read-only single value file which exists on non-root
> -       cgroups.
> +       A read-write single value file which exists on non-root cgroups.
> +
> +       The max swap usage recorded for the cgroup and its descendants since
> +       the creation of the cgroup or the most recent reset.
>
> -       The max swap usage recorded for the cgroup and its
> -       descendants since the creation of the cgroup.
> +       Any non-empty write to this file resets it to the current swap usage.
> +       All content written is completely ignored.
>
>    memory.swap.max
>         A read-write single value file which exists on non-root
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8f2f1bb18c9c..abfa547615d6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -25,6 +25,7 @@
>   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
>   */
>
> +#include <linux/cgroup-defs.h>
>  #include <linux/page_counter.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
>         return (u64)memcg->memory.watermark * PAGE_SIZE;
>  }
>
> +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> +                                char *buf, size_t nbytes, loff_t off)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +       page_counter_reset_watermark(&memcg->memory);
> +
> +       return nbytes;
> +}
> +
>  static int memory_min_show(struct seq_file *m, void *v)
>  {
>         return seq_puts_memcg_tunable(m,
> @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
>                 .name = "peak",
>                 .flags = CFTYPE_NOT_ON_ROOT,
>                 .read_u64 = memory_peak_read,
> +               .write = memory_peak_write,
>         },
>         {
>                 .name = "min",
> @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
>         return (u64)memcg->swap.watermark * PAGE_SIZE;
>  }
>
> +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> +                                char *buf, size_t nbytes, loff_t off)
> +{
> +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +
> +       page_counter_reset_watermark(&memcg->swap);
> +
> +       return nbytes;
> +}
> +
>  static int swap_high_show(struct seq_file *m, void *v)
>  {
>         return seq_puts_memcg_tunable(m,
> @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
>                 .name = "swap.peak",
>                 .flags = CFTYPE_NOT_ON_ROOT,
>                 .read_u64 = swap_peak_read,
> +               .write = swap_peak_write,
>         },
>         {
>                 .name = "swap.events",
> diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> index 41ae8047b889..681972de673b 100644
> --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
>  /*
>   * This test create a memory cgroup, allocates
>   * some anonymous memory and some pagecache
> - * and check memory.current and some memory.stat values.
> + * and checks memory.current, memory.peak, and some memory.stat values.
>   */
> -static int test_memcg_current(const char *root)
> +static int test_memcg_current_peak(const char *root)
>  {
>         int ret = KSFT_FAIL;
> -       long current;
> +       long current, peak, peak_reset;
>         char *memcg;
>
>         memcg = cg_name(root, "memcg_test");
> @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
>         if (current != 0)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak != 0)
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_anon_50M_check, NULL))
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(50))
> +               goto cleanup;
> +
> +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> +       if (peak_reset != 0)
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak > MB(30))
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(50))
> +               goto cleanup;
> +
>         ret = KSFT_PASS;
>
>  cleanup:
> @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
>
>  /*
>   * This test checks that memory.swap.max limits the amount of
> - * anonymous memory which can be swapped out.
> + * anonymous memory which can be swapped out. Additionally, it verifies that
> + * memory.swap.peak reflects the high watermark and can be reset.
>   */
> -static int test_memcg_swap_max(const char *root)
> +static int test_memcg_swap_max_peak(const char *root)
>  {
>         int ret = KSFT_FAIL;
>         char *memcg;
> -       long max;
> +       long max, peak;
>
>         if (!is_swap_enabled())
>                 return KSFT_SKIP;
> @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
>                 goto cleanup;
>         }
>
> +       if (cg_read_long(memcg, "memory.swap.peak"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.peak"))
> +               goto cleanup;
> +
>         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
>                 goto cleanup;
>
> @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
>         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.swap.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> +               goto cleanup;
> +
> +
> +       if (cg_write(memcg, "memory.peak", "\n"))
> +               goto cleanup;
> +
> +       if (cg_read_long(memcg, "memory.peak"))
> +               goto cleanup;
> +
>         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
>                 goto cleanup;
>
> @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
>         if (max <= 0)
>                 goto cleanup;
>
> +       peak = cg_read_long(memcg, "memory.peak");
> +       if (peak < MB(29))
> +               goto cleanup;
> +
> +       peak = cg_read_long(memcg, "memory.swap.peak");
> +       if (peak < MB(19))
> +               goto cleanup;
> +
>         ret = KSFT_PASS;
>
>  cleanup:
> @@ -1295,7 +1351,7 @@ struct memcg_test {
>         const char *name;
>  } tests[] = {
>         T(test_memcg_subtree_control),
> -       T(test_memcg_current),
> +       T(test_memcg_current_peak),
>         T(test_memcg_min),
>         T(test_memcg_low),
>         T(test_memcg_high),
> @@ -1303,7 +1359,7 @@ struct memcg_test {
>         T(test_memcg_max),
>         T(test_memcg_reclaim),
>         T(test_memcg_oom_events),
> -       T(test_memcg_swap_max),
> +       T(test_memcg_swap_max_peak),
>         T(test_memcg_sock),
>         T(test_memcg_oom_group_leaf_events),
>         T(test_memcg_oom_group_parent_events),
> --
> 2.40.1
>


-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-15 20:42   ` David Finkel
@ 2024-07-15 20:46     ` David Finkel
  2024-07-16  7:20       ` Michal Hocko
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-15 20:46 UTC (permalink / raw)
  To: Muchun Song, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li, cgroups,
	linux-doc, linux-mm, linux-kselftest, Shakeel Butt

[Fixing Shakeel's email address (I didn't notice it changed when
comparing my previous "git send-email" commandline and
get_mainainer.pl output)]

On Mon, Jul 15, 2024 at 4:42 PM David Finkel <davidf@vimeo.com> wrote:
>
> Note: this is a simple rebase of a patch I sent a few months ago,
> which received two acks before the thread petered out:
> https://www.spinics.net/lists/cgroups/msg40602.html
>
> Thanks,
>
> On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> >
> > Other mechanisms for querying the peak memory usage of either a process
> > or v1 memory cgroup allow for resetting the high watermark. Restore
> > parity with those mechanisms.
> >
> > For example:
> >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> >    the high watermark.
> >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> >    directory resets the peak RSS.
> >
> > This change copies the cgroup v1 behavior so any write to the
> > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > to the current usage.
> >
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
> >
> > Signed-off-by: David Finkel <davidf@vimeo.com>
> > ---
> >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> >  mm/memcontrol.c                               | 23 ++++++
> >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> >  3 files changed, 99 insertions(+), 16 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index 8fbb0519d556..201d8e5d9f82 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> >         reclaim induced by memory.reclaim.
> >
> >    memory.peak
> > -       A read-only single value file which exists on non-root
> > -       cgroups.
> > +       A read-write single value file which exists on non-root cgroups.
> > +
> > +       The max memory usage recorded for the cgroup and its descendants since
> > +       either the creation of the cgroup or the most recent reset.
> >
> > -       The max memory usage recorded for the cgroup and its
> > -       descendants since the creation of the cgroup.
> > +       Any non-empty write to this file resets it to the current memory usage.
> > +       All content written is completely ignored.
> >
> >    memory.oom.group
> >         A read-write single value file which exists on non-root
> > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> >         Healthy workloads are not expected to reach this limit.
> >
> >    memory.swap.peak
> > -       A read-only single value file which exists on non-root
> > -       cgroups.
> > +       A read-write single value file which exists on non-root cgroups.
> > +
> > +       The max swap usage recorded for the cgroup and its descendants since
> > +       the creation of the cgroup or the most recent reset.
> >
> > -       The max swap usage recorded for the cgroup and its
> > -       descendants since the creation of the cgroup.
> > +       Any non-empty write to this file resets it to the current swap usage.
> > +       All content written is completely ignored.
> >
> >    memory.swap.max
> >         A read-write single value file which exists on non-root
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8f2f1bb18c9c..abfa547615d6 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -25,6 +25,7 @@
> >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> >   */
> >
> > +#include <linux/cgroup-defs.h>
> >  #include <linux/page_counter.h>
> >  #include <linux/memcontrol.h>
> >  #include <linux/cgroup.h>
> > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> >  }
> >
> > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > +                                char *buf, size_t nbytes, loff_t off)
> > +{
> > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +
> > +       page_counter_reset_watermark(&memcg->memory);
> > +
> > +       return nbytes;
> > +}
> > +
> >  static int memory_min_show(struct seq_file *m, void *v)
> >  {
> >         return seq_puts_memcg_tunable(m,
> > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> >                 .name = "peak",
> >                 .flags = CFTYPE_NOT_ON_ROOT,
> >                 .read_u64 = memory_peak_read,
> > +               .write = memory_peak_write,
> >         },
> >         {
> >                 .name = "min",
> > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> >  }
> >
> > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > +                                char *buf, size_t nbytes, loff_t off)
> > +{
> > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +
> > +       page_counter_reset_watermark(&memcg->swap);
> > +
> > +       return nbytes;
> > +}
> > +
> >  static int swap_high_show(struct seq_file *m, void *v)
> >  {
> >         return seq_puts_memcg_tunable(m,
> > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> >                 .name = "swap.peak",
> >                 .flags = CFTYPE_NOT_ON_ROOT,
> >                 .read_u64 = swap_peak_read,
> > +               .write = swap_peak_write,
> >         },
> >         {
> >                 .name = "swap.events",
> > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > index 41ae8047b889..681972de673b 100644
> > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> >  /*
> >   * This test create a memory cgroup, allocates
> >   * some anonymous memory and some pagecache
> > - * and check memory.current and some memory.stat values.
> > + * and checks memory.current, memory.peak, and some memory.stat values.
> >   */
> > -static int test_memcg_current(const char *root)
> > +static int test_memcg_current_peak(const char *root)
> >  {
> >         int ret = KSFT_FAIL;
> > -       long current;
> > +       long current, peak, peak_reset;
> >         char *memcg;
> >
> >         memcg = cg_name(root, "memcg_test");
> > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> >         if (current != 0)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak != 0)
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(50))
> > +               goto cleanup;
> > +
> > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > +       if (peak_reset != 0)
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak > MB(30))
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(50))
> > +               goto cleanup;
> > +
> >         ret = KSFT_PASS;
> >
> >  cleanup:
> > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> >
> >  /*
> >   * This test checks that memory.swap.max limits the amount of
> > - * anonymous memory which can be swapped out.
> > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > + * memory.swap.peak reflects the high watermark and can be reset.
> >   */
> > -static int test_memcg_swap_max(const char *root)
> > +static int test_memcg_swap_max_peak(const char *root)
> >  {
> >         int ret = KSFT_FAIL;
> >         char *memcg;
> > -       long max;
> > +       long max, peak;
> >
> >         if (!is_swap_enabled())
> >                 return KSFT_SKIP;
> > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> >                 goto cleanup;
> >         }
> >
> > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.peak"))
> > +               goto cleanup;
> > +
> >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> >                 goto cleanup;
> >
> > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > +               goto cleanup;
> > +
> > +
> > +       if (cg_write(memcg, "memory.peak", "\n"))
> > +               goto cleanup;
> > +
> > +       if (cg_read_long(memcg, "memory.peak"))
> > +               goto cleanup;
> > +
> >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> >                 goto cleanup;
> >
> > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> >         if (max <= 0)
> >                 goto cleanup;
> >
> > +       peak = cg_read_long(memcg, "memory.peak");
> > +       if (peak < MB(29))
> > +               goto cleanup;
> > +
> > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > +       if (peak < MB(19))
> > +               goto cleanup;
> > +
> >         ret = KSFT_PASS;
> >
> >  cleanup:
> > @@ -1295,7 +1351,7 @@ struct memcg_test {
> >         const char *name;
> >  } tests[] = {
> >         T(test_memcg_subtree_control),
> > -       T(test_memcg_current),
> > +       T(test_memcg_current_peak),
> >         T(test_memcg_min),
> >         T(test_memcg_low),
> >         T(test_memcg_high),
> > @@ -1303,7 +1359,7 @@ struct memcg_test {
> >         T(test_memcg_max),
> >         T(test_memcg_reclaim),
> >         T(test_memcg_oom_events),
> > -       T(test_memcg_swap_max),
> > +       T(test_memcg_swap_max_peak),
> >         T(test_memcg_sock),
> >         T(test_memcg_oom_group_leaf_events),
> >         T(test_memcg_oom_group_parent_events),
> > --
> > 2.40.1
> >
>
>
> --
> David Finkel
> Senior Principal Software Engineer, Core Services



-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-15 20:46     ` David Finkel
@ 2024-07-16  7:20       ` Michal Hocko
  2024-07-16 12:47         ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Michal Hocko @ 2024-07-16  7:20 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest, Shakeel Butt

On Mon 15-07-24 16:46:36, David Finkel wrote:
> > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > >
> > > Other mechanisms for querying the peak memory usage of either a process
> > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > parity with those mechanisms.
> > >
> > > For example:
> > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > >    the high watermark.
> > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > >    directory resets the peak RSS.
> > >
> > > This change copies the cgroup v1 behavior so any write to the
> > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > to the current usage.
> > >
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions),

I do not understand the OOM-killer reference here. Why does it matter?
Could you explain please?

> > > these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.

Could you elaborate some more on how you are using this please? I expect
you recycle memcgs for different runs of workers and reset peak
consumptions before a new run and record it after it is done. The thing
which is not really clear to me is how the peak value really helps if it
can vary a lot among different runs. But maybe I misunderstand.

> > >
> > > Signed-off-by: David Finkel <davidf@vimeo.com>
> > > ---
> > >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> > >  mm/memcontrol.c                               | 23 ++++++
> > >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> > >  3 files changed, 99 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > > index 8fbb0519d556..201d8e5d9f82 100644
> > > --- a/Documentation/admin-guide/cgroup-v2.rst
> > > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> > >         reclaim induced by memory.reclaim.
> > >
> > >    memory.peak
> > > -       A read-only single value file which exists on non-root
> > > -       cgroups.
> > > +       A read-write single value file which exists on non-root cgroups.
> > > +
> > > +       The max memory usage recorded for the cgroup and its descendants since
> > > +       either the creation of the cgroup or the most recent reset.
> > >
> > > -       The max memory usage recorded for the cgroup and its
> > > -       descendants since the creation of the cgroup.
> > > +       Any non-empty write to this file resets it to the current memory usage.
> > > +       All content written is completely ignored.
> > >
> > >    memory.oom.group
> > >         A read-write single value file which exists on non-root
> > > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> > >         Healthy workloads are not expected to reach this limit.
> > >
> > >    memory.swap.peak
> > > -       A read-only single value file which exists on non-root
> > > -       cgroups.
> > > +       A read-write single value file which exists on non-root cgroups.
> > > +
> > > +       The max swap usage recorded for the cgroup and its descendants since
> > > +       the creation of the cgroup or the most recent reset.
> > >
> > > -       The max swap usage recorded for the cgroup and its
> > > -       descendants since the creation of the cgroup.
> > > +       Any non-empty write to this file resets it to the current swap usage.
> > > +       All content written is completely ignored.
> > >
> > >    memory.swap.max
> > >         A read-write single value file which exists on non-root
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 8f2f1bb18c9c..abfa547615d6 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -25,6 +25,7 @@
> > >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> > >   */
> > >
> > > +#include <linux/cgroup-defs.h>
> > >  #include <linux/page_counter.h>
> > >  #include <linux/memcontrol.h>
> > >  #include <linux/cgroup.h>
> > > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> > >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> > >  }
> > >
> > > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > > +                                char *buf, size_t nbytes, loff_t off)
> > > +{
> > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > +
> > > +       page_counter_reset_watermark(&memcg->memory);
> > > +
> > > +       return nbytes;
> > > +}
> > > +
> > >  static int memory_min_show(struct seq_file *m, void *v)
> > >  {
> > >         return seq_puts_memcg_tunable(m,
> > > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> > >                 .name = "peak",
> > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > >                 .read_u64 = memory_peak_read,
> > > +               .write = memory_peak_write,
> > >         },
> > >         {
> > >                 .name = "min",
> > > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> > >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> > >  }
> > >
> > > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > > +                                char *buf, size_t nbytes, loff_t off)
> > > +{
> > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > +
> > > +       page_counter_reset_watermark(&memcg->swap);
> > > +
> > > +       return nbytes;
> > > +}
> > > +
> > >  static int swap_high_show(struct seq_file *m, void *v)
> > >  {
> > >         return seq_puts_memcg_tunable(m,
> > > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> > >                 .name = "swap.peak",
> > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > >                 .read_u64 = swap_peak_read,
> > > +               .write = swap_peak_write,
> > >         },
> > >         {
> > >                 .name = "swap.events",
> > > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > index 41ae8047b889..681972de673b 100644
> > > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> > >  /*
> > >   * This test create a memory cgroup, allocates
> > >   * some anonymous memory and some pagecache
> > > - * and check memory.current and some memory.stat values.
> > > + * and checks memory.current, memory.peak, and some memory.stat values.
> > >   */
> > > -static int test_memcg_current(const char *root)
> > > +static int test_memcg_current_peak(const char *root)
> > >  {
> > >         int ret = KSFT_FAIL;
> > > -       long current;
> > > +       long current, peak, peak_reset;
> > >         char *memcg;
> > >
> > >         memcg = cg_name(root, "memcg_test");
> > > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> > >         if (current != 0)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak != 0)
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(50))
> > > +               goto cleanup;
> > > +
> > > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > > +       if (peak_reset != 0)
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak > MB(30))
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(50))
> > > +               goto cleanup;
> > > +
> > >         ret = KSFT_PASS;
> > >
> > >  cleanup:
> > > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> > >
> > >  /*
> > >   * This test checks that memory.swap.max limits the amount of
> > > - * anonymous memory which can be swapped out.
> > > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > > + * memory.swap.peak reflects the high watermark and can be reset.
> > >   */
> > > -static int test_memcg_swap_max(const char *root)
> > > +static int test_memcg_swap_max_peak(const char *root)
> > >  {
> > >         int ret = KSFT_FAIL;
> > >         char *memcg;
> > > -       long max;
> > > +       long max, peak;
> > >
> > >         if (!is_swap_enabled())
> > >                 return KSFT_SKIP;
> > > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> > >                 goto cleanup;
> > >         }
> > >
> > > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.peak"))
> > > +               goto cleanup;
> > > +
> > >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> > >                 goto cleanup;
> > >
> > > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> > >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > > +               goto cleanup;
> > > +
> > > +
> > > +       if (cg_write(memcg, "memory.peak", "\n"))
> > > +               goto cleanup;
> > > +
> > > +       if (cg_read_long(memcg, "memory.peak"))
> > > +               goto cleanup;
> > > +
> > >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> > >                 goto cleanup;
> > >
> > > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> > >         if (max <= 0)
> > >                 goto cleanup;
> > >
> > > +       peak = cg_read_long(memcg, "memory.peak");
> > > +       if (peak < MB(29))
> > > +               goto cleanup;
> > > +
> > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > +       if (peak < MB(19))
> > > +               goto cleanup;
> > > +
> > >         ret = KSFT_PASS;
> > >
> > >  cleanup:
> > > @@ -1295,7 +1351,7 @@ struct memcg_test {
> > >         const char *name;
> > >  } tests[] = {
> > >         T(test_memcg_subtree_control),
> > > -       T(test_memcg_current),
> > > +       T(test_memcg_current_peak),
> > >         T(test_memcg_min),
> > >         T(test_memcg_low),
> > >         T(test_memcg_high),
> > > @@ -1303,7 +1359,7 @@ struct memcg_test {
> > >         T(test_memcg_max),
> > >         T(test_memcg_reclaim),
> > >         T(test_memcg_oom_events),
> > > -       T(test_memcg_swap_max),
> > > +       T(test_memcg_swap_max_peak),
> > >         T(test_memcg_sock),
> > >         T(test_memcg_oom_group_leaf_events),
> > >         T(test_memcg_oom_group_parent_events),
> > > --
> > > 2.40.1
> > >
> >
> >
> > --
> > David Finkel
> > Senior Principal Software Engineer, Core Services
> 
> 
> 
> -- 
> David Finkel
> Senior Principal Software Engineer, Core Services

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16  7:20       ` Michal Hocko
@ 2024-07-16 12:47         ` David Finkel
  2024-07-16 13:19           ` Michal Hocko
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-16 12:47 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest, Shakeel Butt

On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > >
> > > > Other mechanisms for querying the peak memory usage of either a process
> > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > parity with those mechanisms.
> > > >
> > > > For example:
> > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > >    the high watermark.
> > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > >    directory resets the peak RSS.
> > > >
> > > > This change copies the cgroup v1 behavior so any write to the
> > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > to the current usage.
> > > >
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions),
>
> I do not understand the OOM-killer reference here. Why does it matter?
> Could you explain please?

Sure, we're attempting to bin-packing work based on past items of the same type.
With CPU, we can provision for the mean CPU-time per-wall-time to get
a lose "cores"
concept that we use for binpacking. With CPU, if we end up with a bit
of contention,
everything just gets a bit slower while the schedule arbitrates among cgroups.

However, with memory, you only have so much physical memory for the outer memcg.
If we pack things too tightly on memory, the OOM-killer is going to kill
something to free up memory. In some cases that's fine, but provisioning for the
peak memory for that "type" of work-item mostly avoids this issue.

My apologies. I should have reworded this sentence before resending.
(there was a question about it last time, too)


>
> > > > these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
>
> Could you elaborate some more on how you are using this please? I expect
> you recycle memcgs for different runs of workers and reset peak
> consumptions before a new run and record it after it is done. The thing
> which is not really clear to me is how the peak value really helps if it
> can vary a lot among different runs. But maybe I misunderstand.
>

That's mostly correct. The workers are long-lived and will handle many
work-items over their lifetimes (to amortize startup overheads).
The particular system that uses this classifies work in "queues", which
can be loosely assumed to use the same resources between runs, since
they're doing similar work.

To mitigate the effect of outliers, we take a high quantile of the peak memory
consumed by work items within a queue when estimating the memory dimension
to binpack future work-items.

> > > >
> > > > Signed-off-by: David Finkel <davidf@vimeo.com>
> > > > ---
> > > >  Documentation/admin-guide/cgroup-v2.rst       | 20 +++---
> > > >  mm/memcontrol.c                               | 23 ++++++
> > > >  .../selftests/cgroup/test_memcontrol.c        | 72 ++++++++++++++++---
> > > >  3 files changed, 99 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > > > index 8fbb0519d556..201d8e5d9f82 100644
> > > > --- a/Documentation/admin-guide/cgroup-v2.rst
> > > > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > > > @@ -1322,11 +1322,13 @@ PAGE_SIZE multiple when read back.
> > > >         reclaim induced by memory.reclaim.
> > > >
> > > >    memory.peak
> > > > -       A read-only single value file which exists on non-root
> > > > -       cgroups.
> > > > +       A read-write single value file which exists on non-root cgroups.
> > > > +
> > > > +       The max memory usage recorded for the cgroup and its descendants since
> > > > +       either the creation of the cgroup or the most recent reset.
> > > >
> > > > -       The max memory usage recorded for the cgroup and its
> > > > -       descendants since the creation of the cgroup.
> > > > +       Any non-empty write to this file resets it to the current memory usage.
> > > > +       All content written is completely ignored.
> > > >
> > > >    memory.oom.group
> > > >         A read-write single value file which exists on non-root
> > > > @@ -1652,11 +1654,13 @@ PAGE_SIZE multiple when read back.
> > > >         Healthy workloads are not expected to reach this limit.
> > > >
> > > >    memory.swap.peak
> > > > -       A read-only single value file which exists on non-root
> > > > -       cgroups.
> > > > +       A read-write single value file which exists on non-root cgroups.
> > > > +
> > > > +       The max swap usage recorded for the cgroup and its descendants since
> > > > +       the creation of the cgroup or the most recent reset.
> > > >
> > > > -       The max swap usage recorded for the cgroup and its
> > > > -       descendants since the creation of the cgroup.
> > > > +       Any non-empty write to this file resets it to the current swap usage.
> > > > +       All content written is completely ignored.
> > > >
> > > >    memory.swap.max
> > > >         A read-write single value file which exists on non-root
> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > index 8f2f1bb18c9c..abfa547615d6 100644
> > > > --- a/mm/memcontrol.c
> > > > +++ b/mm/memcontrol.c
> > > > @@ -25,6 +25,7 @@
> > > >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> > > >   */
> > > >
> > > > +#include <linux/cgroup-defs.h>
> > > >  #include <linux/page_counter.h>
> > > >  #include <linux/memcontrol.h>
> > > >  #include <linux/cgroup.h>
> > > > @@ -6915,6 +6916,16 @@ static u64 memory_peak_read(struct cgroup_subsys_state *css,
> > > >         return (u64)memcg->memory.watermark * PAGE_SIZE;
> > > >  }
> > > >
> > > > +static ssize_t memory_peak_write(struct kernfs_open_file *of,
> > > > +                                char *buf, size_t nbytes, loff_t off)
> > > > +{
> > > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > > +
> > > > +       page_counter_reset_watermark(&memcg->memory);
> > > > +
> > > > +       return nbytes;
> > > > +}
> > > > +
> > > >  static int memory_min_show(struct seq_file *m, void *v)
> > > >  {
> > > >         return seq_puts_memcg_tunable(m,
> > > > @@ -7232,6 +7243,7 @@ static struct cftype memory_files[] = {
> > > >                 .name = "peak",
> > > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > > >                 .read_u64 = memory_peak_read,
> > > > +               .write = memory_peak_write,
> > > >         },
> > > >         {
> > > >                 .name = "min",
> > > > @@ -8201,6 +8213,16 @@ static u64 swap_peak_read(struct cgroup_subsys_state *css,
> > > >         return (u64)memcg->swap.watermark * PAGE_SIZE;
> > > >  }
> > > >
> > > > +static ssize_t swap_peak_write(struct kernfs_open_file *of,
> > > > +                                char *buf, size_t nbytes, loff_t off)
> > > > +{
> > > > +       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > > > +
> > > > +       page_counter_reset_watermark(&memcg->swap);
> > > > +
> > > > +       return nbytes;
> > > > +}
> > > > +
> > > >  static int swap_high_show(struct seq_file *m, void *v)
> > > >  {
> > > >         return seq_puts_memcg_tunable(m,
> > > > @@ -8283,6 +8305,7 @@ static struct cftype swap_files[] = {
> > > >                 .name = "swap.peak",
> > > >                 .flags = CFTYPE_NOT_ON_ROOT,
> > > >                 .read_u64 = swap_peak_read,
> > > > +               .write = swap_peak_write,
> > > >         },
> > > >         {
> > > >                 .name = "swap.events",
> > > > diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > index 41ae8047b889..681972de673b 100644
> > > > --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > +++ b/tools/testing/selftests/cgroup/test_memcontrol.c
> > > > @@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
> > > >  /*
> > > >   * This test create a memory cgroup, allocates
> > > >   * some anonymous memory and some pagecache
> > > > - * and check memory.current and some memory.stat values.
> > > > + * and checks memory.current, memory.peak, and some memory.stat values.
> > > >   */
> > > > -static int test_memcg_current(const char *root)
> > > > +static int test_memcg_current_peak(const char *root)
> > > >  {
> > > >         int ret = KSFT_FAIL;
> > > > -       long current;
> > > > +       long current, peak, peak_reset;
> > > >         char *memcg;
> > > >
> > > >         memcg = cg_name(root, "memcg_test");
> > > > @@ -180,12 +180,32 @@ static int test_memcg_current(const char *root)
> > > >         if (current != 0)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak != 0)
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_anon_50M_check, NULL))
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(50))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak_reset = cg_write(memcg, "memory.peak", "\n");
> > > > +       if (peak_reset != 0)
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak > MB(30))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(50))
> > > > +               goto cleanup;
> > > > +
> > > >         ret = KSFT_PASS;
> > > >
> > > >  cleanup:
> > > > @@ -817,13 +837,14 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
> > > >
> > > >  /*
> > > >   * This test checks that memory.swap.max limits the amount of
> > > > - * anonymous memory which can be swapped out.
> > > > + * anonymous memory which can be swapped out. Additionally, it verifies that
> > > > + * memory.swap.peak reflects the high watermark and can be reset.
> > > >   */
> > > > -static int test_memcg_swap_max(const char *root)
> > > > +static int test_memcg_swap_max_peak(const char *root)
> > > >  {
> > > >         int ret = KSFT_FAIL;
> > > >         char *memcg;
> > > > -       long max;
> > > > +       long max, peak;
> > > >
> > > >         if (!is_swap_enabled())
> > > >                 return KSFT_SKIP;
> > > > @@ -840,6 +861,12 @@ static int test_memcg_swap_max(const char *root)
> > > >                 goto cleanup;
> > > >         }
> > > >
> > > > +       if (cg_read_long(memcg, "memory.swap.peak"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.peak"))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_read_strcmp(memcg, "memory.max", "max\n"))
> > > >                 goto cleanup;
> > > >
> > > > @@ -862,6 +889,27 @@ static int test_memcg_swap_max(const char *root)
> > > >         if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_write(memcg, "memory.swap.peak", "\n"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.swap.peak") > MB(10))
> > > > +               goto cleanup;
> > > > +
> > > > +
> > > > +       if (cg_write(memcg, "memory.peak", "\n"))
> > > > +               goto cleanup;
> > > > +
> > > > +       if (cg_read_long(memcg, "memory.peak"))
> > > > +               goto cleanup;
> > > > +
> > > >         if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
> > > >                 goto cleanup;
> > > >
> > > > @@ -869,6 +917,14 @@ static int test_memcg_swap_max(const char *root)
> > > >         if (max <= 0)
> > > >                 goto cleanup;
> > > >
> > > > +       peak = cg_read_long(memcg, "memory.peak");
> > > > +       if (peak < MB(29))
> > > > +               goto cleanup;
> > > > +
> > > > +       peak = cg_read_long(memcg, "memory.swap.peak");
> > > > +       if (peak < MB(19))
> > > > +               goto cleanup;
> > > > +
> > > >         ret = KSFT_PASS;
> > > >
> > > >  cleanup:
> > > > @@ -1295,7 +1351,7 @@ struct memcg_test {
> > > >         const char *name;
> > > >  } tests[] = {
> > > >         T(test_memcg_subtree_control),
> > > > -       T(test_memcg_current),
> > > > +       T(test_memcg_current_peak),
> > > >         T(test_memcg_min),
> > > >         T(test_memcg_low),
> > > >         T(test_memcg_high),
> > > > @@ -1303,7 +1359,7 @@ struct memcg_test {
> > > >         T(test_memcg_max),
> > > >         T(test_memcg_reclaim),
> > > >         T(test_memcg_oom_events),
> > > > -       T(test_memcg_swap_max),
> > > > +       T(test_memcg_swap_max_peak),
> > > >         T(test_memcg_sock),
> > > >         T(test_memcg_oom_group_leaf_events),
> > > >         T(test_memcg_oom_group_parent_events),
> > > > --
> > > > 2.40.1
> > > >
> > >
> > >
> > > --
> > > David Finkel
> > > Senior Principal Software Engineer, Core Services
> >
> >
> >
> > --
> > David Finkel
> > Senior Principal Software Engineer, Core Services
>
> --
> Michal Hocko
> SUSE Labs

Thanks!

-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 12:47         ` David Finkel
@ 2024-07-16 13:19           ` Michal Hocko
  2024-07-16 13:39             ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Michal Hocko @ 2024-07-16 13:19 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest, Shakeel Butt

On Tue 16-07-24 08:47:59, David Finkel wrote:
> On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > > >
> > > > > Other mechanisms for querying the peak memory usage of either a process
> > > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > > parity with those mechanisms.
> > > > >
> > > > > For example:
> > > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > > >    the high watermark.
> > > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > > >    directory resets the peak RSS.
> > > > >
> > > > > This change copies the cgroup v1 behavior so any write to the
> > > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > > to the current usage.
> > > > >
> > > > > This behavior is particularly useful for work scheduling systems that
> > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > opinions),
> >
> > I do not understand the OOM-killer reference here. Why does it matter?
> > Could you explain please?
> 
> Sure, we're attempting to bin-packing work based on past items of the same type.
> With CPU, we can provision for the mean CPU-time per-wall-time to get
> a lose "cores"
> concept that we use for binpacking. With CPU, if we end up with a bit
> of contention,
> everything just gets a bit slower while the schedule arbitrates among cgroups.
> 
> However, with memory, you only have so much physical memory for the outer memcg.
> If we pack things too tightly on memory, the OOM-killer is going to kill
> something to free up memory. In some cases that's fine, but provisioning for the
> peak memory for that "type" of work-item mostly avoids this issue.

It is still not clear to me how the memory reclaim falls into that. Are
your workloads mostly unreclaimable (e.g. anon mostly consumers without
any swap)? Why I am asking? Well, if the workload's memory is
reclaimable then the peak memory consumption is largely misleading
because an unknown portion of that memory consumption is hidden by the
reclaimed portion of it. This is not really specific to the write
handlers to reset the value though so I do not want to digress this
patch too much. I do not have objections to the patch itself. Clarifying
the usecase with your followup here would be nice.

Thanks for the clarification!
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 13:19           ` Michal Hocko
@ 2024-07-16 13:39             ` David Finkel
  0 siblings, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-16 13:39 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shuah Khan, Johannes Weiner, Tejun Heo, Zefan Li,
	cgroups, linux-doc, linux-mm, linux-kselftest, Shakeel Butt

On Tue, Jul 16, 2024 at 9:19 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-07-24 08:47:59, David Finkel wrote:
> > On Tue, Jul 16, 2024 at 3:20 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 15-07-24 16:46:36, David Finkel wrote:
> > > > > On Mon, Jul 15, 2024 at 4:38 PM David Finkel <davidf@vimeo.com> wrote:
> > > > > >
> > > > > > Other mechanisms for querying the peak memory usage of either a process
> > > > > > or v1 memory cgroup allow for resetting the high watermark. Restore
> > > > > > parity with those mechanisms.
> > > > > >
> > > > > > For example:
> > > > > >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> > > > > >    the high watermark.
> > > > > >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> > > > > >    directory resets the peak RSS.
> > > > > >
> > > > > > This change copies the cgroup v1 behavior so any write to the
> > > > > > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > > > > > to the current usage.
> > > > > >
> > > > > > This behavior is particularly useful for work scheduling systems that
> > > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > > opinions),
> > >
> > > I do not understand the OOM-killer reference here. Why does it matter?
> > > Could you explain please?
> >
> > Sure, we're attempting to bin-packing work based on past items of the same type.
> > With CPU, we can provision for the mean CPU-time per-wall-time to get
> > a lose "cores"
> > concept that we use for binpacking. With CPU, if we end up with a bit
> > of contention,
> > everything just gets a bit slower while the schedule arbitrates among cgroups.
> >
> > However, with memory, you only have so much physical memory for the outer memcg.
> > If we pack things too tightly on memory, the OOM-killer is going to kill
> > something to free up memory. In some cases that's fine, but provisioning for the
> > peak memory for that "type" of work-item mostly avoids this issue.
>
> It is still not clear to me how the memory reclaim falls into that. Are
> your workloads mostly unreclaimable (e.g. anon mostly consumers without
> any swap)? Why I am asking? Well, if the workload's memory is
> reclaimable then the peak memory consumption is largely misleading
> because an unknown portion of that memory consumption is hidden by the
> reclaimed portion of it. This is not really specific to the write
> handlers to reset the value though so I do not want to digress this
> patch too much. I do not have objections to the patch itself. Clarifying
> the usecase with your followup here would be nice.

Thanks, I'm happy to clarify things!

That's a good point about peak-RSS being unreliable if the memory's reclaimable.

The memory is mostly unreclaimable. It's almost all anonymous mmap,
with a few local files that would be resident in buffercache. (but
generally aren't mmaped)
We don't run with swap enabled on the systems for a few reasons.
In particular, kubernetes disallows swap, which ties our hands, but
even if it didn't,
demand paging from disk tends to stall any useful work, so we'd rather
see the OOM-killer invoked, anyway.

(we actually have some plans for disabling OOM-kills in these cgroups
and letting the userspace process
managing these memcgs handle work-throttling and worker-killing when
there are OOM-conditions, but that's another story :) )

>
> Thanks for the clarification!
> --
> Michal Hocko
> SUSE Labs



-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-15 20:36 ` David Finkel
  2024-07-15 20:42   ` David Finkel
@ 2024-07-16 13:48   ` Michal Hocko
  2024-07-16 13:54     ` David Finkel
  2024-07-16 16:44     ` Tejun Heo
  1 sibling, 2 replies; 47+ messages in thread
From: Michal Hocko @ 2024-07-16 13:48 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shakeel Butt, Shuah Khan, Johannes Weiner,
	Tejun Heo, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Mon 15-07-24 16:36:26, David Finkel wrote:
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms.
> 
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
> 
> This change copies the cgroup v1 behavior so any write to the
> memory.peak and memory.swap.peak pseudo-files reset the high watermark
> to the current usage.
> 
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
> 
> Signed-off-by: David Finkel <davidf@vimeo.com>

As mentioned down the email thread, I consider usefulness of peak value
rather limited. It is misleading when memory is reclaimed. But
fundamentally I do not oppose to unifying the write behavior to reset
values.

The chnagelog could use some of the clarifications down the thread.

Acked-by: Michal Hocko <mhocko@suse.com>

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 13:48   ` Michal Hocko
@ 2024-07-16 13:54     ` David Finkel
  2024-07-16 16:44     ` Tejun Heo
  1 sibling, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-16 13:54 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Roman Gushchin, Shakeel Butt, Shuah Khan, Johannes Weiner,
	Tejun Heo, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Jul 16, 2024 at 9:48 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 15-07-24 16:36:26, David Finkel wrote:
> > Other mechanisms for querying the peak memory usage of either a process
> > or v1 memory cgroup allow for resetting the high watermark. Restore
> > parity with those mechanisms.
> >
> > For example:
> >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> >    the high watermark.
> >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> >    directory resets the peak RSS.
> >
> > This change copies the cgroup v1 behavior so any write to the
> > memory.peak and memory.swap.peak pseudo-files reset the high watermark
> > to the current usage.
> >
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
> >
> > Signed-off-by: David Finkel <davidf@vimeo.com>
>
> As mentioned down the email thread, I consider usefulness of peak value
> rather limited. It is misleading when memory is reclaimed. But
> fundamentally I do not oppose to unifying the write behavior to reset
> values.
>
> The chnagelog could use some of the clarifications down the thread.

Sure, I can spend some time rewording the changelog this afternoon, and
remail it. in a few hours.

>
> Acked-by: Michal Hocko <mhocko@suse.com>
>

Thank you!
> --
> Michal Hocko
> SUSE Labs



-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 13:48   ` Michal Hocko
  2024-07-16 13:54     ` David Finkel
@ 2024-07-16 16:44     ` Tejun Heo
  2024-07-16 17:01       ` Roman Gushchin
                         ` (3 more replies)
  1 sibling, 4 replies; 47+ messages in thread
From: Tejun Heo @ 2024-07-16 16:44 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Finkel, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

Hello,

On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
...
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.

Swap still has bad reps but there's nothing drastically worse about it than
page cache. ie. If you're under memory pressure, you get thrashing one way
or another. If there's no swap, the system is just memlocking anon memory
even when they are a lot colder than page cache, so I'm skeptical that no
swap + mostly anon + kernel OOM kills is a good strategy in general
especially given that the system behavior is not very predictable under OOM
conditions.

> As mentioned down the email thread, I consider usefulness of peak value
> rather limited. It is misleading when memory is reclaimed. But
> fundamentally I do not oppose to unifying the write behavior to reset
> values.

The removal of resets was intentional. The problem was that it wasn't clear
who owned those counters and there's no way of telling who reset what when.
It was easy to accidentally end up with multiple entities that think they
can get timed measurement by resetting.

So, in general, I don't think this is a great idea. There are shortcomings
to how memory.peak behaves in that its meaningfulness quickly declines over
time. This is expected and the rationale behind adding memory.peak, IIRC,
was that it was difficult to tell the memory usage of a short-lived cgroup.

If we want to allow peak measurement of time periods, I wonder whether we
could do something similar to pressure triggers - ie. let users register
watchers so that each user can define their own watch periods. This is more
involved but more useful and less error-inducing than adding reset to a
single counter.

Johannes, what do you think?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 16:44     ` Tejun Heo
@ 2024-07-16 17:01       ` Roman Gushchin
  2024-07-16 17:20         ` David Finkel
  2024-07-16 19:53         ` Tejun Heo
  2024-07-16 17:10       ` David Finkel
                         ` (2 subsequent siblings)
  3 siblings, 2 replies; 47+ messages in thread
From: Roman Gushchin @ 2024-07-16 17:01 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, David Finkel, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
> 
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

It's definitely a better user interface and I totally agree with you regarding
the shortcomings of the proposed interface with a global reset. But if you let
users to register a (potentially large) number of watchers, it might be quite
bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
the accuracy of peak values. And then the question is why not just poll it
periodically from userspace?


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 17:01       ` Roman Gushchin
@ 2024-07-16 17:20         ` David Finkel
  2024-07-16 19:53         ` Tejun Heo
  1 sibling, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-16 17:20 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Shuah Khan, Johannes Weiner,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Tue, Jul 16, 2024 at 1:01 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
>
> It's definitely a better user interface and I totally agree with you regarding
> the shortcomings of the proposed interface with a global reset. But if you let
> users to register a (potentially large) number of watchers, it might be quite
> bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
> the accuracy of peak values. And then the question is why not just poll it
> periodically from userspace?

FWIW, as a stop-gap, we did implement periodic polling from userspace for
the system that motivated this change, but that is unlikely to catch
memory-usage
spikes that have shorter timescales than the polling interval. For
now, we're keeping
it on cgroups v1, but that's looking like a long-term untenable position.


Thanks,


-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 17:01       ` Roman Gushchin
  2024-07-16 17:20         ` David Finkel
@ 2024-07-16 19:53         ` Tejun Heo
  1 sibling, 0 replies; 47+ messages in thread
From: Tejun Heo @ 2024-07-16 19:53 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Michal Hocko, David Finkel, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

Hello,

On Tue, Jul 16, 2024 at 05:01:15PM +0000, Roman Gushchin wrote:
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> 
> It's definitely a better user interface and I totally agree with you regarding
> the shortcomings of the proposed interface with a global reset. But if you let
> users to register a (potentially large) number of watchers, it might be quite
> bad for the overall performance, isn't it? To mitigate it, we'll need to reduce
> the accuracy of peak values. And then the question is why not just poll it
> periodically from userspace?

I haven't thought in detail but it's the same problem that PSI triggers
have, right? PSI seems to have found a reasonable trade-off across accuracy,
overhead and usage restrictions? Maybe peak usage isn't as widely useful and
it doesn't justify to engineer something as sophisticated. I don't know.

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 16:44     ` Tejun Heo
  2024-07-16 17:01       ` Roman Gushchin
@ 2024-07-16 17:10       ` David Finkel
  2024-07-16 19:48         ` Tejun Heo
  2024-07-16 18:00       ` Michal Hocko
  2024-07-17 17:04       ` Johannes Weiner
  3 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-16 17:10 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Jul 16, 2024 at 12:44 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
>
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.

The reason we need peak memory information is to let us schedule work in a way
that we generally avoid OOM conditions.
For the workloads I work on, we generally have very little in the
page-cache, since
the data isn't stored locally most of the time, but streamed from
other storage/database
systems. For those cases, demand-paging will cause large variations in
servicing time,
and we'd rather restart the process than have unpredictable latency.
The same is true for the batch/queue-work system I wrote this patch to support.
We keep very little data on the local disk, so the page cache is
relatively small.

>
> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
>
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
>
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
>
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

I appreciate the ownership issues with the current resetting interface
in the other locations.
However, this peak RSS data is not used by all that many applications
(as evidenced by
the fact that the memory.peak file was only added a bit over a year ago).
I think there are enough cases where ownership is enforced externally
that mirroring the
existing interface to cgroup2 is sufficient.

I do think a more stateful interface would be nice, but I don't know
whether I have enough
knowledge of memcg to implement that in a reasonable amount of time.

Ownership aside, I think being able to reset the high watermark of a
process makes it
significantly more useful. Creating new cgroups and moving processes
around is significantly
heavier-weight.

Thanks,

>
> Johannes, what do you think?
>
> Thanks.
>
> --
> tejun

-- 
David Finkel
Senior Principal Software Engineer, Core Services

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 17:10       ` David Finkel
@ 2024-07-16 19:48         ` Tejun Heo
  2024-07-16 20:18           ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Tejun Heo @ 2024-07-16 19:48 UTC (permalink / raw)
  To: David Finkel
  Cc: Michal Hocko, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

Hello,

On Tue, Jul 16, 2024 at 01:10:14PM -0400, David Finkel wrote:
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> 
> The reason we need peak memory information is to let us schedule work in a
> way that we generally avoid OOM conditions. For the workloads I work on,
> we generally have very little in the page-cache, since the data isn't
> stored locally most of the time, but streamed from other storage/database
> systems. For those cases, demand-paging will cause large variations in
> servicing time, and we'd rather restart the process than have
> unpredictable latency. The same is true for the batch/queue-work system I
> wrote this patch to support. We keep very little data on the local disk,
> so the page cache is relatively small.

You can detect these conditions more reliably and *earlier* using PSI
triggers with swap enabled than hard allocations and OOM kills. Then, you
can take whatever decision you want to take including killing the job
without worrying about the whole system severely suffering. You can even do
things like freezing the cgroup and taking backtraces and collecting other
debug info to better understand why the memory usage is blowing up.

There are of course multiple ways to go about things but I think it's useful
to note that hard alloc based on peak usage + OOM kills likely isn't the
best way here.

...
> I appreciate the ownership issues with the current resetting interface in
> the other locations. However, this peak RSS data is not used by all that
> many applications (as evidenced by the fact that the memory.peak file was
> only added a bit over a year ago). I think there are enough cases where
> ownership is enforced externally that mirroring the existing interface to
> cgroup2 is sufficient.

It's fairly new addition and its utility is limited, so it's not that widely
used. Adding reset makes it more useful but in a way which can be
deterimental in the long term.

> I do think a more stateful interface would be nice, but I don't know
> whether I have enough knowledge of memcg to implement that in a reasonable
> amount of time.

Right, this probably isn't trivial.

> Ownership aside, I think being able to reset the high watermark of a
> process makes it significantly more useful. Creating new cgroups and
> moving processes around is significantly heavier-weight.

Yeah, the setup / teardown cost can be non-trivial for short lived cgroups.
I agree that having some way of measuring peak in different time intervals
can be useful.

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 19:48         ` Tejun Heo
@ 2024-07-16 20:18           ` David Finkel
  0 siblings, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-16 20:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Jul 16, 2024 at 3:48 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 01:10:14PM -0400, David Finkel wrote:
> > > Swap still has bad reps but there's nothing drastically worse about it than
> > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > or another. If there's no swap, the system is just memlocking anon memory
> > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > especially given that the system behavior is not very predictable under OOM
> > > conditions.
> >
> > The reason we need peak memory information is to let us schedule work in a
> > way that we generally avoid OOM conditions. For the workloads I work on,
> > we generally have very little in the page-cache, since the data isn't
> > stored locally most of the time, but streamed from other storage/database
> > systems. For those cases, demand-paging will cause large variations in
> > servicing time, and we'd rather restart the process than have
> > unpredictable latency. The same is true for the batch/queue-work system I
> > wrote this patch to support. We keep very little data on the local disk,
> > so the page cache is relatively small.
>
> You can detect these conditions more reliably and *earlier* using PSI
> triggers with swap enabled than hard allocations and OOM kills. Then, you
> can take whatever decision you want to take including killing the job
> without worrying about the whole system severely suffering. You can even do
> things like freezing the cgroup and taking backtraces and collecting other
> debug info to better understand why the memory usage is blowing up.
>
> There are of course multiple ways to go about things but I think it's useful
> to note that hard alloc based on peak usage + OOM kills likely isn't the
> best way here.

To be clear, my goal with peak memory tracking is to bin-pack in a way
that I don't encounter OOMs. I'd prefer to have a bit of headroom and
avoid OOMs if I can.

PSI does seem like a wonderful tool, and I do intend to use it, but
since it's a reactive
signal and doesn't provide absolute values for the total memory usage
that we'd need to
figure out in our central scheduler which work can cohabitate (and how
many instances),
it complements memory.peak rather than replacing my need for it.

FWIW, at the moment, we have some (partially broken) OOM-detection,
which does make
sense to swap out for PSI tracking/trigger-watching that takes care of
scaling down workers
when there's resource-pressure.
(Thanks for pointing out that PSI is generally a better signal than
OOMs for memory pressure)


Thanks again,

>
> ...
> > I appreciate the ownership issues with the current resetting interface in
> > the other locations. However, this peak RSS data is not used by all that
> > many applications (as evidenced by the fact that the memory.peak file was
> > only added a bit over a year ago). I think there are enough cases where
> > ownership is enforced externally that mirroring the existing interface to
> > cgroup2 is sufficient.
>
> It's fairly new addition and its utility is limited, so it's not that widely
> used. Adding reset makes it more useful but in a way which can be
> deterimental in the long term.
>
> > I do think a more stateful interface would be nice, but I don't know
> > whether I have enough knowledge of memcg to implement that in a reasonable
> > amount of time.
>
> Right, this probably isn't trivial.
>
> > Ownership aside, I think being able to reset the high watermark of a
> > process makes it significantly more useful. Creating new cgroups and
> > moving processes around is significantly heavier-weight.
>
> Yeah, the setup / teardown cost can be non-trivial for short lived cgroups.
> I agree that having some way of measuring peak in different time intervals
> can be useful.
>
> Thanks.
>
> --
> tejun



-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 16:44     ` Tejun Heo
  2024-07-16 17:01       ` Roman Gushchin
  2024-07-16 17:10       ` David Finkel
@ 2024-07-16 18:00       ` Michal Hocko
  2024-07-16 20:00         ` Tejun Heo
  2024-07-17 17:04       ` Johannes Weiner
  3 siblings, 1 reply; 47+ messages in thread
From: Michal Hocko @ 2024-07-16 18:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Finkel, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue 16-07-24 06:44:11, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
> 
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.

Completely agree on this!

> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.

yes, I understand and agree with you. Generally speaking peak value is
of a very limited value. On the other hand we already have it in v2 and
if it allows a reliable way to scale the workload (which seems to be the
case here) than reseting the value sounds like a cheaper value than
tearing down the memcg and creating it again (with all the dead cgroups
headache that might follow). The reset interface doesn't cause much from
the maintenance POV and if somebody wants to use it they surely need
find a way to coordinate.

> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.

I would rather not get back to that unless we have many more users that
really need that. Absolute value of the memory consumption is a long
living concept that doesn't make much sense most of the time. People
just tend to still use it because it is much simpler to compare two different
values rather than something as dynamic as PSI similar metrics.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 18:00       ` Michal Hocko
@ 2024-07-16 20:00         ` Tejun Heo
  2024-07-16 22:06           ` David Finkel
  2024-07-17  6:23           ` Michal Hocko
  0 siblings, 2 replies; 47+ messages in thread
From: Tejun Heo @ 2024-07-16 20:00 UTC (permalink / raw)
  To: Michal Hocko
  Cc: David Finkel, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

Hello,

On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
...
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> 
> I would rather not get back to that unless we have many more users that
> really need that. Absolute value of the memory consumption is a long
> living concept that doesn't make much sense most of the time. People
> just tend to still use it because it is much simpler to compare two different
> values rather than something as dynamic as PSI similar metrics.

The initial justification for adding memory.peak was that it's mostly to
monitor short lived cgroups. Adding reset would make it used more widely,
which isn't necessarily a bad thing and people most likely will find ways to
use it creatively. I'm mostly worried that that's going to create a mess
down the road. Yeah, so, it's not widely useful now but adding reset makes
it more useful and in a way which can potentially paint us into a corner.

But then again, maybe this is really niche, future usages will still remain
very restricted, and adding something more akin to PSI triggers is way
over-engineering it.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 20:00         ` Tejun Heo
@ 2024-07-16 22:06           ` David Finkel
  2024-07-17  6:26             ` Michal Hocko
  2024-07-17  6:23           ` Michal Hocko
  1 sibling, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-16 22:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shuah Khan, Johannes Weiner,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> ...
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> >
> > I would rather not get back to that unless we have many more users that
> > really need that. Absolute value of the memory consumption is a long
> > living concept that doesn't make much sense most of the time. People
> > just tend to still use it because it is much simpler to compare two different
> > values rather than something as dynamic as PSI similar metrics.
>
> The initial justification for adding memory.peak was that it's mostly to
> monitor short lived cgroups. Adding reset would make it used more widely,
> which isn't necessarily a bad thing and people most likely will find ways to
> use it creatively. I'm mostly worried that that's going to create a mess
> down the road. Yeah, so, it's not widely useful now but adding reset makes
> it more useful and in a way which can potentially paint us into a corner.

This is a pretty low-overhead feature as-is, but we can reduce it further by
changing it so instead of resetting the watermark on any non-empty string,
we reset only on one particular string.

I'm thinking of something like "global_reset\n", so if we do something like the
PSI interface later, users can write "fd_local_reset\n", and get that
nicer behavior.

This also has the benefit of allowing "echo global_reset >
/sys/fs/cgroup/.../memory.peak" to do the right thing.
(better names welcome)

>
> But then again, maybe this is really niche, future usages will still remain
> very restricted, and adding something more akin to PSI triggers is way
> over-engineering it.

Yeah, I think this is niche enough that it isn't worth over-engineering.
There is some value to keeping broad compatibility for things moving
from cgroups v1, too.

>
> Thanks.
>
> --
> tejun


Thanks again,
-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 22:06           ` David Finkel
@ 2024-07-17  6:26             ` Michal Hocko
  2024-07-17 14:24               ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Michal Hocko @ 2024-07-17  6:26 UTC (permalink / raw)
  To: David Finkel
  Cc: Tejun Heo, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shuah Khan, Johannes Weiner,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Tue 16-07-24 18:06:17, David Finkel wrote:
> On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
> >
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> > ...
> > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > could do something similar to pressure triggers - ie. let users register
> > > > watchers so that each user can define their own watch periods. This is more
> > > > involved but more useful and less error-inducing than adding reset to a
> > > > single counter.
> > >
> > > I would rather not get back to that unless we have many more users that
> > > really need that. Absolute value of the memory consumption is a long
> > > living concept that doesn't make much sense most of the time. People
> > > just tend to still use it because it is much simpler to compare two different
> > > values rather than something as dynamic as PSI similar metrics.
> >
> > The initial justification for adding memory.peak was that it's mostly to
> > monitor short lived cgroups. Adding reset would make it used more widely,
> > which isn't necessarily a bad thing and people most likely will find ways to
> > use it creatively. I'm mostly worried that that's going to create a mess
> > down the road. Yeah, so, it's not widely useful now but adding reset makes
> > it more useful and in a way which can potentially paint us into a corner.
> 
> This is a pretty low-overhead feature as-is, but we can reduce it further by
> changing it so instead of resetting the watermark on any non-empty string,
> we reset only on one particular string.
> 
> I'm thinking of something like "global_reset\n", so if we do something like the
> PSI interface later, users can write "fd_local_reset\n", and get that
> nicer behavior.
> 
> This also has the benefit of allowing "echo global_reset >
> /sys/fs/cgroup/.../memory.peak" to do the right thing.
> (better names welcome)

This would be a different behavior than in v1 and therefore confusing
for those who rely on this in v1 already. So I wouldn't overengineer it
and keep the semantic as simple as possible. If we decide to add PSI
triggers they are completely independent on peak value because that is
reclaim based interface which by definition makes peak value very
dubious.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17  6:26             ` Michal Hocko
@ 2024-07-17 14:24               ` David Finkel
  2024-07-17 15:46                 ` Michal Hocko
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-17 14:24 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tejun Heo, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shuah Khan, Johannes Weiner,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed, Jul 17, 2024 at 2:26 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 16-07-24 18:06:17, David Finkel wrote:
> > On Tue, Jul 16, 2024 at 4:00 PM Tejun Heo <tj@kernel.org> wrote:
> > >
> > > Hello,
> > >
> > > On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> > > ...
> > > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > > could do something similar to pressure triggers - ie. let users register
> > > > > watchers so that each user can define their own watch periods. This is more
> > > > > involved but more useful and less error-inducing than adding reset to a
> > > > > single counter.
> > > >
> > > > I would rather not get back to that unless we have many more users that
> > > > really need that. Absolute value of the memory consumption is a long
> > > > living concept that doesn't make much sense most of the time. People
> > > > just tend to still use it because it is much simpler to compare two different
> > > > values rather than something as dynamic as PSI similar metrics.
> > >
> > > The initial justification for adding memory.peak was that it's mostly to
> > > monitor short lived cgroups. Adding reset would make it used more widely,
> > > which isn't necessarily a bad thing and people most likely will find ways to
> > > use it creatively. I'm mostly worried that that's going to create a mess
> > > down the road. Yeah, so, it's not widely useful now but adding reset makes
> > > it more useful and in a way which can potentially paint us into a corner.
> >
> > This is a pretty low-overhead feature as-is, but we can reduce it further by
> > changing it so instead of resetting the watermark on any non-empty string,
> > we reset only on one particular string.
> >
> > I'm thinking of something like "global_reset\n", so if we do something like the
> > PSI interface later, users can write "fd_local_reset\n", and get that
> > nicer behavior.
> >
> > This also has the benefit of allowing "echo global_reset >
> > /sys/fs/cgroup/.../memory.peak" to do the right thing.
> > (better names welcome)
>
> This would be a different behavior than in v1 and therefore confusing
> for those who rely on this in v1 already. So I wouldn't overengineer it
> and keep the semantic as simple as possible. If we decide to add PSI
> triggers they are completely independent on peak value because that is
> reclaim based interface which by definition makes peak value very
> dubious.

That's fair.

My only thought is that "write any non-empty string", is a very wide interface
to support, and limits other possible behaviors later.

FWIW, I have patches with and without this narrowed interface ready to re-mail
pending the outcome of this discussion. (both include additional
use-case info in the changelog)

Keeping the "all non-empty writes" behavior:
https://github.com/dfinkel/linux/commit/5edb21882a88693024a95bbd76b4a8d4561348da

With the narrowed interface:
https://github.com/dfinkel/linux/commit/f341c8c0cf5fbdcf7af30e20b334e532df74906d

> --
> Michal Hocko
> SUSE Labs


Thanks,
-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 14:24               ` David Finkel
@ 2024-07-17 15:46                 ` Michal Hocko
  0 siblings, 0 replies; 47+ messages in thread
From: Michal Hocko @ 2024-07-17 15:46 UTC (permalink / raw)
  To: David Finkel
  Cc: Tejun Heo, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shuah Khan, Johannes Weiner,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed 17-07-24 10:24:07, David Finkel wrote:
> On Wed, Jul 17, 2024 at 2:26 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 16-07-24 18:06:17, David Finkel wrote:
[...]
> > > I'm thinking of something like "global_reset\n", so if we do something like the
> > > PSI interface later, users can write "fd_local_reset\n", and get that
> > > nicer behavior.
> > >
> > > This also has the benefit of allowing "echo global_reset >
> > > /sys/fs/cgroup/.../memory.peak" to do the right thing.
> > > (better names welcome)
> >
> > This would be a different behavior than in v1 and therefore confusing
> > for those who rely on this in v1 already. So I wouldn't overengineer it
> > and keep the semantic as simple as possible. If we decide to add PSI
> > triggers they are completely independent on peak value because that is
> > reclaim based interface which by definition makes peak value very
> > dubious.
> 
> That's fair.
> 
> My only thought is that "write any non-empty string", is a very wide interface
> to support, and limits other possible behaviors later.

yes, that ship has sailed long time ago.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 20:00         ` Tejun Heo
  2024-07-16 22:06           ` David Finkel
@ 2024-07-17  6:23           ` Michal Hocko
  1 sibling, 0 replies; 47+ messages in thread
From: Michal Hocko @ 2024-07-17  6:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Finkel, Muchun Song, Andrew Morton, core-services,
	Jonathan Corbet, Roman Gushchin, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue 16-07-24 10:00:10, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 08:00:52PM +0200, Michal Hocko wrote:
> ...
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> > 
> > I would rather not get back to that unless we have many more users that
> > really need that. Absolute value of the memory consumption is a long
> > living concept that doesn't make much sense most of the time. People
> > just tend to still use it because it is much simpler to compare two different
> > values rather than something as dynamic as PSI similar metrics.
> 
> The initial justification for adding memory.peak was that it's mostly to
> monitor short lived cgroups. Adding reset would make it used more widely,
> which isn't necessarily a bad thing and people most likely will find ways to
> use it creatively. I'm mostly worried that that's going to create a mess
> down the road. Yeah, so, it's not widely useful now but adding reset makes
> it more useful and in a way which can potentially paint us into a corner.

I really fail to see how this would cause problems with future
maintainability. It is not like we are trying to deprecate this
memory.peak.

I am also not sure this makes it so much more attractive that people
would start using it just because they can reset the value. It makes
sense to extend our documentation and actually describe pitfalls.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-16 16:44     ` Tejun Heo
                         ` (2 preceding siblings ...)
  2024-07-16 18:00       ` Michal Hocko
@ 2024-07-17 17:04       ` Johannes Weiner
  2024-07-17 20:14         ` David Finkel
  2024-07-18 21:49         ` David Finkel
  3 siblings, 2 replies; 47+ messages in thread
From: Johannes Weiner @ 2024-07-17 17:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Hocko, David Finkel, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shakeel Butt,
	Shuah Khan, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> ...
> > > This behavior is particularly useful for work scheduling systems that
> > > need to track memory usage of worker processes/cgroups per-work-item.
> > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > opinions), these systems need to track the peak memory usage to compute
> > > system/container fullness when binpacking workitems.
> 
> Swap still has bad reps but there's nothing drastically worse about it than
> page cache. ie. If you're under memory pressure, you get thrashing one way
> or another. If there's no swap, the system is just memlocking anon memory
> even when they are a lot colder than page cache, so I'm skeptical that no
> swap + mostly anon + kernel OOM kills is a good strategy in general
> especially given that the system behavior is not very predictable under OOM
> conditions.
> 
> > As mentioned down the email thread, I consider usefulness of peak value
> > rather limited. It is misleading when memory is reclaimed. But
> > fundamentally I do not oppose to unifying the write behavior to reset
> > values.
> 
> The removal of resets was intentional. The problem was that it wasn't clear
> who owned those counters and there's no way of telling who reset what when.
> It was easy to accidentally end up with multiple entities that think they
> can get timed measurement by resetting.
> 
> So, in general, I don't think this is a great idea. There are shortcomings
> to how memory.peak behaves in that its meaningfulness quickly declines over
> time. This is expected and the rationale behind adding memory.peak, IIRC,
> was that it was difficult to tell the memory usage of a short-lived cgroup.
> 
> If we want to allow peak measurement of time periods, I wonder whether we
> could do something similar to pressure triggers - ie. let users register
> watchers so that each user can define their own watch periods. This is more
> involved but more useful and less error-inducing than adding reset to a
> single counter.
> 
> Johannes, what do you think?

I'm also not a fan of the ability to reset globally.

I seem to remember a scheme we discussed some time ago to do local
state tracking without having the overhead in the page counter
fastpath. The new data that needs to be tracked is a pc->local_peak
(in the page_counter) and an fd->peak (in the watcher's file state).

1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.

2. Somebody opens the memory.peak. Initialize fd->peak = -1.

3. If they write, set fd->peak = pc->local_peak = usage.

4. Usage grows.

5. They read(). A conventional reader has fd->peak == -1, so we return
   pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).

6. Usage drops.

7. New watcher opens and writes. Bring up all existing watchers'
   fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
   Then set the new fd->peak = pc->local_peak = current usage as in 3.

8. See 5. again for read() from each watcher.

This way all fd's can arbitrarily start tracking new local peaks with
write(). The operation in the charging fast path is cheap. The write()
is O(existing_watchers), which seems reasonable. It's fully backward
compatible with conventional open() + read() users.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 17:04       ` Johannes Weiner
@ 2024-07-17 20:14         ` David Finkel
  2024-07-17 20:44           ` Johannes Weiner
  2024-07-18 21:49         ` David Finkel
  1 sibling, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-17 20:14 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions), these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
> >
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> >
> > > As mentioned down the email thread, I consider usefulness of peak value
> > > rather limited. It is misleading when memory is reclaimed. But
> > > fundamentally I do not oppose to unifying the write behavior to reset
> > > values.
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> >
> > Johannes, what do you think?
>
> I'm also not a fan of the ability to reset globally.
>
> I seem to remember a scheme we discussed some time ago to do local
> state tracking without having the overhead in the page counter
> fastpath. The new data that needs to be tracked is a pc->local_peak
> (in the page_counter) and an fd->peak (in the watcher's file state).
>
> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>
> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>
> 3. If they write, set fd->peak = pc->local_peak = usage.
>
> 4. Usage grows.
>
> 5. They read(). A conventional reader has fd->peak == -1, so we return
>    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>
> 6. Usage drops.
>
> 7. New watcher opens and writes. Bring up all existing watchers'
>    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>    Then set the new fd->peak = pc->local_peak = current usage as in 3.
>
> 8. See 5. again for read() from each watcher.
>
> This way all fd's can arbitrarily start tracking new local peaks with
> write(). The operation in the charging fast path is cheap. The write()
> is O(existing_watchers), which seems reasonable. It's fully backward
> compatible with conventional open() + read() users.

That scheme seems viable, but it's a lot more work to implement and maintain
than a simple global reset.

Since that scheme maintains a separate pc->local_peak, it's not mutually
exclusive with implementing a global reset now. (as long as we reserve a
way to distinguish the different kinds of writes).

As discussed on other sub-threads, this might be too niche to be worth
the significant complexity of avoiding a global reset. (especially when
users would likely be moving from cgroups v1 which does have a global reset)

Thanks,
-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 20:14         ` David Finkel
@ 2024-07-17 20:44           ` Johannes Weiner
  2024-07-17 21:13             ` David Finkel
  2024-07-18  7:21             ` Michal Hocko
  0 siblings, 2 replies; 47+ messages in thread
From: Johannes Weiner @ 2024-07-17 20:44 UTC (permalink / raw)
  To: David Finkel
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
> On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > > Hello,
> > >
> > > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > > ...
> > > > > This behavior is particularly useful for work scheduling systems that
> > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > opinions), these systems need to track the peak memory usage to compute
> > > > > system/container fullness when binpacking workitems.
> > >
> > > Swap still has bad reps but there's nothing drastically worse about it than
> > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > or another. If there's no swap, the system is just memlocking anon memory
> > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > especially given that the system behavior is not very predictable under OOM
> > > conditions.
> > >
> > > > As mentioned down the email thread, I consider usefulness of peak value
> > > > rather limited. It is misleading when memory is reclaimed. But
> > > > fundamentally I do not oppose to unifying the write behavior to reset
> > > > values.
> > >
> > > The removal of resets was intentional. The problem was that it wasn't clear
> > > who owned those counters and there's no way of telling who reset what when.
> > > It was easy to accidentally end up with multiple entities that think they
> > > can get timed measurement by resetting.
> > >
> > > So, in general, I don't think this is a great idea. There are shortcomings
> > > to how memory.peak behaves in that its meaningfulness quickly declines over
> > > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > > was that it was difficult to tell the memory usage of a short-lived cgroup.
> > >
> > > If we want to allow peak measurement of time periods, I wonder whether we
> > > could do something similar to pressure triggers - ie. let users register
> > > watchers so that each user can define their own watch periods. This is more
> > > involved but more useful and less error-inducing than adding reset to a
> > > single counter.
> > >
> > > Johannes, what do you think?
> >
> > I'm also not a fan of the ability to reset globally.
> >
> > I seem to remember a scheme we discussed some time ago to do local
> > state tracking without having the overhead in the page counter
> > fastpath. The new data that needs to be tracked is a pc->local_peak
> > (in the page_counter) and an fd->peak (in the watcher's file state).
> >
> > 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
> >
> > 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
> >
> > 3. If they write, set fd->peak = pc->local_peak = usage.
> >
> > 4. Usage grows.
> >
> > 5. They read(). A conventional reader has fd->peak == -1, so we return
> >    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
> >
> > 6. Usage drops.
> >
> > 7. New watcher opens and writes. Bring up all existing watchers'
> >    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
> >    Then set the new fd->peak = pc->local_peak = current usage as in 3.
> >
> > 8. See 5. again for read() from each watcher.
> >
> > This way all fd's can arbitrarily start tracking new local peaks with
> > write(). The operation in the charging fast path is cheap. The write()
> > is O(existing_watchers), which seems reasonable. It's fully backward
> > compatible with conventional open() + read() users.
> 
> That scheme seems viable, but it's a lot more work to implement and maintain
> than a simple global reset.
> 
> Since that scheme maintains a separate pc->local_peak, it's not mutually
> exclusive with implementing a global reset now. (as long as we reserve a
> way to distinguish the different kinds of writes).
> 
> As discussed on other sub-threads, this might be too niche to be worth
> the significant complexity of avoiding a global reset. (especially when
> users would likely be moving from cgroups v1 which does have a global reset)

The problem is that once global resetting is allowed, it makes the
number reported in memory.peak unreliable for everyone. You just don't
know, and can't tell, if somebody wrote to it recently. It's not too
much of a leap to say this breaks the existing interface contract.

You have to decide whether the above is worth implementing. But my
take is that the downsides of the simpler solution outweigh its
benefits.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 20:44           ` Johannes Weiner
@ 2024-07-17 21:13             ` David Finkel
  2024-07-17 23:48               ` Waiman Long
  2024-07-18  7:21             ` Michal Hocko
  1 sibling, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-17 21:13 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed, Jul 17, 2024 at 4:44 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
> > On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > >
> > > On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > > > Hello,
> > > >
> > > > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > > > ...
> > > > > > This behavior is particularly useful for work scheduling systems that
> > > > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > > > opinions), these systems need to track the peak memory usage to compute
> > > > > > system/container fullness when binpacking workitems.
> > > >
> > > > Swap still has bad reps but there's nothing drastically worse about it than
> > > > page cache. ie. If you're under memory pressure, you get thrashing one way
> > > > or another. If there's no swap, the system is just memlocking anon memory
> > > > even when they are a lot colder than page cache, so I'm skeptical that no
> > > > swap + mostly anon + kernel OOM kills is a good strategy in general
> > > > especially given that the system behavior is not very predictable under OOM
> > > > conditions.
> > > >
> > > > > As mentioned down the email thread, I consider usefulness of peak value
> > > > > rather limited. It is misleading when memory is reclaimed. But
> > > > > fundamentally I do not oppose to unifying the write behavior to reset
> > > > > values.
> > > >
> > > > The removal of resets was intentional. The problem was that it wasn't clear
> > > > who owned those counters and there's no way of telling who reset what when.
> > > > It was easy to accidentally end up with multiple entities that think they
> > > > can get timed measurement by resetting.
> > > >
> > > > So, in general, I don't think this is a great idea. There are shortcomings
> > > > to how memory.peak behaves in that its meaningfulness quickly declines over
> > > > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > > > was that it was difficult to tell the memory usage of a short-lived cgroup.
> > > >
> > > > If we want to allow peak measurement of time periods, I wonder whether we
> > > > could do something similar to pressure triggers - ie. let users register
> > > > watchers so that each user can define their own watch periods. This is more
> > > > involved but more useful and less error-inducing than adding reset to a
> > > > single counter.
> > > >
> > > > Johannes, what do you think?
> > >
> > > I'm also not a fan of the ability to reset globally.
> > >
> > > I seem to remember a scheme we discussed some time ago to do local
> > > state tracking without having the overhead in the page counter
> > > fastpath. The new data that needs to be tracked is a pc->local_peak
> > > (in the page_counter) and an fd->peak (in the watcher's file state).
> > >
> > > 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
> > >
> > > 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
> > >
> > > 3. If they write, set fd->peak = pc->local_peak = usage.
> > >
> > > 4. Usage grows.
> > >
> > > 5. They read(). A conventional reader has fd->peak == -1, so we return
> > >    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
> > >
> > > 6. Usage drops.
> > >
> > > 7. New watcher opens and writes. Bring up all existing watchers'
> > >    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
> > >    Then set the new fd->peak = pc->local_peak = current usage as in 3.
> > >
> > > 8. See 5. again for read() from each watcher.
> > >
> > > This way all fd's can arbitrarily start tracking new local peaks with
> > > write(). The operation in the charging fast path is cheap. The write()
> > > is O(existing_watchers), which seems reasonable. It's fully backward
> > > compatible with conventional open() + read() users.
> >
> > That scheme seems viable, but it's a lot more work to implement and maintain
> > than a simple global reset.
> >
> > Since that scheme maintains a separate pc->local_peak, it's not mutually
> > exclusive with implementing a global reset now. (as long as we reserve a
> > way to distinguish the different kinds of writes).
> >
> > As discussed on other sub-threads, this might be too niche to be worth
> > the significant complexity of avoiding a global reset. (especially when
> > users would likely be moving from cgroups v1 which does have a global reset)
>
> The problem is that once global resetting is allowed, it makes the
> number reported in memory.peak unreliable for everyone. You just don't
> know, and can't tell, if somebody wrote to it recently. It's not too
> much of a leap to say this breaks the existing interface contract.

It does make it hard to tell when it was reset, however, it also allows some
very powerful commandline interactions that aren't possible if you need to
keep a persistent fd open.

I have run things in cgroups to measure peak memory and CPU-time for
things that have subprocesses. If I needed to keep a persistent fd open
in order to reset the high watermark, it would have been far less useful.

Honestly, I don't see a ton of value in tracking the peak memory if I
can't reset it.
It's not my use-case, but, there are a lot of cases where process-startup uses
a lot more memory than the steady-state, so the sysadmin might want to
measure that startup peak and any later peaks separately.

In my use-case, I do have a long-lived process managing the cgroups
for its workers, so I could keep an fd around and reset it as necessary.
However, I do sometimes shell into the relevant k8s container and poke
at the cgroups with a shell, and having to dup that managing processes'
FD somehow to check the high watermark while debugging would be
rather annoying. (although definitely not a dealbreaker)

>
> You have to decide whether the above is worth implementing. But my
> take is that the downsides of the simpler solution outweigh its
> benefits.

There are a few parts to my reticence to implement something
more complicated.
 1) Correctly cleaning up when one of those FDs gets closed can
     be subtle
 2) It's a lot of code, in some very sensitive portions of the kernel,
     so I'd need to test that code a lot more than I do for slapping
     a new entrypoint on the existing watermark reset of the
     page_counter.
 3) For various reasons, the relevant workload runs on
     Google Kubernetes Engine with their Container Optimised OS.
     If the patch is simple enough, I can request that Google
     cherry-pick the relevant commit, so we don't have to wait
     over a year for the next LTS kernel to roll out before we
     can switch to cgroups v2.

It would be a nice personal challenge to implement the solution
you suggest, but it's definitely not something I'd knock out in the
next couple days.

Thanks,
-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 21:13             ` David Finkel
@ 2024-07-17 23:48               ` Waiman Long
  2024-07-18  1:24                 ` Tejun Heo
  0 siblings, 1 reply; 47+ messages in thread
From: Waiman Long @ 2024-07-17 23:48 UTC (permalink / raw)
  To: David Finkel, Johannes Weiner
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On 7/17/24 17:13, David Finkel wrote:
> On Wed, Jul 17, 2024 at 4:44 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>> On Wed, Jul 17, 2024 at 04:14:07PM -0400, David Finkel wrote:
>>> On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>>>> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
>>>>> Hello,
>>>>>
>>>>> On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
>>>>> ...
>>>>>>> This behavior is particularly useful for work scheduling systems that
>>>>>>> need to track memory usage of worker processes/cgroups per-work-item.
>>>>>>> Since memory can't be squeezed like CPU can (the OOM-killer has
>>>>>>> opinions), these systems need to track the peak memory usage to compute
>>>>>>> system/container fullness when binpacking workitems.
>>>>> Swap still has bad reps but there's nothing drastically worse about it than
>>>>> page cache. ie. If you're under memory pressure, you get thrashing one way
>>>>> or another. If there's no swap, the system is just memlocking anon memory
>>>>> even when they are a lot colder than page cache, so I'm skeptical that no
>>>>> swap + mostly anon + kernel OOM kills is a good strategy in general
>>>>> especially given that the system behavior is not very predictable under OOM
>>>>> conditions.
>>>>>
>>>>>> As mentioned down the email thread, I consider usefulness of peak value
>>>>>> rather limited. It is misleading when memory is reclaimed. But
>>>>>> fundamentally I do not oppose to unifying the write behavior to reset
>>>>>> values.
>>>>> The removal of resets was intentional. The problem was that it wasn't clear
>>>>> who owned those counters and there's no way of telling who reset what when.
>>>>> It was easy to accidentally end up with multiple entities that think they
>>>>> can get timed measurement by resetting.
>>>>>
>>>>> So, in general, I don't think this is a great idea. There are shortcomings
>>>>> to how memory.peak behaves in that its meaningfulness quickly declines over
>>>>> time. This is expected and the rationale behind adding memory.peak, IIRC,
>>>>> was that it was difficult to tell the memory usage of a short-lived cgroup.
>>>>>
>>>>> If we want to allow peak measurement of time periods, I wonder whether we
>>>>> could do something similar to pressure triggers - ie. let users register
>>>>> watchers so that each user can define their own watch periods. This is more
>>>>> involved but more useful and less error-inducing than adding reset to a
>>>>> single counter.
>>>>>
>>>>> Johannes, what do you think?
>>>> I'm also not a fan of the ability to reset globally.
>>>>
>>>> I seem to remember a scheme we discussed some time ago to do local
>>>> state tracking without having the overhead in the page counter
>>>> fastpath. The new data that needs to be tracked is a pc->local_peak
>>>> (in the page_counter) and an fd->peak (in the watcher's file state).
>>>>
>>>> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>>>>
>>>> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>>>>
>>>> 3. If they write, set fd->peak = pc->local_peak = usage.
>>>>
>>>> 4. Usage grows.
>>>>
>>>> 5. They read(). A conventional reader has fd->peak == -1, so we return
>>>>     pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>>>>
>>>> 6. Usage drops.
>>>>
>>>> 7. New watcher opens and writes. Bring up all existing watchers'
>>>>     fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>>>>     Then set the new fd->peak = pc->local_peak = current usage as in 3.
>>>>
>>>> 8. See 5. again for read() from each watcher.
>>>>
>>>> This way all fd's can arbitrarily start tracking new local peaks with
>>>> write(). The operation in the charging fast path is cheap. The write()
>>>> is O(existing_watchers), which seems reasonable. It's fully backward
>>>> compatible with conventional open() + read() users.
>>> That scheme seems viable, but it's a lot more work to implement and maintain
>>> than a simple global reset.
>>>
>>> Since that scheme maintains a separate pc->local_peak, it's not mutually
>>> exclusive with implementing a global reset now. (as long as we reserve a
>>> way to distinguish the different kinds of writes).
>>>
>>> As discussed on other sub-threads, this might be too niche to be worth
>>> the significant complexity of avoiding a global reset. (especially when
>>> users would likely be moving from cgroups v1 which does have a global reset)
>> The problem is that once global resetting is allowed, it makes the
>> number reported in memory.peak unreliable for everyone. You just don't
>> know, and can't tell, if somebody wrote to it recently. It's not too
>> much of a leap to say this breaks the existing interface contract.
> It does make it hard to tell when it was reset, however, it also allows some
> very powerful commandline interactions that aren't possible if you need to
> keep a persistent fd open.
>
> I have run things in cgroups to measure peak memory and CPU-time for
> things that have subprocesses. If I needed to keep a persistent fd open
> in order to reset the high watermark, it would have been far less useful.
>
> Honestly, I don't see a ton of value in tracking the peak memory if I
> can't reset it.
> It's not my use-case, but, there are a lot of cases where process-startup uses
> a lot more memory than the steady-state, so the sysadmin might want to
> measure that startup peak and any later peaks separately.
>
> In my use-case, I do have a long-lived process managing the cgroups
> for its workers, so I could keep an fd around and reset it as necessary.
> However, I do sometimes shell into the relevant k8s container and poke
> at the cgroups with a shell, and having to dup that managing processes'
> FD somehow to check the high watermark while debugging would be
> rather annoying. (although definitely not a dealbreaker)
>
>> You have to decide whether the above is worth implementing. But my
>> take is that the downsides of the simpler solution outweigh its
>> benefits.
> There are a few parts to my reticence to implement something
> more complicated.
>   1) Correctly cleaning up when one of those FDs gets closed can
>       be subtle
>   2) It's a lot of code, in some very sensitive portions of the kernel,
>       so I'd need to test that code a lot more than I do for slapping
>       a new entrypoint on the existing watermark reset of the
>       page_counter.
>   3) For various reasons, the relevant workload runs on
>       Google Kubernetes Engine with their Container Optimised OS.
>       If the patch is simple enough, I can request that Google
>       cherry-pick the relevant commit, so we don't have to wait
>       over a year for the next LTS kernel to roll out before we
>       can switch to cgroups v2.
>
> It would be a nice personal challenge to implement the solution
> you suggest, but it's definitely not something I'd knock out in the
> next couple days.

How about letting .peak shows two numbers? The first one is the peak 
since the creation of the cgroup and cannot be reset. The second one is 
a local maximum that can be reset to 0. We just to keep track of one 
more counter that should be simple enough to implement.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 23:48               ` Waiman Long
@ 2024-07-18  1:24                 ` Tejun Heo
  2024-07-18  2:17                   ` Roman Gushchin
  2024-07-18  2:22                   ` Waiman Long
  0 siblings, 2 replies; 47+ messages in thread
From: Tejun Heo @ 2024-07-18  1:24 UTC (permalink / raw)
  To: Waiman Long
  Cc: David Finkel, Johannes Weiner, Michal Hocko, Muchun Song,
	Andrew Morton, core-services, Jonathan Corbet, Roman Gushchin,
	Shuah Khan, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest, Shakeel Butt

On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
...
> How about letting .peak shows two numbers? The first one is the peak since
> the creation of the cgroup and cannot be reset. The second one is a local
> maximum that can be reset to 0. We just to keep track of one more counter
> that should be simple enough to implement.

What Johannes suggested seems to hit all the marks - it's efficient and
relatively simple, the overhead is only on the users of the facility, and
flexible in a straightforward manner. I have a hard time buying the argument
that it's more difficult to use - the benefit to cost ratio seems pretty
clear. Given that, I'm not sure why we'd want to add something fishy that
can lead to longterm problems.

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-18  1:24                 ` Tejun Heo
@ 2024-07-18  2:17                   ` Roman Gushchin
  2024-07-18  2:22                   ` Waiman Long
  1 sibling, 0 replies; 47+ messages in thread
From: Roman Gushchin @ 2024-07-18  2:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Waiman Long, David Finkel, Johannes Weiner, Michal Hocko,
	Muchun Song, Andrew Morton, core-services, Jonathan Corbet,
	Shuah Khan, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest, Shakeel Butt

On Wed, Jul 17, 2024 at 03:24:49PM -1000, Tejun Heo wrote:
> On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
> ...
> > How about letting .peak shows two numbers? The first one is the peak since
> > the creation of the cgroup and cannot be reset. The second one is a local
> > maximum that can be reset to 0. We just to keep track of one more counter
> > that should be simple enough to implement.
> 
> What Johannes suggested seems to hit all the marks - it's efficient and
> relatively simple, the overhead is only on the users of the facility, and
> flexible in a straightforward manner. I have a hard time buying the argument
> that it's more difficult to use - the benefit to cost ratio seems pretty
> clear. Given that, I'm not sure why we'd want to add something fishy that
> can lead to longterm problems.

+1 to that. I really like Johannes's suggestion.

Thanks


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-18  1:24                 ` Tejun Heo
  2024-07-18  2:17                   ` Roman Gushchin
@ 2024-07-18  2:22                   ` Waiman Long
  1 sibling, 0 replies; 47+ messages in thread
From: Waiman Long @ 2024-07-18  2:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Finkel, Johannes Weiner, Michal Hocko, Muchun Song,
	Andrew Morton, core-services, Jonathan Corbet, Roman Gushchin,
	Shuah Khan, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest, Shakeel Butt


On 7/17/24 21:24, Tejun Heo wrote:
> On Wed, Jul 17, 2024 at 07:48:40PM -0400, Waiman Long wrote:
> ...
>> How about letting .peak shows two numbers? The first one is the peak since
>> the creation of the cgroup and cannot be reset. The second one is a local
>> maximum that can be reset to 0. We just to keep track of one more counter
>> that should be simple enough to implement.
> What Johannes suggested seems to hit all the marks - it's efficient and
> relatively simple, the overhead is only on the users of the facility, and
> flexible in a straightforward manner. I have a hard time buying the argument
> that it's more difficult to use - the benefit to cost ratio seems pretty
> clear. Given that, I'm not sure why we'd want to add something fishy that
> can lead to longterm problems.

On second thought, it is a change in the user interface that may break 
existing apps that use the peak file. So it is probably better to go 
with Johannes' suggestion.

Thanks,
Longman



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 20:44           ` Johannes Weiner
  2024-07-17 21:13             ` David Finkel
@ 2024-07-18  7:21             ` Michal Hocko
  1 sibling, 0 replies; 47+ messages in thread
From: Michal Hocko @ 2024-07-18  7:21 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: David Finkel, Tejun Heo, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed 17-07-24 16:44:53, Johannes Weiner wrote:
[...]
> The problem is that once global resetting is allowed, it makes the
> number reported in memory.peak unreliable for everyone. You just don't
> know, and can't tell, if somebody wrote to it recently. It's not too
> much of a leap to say this breaks the existing interface contract.

I do not remember any bug reports from v1 where there was a max usage
misreported because of uncoordinated value reseting. So while you are
right that this is theoretically possible I am not convinced this is a
real problem in practice.

On the other hand it seems there is a wider agreement this shouldn't be
added to v2 and I do respect that.

> You have to decide whether the above is worth implementing. But my
> take is that the downsides of the simpler solution outweigh its
> benefits.

While this seems quite elegant I am not convinced this is really worth
the additional code for a metric like peak memory consumption which is a
very limited metric in a presence of memory reclaim.

Thanks!
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-17 17:04       ` Johannes Weiner
  2024-07-17 20:14         ` David Finkel
@ 2024-07-18 21:49         ` David Finkel
  2024-07-19  3:23           ` Waiman Long
  1 sibling, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-18 21:49 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt

On Wed, Jul 17, 2024 at 1:04 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Tue, Jul 16, 2024 at 06:44:11AM -1000, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Jul 16, 2024 at 03:48:17PM +0200, Michal Hocko wrote:
> > ...
> > > > This behavior is particularly useful for work scheduling systems that
> > > > need to track memory usage of worker processes/cgroups per-work-item.
> > > > Since memory can't be squeezed like CPU can (the OOM-killer has
> > > > opinions), these systems need to track the peak memory usage to compute
> > > > system/container fullness when binpacking workitems.
> >
> > Swap still has bad reps but there's nothing drastically worse about it than
> > page cache. ie. If you're under memory pressure, you get thrashing one way
> > or another. If there's no swap, the system is just memlocking anon memory
> > even when they are a lot colder than page cache, so I'm skeptical that no
> > swap + mostly anon + kernel OOM kills is a good strategy in general
> > especially given that the system behavior is not very predictable under OOM
> > conditions.
> >
> > > As mentioned down the email thread, I consider usefulness of peak value
> > > rather limited. It is misleading when memory is reclaimed. But
> > > fundamentally I do not oppose to unifying the write behavior to reset
> > > values.
> >
> > The removal of resets was intentional. The problem was that it wasn't clear
> > who owned those counters and there's no way of telling who reset what when.
> > It was easy to accidentally end up with multiple entities that think they
> > can get timed measurement by resetting.
> >
> > So, in general, I don't think this is a great idea. There are shortcomings
> > to how memory.peak behaves in that its meaningfulness quickly declines over
> > time. This is expected and the rationale behind adding memory.peak, IIRC,
> > was that it was difficult to tell the memory usage of a short-lived cgroup.
> >
> > If we want to allow peak measurement of time periods, I wonder whether we
> > could do something similar to pressure triggers - ie. let users register
> > watchers so that each user can define their own watch periods. This is more
> > involved but more useful and less error-inducing than adding reset to a
> > single counter.
> >
> > Johannes, what do you think?
>
> I'm also not a fan of the ability to reset globally.
>
> I seem to remember a scheme we discussed some time ago to do local
> state tracking without having the overhead in the page counter
> fastpath. The new data that needs to be tracked is a pc->local_peak
> (in the page_counter) and an fd->peak (in the watcher's file state).
>
> 1. Usage peak is tracked in pc->watermark, and now also in pc->local_peak.
>
> 2. Somebody opens the memory.peak. Initialize fd->peak = -1.
>
> 3. If they write, set fd->peak = pc->local_peak = usage.
>
> 4. Usage grows.
>
> 5. They read(). A conventional reader has fd->peak == -1, so we return
>    pc->watermark. If the fd has been written to, return max(fd->peak, pc->local_peak).
>
> 6. Usage drops.
>
> 7. New watcher opens and writes. Bring up all existing watchers'
>    fd->peak (that aren't -1) to pc->local_peak *iff* latter is bigger.
>    Then set the new fd->peak = pc->local_peak = current usage as in 3.
>
> 8. See 5. again for read() from each watcher.
>
> This way all fd's can arbitrarily start tracking new local peaks with
> write(). The operation in the charging fast path is cheap. The write()
> is O(existing_watchers), which seems reasonable. It's fully backward
> compatible with conventional open() + read() users.

I spent some time today attempting to implement this.
Here's a branch on github that compiles, and I think is close to what you
described, but is definitely still a WIP:

https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session

Since there seems to be significant agreement that this approach is better
long-term as a kernel interface, if that continues, I can factor out some of
the changes so it supports both memory.peak and memory.swap.peak,
fix the tests, and clean up any style issues tomorrow.

Also, If there are opinions on whether the cgroup_lock is a reasonable way
of handling this synchronization, or if I should add a more appropriate spinlock
or mutex onto either the pagecounter or the memcg, I'm all ears.

(I can mail the WIP change as-is if that's prefered to github)

-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-18 21:49         ` David Finkel
@ 2024-07-19  3:23           ` Waiman Long
  2024-07-22 15:18             ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Waiman Long @ 2024-07-19  3:23 UTC (permalink / raw)
  To: David Finkel, Johannes Weiner
  Cc: Tejun Heo, Michal Hocko, Muchun Song, Andrew Morton,
	core-services, Jonathan Corbet, Roman Gushchin, Shuah Khan,
	Zefan Li, cgroups, linux-doc, linux-mm, linux-kselftest,
	Shakeel Butt


On 7/18/24 17:49, David Finkel wrote:
> I spent some time today attempting to implement this.
> Here's a branch on github that compiles, and I think is close to what you
> described, but is definitely still a WIP:
>
> https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session
>
> Since there seems to be significant agreement that this approach is better
> long-term as a kernel interface, if that continues, I can factor out some of
> the changes so it supports both memory.peak and memory.swap.peak,
> fix the tests, and clean up any style issues tomorrow.
>
> Also, If there are opinions on whether the cgroup_lock is a reasonable way
> of handling this synchronization, or if I should add a more appropriate spinlock
> or mutex onto either the pagecounter or the memcg, I'm all ears.

cgroup_lock() should only be used by the cgroup core code, though there 
are exceptions.

You may or may not need lock protection depending on what data you want 
to protect and if there is any chance that concurrent race may screw 
thing up. If lock protection is really needed, add your own lock to 
protect the data. Since your critical sections seem to be pretty short, 
a regular spinlock will be enough.

Cheers,
Longman



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-19  3:23           ` Waiman Long
@ 2024-07-22 15:18             ` David Finkel
  0 siblings, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-22 15:18 UTC (permalink / raw)
  To: Waiman Long
  Cc: Johannes Weiner, Tejun Heo, Michal Hocko, Muchun Song,
	Andrew Morton, core-services, Jonathan Corbet, Roman Gushchin,
	Shuah Khan, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest, Shakeel Butt

On Thu, Jul 18, 2024 at 11:23 PM Waiman Long <longman@redhat.com> wrote:
>
>
> On 7/18/24 17:49, David Finkel wrote:
> > I spent some time today attempting to implement this.
> > Here's a branch on github that compiles, and I think is close to what you
> > described, but is definitely still a WIP:
> >
> > https://github.com/torvalds/linux/compare/master...dfinkel:linux:memcg2_memory_peak_fd_session
> >
> > Since there seems to be significant agreement that this approach is better
> > long-term as a kernel interface, if that continues, I can factor out some of
> > the changes so it supports both memory.peak and memory.swap.peak,
> > fix the tests, and clean up any style issues tomorrow.
> >
> > Also, If there are opinions on whether the cgroup_lock is a reasonable way
> > of handling this synchronization, or if I should add a more appropriate spinlock
> > or mutex onto either the pagecounter or the memcg, I'm all ears.
>
> cgroup_lock() should only be used by the cgroup core code, though there
> are exceptions.
>
> You may or may not need lock protection depending on what data you want
> to protect and if there is any chance that concurrent race may screw
> thing up. If lock protection is really needed, add your own lock to
> protect the data. Since your critical sections seem to be pretty short,
> a regular spinlock will be enough.

Thanks, using cgroup_lock() created a deadlock, and using a spinlock
resolved that.

>
> Cheers,
> Longman
>


-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers (fd-local edition)
@ 2024-07-22 15:17 David Finkel
  2024-07-22 15:17 ` [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-22 15:17 UTC (permalink / raw)
  To: Muchun Song, Tejun Heo, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, Johannes Weiner, Zefan Li, cgroups,
	linux-doc, linux-mm, linux-kselftest

My last patch[1] was met with a general desire for a safer scheme that
avoided global resets, which expose unclear ownership.

Fortunately, Johannes[2] suggested a reasonably simple scheme to provide
an FD-local reset, which eliminates most of those issues.

The one open question I have is whether the cgroup/memcg itself is kept
alive by an open FD, or if we need to update the memcg freeing code to
traverse the new list of "watchers" so they don't try to access freed
memory.

Thank you,

David Finkel
Senior Principal Software Engineer, Core Services
Vimeo Inc.

[1]: https://lore.kernel.org/cgroups/20240715203625.1462309-1-davidf@vimeo.com/
[2]: https://lore.kernel.org/cgroups/20240717170408.GC1321673@cmpxchg.org/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-22 15:17 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers (fd-local edition) David Finkel
@ 2024-07-22 15:17 ` David Finkel
  2024-07-22 18:22   ` Roman Gushchin
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-22 15:17 UTC (permalink / raw)
  To: Muchun Song, Tejun Heo, Andrew Morton
  Cc: core-services, Jonathan Corbet, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Shuah Khan, Johannes Weiner, Zefan Li, cgroups,
	linux-doc, linux-mm, linux-kselftest, David Finkel

Other mechanisms for querying the peak memory usage of either a process
or v1 memory cgroup allow for resetting the high watermark. Restore
parity with those mechanisms, but with a less racy API.

For example:
 - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
   the high watermark.
 - writing "5" to the clear_refs pseudo-file in a processes's proc
   directory resets the peak RSS.

This change is an evolution of a previous patch, which mostly copied the
cgroup v1 behavior, however, there were concerns about races/ownership
issues with a global reset, so instead this change makes the reset
filedescriptor-local.

Writing a specific string to the memory.peak and memory.swap.peak
pseudo-files reset the high watermark to the current usage for
subsequent reads through that same fd.

Notably, following Johannes's suggestion, this implementation moves the
O(fds that have written) behavior onto the fd write(2) path. Instead, on
the page-allocation path, we simply add one additional watermark to
conditionally bump per-hierarchy level in the page-counter.

This behavior is particularly useful for work scheduling systems that
need to track memory usage of worker processes/cgroups per-work-item.
Since memory can't be squeezed like CPU can (the OOM-killer has
opinions), these systems need to track the peak memory usage to compute
system/container fullness when binpacking workitems.

Most notably, Vimeo's use-case involves a system that's doing global
binpacking across many Kubernetes pods/containers, and while we can use
PSI for some local decisions about overload, we strive to avoid packing
workloads too tightly in the first place. To facilitate this, we track
the peak memory usage. However, since we run with long-lived workers (to
amortize startup costs) we need a way to track the high watermark while
a work-item is executing. Polling runs the risk of missing short spikes
that last for timescales below the polling interval, and peak memory
tracking at the cgroup level is otherwise perfect for this use-case.

As this data is used to ensure that binpacked work ends up with
sufficient headroom, this use-case mostly avoids the inaccuracies
surrounding reclaimable memory.

Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: David Finkel <davidf@vimeo.com>
---
 Documentation/admin-guide/cgroup-v2.rst       |  26 +-
 include/linux/cgroup.h                        |   7 +
 include/linux/memcontrol.h                    |   5 +
 include/linux/page_counter.h                  |   6 +
 kernel/cgroup/cgroup-internal.h               |   2 +
 kernel/cgroup/cgroup.c                        |   7 +
 mm/memcontrol.c                               | 165 ++++++++++++-
 mm/page_counter.c                             |   4 +
 tools/testing/selftests/cgroup/cgroup_util.c  |  22 ++
 tools/testing/selftests/cgroup/cgroup_util.h  |   2 +
 .../selftests/cgroup/test_memcontrol.c        | 227 +++++++++++++++++-
 11 files changed, 448 insertions(+), 25 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 8fbb0519d556..10a2f919128f 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1322,11 +1322,16 @@ PAGE_SIZE multiple when read back.
 	reclaim induced by memory.reclaim.
 
   memory.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max memory usage recorded for the cgroup and its descendants since
+	either the creation of the cgroup or the most recent reset for that fd.
 
-	The max memory usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	A write of the string "fd_local_reset" to this file resets it to the
+	current memory usage for subsequent reads through the same
+	file descriptor.
+	Attempts to write any other string will return EINVAL
+	(modulo leading and trailing whitespace).
 
   memory.oom.group
 	A read-write single value file which exists on non-root
@@ -1652,11 +1657,16 @@ PAGE_SIZE multiple when read back.
 	Healthy workloads are not expected to reach this limit.
 
   memory.swap.peak
-	A read-only single value file which exists on non-root
-	cgroups.
+	A read-write single value file which exists on non-root cgroups.
+
+	The max swap usage recorded for the cgroup and its descendants since
+	the creation of the cgroup or the most recent reset for that fd.
 
-	The max swap usage recorded for the cgroup and its
-	descendants since the creation of the cgroup.
+	A write of the string "fd_local_reset" to this file resets it to the
+	current memory usage for subsequent reads through the same
+	file descriptor.
+	Attempts to write any other string will return EINVAL
+	(modulo leading and trailing whitespace).
 
   memory.swap.max
 	A read-write single value file which exists on non-root
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 2150ca60394b..9bda441227ea 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -855,4 +855,11 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
 
+struct memcg_peak_mem_ctx {
+	long				local_watermark;
+	struct memcg_peak_mem_ctx	*next, *prev;
+};
+
+struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of);
+
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 030d34e9d117..6be7507c6fd3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -198,6 +198,11 @@ struct mem_cgroup {
 	struct page_counter kmem;		/* v1 only */
 	struct page_counter tcpmem;		/* v1 only */
 
+	/* lists of memcg peak watching contexts on swap and memory */
+	struct memcg_peak_mem_ctx *peak_memory_local_watermark_watchers;
+	struct memcg_peak_mem_ctx *peak_swap_local_watermark_watchers;
+	spinlock_t pagecounter_peak_watchers_lock;
+
 	/* Range enforcement for interrupt charges */
 	struct work_struct high_work;
 
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 8cd858d912c4..047ceaece258 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -26,6 +26,7 @@ struct page_counter {
 	atomic_long_t children_low_usage;
 
 	unsigned long watermark;
+	unsigned long local_watermark; /* track min of fd-local resets */
 	unsigned long failcnt;
 
 	/* Keep all the read most fields in a separete cacheline. */
@@ -81,4 +82,9 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
 	counter->watermark = page_counter_read(counter);
 }
 
+static inline void page_counter_reset_local_watermark(struct page_counter *counter)
+{
+	counter->local_watermark = page_counter_read(counter);
+}
+
 #endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 520b90dd97ec..5a97ba08e976 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -81,6 +81,8 @@ struct cgroup_file_ctx {
 	struct {
 		struct cgroup_pidlist	*pidlist;
 	} procs1;
+
+	struct memcg_peak_mem_ctx peak;
 };
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e32b6972c478..38b935ffa6cf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1964,6 +1964,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	return -EINVAL;
 }
 
+struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of)
+{
+	struct cgroup_file_ctx *ctx = of->priv;
+
+	return &ctx->peak;
+}
+
 static void apply_cgroup_root_flags(unsigned int root_flags)
 {
 	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8f2f1bb18c9c..eb6614236371 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  */
 
+#include <linux/cgroup-defs.h>
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
@@ -5745,6 +5746,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
+	spin_lock_init(&memcg->pagecounter_peak_watchers_lock);
 	memcg->socket_pressure = jiffies;
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
@@ -6907,12 +6909,130 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
 	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
 }
 
-static u64 memory_peak_read(struct cgroup_subsys_state *css,
-			    struct cftype *cft)
+static struct page_counter *memcg_memory_extract_page_counter(struct mem_cgroup *memcg)
 {
+	return &memcg->memory;
+}
+
+static struct memcg_peak_mem_ctx **memcg_memory_extract_peak_watchers(struct mem_cgroup *memcg)
+{
+	return &memcg->peak_memory_local_watermark_watchers;
+}
+
+inline int swap_memory_peak_show(
+	struct seq_file *sf, void *v,
+	struct page_counter *(*extract_pc)(struct mem_cgroup *memcg))
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct page_counter *pc = extract_pc(memcg);
+
+	struct kernfs_open_file *of = sf->private;
+	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
+	s64 fd_peak = ctx->local_watermark;
+
+	if (fd_peak == -1) {
+		seq_printf(sf, "%llu\n", (u64)pc->watermark * PAGE_SIZE);
+		return 0;
+	}
+
+	s64 pc_peak = pc->local_watermark;
+	s64 wm = fd_peak > pc_peak ? fd_peak : pc_peak;
+
+	seq_printf(sf, "%lld\n", wm * PAGE_SIZE);
+	return 0;
+}
+
+static int memory_peak_show(struct seq_file *sf, void *v)
+{
+	return swap_memory_peak_show(sf, v, memcg_memory_extract_page_counter);
+}
+
+static int swap_memory_peak_open(struct kernfs_open_file *of)
+{
+	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
+
+	ctx->local_watermark = -1;
+	return 0;
+}
+
+inline void swap_memory_peak_release(
+	struct kernfs_open_file *of,
+	struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
+
+	if (ctx->local_watermark == -1) {
+		/* fast path (no writes on this fd)*/
+		return;
+	}
+	spin_lock(&memcg->pagecounter_peak_watchers_lock);
+	if (ctx->next) {
+		ctx->next->prev = ctx->prev;
+	}
+	if (ctx->prev) {
+		ctx->prev->next = ctx->next;
+	} else {
+		struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
+
+		*watchers = ctx->next;
+	}
+	spin_unlock(&memcg->pagecounter_peak_watchers_lock);
+}
 
-	return (u64)memcg->memory.watermark * PAGE_SIZE;
+static void memory_peak_release(struct kernfs_open_file *of)
+{
+	swap_memory_peak_release(of, memcg_memory_extract_peak_watchers);
+}
+
+inline ssize_t swap_memory_peak_write(
+	struct kernfs_open_file *of,
+	char *buf, size_t nbytes, loff_t off,
+	struct page_counter* (*extract_pc)(struct mem_cgroup *memcg),
+	struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
+{
+	buf = strstrip(buf);
+	/* Only allow "fd_local_reset" to keep the API clear */
+	if (strcmp(buf, "fd_local_reset"))
+		return -EINVAL;
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
+
+	spin_lock(&memcg->pagecounter_peak_watchers_lock);
+
+	struct page_counter *pc = extract_pc(memcg);
+
+	page_counter_reset_local_watermark(pc);
+	const unsigned long cur = pc->local_watermark;
+	struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
+	struct memcg_peak_mem_ctx *peer_ctx;
+
+	for (peer_ctx = *watchers; peer_ctx; peer_ctx = peer_ctx->next) {
+		if (cur > peer_ctx->local_watermark)
+			peer_ctx->local_watermark = cur;
+	}
+	if (ctx->local_watermark == -1) {
+		/* only append to the list if we're not already there */
+		if (peer_ctx) {
+			ctx->prev = peer_ctx;
+			peer_ctx->next = ctx;
+		} else {
+			*watchers = ctx;
+		}
+	}
+	ctx->local_watermark = cur;
+	spin_unlock(&memcg->pagecounter_peak_watchers_lock);
+
+	return nbytes;
+}
+
+static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	return swap_memory_peak_write(of, buf, nbytes, off,
+				      memcg_memory_extract_page_counter,
+				      memcg_memory_extract_peak_watchers);
 }
 
 static int memory_min_show(struct seq_file *m, void *v)
@@ -7231,7 +7351,10 @@ static struct cftype memory_files[] = {
 	{
 		.name = "peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = memory_peak_read,
+		.open = swap_memory_peak_open,
+		.release = memory_peak_release,
+		.seq_show = memory_peak_show,
+		.write = memory_peak_write,
 	},
 	{
 		.name = "min",
@@ -8193,14 +8316,35 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
 	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
 }
 
-static u64 swap_peak_read(struct cgroup_subsys_state *css,
-			  struct cftype *cft)
+
+static struct page_counter *memcg_swap_extract_page_counter(struct mem_cgroup *memcg)
+{
+	return &memcg->swap;
+}
+
+static struct memcg_peak_mem_ctx **memcg_swap_extract_peak_watchers(struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	return &memcg->peak_swap_local_watermark_watchers;
+}
+
+static int swap_peak_show(struct seq_file *sf, void *v)
+{
+	return swap_memory_peak_show(sf, v, memcg_swap_extract_page_counter);
+}
 
-	return (u64)memcg->swap.watermark * PAGE_SIZE;
+static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
+			       size_t nbytes, loff_t off)
+{
+	return swap_memory_peak_write(of, buf, nbytes, off,
+				      memcg_swap_extract_page_counter,
+				      memcg_swap_extract_peak_watchers);
+}
+static void swap_peak_release(struct kernfs_open_file *of)
+{
+	swap_memory_peak_release(of, memcg_swap_extract_peak_watchers);
 }
 
+
 static int swap_high_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -8282,7 +8426,10 @@ static struct cftype swap_files[] = {
 	{
 		.name = "swap.peak",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.read_u64 = swap_peak_read,
+		.open = swap_memory_peak_open,
+		.release = swap_peak_release,
+		.seq_show = swap_peak_show,
+		.write = swap_peak_write,
 	},
 	{
 		.name = "swap.events",
diff --git a/mm/page_counter.c b/mm/page_counter.c
index db20d6452b71..40d5f4990218 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -82,6 +82,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 		 */
 		if (new > READ_ONCE(c->watermark))
 			WRITE_ONCE(c->watermark, new);
+		if (new > READ_ONCE(c->local_watermark))
+			WRITE_ONCE(c->local_watermark, new);
 	}
 }
 
@@ -135,6 +137,8 @@ bool page_counter_try_charge(struct page_counter *counter,
 		 */
 		if (new > READ_ONCE(c->watermark))
 			WRITE_ONCE(c->watermark, new);
+		if (new > READ_ONCE(c->local_watermark))
+			WRITE_ONCE(c->local_watermark, new);
 	}
 	return true;
 
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
index 432db923bced..1e2d46636a0c 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.c
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char *control)
 	return atol(buf);
 }
 
+long cg_read_long_fd(int fd)
+{
+	char buf[128];
+
+	if (pread(fd, buf, sizeof(buf), 0) <= 0)
+		return -1;
+
+	return atol(buf);
+}
+
 long cg_read_key_long(const char *cgroup, const char *control, const char *key)
 {
 	char buf[PAGE_SIZE];
@@ -183,6 +193,18 @@ int cg_write(const char *cgroup, const char *control, char *buf)
 	return ret == len ? 0 : ret;
 }
 
+/*
+ * Returns fd on success, or -1 on failure.
+ * (fd should be closed with close() as usual)
+ */
+int cg_open(const char *cgroup, const char *control, int flags)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+	return open(path, flags);
+}
+
 int cg_write_numeric(const char *cgroup, const char *control, long value)
 {
 	char buf[64];
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
index e8d04ac9e3d2..19b131ee7707 100644
--- a/tools/testing/selftests/cgroup/cgroup_util.h
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -34,9 +34,11 @@ extern int cg_read_strcmp(const char *cgroup, const char *control,
 extern int cg_read_strstr(const char *cgroup, const char *control,
 			  const char *needle);
 extern long cg_read_long(const char *cgroup, const char *control);
+extern long cg_read_long_fd(int fd);
 long cg_read_key_long(const char *cgroup, const char *control, const char *key);
 extern long cg_read_lc(const char *cgroup, const char *control);
 extern int cg_write(const char *cgroup, const char *control, char *buf);
+extern int cg_open(const char *cgroup, const char *control, int flags);
 int cg_write_numeric(const char *cgroup, const char *control, long value);
 extern int cg_run(const char *cgroup,
 		  int (*fn)(const char *cgroup, void *arg),
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 41ae8047b889..5641c1fd6dae 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -161,12 +161,12 @@ static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
 /*
  * This test create a memory cgroup, allocates
  * some anonymous memory and some pagecache
- * and check memory.current and some memory.stat values.
+ * and checks memory.current, memory.peak, and some memory.stat values.
  */
-static int test_memcg_current(const char *root)
+static int test_memcg_current_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
-	long current;
+	long current, peak, peak_reset;
 	char *memcg;
 
 	memcg = cg_name(root, "memcg_test");
@@ -180,15 +180,109 @@ static int test_memcg_current(const char *root)
 	if (current != 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak != 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	/*
+	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
+	 * We need at least three to be closed in a different order than writes occurred to test
+	 * the linked-list handling.
+	 */
+	int peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd == -1)
+		goto cleanup;
+
+	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
+	int peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd2 == -1)
+		goto cleanup;
+
+	int peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd3 == -1)
+		goto cleanup;
+
+	static const char reset_string[] = "fd_local_reset\n";
+
+	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	fd2_closed = true;
+	if (close(peak_fd2))
+		goto cleanup;
+
+	int peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (peak_fd4 == -1)
+		goto cleanup;
+
+	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak = cg_read_long_fd(peak_fd);
+	if (peak > MB(30) || peak < 0)
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(50))
+		goto cleanup;
+
+	/* Make sure everything is back to normal */
+	peak = cg_read_long_fd(peak_fd);
+	if (peak < MB(50))
+		goto cleanup;
+
+	peak = cg_read_long_fd(peak_fd4);
+	if (peak < MB(50))
+		goto cleanup;
+
+	fd3_closed = true;
+	if (close(peak_fd3))
+		goto cleanup;
+
+	fd4_closed = true;
+	if (close(peak_fd4))
+		goto cleanup;
+
+
 	ret = KSFT_PASS;
 
 cleanup:
+	close(peak_fd);
+	if (!fd2_closed)
+		close(peak_fd2);
+	if (!fd3_closed)
+		close(peak_fd3);
+	if (!fd4_closed)
+		close(peak_fd4);
 	cg_destroy(memcg);
 	free(memcg);
 
@@ -817,13 +911,16 @@ static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
 
 /*
  * This test checks that memory.swap.max limits the amount of
- * anonymous memory which can be swapped out.
+ * anonymous memory which can be swapped out. Additionally, it verifies that
+ * memory.swap.peak reflects the high watermark and can be reset.
  */
-static int test_memcg_swap_max(const char *root)
+static int test_memcg_swap_max_peak(const char *root)
 {
 	int ret = KSFT_FAIL;
 	char *memcg;
-	long max;
+	long max, peak;
+
+	static const char reset_string[] = "fd_local_reset\n";
 
 	if (!is_swap_enabled())
 		return KSFT_SKIP;
@@ -840,6 +937,45 @@ static int test_memcg_swap_max(const char *root)
 		goto cleanup;
 	}
 
+	int swap_peak_fd = cg_open(memcg, "memory.swap.peak",
+				   O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (swap_peak_fd == -1)
+		goto cleanup;
+
+	int mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
+
+	if (mem_peak_fd == -1)
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak"))
+		goto cleanup;
+
+	if (cg_read_long_fd(swap_peak_fd))
+		goto cleanup;
+
+	/* switch the swap and mem fds into local-peak tracking mode*/
+	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	if (cg_read_long_fd(swap_peak_fd))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak"))
+		goto cleanup;
+
+	if (cg_read_long_fd(mem_peak_fd))
+		goto cleanup;
+
+	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	if (cg_read_long_fd(mem_peak_fd))
+		goto cleanup;
+
 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
 		goto cleanup;
 
@@ -862,6 +998,61 @@ static int test_memcg_swap_max(const char *root)
 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	/*
+	 * open, reset and close the peak swap on another FD to make sure
+	 * multiple extant fds don't corrupt the linked-list
+	 */
+	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
+	if (peak_reset)
+		goto cleanup;
+
+	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
+	if (peak_reset)
+		goto cleanup;
+
+	/* actually reset on the fds */
+	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
+	if (peak_reset != sizeof(reset_string))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak > MB(10))
+		goto cleanup;
+
+	/*
+	 * The cgroup is now empty, but there may be a page or two associated
+	 * with the open FD accounted to it.
+	 */
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak > MB(1))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.peak") < MB(29))
+		goto cleanup;
+
+	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
+		goto cleanup;
+
 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
 		goto cleanup;
 
@@ -869,9 +1060,29 @@ static int test_memcg_swap_max(const char *root)
 	if (max <= 0)
 		goto cleanup;
 
+	peak = cg_read_long(memcg, "memory.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long(memcg, "memory.swap.peak");
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(mem_peak_fd);
+	if (peak < MB(29))
+		goto cleanup;
+
+	peak = cg_read_long_fd(swap_peak_fd);
+	if (peak < MB(19))
+		goto cleanup;
+
 	ret = KSFT_PASS;
 
 cleanup:
+	if (close(mem_peak_fd))
+		ret = KSFT_FAIL;
+	if (close(swap_peak_fd))
+		ret = KSFT_FAIL;
 	cg_destroy(memcg);
 	free(memcg);
 
@@ -1295,7 +1506,7 @@ struct memcg_test {
 	const char *name;
 } tests[] = {
 	T(test_memcg_subtree_control),
-	T(test_memcg_current),
+	T(test_memcg_current_peak),
 	T(test_memcg_min),
 	T(test_memcg_low),
 	T(test_memcg_high),
@@ -1303,7 +1514,7 @@ struct memcg_test {
 	T(test_memcg_max),
 	T(test_memcg_reclaim),
 	T(test_memcg_oom_events),
-	T(test_memcg_swap_max),
+	T(test_memcg_swap_max_peak),
 	T(test_memcg_sock),
 	T(test_memcg_oom_group_leaf_events),
 	T(test_memcg_oom_group_parent_events),
-- 
2.40.1



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-22 15:17 ` [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
@ 2024-07-22 18:22   ` Roman Gushchin
  2024-07-22 19:30     ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Roman Gushchin @ 2024-07-22 18:22 UTC (permalink / raw)
  To: David Finkel
  Cc: Muchun Song, Tejun Heo, Andrew Morton, core-services,
	Jonathan Corbet, Michal Hocko, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On Mon, Jul 22, 2024 at 11:17:13AM -0400, David Finkel wrote:
> Other mechanisms for querying the peak memory usage of either a process
> or v1 memory cgroup allow for resetting the high watermark. Restore
> parity with those mechanisms, but with a less racy API.
> 
> For example:
>  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
>    the high watermark.
>  - writing "5" to the clear_refs pseudo-file in a processes's proc
>    directory resets the peak RSS.
> 
> This change is an evolution of a previous patch, which mostly copied the
> cgroup v1 behavior, however, there were concerns about races/ownership
> issues with a global reset, so instead this change makes the reset
> filedescriptor-local.
> 
> Writing a specific string to the memory.peak and memory.swap.peak
> pseudo-files reset the high watermark to the current usage for
> subsequent reads through that same fd.
> 
> Notably, following Johannes's suggestion, this implementation moves the
> O(fds that have written) behavior onto the fd write(2) path. Instead, on
> the page-allocation path, we simply add one additional watermark to
> conditionally bump per-hierarchy level in the page-counter.
> 
> This behavior is particularly useful for work scheduling systems that
> need to track memory usage of worker processes/cgroups per-work-item.
> Since memory can't be squeezed like CPU can (the OOM-killer has
> opinions), these systems need to track the peak memory usage to compute
> system/container fullness when binpacking workitems.
> 
> Most notably, Vimeo's use-case involves a system that's doing global
> binpacking across many Kubernetes pods/containers, and while we can use
> PSI for some local decisions about overload, we strive to avoid packing
> workloads too tightly in the first place. To facilitate this, we track
> the peak memory usage. However, since we run with long-lived workers (to
> amortize startup costs) we need a way to track the high watermark while
> a work-item is executing. Polling runs the risk of missing short spikes
> that last for timescales below the polling interval, and peak memory
> tracking at the cgroup level is otherwise perfect for this use-case.
> 
> As this data is used to ensure that binpacked work ends up with
> sufficient headroom, this use-case mostly avoids the inaccuracies
> surrounding reclaimable memory.
> 
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: David Finkel <davidf@vimeo.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst       |  26 +-
>  include/linux/cgroup.h                        |   7 +
>  include/linux/memcontrol.h                    |   5 +
>  include/linux/page_counter.h                  |   6 +
>  kernel/cgroup/cgroup-internal.h               |   2 +
>  kernel/cgroup/cgroup.c                        |   7 +
>  mm/memcontrol.c                               | 165 ++++++++++++-
>  mm/page_counter.c                             |   4 +
>  tools/testing/selftests/cgroup/cgroup_util.c  |  22 ++
>  tools/testing/selftests/cgroup/cgroup_util.h  |   2 +
>  .../selftests/cgroup/test_memcontrol.c        | 227 +++++++++++++++++-
>  11 files changed, 448 insertions(+), 25 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 8fbb0519d556..10a2f919128f 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1322,11 +1322,16 @@ PAGE_SIZE multiple when read back.
>  	reclaim induced by memory.reclaim.
>  
>    memory.peak
> -	A read-only single value file which exists on non-root
> -	cgroups.
> +	A read-write single value file which exists on non-root cgroups.
> +
> +	The max memory usage recorded for the cgroup and its descendants since
> +	either the creation of the cgroup or the most recent reset for that fd.
>  
> -	The max memory usage recorded for the cgroup and its
> -	descendants since the creation of the cgroup.
> +	A write of the string "fd_local_reset" to this file resets it to the
> +	current memory usage for subsequent reads through the same

Hi David!

Not a very strong preference, but with the current design, do we really expect
to have a non-local reset? If not, can we agree on a "reset" string instead
for a sake of simplicity?

> +	file descriptor.
> +	Attempts to write any other string will return EINVAL
> +	(modulo leading and trailing whitespace).
>  
>    memory.oom.group
>  	A read-write single value file which exists on non-root
> @@ -1652,11 +1657,16 @@ PAGE_SIZE multiple when read back.
>  	Healthy workloads are not expected to reach this limit.
>  
>    memory.swap.peak
> -	A read-only single value file which exists on non-root
> -	cgroups.
> +	A read-write single value file which exists on non-root cgroups.
> +
> +	The max swap usage recorded for the cgroup and its descendants since
> +	the creation of the cgroup or the most recent reset for that fd.
>  
> -	The max swap usage recorded for the cgroup and its
> -	descendants since the creation of the cgroup.
> +	A write of the string "fd_local_reset" to this file resets it to the
> +	current memory usage for subsequent reads through the same
> +	file descriptor.
> +	Attempts to write any other string will return EINVAL
> +	(modulo leading and trailing whitespace).
>  
>    memory.swap.max
>  	A read-write single value file which exists on non-root
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 2150ca60394b..9bda441227ea 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -855,4 +855,11 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
>  
>  struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
>  
> +struct memcg_peak_mem_ctx {
> +	long				local_watermark;
> +	struct memcg_peak_mem_ctx	*next, *prev;

Please, take a look at include/linux/list.h and use it instead of
re-implementing list operations from scratch.

> +};
> +
> +struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of);
> +
>  #endif /* _LINUX_CGROUP_H */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 030d34e9d117..6be7507c6fd3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -198,6 +198,11 @@ struct mem_cgroup {
>  	struct page_counter kmem;		/* v1 only */
>  	struct page_counter tcpmem;		/* v1 only */
>  
> +	/* lists of memcg peak watching contexts on swap and memory */
> +	struct memcg_peak_mem_ctx *peak_memory_local_watermark_watchers;
> +	struct memcg_peak_mem_ctx *peak_swap_local_watermark_watchers;
> +	spinlock_t pagecounter_peak_watchers_lock;
> +
>  	/* Range enforcement for interrupt charges */
>  	struct work_struct high_work;
>  
> diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> index 8cd858d912c4..047ceaece258 100644
> --- a/include/linux/page_counter.h
> +++ b/include/linux/page_counter.h
> @@ -26,6 +26,7 @@ struct page_counter {
>  	atomic_long_t children_low_usage;
>  
>  	unsigned long watermark;
> +	unsigned long local_watermark; /* track min of fd-local resets */
>  	unsigned long failcnt;
>  
>  	/* Keep all the read most fields in a separete cacheline. */
> @@ -81,4 +82,9 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
>  	counter->watermark = page_counter_read(counter);
>  }
>  
> +static inline void page_counter_reset_local_watermark(struct page_counter *counter)
> +{
> +	counter->local_watermark = page_counter_read(counter);
> +}
> +
>  #endif /* _LINUX_PAGE_COUNTER_H */
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index 520b90dd97ec..5a97ba08e976 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -81,6 +81,8 @@ struct cgroup_file_ctx {
>  	struct {
>  		struct cgroup_pidlist	*pidlist;
>  	} procs1;
> +
> +	struct memcg_peak_mem_ctx peak;
>  };
>  
>  /*
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index e32b6972c478..38b935ffa6cf 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1964,6 +1964,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
>  	return -EINVAL;
>  }
>  
> +struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of)
> +{
> +	struct cgroup_file_ctx *ctx = of->priv;
> +
> +	return &ctx->peak;
> +}
> +
>  static void apply_cgroup_root_flags(unsigned int root_flags)
>  {
>  	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8f2f1bb18c9c..eb6614236371 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -25,6 +25,7 @@
>   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
>   */
>  
> +#include <linux/cgroup-defs.h>
>  #include <linux/page_counter.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> @@ -5745,6 +5746,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
>  	vmpressure_init(&memcg->vmpressure);
>  	INIT_LIST_HEAD(&memcg->event_list);
>  	spin_lock_init(&memcg->event_list_lock);
> +	spin_lock_init(&memcg->pagecounter_peak_watchers_lock);
>  	memcg->socket_pressure = jiffies;
>  #ifdef CONFIG_MEMCG_KMEM
>  	memcg->kmemcg_id = -1;
> @@ -6907,12 +6909,130 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
>  	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
>  }
>  
> -static u64 memory_peak_read(struct cgroup_subsys_state *css,
> -			    struct cftype *cft)
> +static struct page_counter *memcg_memory_extract_page_counter(struct mem_cgroup *memcg)
>  {
> +	return &memcg->memory;
> +}
> +
> +static struct memcg_peak_mem_ctx **memcg_memory_extract_peak_watchers(struct mem_cgroup *memcg)
> +{
> +	return &memcg->peak_memory_local_watermark_watchers;
> +}
> +
> +inline int swap_memory_peak_show(
> +	struct seq_file *sf, void *v,
> +	struct page_counter *(*extract_pc)(struct mem_cgroup *memcg))
> +{
> +	struct cgroup_subsys_state *css = seq_css(sf);
>  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +	struct page_counter *pc = extract_pc(memcg);
> +
> +	struct kernfs_open_file *of = sf->private;
> +	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> +	s64 fd_peak = ctx->local_watermark;
> +
> +	if (fd_peak == -1) {
> +		seq_printf(sf, "%llu\n", (u64)pc->watermark * PAGE_SIZE);
> +		return 0;
> +	}
> +
> +	s64 pc_peak = pc->local_watermark;
> +	s64 wm = fd_peak > pc_peak ? fd_peak : pc_peak;
> +
> +	seq_printf(sf, "%lld\n", wm * PAGE_SIZE);
> +	return 0;
> +}
> +
> +static int memory_peak_show(struct seq_file *sf, void *v)
> +{
> +	return swap_memory_peak_show(sf, v, memcg_memory_extract_page_counter);

I think it's really too complex. Why not pass a single boolean argument
which will define to use memory page_counter or swap page_counter?
It will eliminate a need to pass pointers to functions and also eliminate
a need for introducing these helper functions in general.

> +}
> +
> +static int swap_memory_peak_open(struct kernfs_open_file *of)
> +{
> +	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> +
> +	ctx->local_watermark = -1;
> +	return 0;
> +}
> +
> +inline void swap_memory_peak_release(
> +	struct kernfs_open_file *of,
> +	struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
> +{
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> +
> +	if (ctx->local_watermark == -1) {
> +		/* fast path (no writes on this fd)*/
> +		return;
> +	}
> +	spin_lock(&memcg->pagecounter_peak_watchers_lock);
> +	if (ctx->next) {
> +		ctx->next->prev = ctx->prev;
> +	}
> +	if (ctx->prev) {
> +		ctx->prev->next = ctx->next;
> +	} else {
> +		struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
> +
> +		*watchers = ctx->next;
> +	}
> +	spin_unlock(&memcg->pagecounter_peak_watchers_lock);
> +}
>  
> -	return (u64)memcg->memory.watermark * PAGE_SIZE;
> +static void memory_peak_release(struct kernfs_open_file *of)
> +{
> +	swap_memory_peak_release(of, memcg_memory_extract_peak_watchers);
> +}
> +
> +inline ssize_t swap_memory_peak_write(
> +	struct kernfs_open_file *of,
> +	char *buf, size_t nbytes, loff_t off,
> +	struct page_counter* (*extract_pc)(struct mem_cgroup *memcg),
> +	struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
> +{
> +	buf = strstrip(buf);
> +	/* Only allow "fd_local_reset" to keep the API clear */
> +	if (strcmp(buf, "fd_local_reset"))
> +		return -EINVAL;
> +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +	struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);

Please, don't mix variable definitions and code. Also, please, use
scripts/checkpatch.pl for checking the code before submission. I guess
it will raise several issues in this patch.

> +
> +	spin_lock(&memcg->pagecounter_peak_watchers_lock);
> +
> +	struct page_counter *pc = extract_pc(memcg);
> +
> +	page_counter_reset_local_watermark(pc);
> +	const unsigned long cur = pc->local_watermark;
> +	struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
> +	struct memcg_peak_mem_ctx *peer_ctx;
> +
> +	for (peer_ctx = *watchers; peer_ctx; peer_ctx = peer_ctx->next) {
> +		if (cur > peer_ctx->local_watermark)
> +			peer_ctx->local_watermark = cur;
> +	}
> +	if (ctx->local_watermark == -1) {
> +		/* only append to the list if we're not already there */
> +		if (peer_ctx) {
> +			ctx->prev = peer_ctx;
> +			peer_ctx->next = ctx;
> +		} else {
> +			*watchers = ctx;
> +		}
> +	}
> +	ctx->local_watermark = cur;
> +	spin_unlock(&memcg->pagecounter_peak_watchers_lock);
> +
> +	return nbytes;
> +}
> +
> +static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
> +				 size_t nbytes, loff_t off)
> +{
> +	return swap_memory_peak_write(of, buf, nbytes, off,
> +				      memcg_memory_extract_page_counter,
> +				      memcg_memory_extract_peak_watchers);
>  }
>  
>  static int memory_min_show(struct seq_file *m, void *v)
> @@ -7231,7 +7351,10 @@ static struct cftype memory_files[] = {
>  	{
>  		.name = "peak",
>  		.flags = CFTYPE_NOT_ON_ROOT,
> -		.read_u64 = memory_peak_read,
> +		.open = swap_memory_peak_open,
> +		.release = memory_peak_release,
> +		.seq_show = memory_peak_show,
> +		.write = memory_peak_write,
>  	},
>  	{
>  		.name = "min",
> @@ -8193,14 +8316,35 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
>  	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
>  }
>  
> -static u64 swap_peak_read(struct cgroup_subsys_state *css,
> -			  struct cftype *cft)
> +
> +static struct page_counter *memcg_swap_extract_page_counter(struct mem_cgroup *memcg)
> +{
> +	return &memcg->swap;
> +}
> +
> +static struct memcg_peak_mem_ctx **memcg_swap_extract_peak_watchers(struct mem_cgroup *memcg)
>  {
> -	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +	return &memcg->peak_swap_local_watermark_watchers;
> +}
> +
> +static int swap_peak_show(struct seq_file *sf, void *v)
> +{
> +	return swap_memory_peak_show(sf, v, memcg_swap_extract_page_counter);
> +}
>  
> -	return (u64)memcg->swap.watermark * PAGE_SIZE;
> +static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
> +			       size_t nbytes, loff_t off)
> +{
> +	return swap_memory_peak_write(of, buf, nbytes, off,
> +				      memcg_swap_extract_page_counter,
> +				      memcg_swap_extract_peak_watchers);
> +}
> +static void swap_peak_release(struct kernfs_open_file *of)
> +{
> +	swap_memory_peak_release(of, memcg_swap_extract_peak_watchers);
>  }
>  
> +
>  static int swap_high_show(struct seq_file *m, void *v)
>  {
>  	return seq_puts_memcg_tunable(m,
> @@ -8282,7 +8426,10 @@ static struct cftype swap_files[] = {
>  	{
>  		.name = "swap.peak",
>  		.flags = CFTYPE_NOT_ON_ROOT,
> -		.read_u64 = swap_peak_read,
> +		.open = swap_memory_peak_open,
> +		.release = swap_peak_release,
> +		.seq_show = swap_peak_show,
> +		.write = swap_peak_write,
>  	},
>  	{
>  		.name = "swap.events",
> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index db20d6452b71..40d5f4990218 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -82,6 +82,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
>  		 */
>  		if (new > READ_ONCE(c->watermark))
>  			WRITE_ONCE(c->watermark, new);
> +		if (new > READ_ONCE(c->local_watermark))
> +			WRITE_ONCE(c->local_watermark, new);

Hm, can't we have a single comparison on the hot path?
Also, we read and write c->local_watermark speculatively here, Idk if it's still
acceptable with an ability to reset watermarks "locally". Maybe it is, but
it definitely deserves at least a comment with an explanation.

And btw thank you for including tests into the commit, it's really great to see.
I'd suggest you to extract them into a separate commit and post it as a series.

Thank you!


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-22 18:22   ` Roman Gushchin
@ 2024-07-22 19:30     ` David Finkel
  2024-07-22 19:47       ` Waiman Long
  0 siblings, 1 reply; 47+ messages in thread
From: David Finkel @ 2024-07-22 19:30 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Muchun Song, Tejun Heo, Andrew Morton, core-services,
	Jonathan Corbet, Michal Hocko, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

Roman,

Thanks for the review.

On Mon, Jul 22, 2024 at 2:22 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> On Mon, Jul 22, 2024 at 11:17:13AM -0400, David Finkel wrote:
> > Other mechanisms for querying the peak memory usage of either a process
> > or v1 memory cgroup allow for resetting the high watermark. Restore
> > parity with those mechanisms, but with a less racy API.
> >
> > For example:
> >  - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets
> >    the high watermark.
> >  - writing "5" to the clear_refs pseudo-file in a processes's proc
> >    directory resets the peak RSS.
> >
> > This change is an evolution of a previous patch, which mostly copied the
> > cgroup v1 behavior, however, there were concerns about races/ownership
> > issues with a global reset, so instead this change makes the reset
> > filedescriptor-local.
> >
> > Writing a specific string to the memory.peak and memory.swap.peak
> > pseudo-files reset the high watermark to the current usage for
> > subsequent reads through that same fd.
> >
> > Notably, following Johannes's suggestion, this implementation moves the
> > O(fds that have written) behavior onto the fd write(2) path. Instead, on
> > the page-allocation path, we simply add one additional watermark to
> > conditionally bump per-hierarchy level in the page-counter.
> >
> > This behavior is particularly useful for work scheduling systems that
> > need to track memory usage of worker processes/cgroups per-work-item.
> > Since memory can't be squeezed like CPU can (the OOM-killer has
> > opinions), these systems need to track the peak memory usage to compute
> > system/container fullness when binpacking workitems.
> >
> > Most notably, Vimeo's use-case involves a system that's doing global
> > binpacking across many Kubernetes pods/containers, and while we can use
> > PSI for some local decisions about overload, we strive to avoid packing
> > workloads too tightly in the first place. To facilitate this, we track
> > the peak memory usage. However, since we run with long-lived workers (to
> > amortize startup costs) we need a way to track the high watermark while
> > a work-item is executing. Polling runs the risk of missing short spikes
> > that last for timescales below the polling interval, and peak memory
> > tracking at the cgroup level is otherwise perfect for this use-case.
> >
> > As this data is used to ensure that binpacked work ends up with
> > sufficient headroom, this use-case mostly avoids the inaccuracies
> > surrounding reclaimable memory.
> >
> > Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> > Signed-off-by: David Finkel <davidf@vimeo.com>
> > ---
> >  Documentation/admin-guide/cgroup-v2.rst       |  26 +-
> >  include/linux/cgroup.h                        |   7 +
> >  include/linux/memcontrol.h                    |   5 +
> >  include/linux/page_counter.h                  |   6 +
> >  kernel/cgroup/cgroup-internal.h               |   2 +
> >  kernel/cgroup/cgroup.c                        |   7 +
> >  mm/memcontrol.c                               | 165 ++++++++++++-
> >  mm/page_counter.c                             |   4 +
> >  tools/testing/selftests/cgroup/cgroup_util.c  |  22 ++
> >  tools/testing/selftests/cgroup/cgroup_util.h  |   2 +
> >  .../selftests/cgroup/test_memcontrol.c        | 227 +++++++++++++++++-
> >  11 files changed, 448 insertions(+), 25 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index 8fbb0519d556..10a2f919128f 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1322,11 +1322,16 @@ PAGE_SIZE multiple when read back.
> >       reclaim induced by memory.reclaim.
> >
> >    memory.peak
> > -     A read-only single value file which exists on non-root
> > -     cgroups.
> > +     A read-write single value file which exists on non-root cgroups.
> > +
> > +     The max memory usage recorded for the cgroup and its descendants since
> > +     either the creation of the cgroup or the most recent reset for that fd.
> >
> > -     The max memory usage recorded for the cgroup and its
> > -     descendants since the creation of the cgroup.
> > +     A write of the string "fd_local_reset" to this file resets it to the
> > +     current memory usage for subsequent reads through the same
>
> Hi David!
>
> Not a very strong preference, but with the current design, do we really expect
> to have a non-local reset? If not, can we agree on a "reset" string instead
> for a sake of simplicity?

I put in "fd_local_reset" as a bit of a straw-man. I'm fine with
changing it to "reset"
since the consensus seemed to be that a global reset was problematic.

>
> > +     file descriptor.
> > +     Attempts to write any other string will return EINVAL
> > +     (modulo leading and trailing whitespace).
> >
> >    memory.oom.group
> >       A read-write single value file which exists on non-root
> > @@ -1652,11 +1657,16 @@ PAGE_SIZE multiple when read back.
> >       Healthy workloads are not expected to reach this limit.
> >
> >    memory.swap.peak
> > -     A read-only single value file which exists on non-root
> > -     cgroups.
> > +     A read-write single value file which exists on non-root cgroups.
> > +
> > +     The max swap usage recorded for the cgroup and its descendants since
> > +     the creation of the cgroup or the most recent reset for that fd.
> >
> > -     The max swap usage recorded for the cgroup and its
> > -     descendants since the creation of the cgroup.
> > +     A write of the string "fd_local_reset" to this file resets it to the
> > +     current memory usage for subsequent reads through the same
> > +     file descriptor.
> > +     Attempts to write any other string will return EINVAL
> > +     (modulo leading and trailing whitespace).
> >
> >    memory.swap.max
> >       A read-write single value file which exists on non-root
> > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> > index 2150ca60394b..9bda441227ea 100644
> > --- a/include/linux/cgroup.h
> > +++ b/include/linux/cgroup.h
> > @@ -855,4 +855,11 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
> >
> >  struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
> >
> > +struct memcg_peak_mem_ctx {
> > +     long                            local_watermark;
> > +     struct memcg_peak_mem_ctx       *next, *prev;
>
> Please, take a look at include/linux/list.h and use it instead of
> re-implementing list operations from scratch.

I clearly didn't look hard enough for that.
Thanks! I'll port this to use those helpers/macros.

>
> > +};
> > +
> > +struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of);
> > +
> >  #endif /* _LINUX_CGROUP_H */
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 030d34e9d117..6be7507c6fd3 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -198,6 +198,11 @@ struct mem_cgroup {
> >       struct page_counter kmem;               /* v1 only */
> >       struct page_counter tcpmem;             /* v1 only */
> >
> > +     /* lists of memcg peak watching contexts on swap and memory */
> > +     struct memcg_peak_mem_ctx *peak_memory_local_watermark_watchers;
> > +     struct memcg_peak_mem_ctx *peak_swap_local_watermark_watchers;
> > +     spinlock_t pagecounter_peak_watchers_lock;
> > +
> >       /* Range enforcement for interrupt charges */
> >       struct work_struct high_work;
> >
> > diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> > index 8cd858d912c4..047ceaece258 100644
> > --- a/include/linux/page_counter.h
> > +++ b/include/linux/page_counter.h
> > @@ -26,6 +26,7 @@ struct page_counter {
> >       atomic_long_t children_low_usage;
> >
> >       unsigned long watermark;
> > +     unsigned long local_watermark; /* track min of fd-local resets */
> >       unsigned long failcnt;
> >
> >       /* Keep all the read most fields in a separete cacheline. */
> > @@ -81,4 +82,9 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
> >       counter->watermark = page_counter_read(counter);
> >  }
> >
> > +static inline void page_counter_reset_local_watermark(struct page_counter *counter)
> > +{
> > +     counter->local_watermark = page_counter_read(counter);
> > +}
> > +
> >  #endif /* _LINUX_PAGE_COUNTER_H */
> > diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> > index 520b90dd97ec..5a97ba08e976 100644
> > --- a/kernel/cgroup/cgroup-internal.h
> > +++ b/kernel/cgroup/cgroup-internal.h
> > @@ -81,6 +81,8 @@ struct cgroup_file_ctx {
> >       struct {
> >               struct cgroup_pidlist   *pidlist;
> >       } procs1;
> > +
> > +     struct memcg_peak_mem_ctx peak;
> >  };
> >
> >  /*
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index e32b6972c478..38b935ffa6cf 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -1964,6 +1964,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
> >       return -EINVAL;
> >  }
> >
> > +struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct kernfs_open_file *of)
> > +{
> > +     struct cgroup_file_ctx *ctx = of->priv;
> > +
> > +     return &ctx->peak;
> > +}
> > +
> >  static void apply_cgroup_root_flags(unsigned int root_flags)
> >  {
> >       if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8f2f1bb18c9c..eb6614236371 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -25,6 +25,7 @@
> >   * Copyright (C) 2020 Alibaba, Inc, Alex Shi
> >   */
> >
> > +#include <linux/cgroup-defs.h>
> >  #include <linux/page_counter.h>
> >  #include <linux/memcontrol.h>
> >  #include <linux/cgroup.h>
> > @@ -5745,6 +5746,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
> >       vmpressure_init(&memcg->vmpressure);
> >       INIT_LIST_HEAD(&memcg->event_list);
> >       spin_lock_init(&memcg->event_list_lock);
> > +     spin_lock_init(&memcg->pagecounter_peak_watchers_lock);
> >       memcg->socket_pressure = jiffies;
> >  #ifdef CONFIG_MEMCG_KMEM
> >       memcg->kmemcg_id = -1;
> > @@ -6907,12 +6909,130 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
> >       return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
> >  }
> >
> > -static u64 memory_peak_read(struct cgroup_subsys_state *css,
> > -                         struct cftype *cft)
> > +static struct page_counter *memcg_memory_extract_page_counter(struct mem_cgroup *memcg)
> >  {
> > +     return &memcg->memory;
> > +}
> > +
> > +static struct memcg_peak_mem_ctx **memcg_memory_extract_peak_watchers(struct mem_cgroup *memcg)
> > +{
> > +     return &memcg->peak_memory_local_watermark_watchers;
> > +}
> > +
> > +inline int swap_memory_peak_show(
> > +     struct seq_file *sf, void *v,
> > +     struct page_counter *(*extract_pc)(struct mem_cgroup *memcg))
> > +{
> > +     struct cgroup_subsys_state *css = seq_css(sf);
> >       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > +     struct page_counter *pc = extract_pc(memcg);
> > +
> > +     struct kernfs_open_file *of = sf->private;
> > +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> > +     s64 fd_peak = ctx->local_watermark;
> > +
> > +     if (fd_peak == -1) {
> > +             seq_printf(sf, "%llu\n", (u64)pc->watermark * PAGE_SIZE);
> > +             return 0;
> > +     }
> > +
> > +     s64 pc_peak = pc->local_watermark;
> > +     s64 wm = fd_peak > pc_peak ? fd_peak : pc_peak;
> > +
> > +     seq_printf(sf, "%lld\n", wm * PAGE_SIZE);
> > +     return 0;
> > +}
> > +
> > +static int memory_peak_show(struct seq_file *sf, void *v)
> > +{
> > +     return swap_memory_peak_show(sf, v, memcg_memory_extract_page_counter);
>
> I think it's really too complex. Why not pass a single boolean argument
> which will define to use memory page_counter or swap page_counter?
> It will eliminate a need to pass pointers to functions and also eliminate
> a need for introducing these helper functions in general.

Yeah, that's definitely a cleaner option with only two alternatives.
I'll make that change.

>
> > +}
> > +
> > +static int swap_memory_peak_open(struct kernfs_open_file *of)
> > +{
> > +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> > +
> > +     ctx->local_watermark = -1;
> > +     return 0;
> > +}
> > +
> > +inline void swap_memory_peak_release(
> > +     struct kernfs_open_file *of,
> > +     struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
> > +{
> > +     struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> > +
> > +     if (ctx->local_watermark == -1) {
> > +             /* fast path (no writes on this fd)*/
> > +             return;
> > +     }
> > +     spin_lock(&memcg->pagecounter_peak_watchers_lock);
> > +     if (ctx->next) {
> > +             ctx->next->prev = ctx->prev;
> > +     }
> > +     if (ctx->prev) {
> > +             ctx->prev->next = ctx->next;
> > +     } else {
> > +             struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
> > +
> > +             *watchers = ctx->next;
> > +     }
> > +     spin_unlock(&memcg->pagecounter_peak_watchers_lock);
> > +}
> >
> > -     return (u64)memcg->memory.watermark * PAGE_SIZE;
> > +static void memory_peak_release(struct kernfs_open_file *of)
> > +{
> > +     swap_memory_peak_release(of, memcg_memory_extract_peak_watchers);
> > +}
> > +
> > +inline ssize_t swap_memory_peak_write(
> > +     struct kernfs_open_file *of,
> > +     char *buf, size_t nbytes, loff_t off,
> > +     struct page_counter* (*extract_pc)(struct mem_cgroup *memcg),
> > +     struct memcg_peak_mem_ctx **(*extract_watchers)(struct mem_cgroup *memcg))
> > +{
> > +     buf = strstrip(buf);
> > +     /* Only allow "fd_local_reset" to keep the API clear */
> > +     if (strcmp(buf, "fd_local_reset"))
> > +             return -EINVAL;
> > +     struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> > +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
>
> Please, don't mix variable definitions and code. Also, please, use
Ok, I'll factor that out

> scripts/checkpatch.pl for checking the code before submission. I guess
> it will raise several issues in this patch.

Thanks, I did run checkpatch.pl on this patch, but I might have made a
few changes
since my last pass. (it didn't complain about mixing cod and definitions)

I'll make another pass before mailing the next version.
>
> > +
> > +     spin_lock(&memcg->pagecounter_peak_watchers_lock);
> > +
> > +     struct page_counter *pc = extract_pc(memcg);
> > +
> > +     page_counter_reset_local_watermark(pc);
> > +     const unsigned long cur = pc->local_watermark;
> > +     struct memcg_peak_mem_ctx **watchers = extract_watchers(memcg);
> > +     struct memcg_peak_mem_ctx *peer_ctx;
> > +
> > +     for (peer_ctx = *watchers; peer_ctx; peer_ctx = peer_ctx->next) {
> > +             if (cur > peer_ctx->local_watermark)
> > +                     peer_ctx->local_watermark = cur;
> > +     }
> > +     if (ctx->local_watermark == -1) {
> > +             /* only append to the list if we're not already there */
> > +             if (peer_ctx) {
> > +                     ctx->prev = peer_ctx;
> > +                     peer_ctx->next = ctx;
> > +             } else {
> > +                     *watchers = ctx;
> > +             }
> > +     }
> > +     ctx->local_watermark = cur;
> > +     spin_unlock(&memcg->pagecounter_peak_watchers_lock);
> > +
> > +     return nbytes;
> > +}
> > +
> > +static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
> > +                              size_t nbytes, loff_t off)
> > +{
> > +     return swap_memory_peak_write(of, buf, nbytes, off,
> > +                                   memcg_memory_extract_page_counter,
> > +                                   memcg_memory_extract_peak_watchers);
> >  }
> >
> >  static int memory_min_show(struct seq_file *m, void *v)
> > @@ -7231,7 +7351,10 @@ static struct cftype memory_files[] = {
> >       {
> >               .name = "peak",
> >               .flags = CFTYPE_NOT_ON_ROOT,
> > -             .read_u64 = memory_peak_read,
> > +             .open = swap_memory_peak_open,
> > +             .release = memory_peak_release,
> > +             .seq_show = memory_peak_show,
> > +             .write = memory_peak_write,
> >       },
> >       {
> >               .name = "min",
> > @@ -8193,14 +8316,35 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
> >       return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
> >  }
> >
> > -static u64 swap_peak_read(struct cgroup_subsys_state *css,
> > -                       struct cftype *cft)
> > +
> > +static struct page_counter *memcg_swap_extract_page_counter(struct mem_cgroup *memcg)
> > +{
> > +     return &memcg->swap;
> > +}
> > +
> > +static struct memcg_peak_mem_ctx **memcg_swap_extract_peak_watchers(struct mem_cgroup *memcg)
> >  {
> > -     struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> > +     return &memcg->peak_swap_local_watermark_watchers;
> > +}
> > +
> > +static int swap_peak_show(struct seq_file *sf, void *v)
> > +{
> > +     return swap_memory_peak_show(sf, v, memcg_swap_extract_page_counter);
> > +}
> >
> > -     return (u64)memcg->swap.watermark * PAGE_SIZE;
> > +static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
> > +                            size_t nbytes, loff_t off)
> > +{
> > +     return swap_memory_peak_write(of, buf, nbytes, off,
> > +                                   memcg_swap_extract_page_counter,
> > +                                   memcg_swap_extract_peak_watchers);
> > +}
> > +static void swap_peak_release(struct kernfs_open_file *of)
> > +{
> > +     swap_memory_peak_release(of, memcg_swap_extract_peak_watchers);
> >  }
> >
> > +
> >  static int swap_high_show(struct seq_file *m, void *v)
> >  {
> >       return seq_puts_memcg_tunable(m,
> > @@ -8282,7 +8426,10 @@ static struct cftype swap_files[] = {
> >       {
> >               .name = "swap.peak",
> >               .flags = CFTYPE_NOT_ON_ROOT,
> > -             .read_u64 = swap_peak_read,
> > +             .open = swap_memory_peak_open,
> > +             .release = swap_peak_release,
> > +             .seq_show = swap_peak_show,
> > +             .write = swap_peak_write,
> >       },
> >       {
> >               .name = "swap.events",
> > diff --git a/mm/page_counter.c b/mm/page_counter.c
> > index db20d6452b71..40d5f4990218 100644
> > --- a/mm/page_counter.c
> > +++ b/mm/page_counter.c
> > @@ -82,6 +82,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
> >                */
> >               if (new > READ_ONCE(c->watermark))
> >                       WRITE_ONCE(c->watermark, new);
> > +             if (new > READ_ONCE(c->local_watermark))
> > +                     WRITE_ONCE(c->local_watermark, new);
>
> Hm, can't we have a single comparison on the hot path?
> Also, we read and write c->local_watermark speculatively here, Idk if it's still
> acceptable with an ability to reset watermarks "locally". Maybe it is, but
> it definitely deserves at least a comment with an explanation.

Unfortunately, since the two watermarks may be reset at different
times I don't think we
can consolidate.
e.g. I think that if the usage peaked, dropped down a bit and then was
going back
up again when the "local_watermark" was reset, we'll continue only
bumping local_watermark,
but we don't want to touch "watermark" until we hit that watermark again.

>
> And btw thank you for including tests into the commit, it's really great to see.
> I'd suggest you to extract them into a separate commit and post it as a series.
Sure thing!
I'll split them off into their own commit. (the tests are about half
the line-delta)

>
> Thank you!


Thanks for the prompt review!
I'll try to get another revision out later today.


-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-22 19:30     ` David Finkel
@ 2024-07-22 19:47       ` Waiman Long
  2024-07-22 23:06         ` David Finkel
  0 siblings, 1 reply; 47+ messages in thread
From: Waiman Long @ 2024-07-22 19:47 UTC (permalink / raw)
  To: David Finkel, Roman Gushchin
  Cc: Muchun Song, Tejun Heo, Andrew Morton, core-services,
	Jonathan Corbet, Michal Hocko, Shakeel Butt, Shuah Khan,
	Johannes Weiner, Zefan Li, cgroups, linux-doc, linux-mm,
	linux-kselftest

On 7/22/24 15:30, David Finkel wrote:
>>> diff --git a/mm/page_counter.c b/mm/page_counter.c
>>> index db20d6452b71..40d5f4990218 100644
>>> --- a/mm/page_counter.c
>>> +++ b/mm/page_counter.c
>>> @@ -82,6 +82,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
>>>                 */
>>>                if (new > READ_ONCE(c->watermark))
>>>                        WRITE_ONCE(c->watermark, new);
>>> +             if (new > READ_ONCE(c->local_watermark))
>>> +                     WRITE_ONCE(c->local_watermark, new);
>> Hm, can't we have a single comparison on the hot path?
>> Also, we read and write c->local_watermark speculatively here, Idk if it's still
>> acceptable with an ability to reset watermarks "locally". Maybe it is, but
>> it definitely deserves at least a comment with an explanation.
> Unfortunately, since the two watermarks may be reset at different
> times I don't think we
> can consolidate.
> e.g. I think that if the usage peaked, dropped down a bit and then was
> going back
> up again when the "local_watermark" was reset, we'll continue only
> bumping local_watermark,
> but we don't want to touch "watermark" until we hit that watermark again.
If we make page_counter_reset_watermark() reset the local_watermark as well,
we can guarantee "local_watermark <= watermark" and wrap one check inside
the other.

         if (new > READ_ONCE(c->local_watermark)) {
                 WRITE_ONCE(c->local_watermark, new);
                 if (new > READ_ONCE(c->watermark))
                         WRITE_ONCE(c->watermark, new);
         }

Cheers,
Longman



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers
  2024-07-22 19:47       ` Waiman Long
@ 2024-07-22 23:06         ` David Finkel
  0 siblings, 0 replies; 47+ messages in thread
From: David Finkel @ 2024-07-22 23:06 UTC (permalink / raw)
  To: Waiman Long
  Cc: Roman Gushchin, Muchun Song, Tejun Heo, Andrew Morton,
	core-services, Jonathan Corbet, Michal Hocko, Shakeel Butt,
	Shuah Khan, Johannes Weiner, Zefan Li, cgroups, linux-doc,
	linux-mm, linux-kselftest

On Mon, Jul 22, 2024 at 3:47 PM Waiman Long <longman@redhat.com> wrote:
>
> On 7/22/24 15:30, David Finkel wrote:
> >>> diff --git a/mm/page_counter.c b/mm/page_counter.c
> >>> index db20d6452b71..40d5f4990218 100644
> >>> --- a/mm/page_counter.c
> >>> +++ b/mm/page_counter.c
> >>> @@ -82,6 +82,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
> >>>                 */
> >>>                if (new > READ_ONCE(c->watermark))
> >>>                        WRITE_ONCE(c->watermark, new);
> >>> +             if (new > READ_ONCE(c->local_watermark))
> >>> +                     WRITE_ONCE(c->local_watermark, new);
> >> Hm, can't we have a single comparison on the hot path?
> >> Also, we read and write c->local_watermark speculatively here, Idk if it's still
> >> acceptable with an ability to reset watermarks "locally". Maybe it is, but
> >> it definitely deserves at least a comment with an explanation.
> > Unfortunately, since the two watermarks may be reset at different
> > times I don't think we
> > can consolidate.
> > e.g. I think that if the usage peaked, dropped down a bit and then was
> > going back
> > up again when the "local_watermark" was reset, we'll continue only
> > bumping local_watermark,
> > but we don't want to touch "watermark" until we hit that watermark again.
> If we make page_counter_reset_watermark() reset the local_watermark as well,
> we can guarantee "local_watermark <= watermark" and wrap one check inside
> the other.
>
>          if (new > READ_ONCE(c->local_watermark)) {
>                  WRITE_ONCE(c->local_watermark, new);
>                  if (new > READ_ONCE(c->watermark))
>                          WRITE_ONCE(c->watermark, new);
>          }
>
> Cheers,
> Longman
>

Hmm, yeah, given that we'll only be resetting one of the two, I think I'll
use this option.
The branch predictor should make that second check pretty
much a noop in the common-case when we enter the outer if, too.

Thanks!

-- 
David Finkel
Senior Principal Software Engineer, Core Services


^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2024-07-22 23:06 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-04 19:41 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
2023-12-04 23:33 ` Shakeel Butt
2023-12-05  9:07 ` Michal Hocko
2023-12-05 16:00   ` David Finkel
2023-12-06  8:45     ` Michal Hocko
2024-02-07 21:06 ` David Finkel
2024-07-15 20:36 David Finkel
2024-07-15 20:36 ` David Finkel
2024-07-15 20:42   ` David Finkel
2024-07-15 20:46     ` David Finkel
2024-07-16  7:20       ` Michal Hocko
2024-07-16 12:47         ` David Finkel
2024-07-16 13:19           ` Michal Hocko
2024-07-16 13:39             ` David Finkel
2024-07-16 13:48   ` Michal Hocko
2024-07-16 13:54     ` David Finkel
2024-07-16 16:44     ` Tejun Heo
2024-07-16 17:01       ` Roman Gushchin
2024-07-16 17:20         ` David Finkel
2024-07-16 19:53         ` Tejun Heo
2024-07-16 17:10       ` David Finkel
2024-07-16 19:48         ` Tejun Heo
2024-07-16 20:18           ` David Finkel
2024-07-16 18:00       ` Michal Hocko
2024-07-16 20:00         ` Tejun Heo
2024-07-16 22:06           ` David Finkel
2024-07-17  6:26             ` Michal Hocko
2024-07-17 14:24               ` David Finkel
2024-07-17 15:46                 ` Michal Hocko
2024-07-17  6:23           ` Michal Hocko
2024-07-17 17:04       ` Johannes Weiner
2024-07-17 20:14         ` David Finkel
2024-07-17 20:44           ` Johannes Weiner
2024-07-17 21:13             ` David Finkel
2024-07-17 23:48               ` Waiman Long
2024-07-18  1:24                 ` Tejun Heo
2024-07-18  2:17                   ` Roman Gushchin
2024-07-18  2:22                   ` Waiman Long
2024-07-18  7:21             ` Michal Hocko
2024-07-18 21:49         ` David Finkel
2024-07-19  3:23           ` Waiman Long
2024-07-22 15:18             ` David Finkel
2024-07-22 15:17 [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers (fd-local edition) David Finkel
2024-07-22 15:17 ` [PATCH] mm, memcg: cg2 memory{.swap,}.peak write handlers David Finkel
2024-07-22 18:22   ` Roman Gushchin
2024-07-22 19:30     ` David Finkel
2024-07-22 19:47       ` Waiman Long
2024-07-22 23:06         ` David Finkel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox