linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Donet Tom <donettom@linux.ibm.com>
To: Bing Jiao <bingjiao@google.com>, linux-mm@kvack.org
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Rientjes <rientjes@google.com>,
	Yosry Ahmed <yosry@kernel.org>,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
	Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
	Barry Song <baohua@kernel.org>,
	Youngjun Park <youngjun.park@lge.com>,
	David Hildenbrand <david@kernel.org>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	Lorenzo Stoakes <ljs@kernel.org>,
	Axel Rasmussen <axelrasmussen@google.com>,
	Yuanchu Xie <yuanchu@google.com>, Wei Xu <weixugc@google.com>,
	Joshua Hahn <joshua.hahnjy@gmail.com>
Subject: Re: [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim
Date: Fri, 20 Mar 2026 18:47:14 +0530	[thread overview]
Message-ID: <380c52cb-fc8d-4fbe-8d2a-f153bd179816@linux.ibm.com> (raw)
In-Reply-To: <20260317230720.990329-3-bingjiao@google.com>

Hi Bing

On 3/18/26 4:37 AM, Bing Jiao wrote:
> NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> it does not reduce the total memory usage of a memcg. In memcg direct
> reclaim paths (e.g., charge-triggered or manual limit writes), where
> demotion is allowed, this leads to "fake progress" where the reclaim
> loop concludes it has satisfied the memory request without actually
> reducing the cgroup's charge.
>
> This could result in inefficient reclaim loops, CPU waste, moving all
> pages to far-tier nodes, and potentially premature OOM kills when the
> cgroup is under memory pressure but demotion is still possible.
>
> Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> these memcg-specific reclaim paths. This ensures that reclaim
> progress is only counted when memory is actually freed or swapped out.

Thanks for the patch. With this change, are we completely disabling 
memory tiering in memcg?

>
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> ---
>   include/linux/swap.h |  1 +
>   mm/memcontrol-v1.c   | 10 ++++++++--
>   mm/memcontrol.c      | 16 +++++++++++-----
>   mm/vmscan.c          |  1 +
>   4 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7a09df6977a5..e83897a6dc72 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
>
>   #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
>   #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
> +#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
>   #define MIN_SWAPPINESS 0
>   #define MAX_SWAPPINESS 200
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 433bba9dfe71..3cb600e28e5b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>   	int ret;
>   	bool limits_invariant;
>   	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
> +
> +	if (!memsw)
> +		reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
>
>   	do {
>   		if (signal_pending(current)) {
> @@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>   		}
>
>   		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> -				memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
> +						 reclaim_options, NULL)) {
>   			ret = -EBUSY;
>   			break;
>   		}
> @@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
>   static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>   {
>   	int nr_retries = MAX_RECLAIM_RETRIES;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> +				       MEMCG_RECLAIM_NO_DEMOTION;
>
>   	/* we call try-to-free pages for make this cgroup empty */
>   	lru_add_drain_all();
> @@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
>   			return -EINTR;
>
>   		if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> -						  MEMCG_RECLAIM_MAY_SWAP, NULL))
> +						  reclaim_options, NULL))
>   			nr_retries--;
>   	}
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 303ac622d22d..fcf1cd0da643 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
>   				  gfp_t gfp_mask)
>   {
>   	unsigned long nr_reclaimed = 0;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> +				       MEMCG_RECLAIM_NO_DEMOTION;
>
>   	do {
>   		unsigned long pflags;
> @@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
>   		psi_memstall_enter(&pflags);
>   		nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
>   							gfp_mask,
> -							MEMCG_RECLAIM_MAY_SWAP,
> +							reclaim_options,
>   							NULL);
>   		psi_memstall_leave(&pflags);
>   	} while ((memcg = parent_mem_cgroup(memcg)) &&
> @@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>   		/* Avoid the refill and flush of the older stock */
>   		batch = nr_pages;
>
> -	reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> +	reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
>   	if (!do_memsw_account() ||
>   	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
>   		if (page_counter_try_charge(&memcg->memory, batch, &counter))
> @@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
>   	psi_memstall_enter(&pflags);
>   	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> -						    gfp_mask, reclaim_options, NULL);
> +					gfp_mask, reclaim_options, NULL);
>   	psi_memstall_leave(&pflags);
>
>   	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>   {
>   	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
>   	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> +				       MEMCG_RECLAIM_NO_DEMOTION;
>   	bool drained = false;
>   	unsigned long high;
>   	int err;
> @@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
>   		}
>
>   		reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> -					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
> +					GFP_KERNEL, reclaim_options, NULL);
>
>   		if (!reclaimed && !nr_retries--)
>   			break;
> @@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>   {
>   	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
>   	unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> +	unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> +				       MEMCG_RECLAIM_NO_DEMOTION;
>   	bool drained = false;
>   	unsigned long max;
>   	int err;
> @@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
>   		if (nr_reclaims) {
>   			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> -					GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
> +					GFP_KERNEL, reclaim_options, NULL))
>   				nr_reclaims--;
>   			continue;
>   		}
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 33287ba4a500..7a8617ba1748 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>   		.may_unmap = 1,
>   		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>   		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> +		.no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
>   	};
>   	/*
>   	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put


Did you run any performance benchmarks with this patch?


This patch looks good to me. Feel free to add

Reviewed by: Donet Tom <donettom@linux.ibm.com>


> --
> 2.53.0.851.ga537e3e6e9-goog
>
>


  parent reply	other threads:[~2026-03-20 13:17 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-17 23:06 [PATCH 0/3] mm/memcontrol: control demotion in memcg reclaim Bing Jiao
2026-03-17 23:07 ` [PATCH 1/3] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-17 23:38   ` Yosry Ahmed
2026-03-17 23:07 ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Bing Jiao
2026-03-17 23:44   ` Yosry Ahmed
2026-03-18 20:57     ` Bing Jiao
2026-03-18 21:56       ` [PATCH v2] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-18 22:06         ` Yosry Ahmed
2026-03-18 22:19         ` [PATCH v3] " Bing Jiao
2026-03-18 22:54           ` Johannes Weiner
2026-03-18 23:28           ` Shakeel Butt
2026-03-19  9:29           ` Michal Hocko
2026-03-20  3:39             ` Bing Jiao
2026-03-20  9:32               ` Michal Hocko
2026-03-21  3:34           ` [PATCH v4] " Bing Jiao
2026-03-20 13:17   ` Donet Tom [this message]
2026-03-21  4:04     ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Bing Jiao
2026-03-17 23:07 ` [PATCH 3/3] mm/vmscan: add demote= option to proactive reclaim Bing Jiao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=380c52cb-fc8d-4fbe-8d2a-f153bd179816@linux.ibm.com \
    --to=donettom@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=bingjiao@google.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=weixugc@google.com \
    --cc=yosry@kernel.org \
    --cc=youngjun.park@lge.com \
    --cc=yuanchu@google.com \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox