From: Yosry Ahmed <yosry@kernel.org>
To: Bing Jiao <bingjiao@google.com>
Cc: linux-mm@kvack.org, Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Roman Gushchin <roman.gushchin@linux.dev>,
Shakeel Butt <shakeel.butt@linux.dev>,
Muchun Song <muchun.song@linux.dev>,
Andrew Morton <akpm@linux-foundation.org>,
David Rientjes <rientjes@google.com>,
cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
Barry Song <baohua@kernel.org>,
Youngjun Park <youngjun.park@lge.com>,
David Hildenbrand <david@kernel.org>,
Qi Zheng <zhengqi.arch@bytedance.com>,
Lorenzo Stoakes <ljs@kernel.org>,
Axel Rasmussen <axelrasmussen@google.com>,
Yuanchu Xie <yuanchu@google.com>, Wei Xu <weixugc@google.com>,
Joshua Hahn <joshua.hahnjy@gmail.com>
Subject: Re: [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim
Date: Tue, 17 Mar 2026 16:44:34 -0700 [thread overview]
Message-ID: <CAO9r8zP5HmeE1uOZE9WxN1GyC59mM_F2JGaKLEkxzzCvnxpW2g@mail.gmail.com> (raw)
In-Reply-To: <20260317230720.990329-3-bingjiao@google.com>
On Tue, Mar 17, 2026 at 4:07 PM Bing Jiao <bingjiao@google.com> wrote:
>
> NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> it does not reduce the total memory usage of a memcg. In memcg direct
> reclaim paths (e.g., charge-triggered or manual limit writes), where
> demotion is allowed, this leads to "fake progress" where the reclaim
> loop concludes it has satisfied the memory request without actually
> reducing the cgroup's charge.
>
> This could result in inefficient reclaim loops, CPU waste, moving all
> pages to far-tier nodes, and potentially premature OOM kills when the
> cgroup is under memory pressure but demotion is still possible.
>
> Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> these memcg-specific reclaim paths. This ensures that reclaim
> progress is only counted when memory is actually freed or swapped out.
See the discussion @
https://lore.kernel.org/linux-mm/20250909012141.1467-1-cuishw@inspur.com/
and the commits/threads it is referring to.
>
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> ---
> include/linux/swap.h | 1 +
> mm/memcontrol-v1.c | 10 ++++++++--
> mm/memcontrol.c | 16 +++++++++++-----
> mm/vmscan.c | 1 +
> 4 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7a09df6977a5..e83897a6dc72 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
>
> #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
> +#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
> #define MIN_SWAPPINESS 0
> #define MAX_SWAPPINESS 200
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 433bba9dfe71..3cb600e28e5b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> int ret;
> bool limits_invariant;
> struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
> + unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
> +
> + if (!memsw)
> + reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
>
> do {
> if (signal_pending(current)) {
> @@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> }
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
> + reclaim_options, NULL)) {
> ret = -EBUSY;
> break;
> }
> @@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> {
> int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> /* we call try-to-free pages for make this cgroup empty */
> lru_add_drain_all();
> @@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> return -EINTR;
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - MEMCG_RECLAIM_MAY_SWAP, NULL))
> + reclaim_options, NULL))
> nr_retries--;
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 303ac622d22d..fcf1cd0da643 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> gfp_t gfp_mask)
> {
> unsigned long nr_reclaimed = 0;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> do {
> unsigned long pflags;
> @@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> psi_memstall_enter(&pflags);
> nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> gfp_mask,
> - MEMCG_RECLAIM_MAY_SWAP,
> + reclaim_options,
> NULL);
> psi_memstall_leave(&pflags);
> } while ((memcg = parent_mem_cgroup(memcg)) &&
> @@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> - reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
> if (!do_memsw_account() ||
> page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> if (page_counter_try_charge(&memcg->memory, batch, &counter))
> @@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> psi_memstall_enter(&pflags);
> nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> - gfp_mask, reclaim_options, NULL);
> + gfp_mask, reclaim_options, NULL);
> psi_memstall_leave(&pflags);
>
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long high;
> int err;
> @@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> }
>
> reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
> + GFP_KERNEL, reclaim_options, NULL);
>
> if (!reclaimed && !nr_retries--)
> break;
> @@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long max;
> int err;
> @@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
> if (nr_reclaims) {
> if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
> + GFP_KERNEL, reclaim_options, NULL))
> nr_reclaims--;
> continue;
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 33287ba4a500..7a8617ba1748 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_unmap = 1,
> .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> + .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
> };
> /*
> * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.53.0.851.ga537e3e6e9-goog
>
next prev parent reply other threads:[~2026-03-17 23:44 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-17 23:06 [PATCH 0/3] mm/memcontrol: control demotion in memcg reclaim Bing Jiao
2026-03-17 23:07 ` [PATCH 1/3] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-17 23:38 ` Yosry Ahmed
2026-03-17 23:07 ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Bing Jiao
2026-03-17 23:44 ` Yosry Ahmed [this message]
2026-03-18 20:57 ` Bing Jiao
2026-03-18 21:56 ` [PATCH v2] mm/memcontrol: fix reclaim_options leak in try_charge_memcg() Bing Jiao
2026-03-18 22:06 ` Yosry Ahmed
2026-03-18 22:19 ` [PATCH v3] " Bing Jiao
2026-03-18 22:54 ` Johannes Weiner
2026-03-18 23:28 ` Shakeel Butt
2026-03-19 9:29 ` Michal Hocko
2026-03-20 3:39 ` Bing Jiao
2026-03-20 9:32 ` Michal Hocko
2026-03-21 3:34 ` [PATCH v4] " Bing Jiao
2026-03-20 13:17 ` [PATCH 2/3] mm/memcontrol: disable demotion in memcg direct reclaim Donet Tom
2026-03-21 4:04 ` Bing Jiao
2026-03-17 23:07 ` [PATCH 3/3] mm/vmscan: add demote= option to proactive reclaim Bing Jiao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAO9r8zP5HmeE1uOZE9WxN1GyC59mM_F2JGaKLEkxzzCvnxpW2g@mail.gmail.com \
--to=yosry@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=bingjiao@google.com \
--cc=cgroups@vger.kernel.org \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=joshua.hahnjy@gmail.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=rientjes@google.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=weixugc@google.com \
--cc=youngjun.park@lge.com \
--cc=yuanchu@google.com \
--cc=zhengqi.arch@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox