linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Chris Li <chrisl@kernel.org>
To: kasong@tencent.com
Cc: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	 Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>,  Baoquan He <bhe@redhat.com>,
	Barry Song <baohua@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	 David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	 Youngjun Park <youngjun.park@lge.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3 05/12] mm/workingset: leave highest bits empty for anon shadow
Date: Wed, 18 Feb 2026 22:56:14 -0800	[thread overview]
Message-ID: <CACePvbWtn=5=HCy-3SSKpJoBdY=zSE4Pa0DtfH60wVa6Q0tPHg@mail.gmail.com> (raw)
In-Reply-To: <20260218-swap-table-p3-v3-5-f4e34be021a7@tencent.com>

On Tue, Feb 17, 2026 at 12:06 PM Kairui Song via B4 Relay
<devnull+kasong.tencent.com@kernel.org> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Swap table entry will need 4 bits reserved for swap count in the shadow,
> so the anon shadow should have its leading 4 bits remain 0.
>
> This should be OK for the foreseeable future. Take 52 bits of physical
> address space as an example: for 4K pages, there would be at most 40
> bits for addressable pages. Currently, we have 36 bits available (64 - 1
> - 16 - 10 - 1, where XA_VALUE takes 1 bit for marker,
> MEM_CGROUP_ID_SHIFT takes 16 bits, NODES_SHIFT takes <=10 bits,
> WORKINGSET flags takes 1 bit).
>
> So in the worst case, we previously need to pack the 40 bits of address
> in 36 bits fields using a 64K bucket (bucket_order = 4). After this, the
> bucket will be increased to 1M. Which should be fine, as on such large
> machines, the working set size will be way larger than the bucket size.
>
> And for MGLRU's gen number tracking, it should be even more than enough,
> MGLRU's gen number (max_seq) increment is much slower compared to the
> eviction counter (nonresident_age).
>
> And after all, either the refault distance or the gen distance is only a
> hint that can tolerate inaccuracy just fine.
>
> And the 4 bits can be shrunk to 3, or extended to a higher value if
> needed later.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>

Acked-by: Chris Li <chrisl@kernel.org>

> ---
>  mm/swap_table.h |  4 ++++
>  mm/workingset.c | 49 ++++++++++++++++++++++++++++++-------------------
>  2 files changed, 34 insertions(+), 19 deletions(-)
>
> diff --git a/mm/swap_table.h b/mm/swap_table.h
> index ea244a57a5b7..10e11d1f3b04 100644
> --- a/mm/swap_table.h
> +++ b/mm/swap_table.h
> @@ -12,6 +12,7 @@ struct swap_table {
>  };
>
>  #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
> +#define SWP_TB_COUNT_BITS              4
>
>  /*
>   * A swap table entry represents the status of a swap slot on a swap
> @@ -22,6 +23,9 @@ struct swap_table {
>   * (shadow), or NULL.
>   */
>
> +/* Macro for shadow offset calculation */
> +#define SWAP_COUNT_SHIFT       SWP_TB_COUNT_BITS
> +
>  /*
>   * Helpers for casting one type of info into a swap table entry.
>   */
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 13422d304715..37a94979900f 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -16,6 +16,7 @@
>  #include <linux/dax.h>
>  #include <linux/fs.h>
>  #include <linux/mm.h>
> +#include "swap_table.h"
>  #include "internal.h"
>
>  /*
> @@ -184,7 +185,9 @@
>  #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) +  \
>                          WORKINGSET_SHIFT + NODES_SHIFT + \
>                          MEM_CGROUP_ID_SHIFT)
> +#define EVICTION_SHIFT_ANON    (EVICTION_SHIFT + SWAP_COUNT_SHIFT)
>  #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
> +#define EVICTION_MASK_ANON     (~0UL >> EVICTION_SHIFT_ANON)
>
>  /*
>   * Eviction timestamps need to be able to cover the full range of
> @@ -194,12 +197,12 @@
>   * that case, we have to sacrifice granularity for distance, and group
>   * evictions into coarser buckets by shaving off lower timestamp bits.
>   */
> -static unsigned int bucket_order __read_mostly;
> +static unsigned int bucket_order[ANON_AND_FILE] __read_mostly;
>
>  static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
> -                        bool workingset)
> +                        bool workingset, bool file)
>  {
> -       eviction &= EVICTION_MASK;
> +       eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON;
>         eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
>         eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
>         eviction = (eviction << WORKINGSET_SHIFT) | workingset;
> @@ -244,7 +247,8 @@ static void *lru_gen_eviction(struct folio *folio)
>         struct mem_cgroup *memcg = folio_memcg(folio);
>         struct pglist_data *pgdat = folio_pgdat(folio);
>
> -       BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
> +       BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH >
> +                    BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON));
>
>         lruvec = mem_cgroup_lruvec(memcg, pgdat);
>         lrugen = &lruvec->lrugen;
> @@ -254,7 +258,7 @@ static void *lru_gen_eviction(struct folio *folio)
>         hist = lru_hist_from_seq(min_seq);
>         atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
>
> -       return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset);
> +       return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type);
>  }
>
>  /*
> @@ -262,7 +266,7 @@ static void *lru_gen_eviction(struct folio *folio)
>   * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
>   */
>  static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
> -                               unsigned long *token, bool *workingset)
> +                               unsigned long *token, bool *workingset, bool file)
>  {
>         int memcg_id;
>         unsigned long max_seq;
> @@ -275,7 +279,7 @@ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
>         *lruvec = mem_cgroup_lruvec(memcg, pgdat);
>
>         max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
> -       max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
> +       max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH;

Nit pick, I saw you use this expression more than once:
"file ? EVICTION_MASK : EVICTION_MASK_ANON"

Maybe make it an inline function or macro?

>
>         return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
>  }
> @@ -293,7 +297,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
>
>         rcu_read_lock();
>
> -       recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
> +       recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type);
>         if (lruvec != folio_lruvec(folio))
>                 goto unlock;
>
> @@ -331,7 +335,7 @@ static void *lru_gen_eviction(struct folio *folio)
>  }
>
>  static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
> -                               unsigned long *token, bool *workingset)
> +                               unsigned long *token, bool *workingset, bool file)
>  {
>         return false;
>  }
> @@ -381,6 +385,7 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
>  void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
>  {
>         struct pglist_data *pgdat = folio_pgdat(folio);
> +       int file = folio_is_file_lru(folio);
>         unsigned long eviction;
>         struct lruvec *lruvec;
>         int memcgid;
> @@ -397,10 +402,10 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
>         /* XXX: target_memcg can be NULL, go through lruvec */
>         memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec));
>         eviction = atomic_long_read(&lruvec->nonresident_age);
> -       eviction >>= bucket_order;
> +       eviction >>= bucket_order[file];
>         workingset_age_nonresident(lruvec, folio_nr_pages(folio));
>         return pack_shadow(memcgid, pgdat, eviction,
> -                               folio_test_workingset(folio));
> +                          folio_test_workingset(folio), file);
>  }
>
>  /**
> @@ -431,14 +436,15 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
>                 bool recent;
>
>                 rcu_read_lock();
> -               recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
> +               recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction,
> +                                            workingset, file);
>                 rcu_read_unlock();
>                 return recent;
>         }
>
>         rcu_read_lock();
>         unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
> -       eviction <<= bucket_order;
> +       eviction <<= bucket_order[file];
>
>         /*
>          * Look up the memcg associated with the stored ID. It might
> @@ -495,7 +501,8 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset,
>          * longest time, so the occasional inappropriate activation
>          * leading to pressure on the active list is not a problem.
>          */
> -       refault_distance = (refault - eviction) & EVICTION_MASK;
> +       refault_distance = ((refault - eviction) &
> +                           (file ? EVICTION_MASK : EVICTION_MASK_ANON));

Here too.

Chris

>
>         /*
>          * Compare the distance to the existing workingset size. We
> @@ -780,8 +787,8 @@ static struct lock_class_key shadow_nodes_key;
>
>  static int __init workingset_init(void)
>  {
> +       unsigned int timestamp_bits, timestamp_bits_anon;
>         struct shrinker *workingset_shadow_shrinker;
> -       unsigned int timestamp_bits;
>         unsigned int max_order;
>         int ret = -ENOMEM;
>
> @@ -794,11 +801,15 @@ static int __init workingset_init(void)
>          * double the initial memory by using totalram_pages as-is.
>          */
>         timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> +       timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON;
>         max_order = fls_long(totalram_pages() - 1);
> -       if (max_order > timestamp_bits)
> -               bucket_order = max_order - timestamp_bits;
> -       pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
> -              timestamp_bits, max_order, bucket_order);
> +       if (max_order > (BITS_PER_LONG - EVICTION_SHIFT))
> +               bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits;
> +       if (max_order > timestamp_bits_anon)
> +               bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon;
> +       pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n",
> +               timestamp_bits, timestamp_bits_anon, max_order,
> +               bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]);
>
>         workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
>                                                     SHRINKER_MEMCG_AWARE,
>
> --
> 2.52.0
>
>


  reply	other threads:[~2026-02-19  6:56 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-17 20:06 [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song via B4 Relay
2026-02-17 20:06 ` [PATCH v3 01/12] mm, swap: protect si->swap_file properly and use as a mount indicator Kairui Song via B4 Relay
2026-02-19  6:36   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 02/12] mm, swap: clean up swapon process and locking Kairui Song via B4 Relay
2026-02-19  6:45   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 03/12] mm, swap: remove redundant arguments and locking for enabling a device Kairui Song via B4 Relay
2026-02-19  6:48   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 04/12] mm, swap: consolidate bad slots setup and make it more robust Kairui Song via B4 Relay
2026-02-19  6:51   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 05/12] mm/workingset: leave highest bits empty for anon shadow Kairui Song via B4 Relay
2026-02-19  6:56   ` Chris Li [this message]
2026-02-17 20:06 ` [PATCH v3 06/12] mm, swap: implement helpers for reserving data in the swap table Kairui Song via B4 Relay
2026-02-19  7:00   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 07/12] mm, swap: mark bad slots in swap table directly Kairui Song via B4 Relay
2026-02-19  7:01   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 08/12] mm, swap: simplify swap table sanity range check Kairui Song via B4 Relay
2026-02-19  7:02   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 09/12] mm, swap: use the swap table to track the swap count Kairui Song via B4 Relay
2026-02-18 10:40   ` kernel test robot
2026-02-18 12:22     ` Kairui Song
2026-02-19  7:06       ` Chris Li
2026-02-17 20:06 ` [PATCH v3 10/12] mm, swap: no need to truncate the scan border Kairui Song via B4 Relay
2026-02-19  7:10   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 11/12] mm, swap: simplify checking if a folio is swapped Kairui Song via B4 Relay
2026-02-19  7:18   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 12/12] mm, swap: no need to clear the shadow explicitly Kairui Song via B4 Relay
2026-02-19  7:19   ` Chris Li
2026-02-17 20:10 ` [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CACePvbWtn=5=HCy-3SSKpJoBdY=zSE4Pa0DtfH60wVa6Q0tPHg@mail.gmail.com' \
    --to=chrisl@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=nphamcs@gmail.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=youngjun.park@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox