From: Kairui Song via B4 Relay <devnull+kasong.tencent.com@kernel.org>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
David Hildenbrand <david@kernel.org>, Zi Yan <ziy@nvidia.com>,
Baolin Wang <baolin.wang@linux.alibaba.com>,
Barry Song <baohua@kernel.org>, Hugh Dickins <hughd@google.com>,
Chris Li <chrisl@kernel.org>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Youngjun Park <youngjun.park@lge.com>,
Chengming Zhou <chengming.zhou@linux.dev>,
Roman Gushchin <roman.gushchin@linux.dev>,
Shakeel Butt <shakeel.butt@linux.dev>,
Muchun Song <muchun.song@linux.dev>,
Qi Zheng <zhengqi.arch@bytedance.com>,
linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
Kairui Song <kasong@tencent.com>, Yosry Ahmed <yosry@kernel.org>,
Lorenzo Stoakes <ljs@kernel.org>, Dev Jain <dev.jain@arm.com>,
Lance Yang <lance.yang@linux.dev>,
Michal Hocko <mhocko@suse.com>, Michal Hocko <mhocko@kernel.org>,
Suren Baghdasaryan <surenb@google.com>,
Axel Rasmussen <axelrasmussen@google.com>,
Lorenzo Stoakes <ljs@kernel.org>, Yosry Ahmed <yosry@kernel.org>
Subject: [PATCH v3 10/12] mm/memcg, swap: store cgroup id in cluster table directly
Date: Tue, 21 Apr 2026 14:16:54 +0800 [thread overview]
Message-ID: <20260421-swap-table-p4-v3-10-2f23759a76bc@tencent.com> (raw)
In-Reply-To: <20260421-swap-table-p4-v3-0-2f23759a76bc@tencent.com>
From: Kairui Song <kasong@tencent.com>
Drop the usage of the swap_cgroup_ctrl, and use the dynamic cluster
table instead.
The per-cluster memcg table is 1024 / 512 bytes on most archs, and does
not need RCU protection: the cgroup data is only read and written under
the cluster lock. That keeps things simple, lets the allocation use
plain kmalloc with immediate kfree (no deferred free), and keeps
fragmentation acceptable.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/memcontrol.h | 6 ++++--
include/linux/swap.h | 8 +++----
mm/memcontrol-v1.c | 42 +++++++++++++++++++++++-------------
mm/memcontrol.c | 14 +++++++-----
mm/swap.h | 4 ++++
mm/swap_state.c | 6 ++----
mm/swap_table.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 35 +++++++++++++++++++-----------
mm/vmscan.c | 2 +-
9 files changed, 128 insertions(+), 43 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a013f37f24aa..bf1a6e131eca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
+struct swap_cluster_info;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
@@ -1899,7 +1900,7 @@ static inline void mem_cgroup_exit_user_fault(void)
current->in_user_fault = 0;
}
-void __memcg1_swapout(struct folio *folio);
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci);
void memcg1_swapin(struct folio *folio);
#else /* CONFIG_MEMCG_V1 */
@@ -1929,7 +1930,8 @@ static inline void mem_cgroup_exit_user_fault(void)
{
}
-static inline void __memcg1_swapout(struct folio *folio)
+static inline void __memcg1_swapout(struct folio *folio,
+ struct swap_cluster_info *ci)
{
}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f2949f5844a6..57af4647d432 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -582,12 +582,12 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return __mem_cgroup_try_charge_swap(folio);
}
-extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+extern void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
if (mem_cgroup_disabled())
return;
- __mem_cgroup_uncharge_swap(entry, nr_pages);
+ __mem_cgroup_uncharge_swap(id, nr_pages);
}
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
@@ -598,7 +598,7 @@ static inline int mem_cgroup_try_charge_swap(struct folio *folio)
return 0;
}
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
+static inline void mem_cgroup_uncharge_swap(unsigned short id,
unsigned int nr_pages)
{
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 36c507d81dc5..494e7b9adc60 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -14,6 +14,7 @@
#include "internal.h"
#include "swap.h"
+#include "swap_table.h"
#include "memcontrol-v1.h"
/*
@@ -606,14 +607,15 @@ void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
/**
* __memcg1_swapout - transfer a memsw charge to swap
* @folio: folio whose memsw charge to transfer
+ * @ci: the locked swap cluster holding the swap entries
*
* Transfer the memsw charge of @folio to the swap entry stored in
* folio->swap.
*
- * Context: folio must be isolated, unmapped, locked and is just about
- * to be freed, and caller must disable IRQs.
+ * Context: folio must be isolated, unmapped, locked and is just about to
+ * be freed, and caller must disable IRQs and hold the swap cluster lock.
*/
-void __memcg1_swapout(struct folio *folio)
+void __memcg1_swapout(struct folio *folio, struct swap_cluster_info *ci)
{
struct mem_cgroup *memcg, *swap_memcg;
struct obj_cgroup *objcg;
@@ -646,7 +648,8 @@ void __memcg1_swapout(struct folio *folio)
swap_memcg = mem_cgroup_private_id_get_online(memcg, nr_entries);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_private_id(swap_memcg), folio->swap);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_entries,
+ mem_cgroup_private_id(swap_memcg));
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
@@ -661,8 +664,7 @@ void __memcg1_swapout(struct folio *folio)
}
/*
- * Interrupts should be disabled here because the caller holds the
- * i_pages lock which is taken with interrupts-off. It is
+ * The caller must hold the swap cluster lock with IRQ off. It is
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
@@ -677,7 +679,7 @@ void __memcg1_swapout(struct folio *folio)
}
/**
- * memcg1_swapin - uncharge swap slot
+ * memcg1_swapin - uncharge swap slot on swapin
* @folio: folio being swapped in
*
* Call this function after successfully adding the charged
@@ -687,6 +689,10 @@ void __memcg1_swapout(struct folio *folio)
*/
void memcg1_swapin(struct folio *folio)
{
+ struct swap_cluster_info *ci;
+ unsigned long nr_pages;
+ unsigned short id;
+
VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -702,14 +708,20 @@ void memcg1_swapin(struct folio *folio)
* correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed.
*/
- if (do_memsw_account()) {
- /*
- * The swap entry might not get freed for a long time,
- * let's not wait for it. The page already received a
- * memory+swap charge, drop the swap entry duplicate.
- */
- mem_cgroup_uncharge_swap(folio->swap, folio_nr_pages(folio));
- }
+ if (!do_memsw_account())
+ return;
+
+ /*
+ * The swap entry might not get freed for a long time,
+ * let's not wait for it. The page already received a
+ * memory+swap charge, drop the swap entry duplicate.
+ */
+ nr_pages = folio_nr_pages(folio);
+ ci = swap_cluster_get_and_lock(folio);
+ id = __swap_cgroup_clear(ci, swp_cluster_offset(folio->swap),
+ nr_pages);
+ swap_cluster_unlock(ci);
+ mem_cgroup_uncharge_swap(id, nr_pages);
}
void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 641706fa47bf..193c8eb73be7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -64,6 +64,8 @@
#include <linux/sched/isolation.h>
#include <linux/kmemleak.h>
#include "internal.h"
+#include "swap.h"
+#include "swap_table.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
@@ -5462,6 +5464,7 @@ int __init mem_cgroup_init(void)
int __mem_cgroup_try_charge_swap(struct folio *folio)
{
unsigned int nr_pages = folio_nr_pages(folio);
+ struct swap_cluster_info *ci;
struct page_counter *counter;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
@@ -5495,22 +5498,23 @@ int __mem_cgroup_try_charge_swap(struct folio *folio)
}
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
- swap_cgroup_record(folio, mem_cgroup_private_id(memcg), folio->swap);
+ ci = swap_cluster_get_and_lock(folio);
+ __swap_cgroup_set(ci, swp_cluster_offset(folio->swap), nr_pages,
+ mem_cgroup_private_id(memcg));
+ swap_cluster_unlock(ci);
return 0;
}
/**
* __mem_cgroup_uncharge_swap - uncharge swap space
- * @entry: swap entry to uncharge
+ * @id: cgroup id to uncharge
* @nr_pages: the amount of swap space to uncharge
*/
-void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(unsigned short id, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
- unsigned short id;
- id = swap_cgroup_clear(entry, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_private_id(id);
if (memcg) {
diff --git a/mm/swap.h b/mm/swap.h
index 80c2f1bf7a57..e4ac7dbc1080 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -5,6 +5,7 @@
#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
struct swap_iocb;
+struct swap_memcg_table;
extern int page_cluster;
@@ -38,6 +39,9 @@ struct swap_cluster_info {
u8 order;
atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */
unsigned int *extend_table; /* For large swap count, protected by ci->lock */
+#ifdef CONFIG_MEMCG
+ struct swap_memcg_table *memcg_table; /* Swap table entries' cgroup record */
+#endif
struct list_head list;
};
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 86d517a33a55..71a3f128fcf0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -176,21 +176,19 @@ static int __swap_cache_add_check(struct swap_cluster_info *ci,
if (shadowp && swp_tb_is_shadow(old_tb))
*shadowp = swp_tb_to_shadow(old_tb);
if (memcg_id)
- *memcg_id = lookup_swap_cgroup_id(targ_entry);
+ *memcg_id = __swap_cgroup_get(ci, ci_off);
if (nr == 1)
return 0;
- targ_entry.val = round_down(targ_entry.val, nr);
ci_off = round_down(ci_off, nr);
ci_end = ci_off + nr;
do {
old_tb = __swap_table_get(ci, ci_off);
if (unlikely(swp_tb_is_folio(old_tb) ||
!__swp_tb_get_count(old_tb) ||
- (memcg_id && *memcg_id != lookup_swap_cgroup_id(targ_entry))))
+ (memcg_id && *memcg_id != __swap_cgroup_get(ci, ci_off))))
return -EBUSY;
- targ_entry.val++;
} while (++ci_off < ci_end);
return 0;
diff --git a/mm/swap_table.h b/mm/swap_table.h
index 8415ffbe2b9c..b2b02ee161b1 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -11,6 +11,11 @@ struct swap_table {
atomic_long_t entries[SWAPFILE_CLUSTER];
};
+/* For storing memcg private id */
+struct swap_memcg_table {
+ unsigned short id[SWAPFILE_CLUSTER];
+};
+
#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
/*
@@ -247,4 +252,53 @@ static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
return swp_tb;
}
+
+#ifdef CONFIG_MEMCG
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+ unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+ do {
+ ci->memcg_table->id[ci_off++] = id;
+ } while (--nr);
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(ci_off >= SWAPFILE_CLUSTER);
+ return ci->memcg_table->id[ci_off];
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+ unsigned int ci_off,
+ unsigned long nr)
+{
+ unsigned short old = ci->memcg_table->id[ci_off];
+
+ __swap_cgroup_set(ci, ci_off, nr, 0);
+ return old;
+}
+#else
+static inline void __swap_cgroup_set(struct swap_cluster_info *ci,
+ unsigned int ci_off, unsigned long nr, unsigned short id)
+{
+}
+
+static inline unsigned short __swap_cgroup_get(struct swap_cluster_info *ci,
+ unsigned int ci_off)
+{
+ return 0;
+}
+
+static inline unsigned short __swap_cgroup_clear(struct swap_cluster_info *ci,
+ unsigned int ci_off,
+ unsigned long nr)
+{
+ return 0;
+}
+#endif
+
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2d16aa89a4fd..edf4cb36728e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -423,7 +423,12 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci)
{
struct swap_table *table;
- table = (struct swap_table *)rcu_dereference_protected(ci->table, true);
+#ifdef CONFIG_MEMCG
+ kfree(ci->memcg_table);
+ ci->memcg_table = NULL;
+#endif
+
+ table = (struct swap_table *)rcu_access_pointer(ci->table);
if (!table)
return;
@@ -441,6 +446,7 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
{
struct swap_table *table = NULL;
struct folio *folio;
+ int ret = 0;
/* The cluster must be empty and not on any list during allocation. */
VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
@@ -458,7 +464,17 @@ static int swap_cluster_alloc_table(struct swap_cluster_info *ci, gfp_t gfp)
return -ENOMEM;
rcu_assign_pointer(ci->table, table);
- return 0;
+
+#ifdef CONFIG_MEMCG
+ if (!ci->memcg_table)
+ ci->memcg_table = kzalloc(sizeof(*ci->memcg_table), gfp);
+ if (!ci->memcg_table)
+ ret = -ENOMEM;
+#endif
+ if (ret)
+ swap_cluster_free_table(ci);
+
+ return ret;
}
/*
@@ -483,6 +499,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
bad_slots++;
else
WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ WARN_ON_ONCE(__swap_cgroup_get(ci, ci_off));
} while (++ci_off < ci_end);
WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
@@ -1860,12 +1877,10 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
unsigned int ci_start, unsigned int nr_pages)
{
unsigned long old_tb;
- unsigned int type = si->type;
unsigned short id = 0, id_cur;
unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
unsigned long offset = cluster_offset(si, ci);
unsigned int ci_batch = ci_off;
- swp_entry_t entry;
VM_WARN_ON(ci->count < nr_pages);
@@ -1883,21 +1898,17 @@ void __swap_cluster_free_entries(struct swap_info_struct *si,
* Uncharge swap slots by memcg in batches. Consecutive
* slots with the same cgroup id are uncharged together.
*/
- entry = swp_entry(type, offset + ci_off);
- id_cur = lookup_swap_cgroup_id(entry);
+ id_cur = __swap_cgroup_clear(ci, ci_off, 1);
if (id != id_cur) {
if (id)
- mem_cgroup_uncharge_swap(swp_entry(type, offset + ci_batch),
- ci_off - ci_batch);
+ mem_cgroup_uncharge_swap(id, ci_off - ci_batch);
id = id_cur;
ci_batch = ci_off;
}
} while (++ci_off < ci_end);
- if (id) {
- mem_cgroup_uncharge_swap(swp_entry(type, offset + ci_batch),
- ci_off - ci_batch);
- }
+ if (id)
+ mem_cgroup_uncharge_swap(id, ci_off - ci_batch);
swap_range_free(si, offset + ci_start, nr_pages);
swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 63d06930d8e3..50d87ff58f86 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -739,7 +739,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __memcg1_swapout(folio);
+ __memcg1_swapout(folio, ci);
__swap_cache_del_folio(ci, folio, swap, shadow);
swap_cluster_unlock_irq(ci);
} else {
--
2.53.0
next prev parent reply other threads:[~2026-04-21 6:17 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-21 6:16 [PATCH v3 00/12] mm, swap: swap table phase IV: unify allocation and reduce static metadata Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 01/12] mm, swap: simplify swap cache allocation helper Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 02/12] mm, swap: move common swap cache operations into standalone helpers Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 03/12] mm/huge_memory: move THP gfp limit helper into header Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 04/12] mm, swap: add support for stable large allocation in swap cache directly Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 05/12] mm, swap: unify large folio allocation Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 06/12] mm/memcg, swap: tidy up cgroup v1 memsw swap helpers Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 07/12] mm, swap: support flexible batch freeing of slots in different memcgs Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 08/12] mm, swap: delay and unify memcg lookup and charging for swapin Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 09/12] mm, swap: consolidate cluster allocation helpers Kairui Song via B4 Relay
2026-04-21 6:16 ` Kairui Song via B4 Relay [this message]
2026-04-21 6:16 ` [PATCH v3 11/12] mm/memcg: remove no longer used swap cgroup array Kairui Song via B4 Relay
2026-04-21 6:16 ` [PATCH v3 12/12] mm, swap: merge zeromap into swap table Kairui Song via B4 Relay
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260421-swap-table-p4-v3-10-2f23759a76bc@tencent.com \
--to=devnull+kasong.tencent.com@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=axelrasmussen@google.com \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=cgroups@vger.kernel.org \
--cc=chengming.zhou@linux.dev \
--cc=chrisl@kernel.org \
--cc=david@kernel.org \
--cc=dev.jain@arm.com \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=kasong@tencent.com \
--cc=lance.yang@linux.dev \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=mhocko@kernel.org \
--cc=mhocko@suse.com \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=surenb@google.com \
--cc=yosry@kernel.org \
--cc=youngjun.park@lge.com \
--cc=zhengqi.arch@bytedance.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox