From: Youngjun Park <youngjun.park@lge.com>
To: Andrew Morton <akpm@linux-foundation.org>, linux-mm@kvack.org
Cc: Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
Barry Song <baohua@kernel.org>,
Johannes Weiner <hannes@cmpxchg.org>,
Michal Hocko <mhocko@kernel.org>,
Roman Gushchin <roman.gushchin@linux.dev>,
Shakeel Butt <shakeel.butt@linux.dev>,
Muchun Song <muchun.song@linux.dev>,
gunho.lee@lge.com, taejoon.song@lge.com, austin.kim@lge.com,
youngjun.park@lge.com
Subject: [RFC PATCH v2 v2 3/5] mm: memcontrol: add interface for swap tier selection
Date: Mon, 26 Jan 2026 15:52:40 +0900 [thread overview]
Message-ID: <20260126065242.1221862-4-youngjun.park@lge.com> (raw)
In-Reply-To: <20260126065242.1221862-1-youngjun.park@lge.com>
This patch integrates the swap tier infrastructure with cgroup,
enabling the selection of specific swap devices per cgroup by
configuring allowed swap tiers.
The new `memory.swap.tiers` interface controls allowed swap tiers via a mask.
By default, the mask is set to include all tiers, allowing specific tiers to
be excluded or restored. Note that effective tiers are calculated separately
using a dedicated mask to respect the cgroup hierarchy. Consequently,
configured tiers may differ from effective ones, as they must be a subset
of the parent's.
Note that cgroups do not pin swap tiers. This is similar to the
`cpuset` controller, which does not prevent CPU hotplug. This
approach ensures flexibility by allowing tier configuration changes
regardless of cgroup usage.
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
Documentation/admin-guide/cgroup-v2.rst | 27 +++++++++
include/linux/memcontrol.h | 3 +-
mm/memcontrol.c | 80 +++++++++++++++++++++++++
mm/swap_tier.c | 66 ++++++++++++++++++++
mm/swap_tier.h | 21 +++++++
mm/swapfile.c | 5 ++
6 files changed, 201 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7f5b59d95fce..776a908ce1b9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1848,6 +1848,33 @@ The following nested keys are defined.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.
+ memory.swap.tiers
+ A read-write nested-keyed file which exists on non-root
+ cgroups. The default is to enable all tiers.
+
+ This interface allows selecting which swap tiers a cgroup can
+ use for swapping out memory.
+
+ The effective tiers are inherited from the parent. Only tiers
+ effective in the parent can be effective in the child. However,
+ the child can explicitly disable tiers allowed by the parent.
+
+ When read, the file shows two lines:
+ - The first line shows the operation string that was
+ written to this file.
+ - The second line shows the effective operation after
+ merging with parent settings.
+
+ When writing, the format is:
+ (+/-)(TIER_NAME) (+/-)(TIER_NAME) ...
+
+ Valid tier names are those configured in
+ /sys/kernel/mm/swap/tiers.
+
+ Each tier can be prefixed with:
+ + Enable this tier
+ - Disable this tier
+
memory.swap.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6c82c8f73e1..542bee1b5f60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -283,7 +283,8 @@ struct mem_cgroup {
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
-
+ int tier_mask;
+ int tier_effective_mask;
#ifdef CONFIG_MEMCG_V1
/* Legacy consumer-oriented counters */
struct page_counter kmem; /* v1 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 007413a53b45..c0a0a957a630 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,6 +68,7 @@
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"
+#include "swap_tier.h"
#include <linux/uaccess.h>
@@ -3691,6 +3692,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
{
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
+ swap_tiers_memcg_sync_mask(memcg);
__mem_cgroup_free(memcg);
}
@@ -3792,6 +3794,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
WRITE_ONCE(memcg->zswap_writeback, true);
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ memcg->tier_mask = TIER_ALL_MASK;
+ swap_tiers_memcg_inherit_mask(memcg, parent);
+
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
@@ -5352,6 +5357,75 @@ static int swap_events_show(struct seq_file *m, void *v)
return 0;
}
+static int swap_tier_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ swap_tiers_mask_show(m, memcg->tier_mask);
+ swap_tiers_mask_show(m, memcg->tier_effective_mask);
+
+ return 0;
+}
+
+static ssize_t swap_tier_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ char *pos, *token;
+ int ret = 0;
+
+ pos = strstrip(buf);
+
+ spin_lock(&swap_tier_lock);
+ if (!*pos) {
+ memcg->tier_mask = TIER_ALL_MASK;
+ goto sync;
+ }
+
+ while ((token = strsep(&pos, " \t\n")) != NULL) {
+ int mask;
+
+ if (!*token)
+ continue;
+
+ if (token[0] != '-' && token[0] != '+') {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mask = swap_tiers_mask_lookup(token+1);
+ if (!mask) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * if child already set, cannot add that tiers for hierarch mismatching.
+ * parent compatible, child must respect parent selected swap device.
+ */
+ switch (token[0]) {
+ case '-':
+ memcg->tier_mask &= ~mask;
+ break;
+ case '+':
+ memcg->tier_mask |= mask;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ goto err;
+ }
+
+sync:
+ __swap_tiers_memcg_sync_mask(memcg);
+err:
+ spin_unlock(&swap_tier_lock);
+ return ret ? ret : nbytes;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -5384,6 +5458,12 @@ static struct cftype swap_files[] = {
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
.seq_show = swap_events_show,
},
+ {
+ .name = "swap.tiers",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_tier_show,
+ .write = swap_tier_write,
+ },
{ } /* terminate */
};
diff --git a/mm/swap_tier.c b/mm/swap_tier.c
index d90f6eccb908..e860c87292e2 100644
--- a/mm/swap_tier.c
+++ b/mm/swap_tier.c
@@ -384,3 +384,69 @@ bool swap_tiers_update(void)
return true;
}
+
+void swap_tiers_mask_show(struct seq_file *m, int mask)
+{
+ struct swap_tier *tier;
+
+ spin_lock(&swap_tier_lock);
+ for_each_active_tier(tier) {
+ if (mask & TIER_MASK(tier))
+ seq_printf(m, "%s ", tier->name);
+ }
+ spin_unlock(&swap_tier_lock);
+ seq_puts(m, "\n");
+}
+
+int swap_tiers_mask_lookup(const char *name)
+{
+ struct swap_tier *tier;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ for_each_active_tier(tier) {
+ if (!strcmp(name, tier->name))
+ return TIER_MASK(tier);
+ }
+
+ return 0;
+}
+
+static void __swap_tier_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ int effective_mask
+ = parent ? parent->tier_effective_mask : TIER_ALL_MASK;
+
+ memcg->tier_effective_mask
+ = effective_mask & memcg->tier_mask;
+}
+
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ spin_lock(&swap_tier_lock);
+ __swap_tier_memcg_inherit_mask(memcg, parent);
+ spin_unlock(&swap_tier_lock);
+}
+
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *child;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ if (memcg == root_mem_cgroup)
+ return;
+
+ for_each_mem_cgroup_tree(child, memcg)
+ __swap_tier_memcg_inherit_mask(child, parent_mem_cgroup(child));
+}
+
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+ spin_lock(&swap_tier_lock);
+ memcg->tier_mask = TIER_ALL_MASK;
+ __swap_tiers_memcg_sync_mask(memcg);
+ spin_unlock(&swap_tier_lock);
+}
diff --git a/mm/swap_tier.h b/mm/swap_tier.h
index de81d540e3b5..8652a7f993ab 100644
--- a/mm/swap_tier.h
+++ b/mm/swap_tier.h
@@ -46,4 +46,25 @@ bool swap_tiers_update(void);
/* Tier assignment */
void swap_tiers_assign_dev(struct swap_info_struct *swp);
+/* Memcg related functions */
+void swap_tiers_mask_show(struct seq_file *m, int mask);
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent);
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+
+/* Mask and tier lookup */
+int swap_tiers_mask_lookup(const char *name);
+
+/**
+ * swap_tiers_mask_test - Check if the tier mask is valid
+ * @tier_mask: The tier mask to check
+ * @mask: The mask to compare against
+ *
+ * Return: true if condition matches, false otherwise
+ */
+static inline bool swap_tiers_mask_test(int tier_mask, int mask)
+{
+ return tier_mask & mask;
+}
#endif /* _SWAP_TIER_H */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8ce021c5bd..dd97e850ea2c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1348,10 +1348,15 @@ static bool swap_alloc_fast(struct folio *folio)
static void swap_alloc_slow(struct folio *folio)
{
struct swap_info_struct *si, *next;
+ int mask = folio_memcg(folio) ?
+ folio_memcg(folio)->tier_effective_mask : TIER_ALL_MASK;
spin_lock(&swap_avail_lock);
start_over:
plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ if (!swap_tiers_mask_test(si->tier_mask, mask))
+ continue;
+
/* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
--
2.34.1
next prev parent reply other threads:[~2026-01-26 6:53 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-26 6:52 [RFC PATCH v2 0/5] mm/swap, memcg: Introduce swap tiers for cgroup based swap control Youngjun Park
2026-01-26 6:52 ` [RFC PATCH v2 v2 1/5] mm: swap: introduce swap tier infrastructure Youngjun Park
2026-02-12 9:07 ` Chris Li
2026-02-13 2:18 ` YoungJun Park
2026-02-13 14:33 ` YoungJun Park
2026-01-26 6:52 ` [RFC PATCH v2 v2 2/5] mm: swap: associate swap devices with tiers Youngjun Park
2026-01-26 6:52 ` Youngjun Park [this message]
2026-01-26 6:52 ` [RFC PATCH v2 v2 4/5] mm, swap: change back to use each swap device's percpu cluster Youngjun Park
2026-02-12 7:37 ` Chris Li
2026-01-26 6:52 ` [RFC PATCH v2 v2 5/5] mm, swap: introduce percpu swap device cache to avoid fragmentation Youngjun Park
2026-02-12 6:12 ` [RFC PATCH v2 0/5] mm/swap, memcg: Introduce swap tiers for cgroup based swap control Chris Li
2026-02-12 9:22 ` Chris Li
2026-02-13 2:26 ` YoungJun Park
2026-02-13 1:59 ` YoungJun Park
2026-02-12 17:57 ` Nhat Pham
2026-02-12 17:58 ` Nhat Pham
2026-02-13 2:43 ` YoungJun Park
2026-02-12 18:33 ` Shakeel Butt
2026-02-13 3:58 ` YoungJun Park
2026-02-21 3:47 ` Shakeel Butt
2026-02-21 6:07 ` Chris Li
2026-02-21 17:44 ` Shakeel Butt
2026-02-22 1:16 ` YoungJun Park
2026-02-21 14:30 ` YoungJun Park
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260126065242.1221862-4-youngjun.park@lge.com \
--to=youngjun.park@lge.com \
--cc=akpm@linux-foundation.org \
--cc=austin.kim@lge.com \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=gunho.lee@lge.com \
--cc=hannes@cmpxchg.org \
--cc=kasong@tencent.com \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=shikemeng@huaweicloud.com \
--cc=taejoon.song@lge.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox