[RFC PATCH v2 v2 5/5] mm, swap: introduce percpu swap device cache to avoid fragmentation

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Youngjun Park <youngjun.park@lge.com>
To: Andrew Morton <akpm@linux-foundation.org>, linux-mm@kvack.org
Cc: Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
	Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
	Barry Song <baohua@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	gunho.lee@lge.com, taejoon.song@lge.com, austin.kim@lge.com,
	youngjun.park@lge.com
Subject: [RFC PATCH v2 v2 5/5] mm, swap: introduce percpu swap device cache to avoid fragmentation
Date: Mon, 26 Jan 2026 15:52:42 +0900	[thread overview]
Message-ID: <20260126065242.1221862-6-youngjun.park@lge.com> (raw)
In-Reply-To: <20260126065242.1221862-1-youngjun.park@lge.com>

When using per-device percpu clusters (instead of a global one),
a naive allocation logic triggers swap device rotation on every
allocation. This behavior leads to severe fragmentation and performance
regression.

To address this, this patch introduces a per-cpu cache for the swap
device. The allocation logic is updated to prioritize the per-cpu
cluster within the cached swap device, effectively restoring the
traditional fastpath and slowpath flow. This approach minimizes side
effects on the existing fastpath.

With this change, swap device rotation occurs only when the current
cached device is unable to satisfy the allocation, rather than on
every attempt.

Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 78 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 6921e22b14d3..ac634a21683a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -253,7 +253,6 @@ enum {
   * throughput.
   */
 struct percpu_cluster {
-	local_lock_t lock; /* Protect the percpu_cluster above */
 	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5e3b87799440..0dcd451afee5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -106,6 +106,16 @@ PLIST_HEAD(swap_active_head);
 static PLIST_HEAD(swap_avail_head);
 static DEFINE_SPINLOCK(swap_avail_lock);
 
+struct percpu_swap_device {
+	struct swap_info_struct *si[SWAP_NR_ORDERS];
+	local_lock_t lock;
+};
+
+static DEFINE_PER_CPU(struct percpu_swap_device, percpu_swap_device) = {
+	.si = { NULL },
+	.lock = INIT_LOCAL_LOCK(),
+};
+
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
 static struct kmem_cache *swap_table_cachep;
@@ -465,7 +475,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	 * Swap allocator uses percpu clusters and holds the local lock.
 	 */
 	lockdep_assert_held(&ci->lock);
-	lockdep_assert_held(this_cpu_ptr(&si->percpu_cluster->lock));
+	lockdep_assert_held(this_cpu_ptr(&percpu_swap_device.lock));
 
 	/* The cluster must be free and was just isolated from the free list. */
 	VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
@@ -484,7 +494,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	spin_unlock(&ci->lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster->lock);
-	local_unlock(&si->percpu_cluster->lock);
+	local_unlock(&percpu_swap_device.lock);
 
 	table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
 
@@ -496,7 +506,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	 * could happen with ignoring the percpu cluster is fragmentation,
 	 * which is acceptable since this fallback and race is rare.
 	 */
-	local_lock(&si->percpu_cluster->lock);
+	local_lock(&percpu_swap_device.lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_lock(&si->global_cluster->lock);
 	spin_lock(&ci->lock);
@@ -941,9 +951,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	swap_cluster_unlock(ci);
-	if (si->flags & SWP_SOLIDSTATE)
+	if (si->flags & SWP_SOLIDSTATE) {
 		this_cpu_write(si->percpu_cluster->next[order], next);
-	else
+		this_cpu_write(percpu_swap_device.si[order], si);
+	} else
 		si->global_cluster->next[order] = next;
 
 	return found;
@@ -1041,7 +1052,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 
 	if (si->flags & SWP_SOLIDSTATE) {
 		/* Fast path using per CPU cluster */
-		local_lock(&si->percpu_cluster->lock);
 		offset = __this_cpu_read(si->percpu_cluster->next[order]);
 	} else {
 		/* Serialize HDD SWAP allocation for each device. */
@@ -1119,9 +1129,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si,
 			goto done;
 	}
 done:
-	if (si->flags & SWP_SOLIDSTATE)
-		local_unlock(&si->percpu_cluster->lock);
-	else
+	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster->lock);
 
 	return found;
@@ -1303,8 +1311,27 @@ static bool get_swap_device_info(struct swap_info_struct *si)
 	return true;
 }
 
+static bool swap_alloc_fast(struct folio *folio)
+{
+	unsigned int order = folio_order(folio);
+	struct swap_info_struct *si;
+
+	/*
+	 * Once allocated, swap_info_struct will never be completely freed,
+	 * so checking it's liveness by get_swap_device_info is enough.
+	 */
+	si = this_cpu_read(percpu_swap_device.si[order]);
+	if (!si || !get_swap_device_info(si))
+		return false;
+
+	cluster_alloc_swap_entry(si, folio);
+	put_swap_device(si);
+
+	return folio_test_swapcache(folio);
+}
+
 /* Rotate the device and switch to a new cluster */
-static void swap_alloc_entry(struct folio *folio)
+static void swap_alloc_slow(struct folio *folio)
 {
 	struct swap_info_struct *si, *next;
 	int mask = folio_memcg(folio) ?
@@ -1482,7 +1509,11 @@ int folio_alloc_swap(struct folio *folio)
 	}
 
 again:
-	swap_alloc_entry(folio);
+	local_lock(&percpu_swap_device.lock);
+	if (!swap_alloc_fast(folio))
+		swap_alloc_slow(folio);
+	local_unlock(&percpu_swap_device.lock);
+
 	if (!order && unlikely(!folio_test_swapcache(folio))) {
 		if (swap_sync_discard())
 			goto again;
@@ -1901,7 +1932,9 @@ swp_entry_t swap_alloc_hibernation_slot(int type)
 			 * Grab the local lock to be compliant
 			 * with swap table allocation.
 			 */
+			local_lock(&percpu_swap_device.lock);
 			offset = cluster_alloc_swap_entry(si, NULL);
+			local_unlock(&percpu_swap_device.lock);
 			if (offset)
 				entry = swp_entry(si->type, offset);
 		}
@@ -2705,6 +2738,27 @@ static void free_cluster_info(struct swap_cluster_info *cluster_info,
 	kvfree(cluster_info);
 }
 
+/*
+ * Called after swap device's reference count is dead, so
+ * neither scan nor allocation will use it.
+ */
+static void flush_percpu_swap_device(struct swap_info_struct *si)
+{
+	int cpu, i;
+	struct swap_info_struct **pcp_si;
+
+	for_each_possible_cpu(cpu) {
+		pcp_si = per_cpu_ptr(percpu_swap_device.si, cpu);
+		/*
+		 * Invalidate the percpu swap device cache, si->users
+		 * is dead, so no new user will point to it, just flush
+		 * any existing user.
+		 */
+		for (i = 0; i < SWAP_NR_ORDERS; i++)
+			cmpxchg(&pcp_si[i], si, NULL);
+	}
+}
+
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
@@ -2788,6 +2842,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	flush_work(&p->discard_work);
 	flush_work(&p->reclaim_work);
+	flush_percpu_swap_device(p);
 
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
@@ -3222,7 +3277,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
 			for (i = 0; i < SWAP_NR_ORDERS; i++)
 				cluster->next[i] = SWAP_ENTRY_INVALID;
-			local_lock_init(&cluster->lock);
 		}
 	} else {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
-- 
2.34.1

next prev parent reply	other threads:[~2026-01-26  7:08 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-26  6:52 [RFC PATCH v2 0/5] mm/swap, memcg: Introduce swap tiers for cgroup based swap control Youngjun Park
2026-01-26  6:52 ` [RFC PATCH v2 v2 1/5] mm: swap: introduce swap tier infrastructure Youngjun Park
2026-02-12  9:07   ` Chris Li
2026-02-13  2:18     ` YoungJun Park
2026-02-13 14:33     ` YoungJun Park
2026-01-26  6:52 ` [RFC PATCH v2 v2 2/5] mm: swap: associate swap devices with tiers Youngjun Park
2026-01-26  6:52 ` [RFC PATCH v2 v2 3/5] mm: memcontrol: add interface for swap tier selection Youngjun Park
2026-01-26  6:52 ` [RFC PATCH v2 v2 4/5] mm, swap: change back to use each swap device's percpu cluster Youngjun Park
2026-02-12  7:37   ` Chris Li
2026-01-26  6:52 ` Youngjun Park [this message]
2026-02-12  6:12 ` [RFC PATCH v2 0/5] mm/swap, memcg: Introduce swap tiers for cgroup based swap control Chris Li
2026-02-12  9:22   ` Chris Li
2026-02-13  2:26     ` YoungJun Park
2026-02-13  1:59   ` YoungJun Park
2026-02-12 17:57 ` Nhat Pham
2026-02-12 17:58   ` Nhat Pham
2026-02-13  2:43   ` YoungJun Park
2026-02-12 18:33 ` Shakeel Butt
2026-02-13  3:58   ` YoungJun Park
2026-02-21  3:47     ` Shakeel Butt
2026-02-21  6:07       ` Chris Li
2026-02-21 17:44         ` Shakeel Butt
2026-02-22  1:16           ` YoungJun Park
2026-02-21 14:30       ` YoungJun Park

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260126065242.1221862-6-youngjun.park@lge.com \
    --to=youngjun.park@lge.com \
    --cc=akpm@linux-foundation.org \
    --cc=austin.kim@lge.com \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=chrisl@kernel.org \
    --cc=gunho.lee@lge.com \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=taejoon.song@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox