From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
Chris Li <chrisl@kernel.org>, Barry Song <v-songbaohua@oppo.com>,
Ryan Roberts <ryan.roberts@arm.com>,
Hugh Dickins <hughd@google.com>,
Yosry Ahmed <yosryahmed@google.com>,
"Huang, Ying" <ying.huang@linux.alibaba.com>,
Nhat Pham <nphamcs@gmail.com>,
Johannes Weiner <hannes@cmpxchg.org>,
Kalesh Singh <kaleshsingh@google.com>,
linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [PATCH v3 12/13] mm, swap: use a global swap cluster for non-rotation devices
Date: Tue, 31 Dec 2024 01:46:20 +0800 [thread overview]
Message-ID: <20241230174621.61185-13-ryncsn@gmail.com> (raw)
In-Reply-To: <20241230174621.61185-1-ryncsn@gmail.com>
From: Kairui Song <kasong@tencent.com>
Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the
goal of the SWAP allocator is to avoid contention for clusters. It uses
a per-CPU cluster design, and each CPU will use a different cluster as
much as possible.
However, HDDs are very sensitive to fragmentation, contention is trivial
in comparison. Therefore, we use one global cluster instead. This ensures
that each order will be written to the same cluster as much as possible,
which helps make the I/O more continuous.
This ensures that the performance of the cluster allocator is as good as
that of the old allocator. Tests after this commit compared to those
before this series:
Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap:
make -j32 with tinyconfig, using 1G memcg limit and HDD swap:
Before this series:
114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k
2901232inputs+0outputs (238877major+4227640minor)pagefaults
After this commit:
113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k
2548728inputs+0outputs (235471major+4238110minor)pagefaults
Suggested-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
---
include/linux/swap.h | 2 ++
mm/swapfile.c | 51 ++++++++++++++++++++++++++++++++------------
2 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4c1d2e69689f..b13b72645db3 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -318,6 +318,8 @@ struct swap_info_struct {
unsigned int pages; /* total of usable pages of swap */
atomic_long_t inuse_pages; /* number of those currently in use */
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
+ struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
+ spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
struct block_device *bdev; /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a3d1239d944b..e57e5453a25b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -814,7 +814,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
out:
relocate_cluster(si, ci);
unlock_cluster(ci);
- __this_cpu_write(si->percpu_cluster->next[order], next);
+ if (si->flags & SWP_SOLIDSTATE)
+ __this_cpu_write(si->percpu_cluster->next[order], next);
+ else
+ si->global_cluster->next[order] = next;
return found;
}
@@ -875,9 +878,16 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
struct swap_cluster_info *ci;
unsigned int offset, found = 0;
- /* Fast path using per CPU cluster */
- local_lock(&si->percpu_cluster->lock);
- offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ if (si->flags & SWP_SOLIDSTATE) {
+ /* Fast path using per CPU cluster */
+ local_lock(&si->percpu_cluster->lock);
+ offset = __this_cpu_read(si->percpu_cluster->next[order]);
+ } else {
+ /* Serialize HDD SWAP allocation for each device. */
+ spin_lock(&si->global_cluster_lock);
+ offset = si->global_cluster->next[order];
+ }
+
if (offset) {
ci = lock_cluster(si, offset);
/* Cluster could have been used by another order */
@@ -972,8 +982,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
}
}
done:
- local_unlock(&si->percpu_cluster->lock);
-
+ if (si->flags & SWP_SOLIDSTATE)
+ local_unlock(&si->percpu_cluster->lock);
+ else
+ spin_unlock(&si->global_cluster_lock);
return found;
}
@@ -2778,6 +2790,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ kfree(p->global_cluster);
+ p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
kvfree(cluster_info);
@@ -3183,17 +3197,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < nr_clusters; i++)
spin_lock_init(&cluster_info[i].lock);
- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!si->percpu_cluster)
- goto err_free;
+ if (si->flags & SWP_SOLIDSTATE) {
+ si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!si->percpu_cluster)
+ goto err_free;
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_ENTRY_INVALID;
+ local_lock_init(&cluster->lock);
+ }
+ } else {
+ si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL);
for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_ENTRY_INVALID;
- local_lock_init(&cluster->lock);
+ si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
+ spin_lock_init(&si->global_cluster_lock);
}
/*
@@ -3467,6 +3488,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
bad_swap:
free_percpu(si->percpu_cluster);
si->percpu_cluster = NULL;
+ kfree(si->global_cluster);
+ si->global_cluster = NULL;
inode = NULL;
destroy_swap_extents(si);
swap_cgroup_swapoff(si->type);
--
2.47.1
next prev parent reply other threads:[~2024-12-30 17:47 UTC|newest]
Thread overview: 35+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-30 17:46 [PATCH v3 00/13] mm, swap: rework of swap allocator locks Kairui Song
2024-12-30 17:46 ` [PATCH v3 01/13] mm, swap: minor clean up for swap entry allocation Kairui Song
2025-01-09 4:04 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 02/13] mm, swap: fold swap_info_get_cont in the only caller Kairui Song
2025-01-09 4:05 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 03/13] mm, swap: remove old allocation path for HDD Kairui Song
2025-01-09 4:06 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 04/13] mm, swap: use cluster lock " Kairui Song
2025-01-09 4:07 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 05/13] mm, swap: clean up device availability check Kairui Song
2025-01-09 4:08 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 06/13] mm, swap: clean up plist removal and adding Kairui Song
2025-01-02 8:59 ` Baoquan He
2025-01-03 8:07 ` Kairui Song
2024-12-30 17:46 ` [PATCH v3 07/13] mm, swap: hold a reference during scan and cleanup flag usage Kairui Song
2025-01-04 5:46 ` Baoquan He
2025-01-13 5:34 ` Kairui Song
2025-01-20 2:39 ` Baoquan He
2025-01-27 9:19 ` Kairui Song
2025-02-05 9:18 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 08/13] mm, swap: use an enum to define all cluster flags and wrap flags changes Kairui Song
2025-01-06 8:43 ` Baoquan He
2025-01-13 5:49 ` Kairui Song
2024-12-30 17:46 ` [PATCH v3 09/13] mm, swap: reduce contention on device lock Kairui Song
2025-01-06 10:12 ` Baoquan He
2025-01-08 11:09 ` Baoquan He
2025-01-09 2:15 ` Kairui Song
2025-01-10 11:23 ` Baoquan He
2025-01-13 6:33 ` Kairui Song
2025-01-13 8:07 ` Kairui Song
2024-12-30 17:46 ` [PATCH v3 10/13] mm, swap: simplify percpu cluster updating Kairui Song
2025-01-09 2:07 ` Baoquan He
2024-12-30 17:46 ` [PATCH v3 11/13] mm, swap: introduce a helper for retrieving cluster from offset Kairui Song
2024-12-30 17:46 ` Kairui Song [this message]
2024-12-30 17:46 ` [PATCH v3 13/13] mm, swap_slots: remove slot cache for freeing path Kairui Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241230174621.61185-13-ryncsn@gmail.com \
--to=ryncsn@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=chrisl@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=kaleshsingh@google.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nphamcs@gmail.com \
--cc=ryan.roberts@arm.com \
--cc=v-songbaohua@oppo.com \
--cc=ying.huang@linux.alibaba.com \
--cc=yosryahmed@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox