From: Chris Li <chrisl@kernel.org>
To: kasong@tencent.com
Cc: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Nhat Pham <nphamcs@gmail.com>, Baoquan He <bhe@redhat.com>,
Barry Song <baohua@kernel.org>,
Johannes Weiner <hannes@cmpxchg.org>,
David Hildenbrand <david@kernel.org>,
Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
Youngjun Park <youngjun.park@lge.com>,
linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3 02/12] mm, swap: clean up swapon process and locking
Date: Wed, 18 Feb 2026 22:45:50 -0800 [thread overview]
Message-ID: <CACePvbXnEU+VsE_wWMfaPsQCAmbZiFJjddsHdhLDnbGL_1P8WA@mail.gmail.com> (raw)
In-Reply-To: <20260218-swap-table-p3-v3-2-f4e34be021a7@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
On Tue, Feb 17, 2026 at 12:06 PM Kairui Song via B4 Relay
<devnull+kasong.tencent.com@kernel.org> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Slightly clean up the swapon process. Add comments about what swap_lock
> protects, introduce and rename helpers that wrap swap_map and
> cluster_info setup, and do it outside of the swap_lock lock.
>
> This lock protection is not needed for swap_map and cluster_info setup
> because all swap users must either hold the percpu ref or hold a stable
> allocated swap entry (e.g., locking a folio in the swap cache) before
> accessing. So before the swap device is exposed by enable_swap_info,
> nothing would use the swap device's map or cluster.
>
> So we are safe to allocate and set up swap data freely first, then
> expose the swap device and set the SWP_WRITEOK flag.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
> mm/swapfile.c | 87 ++++++++++++++++++++++++++++++++---------------------------
> 1 file changed, 48 insertions(+), 39 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 25dfe992538d..8fc35b316ade 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -65,6 +65,13 @@ static void move_cluster(struct swap_info_struct *si,
> struct swap_cluster_info *ci, struct list_head *list,
> enum swap_cluster_flags new_flags);
>
> +/*
> + * Protects the swap_info array, and the SWP_USED flag. swap_info contains
> + * lazily allocated & freed swap device info struts, and SWP_USED indicates
Is "struts" a typo for "struct"?
Chris
> + * which device is used, ~SWP_USED devices and can be reused.
> + *
> + * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag.
> + */
> static DEFINE_SPINLOCK(swap_lock);
> static unsigned int nr_swapfiles;
> atomic_long_t nr_swap_pages;
> @@ -2657,8 +2664,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
> }
>
> static void setup_swap_info(struct swap_info_struct *si, int prio,
> - unsigned char *swap_map,
> - struct swap_cluster_info *cluster_info,
> unsigned long *zeromap)
> {
> si->prio = prio;
> @@ -2668,8 +2673,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
> */
> si->list.prio = -si->prio;
> si->avail_list.prio = -si->prio;
> - si->swap_map = swap_map;
> - si->cluster_info = cluster_info;
> si->zeromap = zeromap;
> }
>
> @@ -2687,13 +2690,11 @@ static void _enable_swap_info(struct swap_info_struct *si)
> }
>
> static void enable_swap_info(struct swap_info_struct *si, int prio,
> - unsigned char *swap_map,
> - struct swap_cluster_info *cluster_info,
> - unsigned long *zeromap)
> + unsigned long *zeromap)
> {
> spin_lock(&swap_lock);
> spin_lock(&si->lock);
> - setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
> + setup_swap_info(si, prio, zeromap);
> spin_unlock(&si->lock);
> spin_unlock(&swap_lock);
> /*
> @@ -2711,7 +2712,7 @@ static void reinsert_swap_info(struct swap_info_struct *si)
> {
> spin_lock(&swap_lock);
> spin_lock(&si->lock);
> - setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
> + setup_swap_info(si, si->prio, si->zeromap);
> _enable_swap_info(si);
> spin_unlock(&si->lock);
> spin_unlock(&swap_lock);
> @@ -2735,8 +2736,8 @@ static void wait_for_allocation(struct swap_info_struct *si)
> }
> }
>
> -static void free_cluster_info(struct swap_cluster_info *cluster_info,
> - unsigned long maxpages)
> +static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
> + unsigned long maxpages)
> {
> struct swap_cluster_info *ci;
> int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
> @@ -2894,7 +2895,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> p->global_cluster = NULL;
> vfree(swap_map);
> kvfree(zeromap);
> - free_cluster_info(cluster_info, maxpages);
> + free_swap_cluster_info(cluster_info, maxpages);
> /* Destroy swap account information */
> swap_cgroup_swapoff(p->type);
>
> @@ -3243,10 +3244,15 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
> static int setup_swap_map(struct swap_info_struct *si,
> union swap_header *swap_header,
> - unsigned char *swap_map,
> unsigned long maxpages)
> {
> unsigned long i;
> + unsigned char *swap_map;
> +
> + swap_map = vzalloc(maxpages);
> + si->swap_map = swap_map;
> + if (!swap_map)
> + return -ENOMEM;
>
> swap_map[0] = SWAP_MAP_BAD; /* omit header page */
> for (i = 0; i < swap_header->info.nr_badpages; i++) {
> @@ -3267,9 +3273,9 @@ static int setup_swap_map(struct swap_info_struct *si,
> return 0;
> }
>
> -static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> - union swap_header *swap_header,
> - unsigned long maxpages)
> +static int setup_swap_clusters_info(struct swap_info_struct *si,
> + union swap_header *swap_header,
> + unsigned long maxpages)
> {
> unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
> struct swap_cluster_info *cluster_info;
> @@ -3339,10 +3345,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> }
> }
>
> - return cluster_info;
> + si->cluster_info = cluster_info;
> + return 0;
> err:
> - free_cluster_info(cluster_info, maxpages);
> - return ERR_PTR(err);
> + free_swap_cluster_info(cluster_info, maxpages);
> + return err;
> }
>
> SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> @@ -3358,9 +3365,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> int nr_extents;
> sector_t span;
> unsigned long maxpages;
> - unsigned char *swap_map = NULL;
> unsigned long *zeromap = NULL;
> - struct swap_cluster_info *cluster_info = NULL;
> struct folio *folio = NULL;
> struct inode *inode = NULL;
> bool inced_nr_rotate_swap = false;
> @@ -3371,6 +3376,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
>
> + /*
> + * Allocate or reuse existing !SWP_USED swap_info. The returned
> + * si will stay in a dying status, so nothing will access its content
> + * until enable_swap_info resurrects its percpu ref and expose it.
> + */
> si = alloc_swap_info();
> if (IS_ERR(si))
> return PTR_ERR(si);
> @@ -3453,18 +3463,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>
> maxpages = si->max;
>
> - /* OK, set up the swap map and apply the bad block list */
> - swap_map = vzalloc(maxpages);
> - if (!swap_map) {
> - error = -ENOMEM;
> + /* Setup the swap map and apply bad block */
> + error = setup_swap_map(si, swap_header, maxpages);
> + if (error)
> goto bad_swap_unlock_inode;
> - }
>
> - error = swap_cgroup_swapon(si->type, maxpages);
> + /* Set up the swap cluster info */
> + error = setup_swap_clusters_info(si, swap_header, maxpages);
> if (error)
> goto bad_swap_unlock_inode;
>
> - error = setup_swap_map(si, swap_header, swap_map, maxpages);
> + error = swap_cgroup_swapon(si->type, maxpages);
> if (error)
> goto bad_swap_unlock_inode;
>
> @@ -3492,13 +3501,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> inced_nr_rotate_swap = true;
> }
>
> - cluster_info = setup_clusters(si, swap_header, maxpages);
> - if (IS_ERR(cluster_info)) {
> - error = PTR_ERR(cluster_info);
> - cluster_info = NULL;
> - goto bad_swap_unlock_inode;
> - }
> -
> if ((swap_flags & SWAP_FLAG_DISCARD) &&
> si->bdev && bdev_max_discard_sectors(si->bdev)) {
> /*
> @@ -3551,7 +3553,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> prio = swap_flags & SWAP_FLAG_PRIO_MASK;
>
> si->swap_file = swap_file;
> - enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
> +
> + /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */
> + enable_swap_info(si, prio, zeromap);
>
> pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
> K(si->pages), name->name, si->prio, nr_extents,
> @@ -3577,13 +3581,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> inode = NULL;
> destroy_swap_extents(si, swap_file);
> swap_cgroup_swapoff(si->type);
> + vfree(si->swap_map);
> + si->swap_map = NULL;
> + free_swap_cluster_info(si->cluster_info, si->max);
> + si->cluster_info = NULL;
> + /*
> + * Clear the SWP_USED flag after all resources are freed so
> + * alloc_swap_info can reuse this si safely.
> + */
> spin_lock(&swap_lock);
> si->flags = 0;
> spin_unlock(&swap_lock);
> - vfree(swap_map);
> kvfree(zeromap);
> - if (cluster_info)
> - free_cluster_info(cluster_info, maxpages);
> if (inced_nr_rotate_swap)
> atomic_dec(&nr_rotate_swap);
> if (swap_file)
>
> --
> 2.52.0
>
>
next prev parent reply other threads:[~2026-02-19 6:46 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-17 20:06 [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song via B4 Relay
2026-02-17 20:06 ` [PATCH v3 01/12] mm, swap: protect si->swap_file properly and use as a mount indicator Kairui Song via B4 Relay
2026-02-19 6:36 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 02/12] mm, swap: clean up swapon process and locking Kairui Song via B4 Relay
2026-02-19 6:45 ` Chris Li [this message]
2026-02-17 20:06 ` [PATCH v3 03/12] mm, swap: remove redundant arguments and locking for enabling a device Kairui Song via B4 Relay
2026-02-19 6:48 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 04/12] mm, swap: consolidate bad slots setup and make it more robust Kairui Song via B4 Relay
2026-02-19 6:51 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 05/12] mm/workingset: leave highest bits empty for anon shadow Kairui Song via B4 Relay
2026-02-19 6:56 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 06/12] mm, swap: implement helpers for reserving data in the swap table Kairui Song via B4 Relay
2026-02-19 7:00 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 07/12] mm, swap: mark bad slots in swap table directly Kairui Song via B4 Relay
2026-02-19 7:01 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 08/12] mm, swap: simplify swap table sanity range check Kairui Song via B4 Relay
2026-02-19 7:02 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 09/12] mm, swap: use the swap table to track the swap count Kairui Song via B4 Relay
2026-02-18 10:40 ` kernel test robot
2026-02-18 12:22 ` Kairui Song
2026-02-19 7:06 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 10/12] mm, swap: no need to truncate the scan border Kairui Song via B4 Relay
2026-02-19 7:10 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 11/12] mm, swap: simplify checking if a folio is swapped Kairui Song via B4 Relay
2026-02-19 7:18 ` Chris Li
2026-02-17 20:06 ` [PATCH v3 12/12] mm, swap: no need to clear the shadow explicitly Kairui Song via B4 Relay
2026-02-19 7:19 ` Chris Li
2026-02-17 20:10 ` [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CACePvbXnEU+VsE_wWMfaPsQCAmbZiFJjddsHdhLDnbGL_1P8WA@mail.gmail.com \
--to=chrisl@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=bhe@redhat.com \
--cc=david@kernel.org \
--cc=hannes@cmpxchg.org \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=nphamcs@gmail.com \
--cc=shikemeng@huaweicloud.com \
--cc=youngjun.park@lge.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox