linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Kairui Song via B4 Relay <devnull+kasong.tencent.com@kernel.org>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
	 Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>,  Baoquan He <bhe@redhat.com>,
	Barry Song <baohua@kernel.org>,
	 Johannes Weiner <hannes@cmpxchg.org>,
	David Hildenbrand <david@kernel.org>,
	 Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	 Youngjun Park <youngjun.park@lge.com>,
	linux-kernel@vger.kernel.org,  Chris Li <chrisl@kernel.org>,
	Kairui Song <kasong@tencent.com>
Subject: [PATCH v3 02/12] mm, swap: clean up swapon process and locking
Date: Wed, 18 Feb 2026 04:06:27 +0800	[thread overview]
Message-ID: <20260218-swap-table-p3-v3-2-f4e34be021a7@tencent.com> (raw)
In-Reply-To: <20260218-swap-table-p3-v3-0-f4e34be021a7@tencent.com>

From: Kairui Song <kasong@tencent.com>

Slightly clean up the swapon process. Add comments about what swap_lock
protects, introduce and rename helpers that wrap swap_map and
cluster_info setup, and do it outside of the swap_lock lock.

This lock protection is not needed for swap_map and cluster_info setup
because all swap users must either hold the percpu ref or hold a stable
allocated swap entry (e.g., locking a folio in the swap cache) before
accessing. So before the swap device is exposed by enable_swap_info,
nothing would use the swap device's map or cluster.

So we are safe to allocate and set up swap data freely first, then
expose the swap device and set the SWP_WRITEOK flag.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swapfile.c | 87 ++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 25dfe992538d..8fc35b316ade 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -65,6 +65,13 @@ static void move_cluster(struct swap_info_struct *si,
 			 struct swap_cluster_info *ci, struct list_head *list,
 			 enum swap_cluster_flags new_flags);
 
+/*
+ * Protects the swap_info array, and the SWP_USED flag. swap_info contains
+ * lazily allocated & freed swap device info struts, and SWP_USED indicates
+ * which device is used, ~SWP_USED devices and can be reused.
+ *
+ * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag.
+ */
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 atomic_long_t nr_swap_pages;
@@ -2657,8 +2664,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
 }
 
 static void setup_swap_info(struct swap_info_struct *si, int prio,
-			    unsigned char *swap_map,
-			    struct swap_cluster_info *cluster_info,
 			    unsigned long *zeromap)
 {
 	si->prio = prio;
@@ -2668,8 +2673,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
 	 */
 	si->list.prio = -si->prio;
 	si->avail_list.prio = -si->prio;
-	si->swap_map = swap_map;
-	si->cluster_info = cluster_info;
 	si->zeromap = zeromap;
 }
 
@@ -2687,13 +2690,11 @@ static void _enable_swap_info(struct swap_info_struct *si)
 }
 
 static void enable_swap_info(struct swap_info_struct *si, int prio,
-				unsigned char *swap_map,
-				struct swap_cluster_info *cluster_info,
-				unsigned long *zeromap)
+			     unsigned long *zeromap)
 {
 	spin_lock(&swap_lock);
 	spin_lock(&si->lock);
-	setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
+	setup_swap_info(si, prio, zeromap);
 	spin_unlock(&si->lock);
 	spin_unlock(&swap_lock);
 	/*
@@ -2711,7 +2712,7 @@ static void reinsert_swap_info(struct swap_info_struct *si)
 {
 	spin_lock(&swap_lock);
 	spin_lock(&si->lock);
-	setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
+	setup_swap_info(si, si->prio, si->zeromap);
 	_enable_swap_info(si);
 	spin_unlock(&si->lock);
 	spin_unlock(&swap_lock);
@@ -2735,8 +2736,8 @@ static void wait_for_allocation(struct swap_info_struct *si)
 	}
 }
 
-static void free_cluster_info(struct swap_cluster_info *cluster_info,
-			      unsigned long maxpages)
+static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
+				   unsigned long maxpages)
 {
 	struct swap_cluster_info *ci;
 	int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
@@ -2894,7 +2895,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->global_cluster = NULL;
 	vfree(swap_map);
 	kvfree(zeromap);
-	free_cluster_info(cluster_info, maxpages);
+	free_swap_cluster_info(cluster_info, maxpages);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
 
@@ -3243,10 +3244,15 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 
 static int setup_swap_map(struct swap_info_struct *si,
 			  union swap_header *swap_header,
-			  unsigned char *swap_map,
 			  unsigned long maxpages)
 {
 	unsigned long i;
+	unsigned char *swap_map;
+
+	swap_map = vzalloc(maxpages);
+	si->swap_map = swap_map;
+	if (!swap_map)
+		return -ENOMEM;
 
 	swap_map[0] = SWAP_MAP_BAD; /* omit header page */
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3267,9 +3273,9 @@ static int setup_swap_map(struct swap_info_struct *si,
 	return 0;
 }
 
-static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
-						union swap_header *swap_header,
-						unsigned long maxpages)
+static int setup_swap_clusters_info(struct swap_info_struct *si,
+				    union swap_header *swap_header,
+				    unsigned long maxpages)
 {
 	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 	struct swap_cluster_info *cluster_info;
@@ -3339,10 +3345,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 		}
 	}
 
-	return cluster_info;
+	si->cluster_info = cluster_info;
+	return 0;
 err:
-	free_cluster_info(cluster_info, maxpages);
-	return ERR_PTR(err);
+	free_swap_cluster_info(cluster_info, maxpages);
+	return err;
 }
 
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
@@ -3358,9 +3365,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	int nr_extents;
 	sector_t span;
 	unsigned long maxpages;
-	unsigned char *swap_map = NULL;
 	unsigned long *zeromap = NULL;
-	struct swap_cluster_info *cluster_info = NULL;
 	struct folio *folio = NULL;
 	struct inode *inode = NULL;
 	bool inced_nr_rotate_swap = false;
@@ -3371,6 +3376,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	/*
+	 * Allocate or reuse existing !SWP_USED swap_info. The returned
+	 * si will stay in a dying status, so nothing will access its content
+	 * until enable_swap_info resurrects its percpu ref and expose it.
+	 */
 	si = alloc_swap_info();
 	if (IS_ERR(si))
 		return PTR_ERR(si);
@@ -3453,18 +3463,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	maxpages = si->max;
 
-	/* OK, set up the swap map and apply the bad block list */
-	swap_map = vzalloc(maxpages);
-	if (!swap_map) {
-		error = -ENOMEM;
+	/* Setup the swap map and apply bad block */
+	error = setup_swap_map(si, swap_header, maxpages);
+	if (error)
 		goto bad_swap_unlock_inode;
-	}
 
-	error = swap_cgroup_swapon(si->type, maxpages);
+	/* Set up the swap cluster info */
+	error = setup_swap_clusters_info(si, swap_header, maxpages);
 	if (error)
 		goto bad_swap_unlock_inode;
 
-	error = setup_swap_map(si, swap_header, swap_map, maxpages);
+	error = swap_cgroup_swapon(si->type, maxpages);
 	if (error)
 		goto bad_swap_unlock_inode;
 
@@ -3492,13 +3501,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		inced_nr_rotate_swap = true;
 	}
 
-	cluster_info = setup_clusters(si, swap_header, maxpages);
-	if (IS_ERR(cluster_info)) {
-		error = PTR_ERR(cluster_info);
-		cluster_info = NULL;
-		goto bad_swap_unlock_inode;
-	}
-
 	if ((swap_flags & SWAP_FLAG_DISCARD) &&
 	    si->bdev && bdev_max_discard_sectors(si->bdev)) {
 		/*
@@ -3551,7 +3553,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		prio = swap_flags & SWAP_FLAG_PRIO_MASK;
 
 	si->swap_file = swap_file;
-	enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
+
+	/* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */
+	enable_swap_info(si, prio, zeromap);
 
 	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
 		K(si->pages), name->name, si->prio, nr_extents,
@@ -3577,13 +3581,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	inode = NULL;
 	destroy_swap_extents(si, swap_file);
 	swap_cgroup_swapoff(si->type);
+	vfree(si->swap_map);
+	si->swap_map = NULL;
+	free_swap_cluster_info(si->cluster_info, si->max);
+	si->cluster_info = NULL;
+	/*
+	 * Clear the SWP_USED flag after all resources are freed so
+	 * alloc_swap_info can reuse this si safely.
+	 */
 	spin_lock(&swap_lock);
 	si->flags = 0;
 	spin_unlock(&swap_lock);
-	vfree(swap_map);
 	kvfree(zeromap);
-	if (cluster_info)
-		free_cluster_info(cluster_info, maxpages);
 	if (inced_nr_rotate_swap)
 		atomic_dec(&nr_rotate_swap);
 	if (swap_file)

-- 
2.52.0




  parent reply	other threads:[~2026-02-17 20:06 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-17 20:06 [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song via B4 Relay
2026-02-17 20:06 ` [PATCH v3 01/12] mm, swap: protect si->swap_file properly and use as a mount indicator Kairui Song via B4 Relay
2026-02-19  6:36   ` Chris Li
2026-02-17 20:06 ` Kairui Song via B4 Relay [this message]
2026-02-19  6:45   ` [PATCH v3 02/12] mm, swap: clean up swapon process and locking Chris Li
2026-02-17 20:06 ` [PATCH v3 03/12] mm, swap: remove redundant arguments and locking for enabling a device Kairui Song via B4 Relay
2026-02-19  6:48   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 04/12] mm, swap: consolidate bad slots setup and make it more robust Kairui Song via B4 Relay
2026-02-19  6:51   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 05/12] mm/workingset: leave highest bits empty for anon shadow Kairui Song via B4 Relay
2026-02-19  6:56   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 06/12] mm, swap: implement helpers for reserving data in the swap table Kairui Song via B4 Relay
2026-02-19  7:00   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 07/12] mm, swap: mark bad slots in swap table directly Kairui Song via B4 Relay
2026-02-19  7:01   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 08/12] mm, swap: simplify swap table sanity range check Kairui Song via B4 Relay
2026-02-19  7:02   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 09/12] mm, swap: use the swap table to track the swap count Kairui Song via B4 Relay
2026-02-18 10:40   ` kernel test robot
2026-02-18 12:22     ` Kairui Song
2026-02-19  7:06       ` Chris Li
2026-02-17 20:06 ` [PATCH v3 10/12] mm, swap: no need to truncate the scan border Kairui Song via B4 Relay
2026-02-19  7:10   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 11/12] mm, swap: simplify checking if a folio is swapped Kairui Song via B4 Relay
2026-02-19  7:18   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 12/12] mm, swap: no need to clear the shadow explicitly Kairui Song via B4 Relay
2026-02-19  7:19   ` Chris Li
2026-02-17 20:10 ` [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260218-swap-table-p3-v3-2-f4e34be021a7@tencent.com \
    --to=devnull+kasong.tencent.com@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=nphamcs@gmail.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=youngjun.park@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox