linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nhat Pham <nphamcs@gmail.com>
To: linux-mm@kvack.org
Cc: akpm@linux-foundation.org, hannes@cmpxchg.org, hughd@google.com,
	yosry.ahmed@linux.dev, mhocko@kernel.org,
	roman.gushchin@linux.dev, shakeel.butt@linux.dev,
	muchun.song@linux.dev, len.brown@intel.com,
	chengming.zhou@linux.dev, kasong@tencent.com, chrisl@kernel.org,
	huang.ying.caritas@gmail.com, ryan.roberts@arm.com,
	shikemeng@huaweicloud.com, viro@zeniv.linux.org.uk,
	baohua@kernel.org, bhe@redhat.com, osalvador@suse.de,
	lorenzo.stoakes@oracle.com, christophe.leroy@csgroup.eu,
	pavel@kernel.org, kernel-team@meta.com,
	linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
	linux-pm@vger.kernel.org, peterx@redhat.com, riel@surriel.com,
	joshua.hahnjy@gmail.com, npache@redhat.com, gourry@gourry.net,
	axelrasmussen@google.com, yuanchu@google.com, weixugc@google.com,
	rafael@kernel.org, jannh@google.com, pfalcato@suse.de,
	zhengqi.arch@bytedance.com
Subject: [PATCH v3 20/20] swapfile: replace the swap map with bitmaps
Date: Sun,  8 Feb 2026 13:58:33 -0800	[thread overview]
Message-ID: <20260208215839.87595-21-nphamcs@gmail.com> (raw)
In-Reply-To: <20260208215839.87595-1-nphamcs@gmail.com>

Now that we have moved the swap count state to virtual swap layer, each
swap map entry only has 3 possible states: free, allocated, and bad.
Replace the swap map with 2 bitmaps (one for allocated state and one for
bad state), saving 6 bits per swap entry.

Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/swap.h |  3 +-
 mm/swapfile.c        | 81 +++++++++++++++++++++++---------------------
 2 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a30d382fb5ee1..a02ce3fb2358b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,7 +259,8 @@ struct swap_info_struct {
 	struct plist_node list;		/* entry in swap_active_head */
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
-	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	unsigned long *swap_map;	/* bitmap for allocated state */
+	unsigned long *bad_map;		/* bitmap for bad state */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
 	struct list_head full_clusters; /* full clusters list */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9478707ce3ffa..b7661ffa312be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -760,25 +760,19 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 				  struct swap_cluster_info *ci,
 				  unsigned long start, unsigned long end)
 {
-	unsigned char *map = si->swap_map;
 	unsigned long offset = start;
 	int nr_reclaim;
 
 	spin_unlock(&ci->lock);
 	do {
-		switch (READ_ONCE(map[offset])) {
-		case 0:
+		if (!test_bit(offset, si->swap_map)) {
 			offset++;
-			break;
-		case SWAP_MAP_ALLOCATED:
+		} else {
 			nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
 			if (nr_reclaim > 0)
 				offset += nr_reclaim;
 			else
 				goto out;
-			break;
-		default:
-			goto out;
 		}
 	} while (offset < end);
 out:
@@ -787,11 +781,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 	 * Recheck the range no matter reclaim succeeded or not, the slot
 	 * could have been be freed while we are not holding the lock.
 	 */
-	for (offset = start; offset < end; offset++)
-		if (READ_ONCE(map[offset]))
-			return false;
-
-	return true;
+	return find_next_bit(si->swap_map, end, start) >= end;
 }
 
 static bool cluster_scan_range(struct swap_info_struct *si,
@@ -800,15 +790,16 @@ static bool cluster_scan_range(struct swap_info_struct *si,
 			       bool *need_reclaim)
 {
 	unsigned long offset, end = start + nr_pages;
-	unsigned char *map = si->swap_map;
-	unsigned char count;
 
 	if (cluster_is_empty(ci))
 		return true;
 
 	for (offset = start; offset < end; offset++) {
-		count = READ_ONCE(map[offset]);
-		if (!count)
+		/* Bad slots cannot be used for allocation */
+		if (test_bit(offset, si->bad_map))
+			return false;
+
+		if (!test_bit(offset, si->swap_map))
 			continue;
 
 		if (swap_cache_only(si, offset)) {
@@ -841,7 +832,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
 	if (cluster_is_empty(ci))
 		ci->order = order;
 
-	memset(si->swap_map + start, usage, nr_pages);
+	bitmap_set(si->swap_map, start, nr_pages);
 	swap_range_alloc(si, nr_pages);
 	ci->count += nr_pages;
 
@@ -1404,7 +1395,7 @@ static struct swap_info_struct *_swap_info_get(swp_slot_t slot)
 	offset = swp_slot_offset(slot);
 	if (offset >= si->max)
 		goto bad_offset;
-	if (data_race(!si->swap_map[swp_slot_offset(slot)]))
+	if (data_race(!test_bit(offset, si->swap_map)))
 		goto bad_free;
 	return si;
 
@@ -1518,8 +1509,7 @@ static void swap_slots_free(struct swap_info_struct *si,
 			      swp_slot_t slot, unsigned int nr_pages)
 {
 	unsigned long offset = swp_slot_offset(slot);
-	unsigned char *map = si->swap_map + offset;
-	unsigned char *map_end = map + nr_pages;
+	unsigned long end = offset + nr_pages;
 
 	/* It should never free entries across different clusters */
 	VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
@@ -1527,10 +1517,8 @@ static void swap_slots_free(struct swap_info_struct *si,
 	VM_BUG_ON(ci->count < nr_pages);
 
 	ci->count -= nr_pages;
-	do {
-		VM_BUG_ON(!swap_is_last_ref(*map));
-		*map = 0;
-	} while (++map < map_end);
+	VM_BUG_ON(find_next_zero_bit(si->swap_map, end, offset) < end);
+	bitmap_clear(si->swap_map, offset, nr_pages);
 
 	swap_range_free(si, offset, nr_pages);
 
@@ -1741,9 +1729,7 @@ unsigned int count_swap_pages(int type, int free)
 static bool swap_slot_allocated(struct swap_info_struct *si,
 		unsigned long offset)
 {
-	unsigned char count = READ_ONCE(si->swap_map[offset]);
-
-	return count && swap_count(count) != SWAP_MAP_BAD;
+	return test_bit(offset, si->swap_map);
 }
 
 /*
@@ -2064,7 +2050,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 }
 
 static void setup_swap_info(struct swap_info_struct *si, int prio,
-			    unsigned char *swap_map,
+			    unsigned long *swap_map,
 			    struct swap_cluster_info *cluster_info)
 {
 	si->prio = prio;
@@ -2092,7 +2078,7 @@ static void _enable_swap_info(struct swap_info_struct *si)
 }
 
 static void enable_swap_info(struct swap_info_struct *si, int prio,
-				unsigned char *swap_map,
+				unsigned long *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
 	spin_lock(&swap_lock);
@@ -2185,7 +2171,8 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
-	unsigned char *swap_map;
+	unsigned long *swap_map;
+	unsigned long *bad_map;
 	struct swap_cluster_info *cluster_info;
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
@@ -2280,6 +2267,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->swap_file = NULL;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
+	bad_map = p->bad_map;
+	p->bad_map = NULL;
 	maxpages = p->max;
 	cluster_info = p->cluster_info;
 	p->max = 0;
@@ -2290,7 +2279,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	mutex_unlock(&swapon_mutex);
 	kfree(p->global_cluster);
 	p->global_cluster = NULL;
-	vfree(swap_map);
+	kvfree(swap_map);
+	kvfree(bad_map);
 	free_cluster_info(cluster_info, maxpages);
 
 	inode = mapping->host;
@@ -2638,18 +2628,20 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 
 static int setup_swap_map(struct swap_info_struct *si,
 			  union swap_header *swap_header,
-			  unsigned char *swap_map,
+			  unsigned long *swap_map,
+			  unsigned long *bad_map,
 			  unsigned long maxpages)
 {
 	unsigned long i;
 
-	swap_map[0] = SWAP_MAP_BAD; /* omit header page */
+	set_bit(0, bad_map); /* omit header page */
+
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 		if (page_nr == 0 || page_nr > swap_header->info.last_page)
 			return -EINVAL;
 		if (page_nr < maxpages) {
-			swap_map[page_nr] = SWAP_MAP_BAD;
+			set_bit(page_nr, bad_map);
 			si->pages--;
 		}
 	}
@@ -2753,7 +2745,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	int nr_extents;
 	sector_t span;
 	unsigned long maxpages;
-	unsigned char *swap_map = NULL;
+	unsigned long *swap_map = NULL, *bad_map = NULL;
 	struct swap_cluster_info *cluster_info = NULL;
 	struct folio *folio = NULL;
 	struct inode *inode = NULL;
@@ -2849,16 +2841,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	maxpages = si->max;
 
 	/* OK, set up the swap map and apply the bad block list */
-	swap_map = vzalloc(maxpages);
+	swap_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL);
 	if (!swap_map) {
 		error = -ENOMEM;
 		goto bad_swap_unlock_inode;
 	}
 
-	error = setup_swap_map(si, swap_header, swap_map, maxpages);
+	bad_map = kvcalloc(BITS_TO_LONGS(maxpages), sizeof(long), GFP_KERNEL);
+	if (!bad_map) {
+		error = -ENOMEM;
+		goto bad_swap_unlock_inode;
+	}
+
+	error = setup_swap_map(si, swap_header, swap_map, bad_map, maxpages);
 	if (error)
 		goto bad_swap_unlock_inode;
 
+	si->bad_map = bad_map;
+
 	if (si->bdev && bdev_stable_writes(si->bdev))
 		si->flags |= SWP_STABLE_WRITES;
 
@@ -2952,7 +2952,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	si->swap_file = NULL;
 	si->flags = 0;
 	spin_unlock(&swap_lock);
-	vfree(swap_map);
+	if (swap_map)
+		kvfree(swap_map);
+	if (bad_map)
+		kvfree(bad_map);
 	if (cluster_info)
 		free_cluster_info(cluster_info, maxpages);
 	if (inced_nr_rotate_swap)
-- 
2.47.3



  parent reply	other threads:[~2026-02-08 21:59 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-08 21:58 [PATCH v3 00/20] Virtual Swap Space Nhat Pham
2026-02-08 21:58 ` [PATCH v3 01/20] mm/swap: decouple swap cache from physical swap infrastructure Nhat Pham
2026-02-08 22:26   ` [PATCH v3 00/20] Virtual Swap Space Nhat Pham
2026-02-10 17:59     ` Kairui Song
2026-02-10 18:52       ` Johannes Weiner
2026-02-10 19:11       ` Nhat Pham
2026-02-10 19:23         ` Nhat Pham
2026-02-12  5:07         ` Chris Li
2026-02-17 23:36         ` Nhat Pham
2026-02-10 21:58       ` Chris Li
2026-02-20 21:05       ` [PATCH] vswap: fix poor batching behavior of vswap free path Nhat Pham
2026-02-08 22:31   ` [PATCH v3 00/20] Virtual Swap Space Nhat Pham
2026-02-09 12:20     ` Chris Li
2026-02-10  2:36       ` Johannes Weiner
2026-02-10 21:24         ` Chris Li
2026-02-10 23:01           ` Johannes Weiner
2026-02-10 18:00       ` Nhat Pham
2026-02-10 23:17         ` Chris Li
2026-02-08 22:39   ` Nhat Pham
2026-02-09  2:22   ` [PATCH v3 01/20] mm/swap: decouple swap cache from physical swap infrastructure kernel test robot
2026-02-08 21:58 ` [PATCH v3 02/20] swap: rearrange the swap header file Nhat Pham
2026-02-08 21:58 ` [PATCH v3 03/20] mm: swap: add an abstract API for locking out swapoff Nhat Pham
2026-02-08 21:58 ` [PATCH v3 04/20] zswap: add new helpers for zswap entry operations Nhat Pham
2026-02-08 21:58 ` [PATCH v3 05/20] mm/swap: add a new function to check if a swap entry is in swap cached Nhat Pham
2026-02-08 21:58 ` [PATCH v3 06/20] mm: swap: add a separate type for physical swap slots Nhat Pham
2026-02-08 21:58 ` [PATCH v3 07/20] mm: create scaffolds for the new virtual swap implementation Nhat Pham
2026-02-08 21:58 ` [PATCH v3 08/20] zswap: prepare zswap for swap virtualization Nhat Pham
2026-02-08 21:58 ` [PATCH v3 09/20] mm: swap: allocate a virtual swap slot for each swapped out page Nhat Pham
2026-02-09 17:12   ` kernel test robot
2026-02-11 13:42   ` kernel test robot
2026-02-08 21:58 ` [PATCH v3 10/20] swap: move swap cache to virtual swap descriptor Nhat Pham
2026-02-08 21:58 ` [PATCH v3 11/20] zswap: move zswap entry management to the " Nhat Pham
2026-02-08 21:58 ` [PATCH v3 12/20] swap: implement the swap_cgroup API using virtual swap Nhat Pham
2026-02-08 21:58 ` [PATCH v3 13/20] swap: manage swap entry lifecycle at the virtual swap layer Nhat Pham
2026-02-08 21:58 ` [PATCH v3 14/20] mm: swap: decouple virtual swap slot from backing store Nhat Pham
2026-02-10  6:31   ` Dan Carpenter
2026-02-08 21:58 ` [PATCH v3 15/20] zswap: do not start zswap shrinker if there is no physical swap slots Nhat Pham
2026-02-08 21:58 ` [PATCH v3 16/20] swap: do not unnecesarily pin readahead swap entries Nhat Pham
2026-02-08 21:58 ` [PATCH v3 17/20] swapfile: remove zeromap bitmap Nhat Pham
2026-02-08 21:58 ` [PATCH v3 18/20] memcg: swap: only charge physical swap slots Nhat Pham
2026-02-09  2:01   ` kernel test robot
2026-02-09  2:12   ` kernel test robot
2026-02-08 21:58 ` [PATCH v3 19/20] swap: simplify swapoff using virtual swap Nhat Pham
2026-02-08 21:58 ` Nhat Pham [this message]
2026-02-08 22:51 ` [PATCH v3 00/20] Virtual Swap Space Nhat Pham
2026-02-12 12:23   ` David Hildenbrand (Arm)
2026-02-12 17:29     ` Nhat Pham
2026-02-12 17:39       ` Nhat Pham
2026-02-12 20:11         ` David Hildenbrand (Arm)
2026-02-12 17:41       ` David Hildenbrand (Arm)
2026-02-12 17:45         ` Nhat Pham
2026-02-10 15:45 ` [syzbot ci] " syzbot ci

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260208215839.87595-21-nphamcs@gmail.com \
    --to=nphamcs@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=christophe.leroy@csgroup.eu \
    --cc=gourry@gourry.net \
    --cc=hannes@cmpxchg.org \
    --cc=huang.ying.caritas@gmail.com \
    --cc=hughd@google.com \
    --cc=jannh@google.com \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=len.brown@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=npache@redhat.com \
    --cc=osalvador@suse.de \
    --cc=pavel@kernel.org \
    --cc=peterx@redhat.com \
    --cc=pfalcato@suse.de \
    --cc=rafael@kernel.org \
    --cc=riel@surriel.com \
    --cc=roman.gushchin@linux.dev \
    --cc=ryan.roberts@arm.com \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=weixugc@google.com \
    --cc=yosry.ahmed@linux.dev \
    --cc=yuanchu@google.com \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox