linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Chris Li <chrisl@kernel.org>
To: kasong@tencent.com
Cc: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org>,
	 Kemeng Shi <shikemeng@huaweicloud.com>,
	Nhat Pham <nphamcs@gmail.com>,  Baoquan He <bhe@redhat.com>,
	Barry Song <baohua@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	 David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	 Youngjun Park <youngjun.park@lge.com>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3 02/12] mm, swap: clean up swapon process and locking
Date: Wed, 18 Feb 2026 22:45:50 -0800	[thread overview]
Message-ID: <CACePvbXnEU+VsE_wWMfaPsQCAmbZiFJjddsHdhLDnbGL_1P8WA@mail.gmail.com> (raw)
In-Reply-To: <20260218-swap-table-p3-v3-2-f4e34be021a7@tencent.com>

Acked-by: Chris Li <chrisl@kernel.org>

On Tue, Feb 17, 2026 at 12:06 PM Kairui Song via B4 Relay
<devnull+kasong.tencent.com@kernel.org> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Slightly clean up the swapon process. Add comments about what swap_lock
> protects, introduce and rename helpers that wrap swap_map and
> cluster_info setup, and do it outside of the swap_lock lock.
>
> This lock protection is not needed for swap_map and cluster_info setup
> because all swap users must either hold the percpu ref or hold a stable
> allocated swap entry (e.g., locking a folio in the swap cache) before
> accessing. So before the swap device is exposed by enable_swap_info,
> nothing would use the swap device's map or cluster.
>
> So we are safe to allocate and set up swap data freely first, then
> expose the swap device and set the SWP_WRITEOK flag.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/swapfile.c | 87 ++++++++++++++++++++++++++++++++---------------------------
>  1 file changed, 48 insertions(+), 39 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 25dfe992538d..8fc35b316ade 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -65,6 +65,13 @@ static void move_cluster(struct swap_info_struct *si,
>                          struct swap_cluster_info *ci, struct list_head *list,
>                          enum swap_cluster_flags new_flags);
>
> +/*
> + * Protects the swap_info array, and the SWP_USED flag. swap_info contains
> + * lazily allocated & freed swap device info struts, and SWP_USED indicates

Is "struts" a typo for "struct"?

Chris

> + * which device is used, ~SWP_USED devices and can be reused.
> + *
> + * Also protects swap_active_head total_swap_pages, and the SWP_WRITEOK flag.
> + */
>  static DEFINE_SPINLOCK(swap_lock);
>  static unsigned int nr_swapfiles;
>  atomic_long_t nr_swap_pages;
> @@ -2657,8 +2664,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
>  }
>
>  static void setup_swap_info(struct swap_info_struct *si, int prio,
> -                           unsigned char *swap_map,
> -                           struct swap_cluster_info *cluster_info,
>                             unsigned long *zeromap)
>  {
>         si->prio = prio;
> @@ -2668,8 +2673,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
>          */
>         si->list.prio = -si->prio;
>         si->avail_list.prio = -si->prio;
> -       si->swap_map = swap_map;
> -       si->cluster_info = cluster_info;
>         si->zeromap = zeromap;
>  }
>
> @@ -2687,13 +2690,11 @@ static void _enable_swap_info(struct swap_info_struct *si)
>  }
>
>  static void enable_swap_info(struct swap_info_struct *si, int prio,
> -                               unsigned char *swap_map,
> -                               struct swap_cluster_info *cluster_info,
> -                               unsigned long *zeromap)
> +                            unsigned long *zeromap)
>  {
>         spin_lock(&swap_lock);
>         spin_lock(&si->lock);
> -       setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
> +       setup_swap_info(si, prio, zeromap);
>         spin_unlock(&si->lock);
>         spin_unlock(&swap_lock);
>         /*
> @@ -2711,7 +2712,7 @@ static void reinsert_swap_info(struct swap_info_struct *si)
>  {
>         spin_lock(&swap_lock);
>         spin_lock(&si->lock);
> -       setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
> +       setup_swap_info(si, si->prio, si->zeromap);
>         _enable_swap_info(si);
>         spin_unlock(&si->lock);
>         spin_unlock(&swap_lock);
> @@ -2735,8 +2736,8 @@ static void wait_for_allocation(struct swap_info_struct *si)
>         }
>  }
>
> -static void free_cluster_info(struct swap_cluster_info *cluster_info,
> -                             unsigned long maxpages)
> +static void free_swap_cluster_info(struct swap_cluster_info *cluster_info,
> +                                  unsigned long maxpages)
>  {
>         struct swap_cluster_info *ci;
>         int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
> @@ -2894,7 +2895,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         p->global_cluster = NULL;
>         vfree(swap_map);
>         kvfree(zeromap);
> -       free_cluster_info(cluster_info, maxpages);
> +       free_swap_cluster_info(cluster_info, maxpages);
>         /* Destroy swap account information */
>         swap_cgroup_swapoff(p->type);
>
> @@ -3243,10 +3244,15 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
>
>  static int setup_swap_map(struct swap_info_struct *si,
>                           union swap_header *swap_header,
> -                         unsigned char *swap_map,
>                           unsigned long maxpages)
>  {
>         unsigned long i;
> +       unsigned char *swap_map;
> +
> +       swap_map = vzalloc(maxpages);
> +       si->swap_map = swap_map;
> +       if (!swap_map)
> +               return -ENOMEM;
>
>         swap_map[0] = SWAP_MAP_BAD; /* omit header page */
>         for (i = 0; i < swap_header->info.nr_badpages; i++) {
> @@ -3267,9 +3273,9 @@ static int setup_swap_map(struct swap_info_struct *si,
>         return 0;
>  }
>
> -static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> -                                               union swap_header *swap_header,
> -                                               unsigned long maxpages)
> +static int setup_swap_clusters_info(struct swap_info_struct *si,
> +                                   union swap_header *swap_header,
> +                                   unsigned long maxpages)
>  {
>         unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
>         struct swap_cluster_info *cluster_info;
> @@ -3339,10 +3345,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>                 }
>         }
>
> -       return cluster_info;
> +       si->cluster_info = cluster_info;
> +       return 0;
>  err:
> -       free_cluster_info(cluster_info, maxpages);
> -       return ERR_PTR(err);
> +       free_swap_cluster_info(cluster_info, maxpages);
> +       return err;
>  }
>
>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> @@ -3358,9 +3365,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>         int nr_extents;
>         sector_t span;
>         unsigned long maxpages;
> -       unsigned char *swap_map = NULL;
>         unsigned long *zeromap = NULL;
> -       struct swap_cluster_info *cluster_info = NULL;
>         struct folio *folio = NULL;
>         struct inode *inode = NULL;
>         bool inced_nr_rotate_swap = false;
> @@ -3371,6 +3376,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>         if (!capable(CAP_SYS_ADMIN))
>                 return -EPERM;
>
> +       /*
> +        * Allocate or reuse existing !SWP_USED swap_info. The returned
> +        * si will stay in a dying status, so nothing will access its content
> +        * until enable_swap_info resurrects its percpu ref and expose it.
> +        */
>         si = alloc_swap_info();
>         if (IS_ERR(si))
>                 return PTR_ERR(si);
> @@ -3453,18 +3463,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>
>         maxpages = si->max;
>
> -       /* OK, set up the swap map and apply the bad block list */
> -       swap_map = vzalloc(maxpages);
> -       if (!swap_map) {
> -               error = -ENOMEM;
> +       /* Setup the swap map and apply bad block */
> +       error = setup_swap_map(si, swap_header, maxpages);
> +       if (error)
>                 goto bad_swap_unlock_inode;
> -       }
>
> -       error = swap_cgroup_swapon(si->type, maxpages);
> +       /* Set up the swap cluster info */
> +       error = setup_swap_clusters_info(si, swap_header, maxpages);
>         if (error)
>                 goto bad_swap_unlock_inode;
>
> -       error = setup_swap_map(si, swap_header, swap_map, maxpages);
> +       error = swap_cgroup_swapon(si->type, maxpages);
>         if (error)
>                 goto bad_swap_unlock_inode;
>
> @@ -3492,13 +3501,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>                 inced_nr_rotate_swap = true;
>         }
>
> -       cluster_info = setup_clusters(si, swap_header, maxpages);
> -       if (IS_ERR(cluster_info)) {
> -               error = PTR_ERR(cluster_info);
> -               cluster_info = NULL;
> -               goto bad_swap_unlock_inode;
> -       }
> -
>         if ((swap_flags & SWAP_FLAG_DISCARD) &&
>             si->bdev && bdev_max_discard_sectors(si->bdev)) {
>                 /*
> @@ -3551,7 +3553,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>                 prio = swap_flags & SWAP_FLAG_PRIO_MASK;
>
>         si->swap_file = swap_file;
> -       enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
> +
> +       /* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */
> +       enable_swap_info(si, prio, zeromap);
>
>         pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
>                 K(si->pages), name->name, si->prio, nr_extents,
> @@ -3577,13 +3581,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>         inode = NULL;
>         destroy_swap_extents(si, swap_file);
>         swap_cgroup_swapoff(si->type);
> +       vfree(si->swap_map);
> +       si->swap_map = NULL;
> +       free_swap_cluster_info(si->cluster_info, si->max);
> +       si->cluster_info = NULL;
> +       /*
> +        * Clear the SWP_USED flag after all resources are freed so
> +        * alloc_swap_info can reuse this si safely.
> +        */
>         spin_lock(&swap_lock);
>         si->flags = 0;
>         spin_unlock(&swap_lock);
> -       vfree(swap_map);
>         kvfree(zeromap);
> -       if (cluster_info)
> -               free_cluster_info(cluster_info, maxpages);
>         if (inced_nr_rotate_swap)
>                 atomic_dec(&nr_rotate_swap);
>         if (swap_file)
>
> --
> 2.52.0
>
>


  reply	other threads:[~2026-02-19  6:46 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-17 20:06 [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song via B4 Relay
2026-02-17 20:06 ` [PATCH v3 01/12] mm, swap: protect si->swap_file properly and use as a mount indicator Kairui Song via B4 Relay
2026-02-19  6:36   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 02/12] mm, swap: clean up swapon process and locking Kairui Song via B4 Relay
2026-02-19  6:45   ` Chris Li [this message]
2026-02-17 20:06 ` [PATCH v3 03/12] mm, swap: remove redundant arguments and locking for enabling a device Kairui Song via B4 Relay
2026-02-19  6:48   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 04/12] mm, swap: consolidate bad slots setup and make it more robust Kairui Song via B4 Relay
2026-02-19  6:51   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 05/12] mm/workingset: leave highest bits empty for anon shadow Kairui Song via B4 Relay
2026-02-19  6:56   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 06/12] mm, swap: implement helpers for reserving data in the swap table Kairui Song via B4 Relay
2026-02-19  7:00   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 07/12] mm, swap: mark bad slots in swap table directly Kairui Song via B4 Relay
2026-02-19  7:01   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 08/12] mm, swap: simplify swap table sanity range check Kairui Song via B4 Relay
2026-02-19  7:02   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 09/12] mm, swap: use the swap table to track the swap count Kairui Song via B4 Relay
2026-02-18 10:40   ` kernel test robot
2026-02-18 12:22     ` Kairui Song
2026-02-19  7:06       ` Chris Li
2026-02-17 20:06 ` [PATCH v3 10/12] mm, swap: no need to truncate the scan border Kairui Song via B4 Relay
2026-02-19  7:10   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 11/12] mm, swap: simplify checking if a folio is swapped Kairui Song via B4 Relay
2026-02-19  7:18   ` Chris Li
2026-02-17 20:06 ` [PATCH v3 12/12] mm, swap: no need to clear the shadow explicitly Kairui Song via B4 Relay
2026-02-19  7:19   ` Chris Li
2026-02-17 20:10 ` [PATCH v3 00/12] mm, swap: swap table phase III: remove swap_map Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CACePvbXnEU+VsE_wWMfaPsQCAmbZiFJjddsHdhLDnbGL_1P8WA@mail.gmail.com \
    --to=chrisl@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=nphamcs@gmail.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=youngjun.park@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox