Re: [PATCH] mm: attempt to batch free swap entries for zap_pte

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
  2024-08-06  7:23 [PATCH] mm: attempt to batch free swap entries for zap_pte_range() zhiguojiang
@ 2024-08-06  6:38 ` Barry Song
  0 siblings, 0 replies; 4+ messages in thread
From: Barry Song @ 2024-08-06  6:38 UTC (permalink / raw)
  To: 20240806012409.61962-1-21cnbao
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Kairui Song, Chris Li,
	Huang, Ying, Hugh Dickins, Kalesh Singh, Ryan Roberts,
	David Hildenbrand

Next time, please use "> "  and ">> " etc to reply to emails.

On Tue, Aug 6, 2024 at 3:23 PM zhiguojiang <justinjiang@vivo.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
>
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
>
>   #include <sys/mman.h>
>   #include <sys/time.h>
>   #include <stdlib.h>
>
>   unsigned long long tv_to_ms(struct timeval tv)
>   {
>          return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>   }
>
>   main()
>   {
>          struct timeval tv_b, tv_e;
>          int i;
>   #define SIZE 1024*1024*1024
>          void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>          if (!p) {
>                  perror("fail to get memory");
>                  exit(-1);
>          }
>
>          madvise(p, SIZE, MADV_HUGEPAGE);
>          memset(p, 0x11, SIZE); /* write to get mem */
>
>          madvise(p, SIZE, MADV_PAGEOUT);
>
>          gettimeofday(&tv_b, NULL);
>          munmap(p, SIZE);
>          gettimeofday(&tv_e, NULL);
>
>          printf("munmap in bandwidth: %ld bytes/ms\n",
>                          SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>   }
>
> The result is as below (munmap bandwidth):
>                  mm-unstable  mm-unstable-with-patch
>     round1       21053761      63161283
>     round2       21053761      63161283
>     round3       21053761      63161283
>     round4       20648881      67108864
>     round5       20648881      67108864
>
> munmap bandwidth becomes 3X faster.
>
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
>
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 61 insertions(+)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ea023fc25d08..ed872a186e81 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>         return true;
>   }
>
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +                             unsigned long offset, int nr_pages,
> +                             bool *has_cache)
> +{
> +       unsigned char *map = si->swap_map + offset;
> +       unsigned char *map_end = map + nr_pages;
> +       bool cached = false;
> +
> +       do {
> +               if ((*map & ~SWAP_HAS_CACHE) != 1)
> +                       return false;
> +               if (*map & SWAP_HAS_CACHE)
> +                       cached = true;
> +       } while (++map < map_end);
> +
> +       *has_cache = cached;
> +       return true;
> +}
> +
>   /*
>    * returns number of pages in the folio that backs the swap entry. If positive,
>    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
>         return usage;
>   }
>
> +static bool try_batch_swap_entries_free(struct swap_info_struct *p,
> +               swp_entry_t entry, int nr, bool *any_only_cache)
> +{
> +       unsigned long offset = swp_offset(entry);
> +       struct swap_cluster_info *ci;
> +       bool has_cache = false;
> +       bool can_batch;
> +       int i;
> +
> +       /* cross into another cluster */
> +       if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +               return false;
> My understand of mTHP swap entries alloced by by cluster_alloc_swap()
> is that they belong to the same cluster in the same swapinfo , so
> theoretically it will not appear for
> (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)?
> Can you help confirm?

zap_pte_range() has no concept of folios (mTHP) as folios could have
gone. you could have the case:
folio1:  last 16 slots of cluster1
folio2:  first 16 slots of cluster2.
folio1 and folio2 are within the same PMD and virtually contiguous
before they are unmapped.

when both folio1 and folio2 have gone, zap_pte_range() 's
nr = swap_pte_batch(pte, max_nr, ptent);

nr will be 32.  "mTHP swap entries alloced by by cluster_alloc_swap() belong
to the same cluster" is correct, but when you zap_pte_range(), your mTHPs
could have gone.

>
> +       ci = lock_cluster_or_swap_info(p, offset);
> +       can_batch = swap_is_last_map(p, offset, nr, &has_cache);
> +       if (can_batch) {
> +               for (i = 0; i < nr; i++)
> +                       WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
> +       }
> +       unlock_cluster_or_swap_info(p, ci);
> +
> +       /* all swap_maps have count==1 and have no swapcache */
> +       if (!can_batch)
> +               goto out;
> +       if (!has_cache) {
> +               spin_lock(&p->lock);
> +               swap_entry_range_free(p, entry, nr);
> +               spin_unlock(&p->lock);
> +       }
> +       *any_only_cache = has_cache;
> +out:
> +       return can_batch;
> +}
> +
>   /*
>    * Drop the last HAS_CACHE flag of swap entries, caller have to
>    * ensure all entries belong to the same cgroup.
> @@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         bool any_only_cache = false;
>         unsigned long offset;
>         unsigned char count;
> +       bool batched;
>
>         if (non_swap_entry(entry))
>                 return;
> @@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         if (WARN_ON(end_offset > si->max))
>                 goto out;
>
> +       if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
> +               batched = try_batch_swap_entries_free(si, entry, nr,
> +                                               &any_only_cache);
> +               if (batched)
> +                       goto reclaim;
> +       }
> The mTHP swap entries are batch freed as a whole directly by skipping
> percpu swp_slots caches, instead of freeing every swap entry separately,
> which can accelerate the mTHP swap entries release. I think it is
> valuable.

yes. I have seen 3X performance improvement.

>
> +
>         /*
>          * First free all entries in the range.
>          */
> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>                 }
>         }
>
> +reclaim:
>         /*
>          * Short-circuit the below loop if none of the entries had their
>          * reference drop to zero.
> --
> 2.34.1
>
> Thanks
> Zhiguo
>
>
>

Thanks
Barry


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
@ 2024-08-06  7:23 zhiguojiang
  2024-08-06  6:38 ` Barry Song
  0 siblings, 1 reply; 4+ messages in thread
From: zhiguojiang @ 2024-08-06  7:23 UTC (permalink / raw)
  To: Barry Song, akpm, linux-mm
  Cc: linux-kernel, Barry Song, Kairui Song, Chris Li, Huang, Ying,
	Hugh Dickins, Kalesh Singh, Ryan Roberts, David Hildenbrand

From: Barry Song <v-songbaohua@oppo.com>

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
If swap_count is 1, which is likely true as most anon memory are
private, we can free all contiguous swap slots all together.

Ran the below test program for measuring the bandwidth of munmap
using zRAM and 64KiB mTHP:

  #include <sys/mman.h>
  #include <sys/time.h>
  #include <stdlib.h>

  unsigned long long tv_to_ms(struct timeval tv)
  {
         return tv.tv_sec * 1000 + tv.tv_usec / 1000;
  }

  main()
  {
         struct timeval tv_b, tv_e;
         int i;
  #define SIZE 1024*1024*1024
         void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
         if (!p) {
                 perror("fail to get memory");
                 exit(-1);
         }

         madvise(p, SIZE, MADV_HUGEPAGE);
         memset(p, 0x11, SIZE); /* write to get mem */

         madvise(p, SIZE, MADV_PAGEOUT);

         gettimeofday(&tv_b, NULL);
         munmap(p, SIZE);
         gettimeofday(&tv_e, NULL);

         printf("munmap in bandwidth: %ld bytes/ms\n",
                         SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
  }

The result is as below (munmap bandwidth):
                 mm-unstable  mm-unstable-with-patch
    round1       21053761      63161283
    round2       21053761      63161283
    round3       21053761      63161283
    round4       20648881      67108864
    round5       20648881      67108864

munmap bandwidth becomes 3X faster.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Cc: Kairui Song <kasong@tencent.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
  mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
  1 file changed, 61 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..ed872a186e81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
  	return true;
  }
  
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *has_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*has_cache = cached;
+	return true;
+}
+
  /*
   * returns number of pages in the folio that backs the swap entry. If positive,
   * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
  	return usage;
  }
  
+static bool try_batch_swap_entries_free(struct swap_info_struct *p,
+		swp_entry_t entry, int nr, bool *any_only_cache)
+{
+	unsigned long offset = swp_offset(entry);
+	struct swap_cluster_info *ci;
+	bool has_cache = false;
+	bool can_batch;
+	int i;
+
+	/* cross into another cluster */
+	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+		return false;
My understand of mTHP swap entries alloced by by cluster_alloc_swap()
is that they belong to the same cluster in the same swapinfo , so
theoretically it will not appear for
(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)?
Can you help confirm?

+	ci = lock_cluster_or_swap_info(p, offset);
+	can_batch = swap_is_last_map(p, offset, nr, &has_cache);
+	if (can_batch) {
+		for (i = 0; i < nr; i++)
+			WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
+	}
+	unlock_cluster_or_swap_info(p, ci);
+
+	/* all swap_maps have count==1 and have no swapcache */
+	if (!can_batch)
+		goto out;
+	if (!has_cache) {
+		spin_lock(&p->lock);
+		swap_entry_range_free(p, entry, nr);
+		spin_unlock(&p->lock);
+	}
+	*any_only_cache = has_cache;
+out:
+	return can_batch;
+}
+
  /*
   * Drop the last HAS_CACHE flag of swap entries, caller have to
   * ensure all entries belong to the same cgroup.
@@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
  	bool any_only_cache = false;
  	unsigned long offset;
  	unsigned char count;
+	bool batched;
  
  	if (non_swap_entry(entry))
  		return;
@@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
  	if (WARN_ON(end_offset > si->max))
  		goto out;
  
+	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
+		batched = try_batch_swap_entries_free(si, entry, nr,
+						&any_only_cache);
+		if (batched)
+			goto reclaim;
+	}
The mTHP swap entries are batch freed as a whole directly by skipping
percpu swp_slots caches, instead of freeing every swap entry separately,
which can accelerate the mTHP swap entries release. I think it is
valuable.

+
  	/*
  	 * First free all entries in the range.
  	 */
@@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
  		}
  	}
  
+reclaim:
  	/*
  	 * Short-circuit the below loop if none of the entries had their
  	 * reference drop to zero.
-- 
2.34.1

Thanks
Zhiguo





^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
  2024-08-06  1:24 Barry Song
@ 2024-08-06 12:56 ` David Hildenbrand
  0 siblings, 0 replies; 4+ messages in thread
From: David Hildenbrand @ 2024-08-06 12:56 UTC (permalink / raw)
  To: Barry Song, akpm, linux-mm
  Cc: linux-kernel, Barry Song, Kairui Song, Chris Li, Huang, Ying,
	Hugh Dickins, Kalesh Singh, Ryan Roberts

On 06.08.24 03:24, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> Zhiguo reported that swap release could be a serious bottleneck
> during process exits[1]. With mTHP, we have the opportunity to
> batch free swaps.
> Thanks to the work of Chris and Kairui[2], I was able to achieve
> this optimization with minimal code changes by building on their
> efforts.
> If swap_count is 1, which is likely true as most anon memory are
> private, we can free all contiguous swap slots all together.
> 
> Ran the below test program for measuring the bandwidth of munmap
> using zRAM and 64KiB mTHP:
> 
>   #include <sys/mman.h>
>   #include <sys/time.h>
>   #include <stdlib.h>
> 
>   unsigned long long tv_to_ms(struct timeval tv)
>   {
>          return tv.tv_sec * 1000 + tv.tv_usec / 1000;
>   }
> 
>   main()
>   {
>          struct timeval tv_b, tv_e;
>          int i;
>   #define SIZE 1024*1024*1024
>          void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>                                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>          if (!p) {
>                  perror("fail to get memory");
>                  exit(-1);
>          }
> 
>          madvise(p, SIZE, MADV_HUGEPAGE);
>          memset(p, 0x11, SIZE); /* write to get mem */
> 
>          madvise(p, SIZE, MADV_PAGEOUT);
> 
>          gettimeofday(&tv_b, NULL);
>          munmap(p, SIZE);
>          gettimeofday(&tv_e, NULL);
> 
>          printf("munmap in bandwidth: %ld bytes/ms\n",
>                          SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
>   }
> 
> The result is as below (munmap bandwidth):
>                  mm-unstable  mm-unstable-with-patch
>     round1       21053761      63161283
>     round2       21053761      63161283
>     round3       21053761      63161283
>     round4       20648881      67108864
>     round5       20648881      67108864
> 
> munmap bandwidth becomes 3X faster.
> 
> [1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
> [2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/
> 
> Cc: Kairui Song <kasong@tencent.com>
> Cc: Chris Li <chrisl@kernel.org>
> Cc: "Huang, Ying" <ying.huang@intel.com>
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 61 insertions(+)
> 
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ea023fc25d08..ed872a186e81 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
>   	return true;
>   }
>   
> +static bool swap_is_last_map(struct swap_info_struct *si,
> +			      unsigned long offset, int nr_pages,
> +			      bool *has_cache)
> +{
> +	unsigned char *map = si->swap_map + offset;
> +	unsigned char *map_end = map + nr_pages;
> +	bool cached = false;
> +
> +	do {
> +		if ((*map & ~SWAP_HAS_CACHE) != 1)
> +			return false;
> +		if (*map & SWAP_HAS_CACHE)
> +			cached = true;
> +	} while (++map < map_end);
> +
> +	*has_cache = cached;
> +	return true;
> +}
> +
>   /*
>    * returns number of pages in the folio that backs the swap entry. If positive,
>    * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
> @@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
>   	return usage;
>   }
>   
> +static bool try_batch_swap_entries_free(struct swap_info_struct *p,

Why call it "p" here and not "si" like in the other code you are touching?

> +		swp_entry_t entry, int nr, bool *any_only_cache)
> +{
> +	unsigned long offset = swp_offset(entry);
> +	struct swap_cluster_info *ci;
> +	bool has_cache = false;
> +	bool can_batch;
> +	int i;
> +
> +	/* cross into another cluster */
> +	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +		return false;
> +	ci = lock_cluster_or_swap_info(p, offset);
> +	can_batch = swap_is_last_map(p, offset, nr, &has_cache);
> +	if (can_batch) {
> +		for (i = 0; i < nr; i++)
> +			WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
> +	}
> +	unlock_cluster_or_swap_info(p, ci);
> +
> +	/* all swap_maps have count==1 and have no swapcache */
> +	if (!can_batch)
> +		goto out;
> +	if (!has_cache) {
> +		spin_lock(&p->lock);
> +		swap_entry_range_free(p, entry, nr);
> +		spin_unlock(&p->lock);
> +	}
> +	*any_only_cache = has_cache;
> +out:
> +	return can_batch;
> +}
> +
>   /*
>    * Drop the last HAS_CACHE flag of swap entries, caller have to
>    * ensure all entries belong to the same cgroup.
> @@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   	bool any_only_cache = false;
>   	unsigned long offset;
>   	unsigned char count;
> +	bool batched;
>   
>   	if (non_swap_entry(entry))
>   		return;
> @@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   	if (WARN_ON(end_offset > si->max))
>   		goto out;
>   
> +	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
> +		batched = try_batch_swap_entries_free(si, entry, nr,
> +						&any_only_cache);
> +		if (batched)
> +			goto reclaim;
> +	}
> +

I'm wondering if we could find a way to clean this up to achieve here:


if (WARN_ON(end_offset > si->max))
	goto out;

/*
  * First free all entries in the range.$
  */
any_only_cache = __free_swap_entries(si, entry, nr);

/*
  * Short-circuit the below loop if none of the entries had their
  * reference drop to zero.
  */
if (!any_only_cache)
	goto out;




Whereby move the fallback loop in that new function

static bool __free_swap_entries(struct swap_info_struct *si,
		swp_entry_t entry, int nr)
{
	const unsigned long start_offset = swp_offset(entry);
	const unsigned long end_offset = start_offset + nr;
	bool any_only_cache = false;

	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
		[... what try_batch_swap_entries_free() would do ...]
	}

fallback:
	for (offset = start_offset; offset < end_offset; offset++) {
		if (data_race(si->swap_map[offset])) {
		[... what the fallback code would do ...]
	}
	return any_only_cache;
}


>   	/*
>   	 * First free all entries in the range.
>   	 */
> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>   		}
>   	}
>   
> +reclaim:
>   	/*
>   	 * Short-circuit the below loop if none of the entries had their
>   	 * reference drop to zero.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] mm: attempt to batch free swap entries for zap_pte_range()
@ 2024-08-06  1:24 Barry Song
  2024-08-06 12:56 ` David Hildenbrand
  0 siblings, 1 reply; 4+ messages in thread
From: Barry Song @ 2024-08-06  1:24 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: linux-kernel, Barry Song, Kairui Song, Chris Li, Huang, Ying,
	Hugh Dickins, Kalesh Singh, Ryan Roberts, David Hildenbrand

From: Barry Song <v-songbaohua@oppo.com>

Zhiguo reported that swap release could be a serious bottleneck
during process exits[1]. With mTHP, we have the opportunity to
batch free swaps.
Thanks to the work of Chris and Kairui[2], I was able to achieve
this optimization with minimal code changes by building on their
efforts.
If swap_count is 1, which is likely true as most anon memory are
private, we can free all contiguous swap slots all together.

Ran the below test program for measuring the bandwidth of munmap
using zRAM and 64KiB mTHP:

 #include <sys/mman.h>
 #include <sys/time.h>
 #include <stdlib.h>

 unsigned long long tv_to_ms(struct timeval tv)
 {
        return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }

 main()
 {
        struct timeval tv_b, tv_e;
        int i;
 #define SIZE 1024*1024*1024
        void *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (!p) {
                perror("fail to get memory");
                exit(-1);
        }

        madvise(p, SIZE, MADV_HUGEPAGE);
        memset(p, 0x11, SIZE); /* write to get mem */

        madvise(p, SIZE, MADV_PAGEOUT);

        gettimeofday(&tv_b, NULL);
        munmap(p, SIZE);
        gettimeofday(&tv_e, NULL);

        printf("munmap in bandwidth: %ld bytes/ms\n",
                        SIZE/(tv_to_ms(tv_e) - tv_to_ms(tv_b)));
 }

The result is as below (munmap bandwidth):
                mm-unstable  mm-unstable-with-patch
   round1       21053761      63161283
   round2       21053761      63161283
   round3       21053761      63161283
   round4       20648881      67108864
   round5       20648881      67108864

munmap bandwidth becomes 3X faster.

[1] https://lore.kernel.org/linux-mm/20240731133318.527-1-justinjiang@vivo.com/
[2] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/

Cc: Kairui Song <kasong@tencent.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..ed872a186e81 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -156,6 +156,25 @@ static bool swap_is_has_cache(struct swap_info_struct *si,
 	return true;
 }
 
+static bool swap_is_last_map(struct swap_info_struct *si,
+			      unsigned long offset, int nr_pages,
+			      bool *has_cache)
+{
+	unsigned char *map = si->swap_map + offset;
+	unsigned char *map_end = map + nr_pages;
+	bool cached = false;
+
+	do {
+		if ((*map & ~SWAP_HAS_CACHE) != 1)
+			return false;
+		if (*map & SWAP_HAS_CACHE)
+			cached = true;
+	} while (++map < map_end);
+
+	*has_cache = cached;
+	return true;
+}
+
 /*
  * returns number of pages in the folio that backs the swap entry. If positive,
  * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
@@ -1469,6 +1488,39 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
 	return usage;
 }
 
+static bool try_batch_swap_entries_free(struct swap_info_struct *p,
+		swp_entry_t entry, int nr, bool *any_only_cache)
+{
+	unsigned long offset = swp_offset(entry);
+	struct swap_cluster_info *ci;
+	bool has_cache = false;
+	bool can_batch;
+	int i;
+
+	/* cross into another cluster */
+	if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+		return false;
+	ci = lock_cluster_or_swap_info(p, offset);
+	can_batch = swap_is_last_map(p, offset, nr, &has_cache);
+	if (can_batch) {
+		for (i = 0; i < nr; i++)
+			WRITE_ONCE(p->swap_map[offset + i], SWAP_HAS_CACHE);
+	}
+	unlock_cluster_or_swap_info(p, ci);
+
+	/* all swap_maps have count==1 and have no swapcache */
+	if (!can_batch)
+		goto out;
+	if (!has_cache) {
+		spin_lock(&p->lock);
+		swap_entry_range_free(p, entry, nr);
+		spin_unlock(&p->lock);
+	}
+	*any_only_cache = has_cache;
+out:
+	return can_batch;
+}
+
 /*
  * Drop the last HAS_CACHE flag of swap entries, caller have to
  * ensure all entries belong to the same cgroup.
@@ -1797,6 +1849,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	bool any_only_cache = false;
 	unsigned long offset;
 	unsigned char count;
+	bool batched;
 
 	if (non_swap_entry(entry))
 		return;
@@ -1808,6 +1861,13 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (WARN_ON(end_offset > si->max))
 		goto out;
 
+	if (nr > 1 && swap_count(data_race(si->swap_map[start_offset]) == 1)) {
+		batched = try_batch_swap_entries_free(si, entry, nr,
+						&any_only_cache);
+		if (batched)
+			goto reclaim;
+	}
+
 	/*
 	 * First free all entries in the range.
 	 */
@@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 		}
 	}
 
+reclaim:
 	/*
 	 * Short-circuit the below loop if none of the entries had their
 	 * reference drop to zero.
-- 
2.34.1



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-08-06 12:56 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-06  7:23 [PATCH] mm: attempt to batch free swap entries for zap_pte_range() zhiguojiang
2024-08-06  6:38 ` Barry Song
  -- strict thread matches above, loose matches on Subject: below --
2024-08-06  1:24 Barry Song
2024-08-06 12:56 ` David Hildenbrand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox