linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: swap: mTHP frees entries as a whole
@ 2024-08-05 16:07 Zhiguo Jiang
  2024-08-05 22:09 ` Barry Song
  0 siblings, 1 reply; 7+ messages in thread
From: Zhiguo Jiang @ 2024-08-05 16:07 UTC (permalink / raw)
  To: Andrew Morton, linux-mm, linux-kernel, Barry Song, Chris Li,
	Zhiguo Jiang
  Cc: opensource.kernel

Support mTHP's attempt to free swap entries as a whole, which can avoid
frequent swap_info locking for every individual entry in
swapcache_free_entries(). When the swap_map count values corresponding
to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
entries will be freed directly by skippping percpu swp_slots caches.

Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
---
 mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index ea023fc25d08..829fb4cfb6ec
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
 	swap_range_free(p, offset, nr_pages);
 }
 
+/*
+ * Free the contiguous swap entries as a whole, caller have to
+ * ensure all entries belong to the same folio.
+ */
+static void swap_entry_range_check_and_free(struct swap_info_struct *p,
+				  swp_entry_t entry, int nr, bool *any_only_cache)
+{
+	const unsigned long start_offset = swp_offset(entry);
+	const unsigned long end_offset = start_offset + nr;
+	unsigned long offset;
+	DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
+	struct swap_cluster_info *ci;
+	int i = 0, nr_setbits = 0;
+	unsigned char count;
+
+	/*
+	 * Free and check swap_map count values corresponding to all contiguous
+	 * entries in the whole folio range.
+	 */
+	WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
+	ci = lock_cluster_or_swap_info(p, start_offset);
+	for (offset = start_offset; offset < end_offset; offset++, i++) {
+		if (data_race(p->swap_map[offset])) {
+			count = __swap_entry_free_locked(p, offset, 1);
+			if (!count) {
+				bitmap_set(to_free, i, 1);
+				nr_setbits++;
+			} else if (count == SWAP_HAS_CACHE) {
+				*any_only_cache = true;
+			}
+		} else {
+			WARN_ON_ONCE(1);
+		}
+	}
+	unlock_cluster_or_swap_info(p, ci);
+
+	/*
+	 * If the swap_map count values corresponding to all contiguous entries are
+	 * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
+	 * skippping percpu swp_slots caches, which can avoid frequent swap_info
+	 * locking for every individual entry.
+	 */
+	if (nr > 1 && nr_setbits == nr) {
+		spin_lock(&p->lock);
+		swap_entry_range_free(p, entry, nr);
+		spin_unlock(&p->lock);
+	} else {
+		for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
+			free_swap_slot(swp_entry(p->type, start_offset + i));
+	}
+}
+
 static void cluster_swap_free_nr(struct swap_info_struct *sis,
 		unsigned long offset, int nr_pages,
 		unsigned char usage)
@@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 	if (WARN_ON(end_offset > si->max))
 		goto out;
 
+	/*
+	 * Try to free all contiguous entries about mTHP as a whole.
+	 */
+	if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
+		swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
+		goto free_cache;
+	}
+
 	/*
 	 * First free all entries in the range.
 	 */
@@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 		}
 	}
 
+free_cache:
 	/*
 	 * Short-circuit the below loop if none of the entries had their
 	 * reference drop to zero.
-- 
2.39.0



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-05 16:07 [PATCH] mm: swap: mTHP frees entries as a whole Zhiguo Jiang
@ 2024-08-05 22:09 ` Barry Song
  2024-08-06  2:01   ` zhiguojiang
  0 siblings, 1 reply; 7+ messages in thread
From: Barry Song @ 2024-08-05 22:09 UTC (permalink / raw)
  To: Zhiguo Jiang
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel

On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
>
> Support mTHP's attempt to free swap entries as a whole, which can avoid
> frequent swap_info locking for every individual entry in
> swapcache_free_entries(). When the swap_map count values corresponding
> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
> entries will be freed directly by skippping percpu swp_slots caches.
>

No, this isn't quite good. Please review the work done by Chris and Kairui[1];
they have handled it better. On a different note, I have a patch that can
handle zap_pte_range() for swap entries in batches[2][3].

[1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
[2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
[3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/

> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
> ---
>  mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 61 insertions(+)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index ea023fc25d08..829fb4cfb6ec
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
>         swap_range_free(p, offset, nr_pages);
>  }
>
> +/*
> + * Free the contiguous swap entries as a whole, caller have to
> + * ensure all entries belong to the same folio.
> + */
> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
> +{
> +       const unsigned long start_offset = swp_offset(entry);
> +       const unsigned long end_offset = start_offset + nr;
> +       unsigned long offset;
> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
> +       struct swap_cluster_info *ci;
> +       int i = 0, nr_setbits = 0;
> +       unsigned char count;
> +
> +       /*
> +        * Free and check swap_map count values corresponding to all contiguous
> +        * entries in the whole folio range.
> +        */
> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
> +       ci = lock_cluster_or_swap_info(p, start_offset);
> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
> +               if (data_race(p->swap_map[offset])) {
> +                       count = __swap_entry_free_locked(p, offset, 1);
> +                       if (!count) {
> +                               bitmap_set(to_free, i, 1);
> +                               nr_setbits++;
> +                       } else if (count == SWAP_HAS_CACHE) {
> +                               *any_only_cache = true;
> +                       }
> +               } else {
> +                       WARN_ON_ONCE(1);
> +               }
> +       }
> +       unlock_cluster_or_swap_info(p, ci);
> +
> +       /*
> +        * If the swap_map count values corresponding to all contiguous entries are
> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
> +        * locking for every individual entry.
> +        */
> +       if (nr > 1 && nr_setbits == nr) {
> +               spin_lock(&p->lock);
> +               swap_entry_range_free(p, entry, nr);
> +               spin_unlock(&p->lock);
> +       } else {
> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
> +       }
> +}
> +
>  static void cluster_swap_free_nr(struct swap_info_struct *sis,
>                 unsigned long offset, int nr_pages,
>                 unsigned char usage)
> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>         if (WARN_ON(end_offset > si->max))
>                 goto out;
>
> +       /*
> +        * Try to free all contiguous entries about mTHP as a whole.
> +        */
> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
> +               goto free_cache;
> +       }
> +
>         /*
>          * First free all entries in the range.
>          */
> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>                 }
>         }
>
> +free_cache:
>         /*
>          * Short-circuit the below loop if none of the entries had their
>          * reference drop to zero.
> --
> 2.39.0
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-05 22:09 ` Barry Song
@ 2024-08-06  2:01   ` zhiguojiang
  2024-08-06  2:07     ` Barry Song
  0 siblings, 1 reply; 7+ messages in thread
From: zhiguojiang @ 2024-08-06  2:01 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel



在 2024/8/6 6:09, Barry Song 写道:
> On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
>> Support mTHP's attempt to free swap entries as a whole, which can avoid
>> frequent swap_info locking for every individual entry in
>> swapcache_free_entries(). When the swap_map count values corresponding
>> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
>> entries will be freed directly by skippping percpu swp_slots caches.
>>
> No, this isn't quite good. Please review the work done by Chris and Kairui[1];
> they have handled it better. On a different note, I have a patch that can
> handle zap_pte_range() for swap entries in batches[2][3].
I'm glad to see your optimized submission about batch freeing swap 
entries for
zap_pte_range(), sorry, I didn't see it before. My this patch can be 
ignored.

Thanks
Zhiguo

>
> [1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
> [2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
> [3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/
>
>> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
>> ---
>>   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 61 insertions(+)
>>
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index ea023fc25d08..829fb4cfb6ec
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
>>          swap_range_free(p, offset, nr_pages);
>>   }
>>
>> +/*
>> + * Free the contiguous swap entries as a whole, caller have to
>> + * ensure all entries belong to the same folio.
>> + */
>> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
>> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
>> +{
>> +       const unsigned long start_offset = swp_offset(entry);
>> +       const unsigned long end_offset = start_offset + nr;
>> +       unsigned long offset;
>> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
>> +       struct swap_cluster_info *ci;
>> +       int i = 0, nr_setbits = 0;
>> +       unsigned char count;
>> +
>> +       /*
>> +        * Free and check swap_map count values corresponding to all contiguous
>> +        * entries in the whole folio range.
>> +        */
>> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
>> +       ci = lock_cluster_or_swap_info(p, start_offset);
>> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
>> +               if (data_race(p->swap_map[offset])) {
>> +                       count = __swap_entry_free_locked(p, offset, 1);
>> +                       if (!count) {
>> +                               bitmap_set(to_free, i, 1);
>> +                               nr_setbits++;
>> +                       } else if (count == SWAP_HAS_CACHE) {
>> +                               *any_only_cache = true;
>> +                       }
>> +               } else {
>> +                       WARN_ON_ONCE(1);
>> +               }
>> +       }
>> +       unlock_cluster_or_swap_info(p, ci);
>> +
>> +       /*
>> +        * If the swap_map count values corresponding to all contiguous entries are
>> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
>> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
>> +        * locking for every individual entry.
>> +        */
>> +       if (nr > 1 && nr_setbits == nr) {
>> +               spin_lock(&p->lock);
>> +               swap_entry_range_free(p, entry, nr);
>> +               spin_unlock(&p->lock);
>> +       } else {
>> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
>> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
>> +       }
>> +}
>> +
>>   static void cluster_swap_free_nr(struct swap_info_struct *sis,
>>                  unsigned long offset, int nr_pages,
>>                  unsigned char usage)
>> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>          if (WARN_ON(end_offset > si->max))
>>                  goto out;
>>
>> +       /*
>> +        * Try to free all contiguous entries about mTHP as a whole.
>> +        */
>> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
>> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
>> +               goto free_cache;
>> +       }
>> +
>>          /*
>>           * First free all entries in the range.
>>           */
>> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>                  }
>>          }
>>
>> +free_cache:
>>          /*
>>           * Short-circuit the below loop if none of the entries had their
>>           * reference drop to zero.
>> --
>> 2.39.0
>>



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-06  2:01   ` zhiguojiang
@ 2024-08-06  2:07     ` Barry Song
  2024-08-06  7:40       ` zhiguojiang
  0 siblings, 1 reply; 7+ messages in thread
From: Barry Song @ 2024-08-06  2:07 UTC (permalink / raw)
  To: zhiguojiang
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel

On Tue, Aug 6, 2024 at 2:01 PM zhiguojiang <justinjiang@vivo.com> wrote:
>
>
>
> 在 2024/8/6 6:09, Barry Song 写道:
> > On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
> >> Support mTHP's attempt to free swap entries as a whole, which can avoid
> >> frequent swap_info locking for every individual entry in
> >> swapcache_free_entries(). When the swap_map count values corresponding
> >> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
> >> entries will be freed directly by skippping percpu swp_slots caches.
> >>
> > No, this isn't quite good. Please review the work done by Chris and Kairui[1];
> > they have handled it better. On a different note, I have a patch that can
> > handle zap_pte_range() for swap entries in batches[2][3].
> I'm glad to see your optimized submission about batch freeing swap
> entries for
> zap_pte_range(), sorry, I didn't see it before. My this patch can be
> ignored.

no worries, please help test and review the formal patch I sent:
https://lore.kernel.org/linux-mm/20240806012409.61962-1-21cnbao@gmail.com/

Please note that I didn't use a bitmap to avoid a large stack, and
there is a real possibility of the below can occur, your patch can
crash if the below is true:
nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER

Additionally, I quickly skip the case where
swap_count(data_race(si->swap_map[start_offset]) != 1) to avoid regressions
in cases that can't be batched.

>
> Thanks
> Zhiguo
>
> >
> > [1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
> > [2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
> > [3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/
> >
> >> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
> >> ---
> >>   mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>   1 file changed, 61 insertions(+)
> >>
> >> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >> index ea023fc25d08..829fb4cfb6ec
> >> --- a/mm/swapfile.c
> >> +++ b/mm/swapfile.c
> >> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
> >>          swap_range_free(p, offset, nr_pages);
> >>   }
> >>
> >> +/*
> >> + * Free the contiguous swap entries as a whole, caller have to
> >> + * ensure all entries belong to the same folio.
> >> + */
> >> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
> >> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
> >> +{
> >> +       const unsigned long start_offset = swp_offset(entry);
> >> +       const unsigned long end_offset = start_offset + nr;
> >> +       unsigned long offset;
> >> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
> >> +       struct swap_cluster_info *ci;
> >> +       int i = 0, nr_setbits = 0;
> >> +       unsigned char count;
> >> +
> >> +       /*
> >> +        * Free and check swap_map count values corresponding to all contiguous
> >> +        * entries in the whole folio range.
> >> +        */
> >> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
> >> +       ci = lock_cluster_or_swap_info(p, start_offset);
> >> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
> >> +               if (data_race(p->swap_map[offset])) {
> >> +                       count = __swap_entry_free_locked(p, offset, 1);
> >> +                       if (!count) {
> >> +                               bitmap_set(to_free, i, 1);
> >> +                               nr_setbits++;
> >> +                       } else if (count == SWAP_HAS_CACHE) {
> >> +                               *any_only_cache = true;
> >> +                       }
> >> +               } else {
> >> +                       WARN_ON_ONCE(1);
> >> +               }
> >> +       }
> >> +       unlock_cluster_or_swap_info(p, ci);
> >> +
> >> +       /*
> >> +        * If the swap_map count values corresponding to all contiguous entries are
> >> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
> >> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
> >> +        * locking for every individual entry.
> >> +        */
> >> +       if (nr > 1 && nr_setbits == nr) {
> >> +               spin_lock(&p->lock);
> >> +               swap_entry_range_free(p, entry, nr);
> >> +               spin_unlock(&p->lock);
> >> +       } else {
> >> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
> >> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
> >> +       }
> >> +}
> >> +
> >>   static void cluster_swap_free_nr(struct swap_info_struct *sis,
> >>                  unsigned long offset, int nr_pages,
> >>                  unsigned char usage)
> >> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>          if (WARN_ON(end_offset > si->max))
> >>                  goto out;
> >>
> >> +       /*
> >> +        * Try to free all contiguous entries about mTHP as a whole.
> >> +        */
> >> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
> >> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
> >> +               goto free_cache;
> >> +       }
> >> +
> >>          /*
> >>           * First free all entries in the range.
> >>           */
> >> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>                  }
> >>          }
> >>
> >> +free_cache:
> >>          /*
> >>           * Short-circuit the below loop if none of the entries had their
> >>           * reference drop to zero.
> >> --
> >> 2.39.0
> >>
Thanks
Barry


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-06  7:40       ` zhiguojiang
@ 2024-08-06  6:48         ` Barry Song
  2024-08-06  8:12           ` zhiguojiang
  0 siblings, 1 reply; 7+ messages in thread
From: Barry Song @ 2024-08-06  6:48 UTC (permalink / raw)
  To: zhiguojiang
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel

On Tue, Aug 6, 2024 at 3:41 PM zhiguojiang <justinjiang@vivo.com> wrote:
>
>
>
> 在 2024/8/6 10:07, Barry Song 写道:
> > On Tue, Aug 6, 2024 at 2:01 PM zhiguojiang <justinjiang@vivo.com> wrote:
> >>
> >>
> >> 在 2024/8/6 6:09, Barry Song 写道:
> >>> On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
> >>>> Support mTHP's attempt to free swap entries as a whole, which can avoid
> >>>> frequent swap_info locking for every individual entry in
> >>>> swapcache_free_entries(). When the swap_map count values corresponding
> >>>> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
> >>>> entries will be freed directly by skippping percpu swp_slots caches.
> >>>>
> >>> No, this isn't quite good. Please review the work done by Chris and Kairui[1];
> >>> they have handled it better. On a different note, I have a patch that can
> >>> handle zap_pte_range() for swap entries in batches[2][3].
> >> I'm glad to see your optimized submission about batch freeing swap
> >> entries for
> >> zap_pte_range(), sorry, I didn't see it before. My this patch can be
> >> ignored.
> > no worries, please help test and review the formal patch I sent:
> > https://lore.kernel.org/linux-mm/20240806012409.61962-1-21cnbao@gmail.com/
> I believe it's ok and valuable.  Looking forward to being merged soon.

Zhiguo, you are absolutely breaking lkml in another thread.
https://lore.kernel.org/linux-mm/b2ea5b52-311d-4c6c-a59d-9982b8723738@vivo.com/

Allow me to address your question from that thread here:

> +       /* cross into another cluster */
> +       if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
> +               return false;
> My understand of mTHP swap entries alloced by by cluster_alloc_swap()
> is that they belong to the same cluster in the same swapinfo , so
> theoretically it will not appear for
> (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)?
> Can you help confirm?

zap_pte_range() could have no concept of folios (mTHP) as folios could have
gone. you could have the case:

folio1:  last 16 slots of cluster1
folio2:  first 16 slots of cluster2.

folio1 and folio2 are within the same PMD and virtually contiguous
before they are unmapped.

when both folio1 and folio2 have been released and all 32 PTEs are swap
entries, zap_pte_range() 's

nr = swap_pte_batch(pte, max_nr, ptent);

nr will be 32.  "mTHP swap entries alloced by by cluster_alloc_swap() belong
to the same cluster" is correct, but when you zap_pte_range(), your mTHPs
could have been freed. swap_pte_batch() just returns the number of swap
entries. These 32 entries are crossing the boundary of one cluster.

> >
> > Please note that I didn't use a bitmap to avoid a large stack, and
> > there is a real possibility of the below can occur, your patch can
> > crash if the below is true:
> > nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER
> >
> > Additionally, I quickly skip the case where
> > swap_count(data_race(si->swap_map[start_offset]) != 1) to avoid regressions
> > in cases that can't be batched.
> >
> >> Thanks
> >> Zhiguo
> >>
> >>> [1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
> >>> [2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
> >>> [3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/
> >>>
> >>>> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
> >>>> ---
> >>>>    mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>    1 file changed, 61 insertions(+)
> >>>>
> >>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
> >>>> index ea023fc25d08..829fb4cfb6ec
> >>>> --- a/mm/swapfile.c
> >>>> +++ b/mm/swapfile.c
> >>>> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
> >>>>           swap_range_free(p, offset, nr_pages);
> >>>>    }
> >>>>
> >>>> +/*
> >>>> + * Free the contiguous swap entries as a whole, caller have to
> >>>> + * ensure all entries belong to the same folio.
> >>>> + */
> >>>> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
> >>>> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
> >>>> +{
> >>>> +       const unsigned long start_offset = swp_offset(entry);
> >>>> +       const unsigned long end_offset = start_offset + nr;
> >>>> +       unsigned long offset;
> >>>> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
> >>>> +       struct swap_cluster_info *ci;
> >>>> +       int i = 0, nr_setbits = 0;
> >>>> +       unsigned char count;
> >>>> +
> >>>> +       /*
> >>>> +        * Free and check swap_map count values corresponding to all contiguous
> >>>> +        * entries in the whole folio range.
> >>>> +        */
> >>>> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
> >>>> +       ci = lock_cluster_or_swap_info(p, start_offset);
> >>>> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
> >>>> +               if (data_race(p->swap_map[offset])) {
> >>>> +                       count = __swap_entry_free_locked(p, offset, 1);
> >>>> +                       if (!count) {
> >>>> +                               bitmap_set(to_free, i, 1);
> >>>> +                               nr_setbits++;
> >>>> +                       } else if (count == SWAP_HAS_CACHE) {
> >>>> +                               *any_only_cache = true;
> >>>> +                       }
> >>>> +               } else {
> >>>> +                       WARN_ON_ONCE(1);
> >>>> +               }
> >>>> +       }
> >>>> +       unlock_cluster_or_swap_info(p, ci);
> >>>> +
> >>>> +       /*
> >>>> +        * If the swap_map count values corresponding to all contiguous entries are
> >>>> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
> >>>> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
> >>>> +        * locking for every individual entry.
> >>>> +        */
> >>>> +       if (nr > 1 && nr_setbits == nr) {
> >>>> +               spin_lock(&p->lock);
> >>>> +               swap_entry_range_free(p, entry, nr);
> >>>> +               spin_unlock(&p->lock);
> >>>> +       } else {
> >>>> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
> >>>> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
> >>>> +       }
> >>>> +}
> >>>> +
> >>>>    static void cluster_swap_free_nr(struct swap_info_struct *sis,
> >>>>                   unsigned long offset, int nr_pages,
> >>>>                   unsigned char usage)
> >>>> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>>>           if (WARN_ON(end_offset > si->max))
> >>>>                   goto out;
> >>>>
> >>>> +       /*
> >>>> +        * Try to free all contiguous entries about mTHP as a whole.
> >>>> +        */
> >>>> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
> >>>> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
> >>>> +               goto free_cache;
> >>>> +       }
> >>>> +
> >>>>           /*
> >>>>            * First free all entries in the range.
> >>>>            */
> >>>> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
> >>>>                   }
> >>>>           }
> >>>>
> >>>> +free_cache:
> >>>>           /*
> >>>>            * Short-circuit the below loop if none of the entries had their
> >>>>            * reference drop to zero.
> >>>> --
> >>>> 2.39.0
> >>>>
Thanks
 Barry


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-06  2:07     ` Barry Song
@ 2024-08-06  7:40       ` zhiguojiang
  2024-08-06  6:48         ` Barry Song
  0 siblings, 1 reply; 7+ messages in thread
From: zhiguojiang @ 2024-08-06  7:40 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel



在 2024/8/6 10:07, Barry Song 写道:
> On Tue, Aug 6, 2024 at 2:01 PM zhiguojiang <justinjiang@vivo.com> wrote:
>>
>>
>> 在 2024/8/6 6:09, Barry Song 写道:
>>> On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
>>>> Support mTHP's attempt to free swap entries as a whole, which can avoid
>>>> frequent swap_info locking for every individual entry in
>>>> swapcache_free_entries(). When the swap_map count values corresponding
>>>> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
>>>> entries will be freed directly by skippping percpu swp_slots caches.
>>>>
>>> No, this isn't quite good. Please review the work done by Chris and Kairui[1];
>>> they have handled it better. On a different note, I have a patch that can
>>> handle zap_pte_range() for swap entries in batches[2][3].
>> I'm glad to see your optimized submission about batch freeing swap
>> entries for
>> zap_pte_range(), sorry, I didn't see it before. My this patch can be
>> ignored.
> no worries, please help test and review the formal patch I sent:
> https://lore.kernel.org/linux-mm/20240806012409.61962-1-21cnbao@gmail.com/
I believe it's ok and valuable.  Looking forward to being merged soon.
>
> Please note that I didn't use a bitmap to avoid a large stack, and
> there is a real possibility of the below can occur, your patch can
> crash if the below is true:
> nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER
>
> Additionally, I quickly skip the case where
> swap_count(data_race(si->swap_map[start_offset]) != 1) to avoid regressions
> in cases that can't be batched.
>
>> Thanks
>> Zhiguo
>>
>>> [1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
>>> [2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
>>> [3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/
>>>
>>>> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
>>>> ---
>>>>    mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 61 insertions(+)
>>>>
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index ea023fc25d08..829fb4cfb6ec
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
>>>>           swap_range_free(p, offset, nr_pages);
>>>>    }
>>>>
>>>> +/*
>>>> + * Free the contiguous swap entries as a whole, caller have to
>>>> + * ensure all entries belong to the same folio.
>>>> + */
>>>> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
>>>> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
>>>> +{
>>>> +       const unsigned long start_offset = swp_offset(entry);
>>>> +       const unsigned long end_offset = start_offset + nr;
>>>> +       unsigned long offset;
>>>> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
>>>> +       struct swap_cluster_info *ci;
>>>> +       int i = 0, nr_setbits = 0;
>>>> +       unsigned char count;
>>>> +
>>>> +       /*
>>>> +        * Free and check swap_map count values corresponding to all contiguous
>>>> +        * entries in the whole folio range.
>>>> +        */
>>>> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
>>>> +       ci = lock_cluster_or_swap_info(p, start_offset);
>>>> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
>>>> +               if (data_race(p->swap_map[offset])) {
>>>> +                       count = __swap_entry_free_locked(p, offset, 1);
>>>> +                       if (!count) {
>>>> +                               bitmap_set(to_free, i, 1);
>>>> +                               nr_setbits++;
>>>> +                       } else if (count == SWAP_HAS_CACHE) {
>>>> +                               *any_only_cache = true;
>>>> +                       }
>>>> +               } else {
>>>> +                       WARN_ON_ONCE(1);
>>>> +               }
>>>> +       }
>>>> +       unlock_cluster_or_swap_info(p, ci);
>>>> +
>>>> +       /*
>>>> +        * If the swap_map count values corresponding to all contiguous entries are
>>>> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
>>>> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
>>>> +        * locking for every individual entry.
>>>> +        */
>>>> +       if (nr > 1 && nr_setbits == nr) {
>>>> +               spin_lock(&p->lock);
>>>> +               swap_entry_range_free(p, entry, nr);
>>>> +               spin_unlock(&p->lock);
>>>> +       } else {
>>>> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
>>>> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
>>>> +       }
>>>> +}
>>>> +
>>>>    static void cluster_swap_free_nr(struct swap_info_struct *sis,
>>>>                   unsigned long offset, int nr_pages,
>>>>                   unsigned char usage)
>>>> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>           if (WARN_ON(end_offset > si->max))
>>>>                   goto out;
>>>>
>>>> +       /*
>>>> +        * Try to free all contiguous entries about mTHP as a whole.
>>>> +        */
>>>> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
>>>> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
>>>> +               goto free_cache;
>>>> +       }
>>>> +
>>>>           /*
>>>>            * First free all entries in the range.
>>>>            */
>>>> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>                   }
>>>>           }
>>>>
>>>> +free_cache:
>>>>           /*
>>>>            * Short-circuit the below loop if none of the entries had their
>>>>            * reference drop to zero.
>>>> --
>>>> 2.39.0
>>>>
> Thanks
> Barry
Thanks
Zhiguo


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] mm: swap: mTHP frees entries as a whole
  2024-08-06  6:48         ` Barry Song
@ 2024-08-06  8:12           ` zhiguojiang
  0 siblings, 0 replies; 7+ messages in thread
From: zhiguojiang @ 2024-08-06  8:12 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, linux-mm, linux-kernel, Chris Li, opensource.kernel



在 2024/8/6 14:48, Barry Song 写道:
> On Tue, Aug 6, 2024 at 3:41 PM zhiguojiang <justinjiang@vivo.com> wrote:
>>
>>
>> 在 2024/8/6 10:07, Barry Song 写道:
>>> On Tue, Aug 6, 2024 at 2:01 PM zhiguojiang <justinjiang@vivo.com> wrote:
>>>>
>>>> 在 2024/8/6 6:09, Barry Song 写道:
>>>>> On Tue, Aug 6, 2024 at 4:08 AM Zhiguo Jiang <justinjiang@vivo.com> wrote:
>>>>>> Support mTHP's attempt to free swap entries as a whole, which can avoid
>>>>>> frequent swap_info locking for every individual entry in
>>>>>> swapcache_free_entries(). When the swap_map count values corresponding
>>>>>> to all contiguous entries are all zero excluding SWAP_HAS_CACHE, the
>>>>>> entries will be freed directly by skippping percpu swp_slots caches.
>>>>>>
>>>>> No, this isn't quite good. Please review the work done by Chris and Kairui[1];
>>>>> they have handled it better. On a different note, I have a patch that can
>>>>> handle zap_pte_range() for swap entries in batches[2][3].
>>>> I'm glad to see your optimized submission about batch freeing swap
>>>> entries for
>>>> zap_pte_range(), sorry, I didn't see it before. My this patch can be
>>>> ignored.
>>> no worries, please help test and review the formal patch I sent:
>>> https://lore.kernel.org/linux-mm/20240806012409.61962-1-21cnbao@gmail.com/
>> I believe it's ok and valuable.  Looking forward to being merged soon.
> Zhiguo, you are absolutely breaking lkml in another thread.
> https://lore.kernel.org/linux-mm/b2ea5b52-311d-4c6c-a59d-9982b8723738@vivo.com/
Sorry, I am not yet proficient in this operation. Could you please fix 
it? Thanks
> Allow me to address your question from that thread here:
>
>> +       /* cross into another cluster */
>> +       if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
>> +               return false;
>> My understand of mTHP swap entries alloced by by cluster_alloc_swap()
>> is that they belong to the same cluster in the same swapinfo , so
>> theoretically it will not appear for
>> (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)?
>> Can you help confirm?
> zap_pte_range() could have no concept of folios (mTHP) as folios could have
> gone. you could have the case:
>
> folio1:  last 16 slots of cluster1
> folio2:  first 16 slots of cluster2.
>
> folio1 and folio2 are within the same PMD and virtually contiguous
> before they are unmapped.
>
> when both folio1 and folio2 have been released and all 32 PTEs are swap
> entries, zap_pte_range() 's
>
> nr = swap_pte_batch(pte, max_nr, ptent);
>
> nr will be 32.  "mTHP swap entries alloced by by cluster_alloc_swap() belong
> to the same cluster" is correct, but when you zap_pte_range(), your mTHPs
> could have been freed. swap_pte_batch() just returns the number of swap
> entries. These 32 entries are crossing the boundary of one cluster.
I got it, thank you very much for the explanation.
>>> Please note that I didn't use a bitmap to avoid a large stack, and
>>> there is a real possibility of the below can occur, your patch can
>>> crash if the below is true:
>>> nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER
>>>
>>> Additionally, I quickly skip the case where
>>> swap_count(data_race(si->swap_map[start_offset]) != 1) to avoid regressions
>>> in cases that can't be batched.
>>>
>>>> Thanks
>>>> Zhiguo
>>>>
>>>>> [1] https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org/
>>>>> [2] https://lore.kernel.org/linux-mm/20240803091118.84274-1-21cnbao@gmail.com/
>>>>> [3] https://lore.kernel.org/linux-mm/CAGsJ_4wPnQqKOHx6iQcwO8bQzoBXKr2qY2AgSxMwTQCj3-8YWw@mail.gmail.com/
>>>>>
>>>>>> Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
>>>>>> ---
>>>>>>     mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 61 insertions(+)
>>>>>>
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index ea023fc25d08..829fb4cfb6ec
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -1493,6 +1493,58 @@ static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry,
>>>>>>            swap_range_free(p, offset, nr_pages);
>>>>>>     }
>>>>>>
>>>>>> +/*
>>>>>> + * Free the contiguous swap entries as a whole, caller have to
>>>>>> + * ensure all entries belong to the same folio.
>>>>>> + */
>>>>>> +static void swap_entry_range_check_and_free(struct swap_info_struct *p,
>>>>>> +                                 swp_entry_t entry, int nr, bool *any_only_cache)
>>>>>> +{
>>>>>> +       const unsigned long start_offset = swp_offset(entry);
>>>>>> +       const unsigned long end_offset = start_offset + nr;
>>>>>> +       unsigned long offset;
>>>>>> +       DECLARE_BITMAP(to_free, SWAPFILE_CLUSTER) = { 0 };
>>>>>> +       struct swap_cluster_info *ci;
>>>>>> +       int i = 0, nr_setbits = 0;
>>>>>> +       unsigned char count;
>>>>>> +
>>>>>> +       /*
>>>>>> +        * Free and check swap_map count values corresponding to all contiguous
>>>>>> +        * entries in the whole folio range.
>>>>>> +        */
>>>>>> +       WARN_ON_ONCE(nr > SWAPFILE_CLUSTER);
>>>>>> +       ci = lock_cluster_or_swap_info(p, start_offset);
>>>>>> +       for (offset = start_offset; offset < end_offset; offset++, i++) {
>>>>>> +               if (data_race(p->swap_map[offset])) {
>>>>>> +                       count = __swap_entry_free_locked(p, offset, 1);
>>>>>> +                       if (!count) {
>>>>>> +                               bitmap_set(to_free, i, 1);
>>>>>> +                               nr_setbits++;
>>>>>> +                       } else if (count == SWAP_HAS_CACHE) {
>>>>>> +                               *any_only_cache = true;
>>>>>> +                       }
>>>>>> +               } else {
>>>>>> +                       WARN_ON_ONCE(1);
>>>>>> +               }
>>>>>> +       }
>>>>>> +       unlock_cluster_or_swap_info(p, ci);
>>>>>> +
>>>>>> +       /*
>>>>>> +        * If the swap_map count values corresponding to all contiguous entries are
>>>>>> +        * all zero excluding SWAP_HAS_CACHE, the entries will be freed directly by
>>>>>> +        * skippping percpu swp_slots caches, which can avoid frequent swap_info
>>>>>> +        * locking for every individual entry.
>>>>>> +        */
>>>>>> +       if (nr > 1 && nr_setbits == nr) {
>>>>>> +               spin_lock(&p->lock);
>>>>>> +               swap_entry_range_free(p, entry, nr);
>>>>>> +               spin_unlock(&p->lock);
>>>>>> +       } else {
>>>>>> +               for_each_set_bit(i, to_free, SWAPFILE_CLUSTER)
>>>>>> +                       free_swap_slot(swp_entry(p->type, start_offset + i));
>>>>>> +       }
>>>>>> +}
>>>>>> +
>>>>>>     static void cluster_swap_free_nr(struct swap_info_struct *sis,
>>>>>>                    unsigned long offset, int nr_pages,
>>>>>>                    unsigned char usage)
>>>>>> @@ -1808,6 +1860,14 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>>>            if (WARN_ON(end_offset > si->max))
>>>>>>                    goto out;
>>>>>>
>>>>>> +       /*
>>>>>> +        * Try to free all contiguous entries about mTHP as a whole.
>>>>>> +        */
>>>>>> +       if (IS_ENABLED(CONFIG_THP_SWAP) && nr > 1) {
>>>>>> +               swap_entry_range_check_and_free(si, entry, nr, &any_only_cache);
>>>>>> +               goto free_cache;
>>>>>> +       }
>>>>>> +
>>>>>>            /*
>>>>>>             * First free all entries in the range.
>>>>>>             */
>>>>>> @@ -1821,6 +1881,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
>>>>>>                    }
>>>>>>            }
>>>>>>
>>>>>> +free_cache:
>>>>>>            /*
>>>>>>             * Short-circuit the below loop if none of the entries had their
>>>>>>             * reference drop to zero.
>>>>>> --
>>>>>> 2.39.0
>>>>>>
> Thanks
>   Barry
Thanks
Zhiguo



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-08-06  8:12 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-08-05 16:07 [PATCH] mm: swap: mTHP frees entries as a whole Zhiguo Jiang
2024-08-05 22:09 ` Barry Song
2024-08-06  2:01   ` zhiguojiang
2024-08-06  2:07     ` Barry Song
2024-08-06  7:40       ` zhiguojiang
2024-08-06  6:48         ` Barry Song
2024-08-06  8:12           ` zhiguojiang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox