linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/2] Compute contiguous empty PTEs for mTHP efficiently
@ 2024-09-16 11:07 Dev Jain
  2024-09-16 11:07 ` [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs Dev Jain
  2024-09-16 11:07 ` [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges Dev Jain
  0 siblings, 2 replies; 9+ messages in thread
From: Dev Jain @ 2024-09-16 11:07 UTC (permalink / raw)
  To: akpm, david, willy
  Cc: ryan.roberts, anshuman.khandual, baohua, hughd, ioworker0,
	wangkefeng.wang, baolin.wang, gshan, linux-kernel, linux-mm,
	Dev Jain

We use pte_range_none() to determine whether contiguous PTEs are empty
for an mTHP allocation. Instead of iterating the while loop for every
order, use some information from the previous iteration to eliminate
some cases.

v1->v2:
 - Break into two patches

v1: https://lore.kernel.org/all/20240913091902.1160520-1-dev.jain@arm.com/

Dev Jain (2):
  mm: Make pte_range_none() return number of empty PTEs
  mm: Compute first_set_pte to eliminate evaluating redundant ranges

 mm/memory.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

-- 
2.30.2



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs
  2024-09-16 11:07 [PATCH v2 0/2] Compute contiguous empty PTEs for mTHP efficiently Dev Jain
@ 2024-09-16 11:07 ` Dev Jain
  2024-09-18  7:03   ` Baolin Wang
  2024-09-19  1:38   ` Barry Song
  2024-09-16 11:07 ` [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges Dev Jain
  1 sibling, 2 replies; 9+ messages in thread
From: Dev Jain @ 2024-09-16 11:07 UTC (permalink / raw)
  To: akpm, david, willy
  Cc: ryan.roberts, anshuman.khandual, baohua, hughd, ioworker0,
	wangkefeng.wang, baolin.wang, gshan, linux-kernel, linux-mm,
	Dev Jain

In preparation for the second patch, make pte_range_none() return
the number of contiguous empty PTEs.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/memory.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 6469ac99f2f7..8bb1236de93c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4617,16 +4617,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	return ret;
 }
 
-static bool pte_range_none(pte_t *pte, int nr_pages)
+static int pte_range_none(pte_t *pte, int nr_pages)
 {
 	int i;
 
 	for (i = 0; i < nr_pages; i++) {
 		if (!pte_none(ptep_get_lockless(pte + i)))
-			return false;
+			return i;
 	}
 
-	return true;
+	return nr_pages;
 }
 
 static struct folio *alloc_anon_folio(struct vm_fault *vmf)
@@ -4671,7 +4671,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	order = highest_order(orders);
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
-		if (pte_range_none(pte + pte_index(addr), 1 << order))
+		if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)
 			break;
 		order = next_order(&orders, order);
 	}
@@ -4787,7 +4787,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
 		update_mmu_tlb(vma, addr, vmf->pte);
 		goto release;
-	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+	} else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
 		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
 		goto release;
 	}
@@ -5121,7 +5121,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 		update_mmu_tlb(vma, addr, vmf->pte);
 		ret = VM_FAULT_NOPAGE;
 		goto unlock;
-	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+	} else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
 		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
 		ret = VM_FAULT_NOPAGE;
 		goto unlock;
-- 
2.30.2



^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges
  2024-09-16 11:07 [PATCH v2 0/2] Compute contiguous empty PTEs for mTHP efficiently Dev Jain
  2024-09-16 11:07 ` [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs Dev Jain
@ 2024-09-16 11:07 ` Dev Jain
  2024-09-19  1:34   ` Barry Song
  1 sibling, 1 reply; 9+ messages in thread
From: Dev Jain @ 2024-09-16 11:07 UTC (permalink / raw)
  To: akpm, david, willy
  Cc: ryan.roberts, anshuman.khandual, baohua, hughd, ioworker0,
	wangkefeng.wang, baolin.wang, gshan, linux-kernel, linux-mm,
	Dev Jain

For an mTHP allocation, we need to check, for every order, whether for
that order, we have enough number of contiguous PTEs empty. Instead of
iterating the while loop for every order, use some information, which
is the first set PTE found, from the previous iteration to eliminate
some cases. The key to understanding the correctness of the patch
is that the ranges we want to examine form a strictly decreasing
sequence of nested intervals.

Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/memory.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 8bb1236de93c..e81c6abe09ce 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4633,10 +4633,11 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pte_t *first_set_pte = NULL, *align_pte, *pte;
 	unsigned long orders;
 	struct folio *folio;
 	unsigned long addr;
-	pte_t *pte;
+	int max_empty;
 	gfp_t gfp;
 	int order;
 
@@ -4671,8 +4672,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	order = highest_order(orders);
 	while (orders) {
 		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
-		if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)
+		align_pte = pte + pte_index(addr);
+
+		/* Range to be scanned known to be empty */
+		if (align_pte + (1 << order) <= first_set_pte)
+			break;
+
+		/* Range to be scanned contains first_set_pte */
+		if (align_pte <= first_set_pte)
+			goto repeat;
+
+		/* align_pte > first_set_pte, so need to check properly */
+		max_empty = pte_range_none(align_pte, 1 << order);
+		if (max_empty == 1 << order)
 			break;
+
+		first_set_pte = align_pte + max_empty;
+repeat:
 		order = next_order(&orders, order);
 	}
 
-- 
2.30.2



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs
  2024-09-16 11:07 ` [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs Dev Jain
@ 2024-09-18  7:03   ` Baolin Wang
  2024-09-19  1:38   ` Barry Song
  1 sibling, 0 replies; 9+ messages in thread
From: Baolin Wang @ 2024-09-18  7:03 UTC (permalink / raw)
  To: Dev Jain, akpm, david, willy
  Cc: ryan.roberts, anshuman.khandual, baohua, hughd, ioworker0,
	wangkefeng.wang, gshan, linux-kernel, linux-mm



On 2024/9/16 19:07, Dev Jain wrote:
> In preparation for the second patch, make pte_range_none() return
> the number of contiguous empty PTEs.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>

LGTM.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> ---
>   mm/memory.c | 12 ++++++------
>   1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 6469ac99f2f7..8bb1236de93c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4617,16 +4617,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   	return ret;
>   }
>   
> -static bool pte_range_none(pte_t *pte, int nr_pages)
> +static int pte_range_none(pte_t *pte, int nr_pages)
>   {
>   	int i;
>   
>   	for (i = 0; i < nr_pages; i++) {
>   		if (!pte_none(ptep_get_lockless(pte + i)))
> -			return false;
> +			return i;
>   	}
>   
> -	return true;
> +	return nr_pages;
>   }
>   
>   static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> @@ -4671,7 +4671,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>   	order = highest_order(orders);
>   	while (orders) {
>   		addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> -		if (pte_range_none(pte + pte_index(addr), 1 << order))
> +		if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)
>   			break;
>   		order = next_order(&orders, order);
>   	}
> @@ -4787,7 +4787,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>   	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>   		update_mmu_tlb(vma, addr, vmf->pte);
>   		goto release;
> -	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +	} else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
>   		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
>   		goto release;
>   	}
> @@ -5121,7 +5121,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>   		update_mmu_tlb(vma, addr, vmf->pte);
>   		ret = VM_FAULT_NOPAGE;
>   		goto unlock;
> -	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +	} else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
>   		update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
>   		ret = VM_FAULT_NOPAGE;
>   		goto unlock;


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges
  2024-09-16 11:07 ` [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges Dev Jain
@ 2024-09-19  1:34   ` Barry Song
  2024-09-19  8:40     ` Dev Jain
  0 siblings, 1 reply; 9+ messages in thread
From: Barry Song @ 2024-09-19  1:34 UTC (permalink / raw)
  To: Dev Jain
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual, hughd,
	ioworker0, wangkefeng.wang, baolin.wang, gshan, linux-kernel,
	linux-mm

On Mon, Sep 16, 2024 at 11:08 PM Dev Jain <dev.jain@arm.com> wrote:
>
> For an mTHP allocation, we need to check, for every order, whether for
> that order, we have enough number of contiguous PTEs empty. Instead of
> iterating the while loop for every order, use some information, which
> is the first set PTE found, from the previous iteration to eliminate
> some cases. The key to understanding the correctness of the patch
> is that the ranges we want to examine form a strictly decreasing
> sequence of nested intervals.

Could we include some benchmark data here, as suggested by Ryan in this thread?

https://lore.kernel.org/linux-mm/58f91a56-890a-45d0-8b1f-47c4c70c9600@arm.com/

>
> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>

> ---
>  mm/memory.c | 20 ++++++++++++++++++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 8bb1236de93c..e81c6abe09ce 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4633,10 +4633,11 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>  {
>         struct vm_area_struct *vma = vmf->vma;
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +       pte_t *first_set_pte = NULL, *align_pte, *pte;
>         unsigned long orders;
>         struct folio *folio;
>         unsigned long addr;
> -       pte_t *pte;
> +       int max_empty;
>         gfp_t gfp;
>         int order;
>
> @@ -4671,8 +4672,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>         order = highest_order(orders);
>         while (orders) {
>                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> -               if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)
> +               align_pte = pte + pte_index(addr);
> +
> +               /* Range to be scanned known to be empty */
> +               if (align_pte + (1 << order) <= first_set_pte)
> +                       break;
> +
> +               /* Range to be scanned contains first_set_pte */
> +               if (align_pte <= first_set_pte)
> +                       goto repeat;
> +
> +               /* align_pte > first_set_pte, so need to check properly */
> +               max_empty = pte_range_none(align_pte, 1 << order);
> +               if (max_empty == 1 << order)
>                         break;
> +
> +               first_set_pte = align_pte + max_empty;
> +repeat:
>                 order = next_order(&orders, order);
>         }
>
> --
> 2.30.2
>

Thanks
barry


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs
  2024-09-16 11:07 ` [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs Dev Jain
  2024-09-18  7:03   ` Baolin Wang
@ 2024-09-19  1:38   ` Barry Song
  1 sibling, 0 replies; 9+ messages in thread
From: Barry Song @ 2024-09-19  1:38 UTC (permalink / raw)
  To: Dev Jain
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual, hughd,
	ioworker0, wangkefeng.wang, baolin.wang, gshan, linux-kernel,
	linux-mm

On Mon, Sep 16, 2024 at 11:08 PM Dev Jain <dev.jain@arm.com> wrote:
>
> In preparation for the second patch, make pte_range_none() return
> the number of contiguous empty PTEs.
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/memory.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 6469ac99f2f7..8bb1236de93c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4617,16 +4617,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         return ret;
>  }
>
> -static bool pte_range_none(pte_t *pte, int nr_pages)
> +static int pte_range_none(pte_t *pte, int nr_pages)
>  {
>         int i;
>
>         for (i = 0; i < nr_pages; i++) {
>                 if (!pte_none(ptep_get_lockless(pte + i)))
> -                       return false;
> +                       return i;
>         }
>
> -       return true;
> +       return nr_pages;
>  }
>
>  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
> @@ -4671,7 +4671,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>         order = highest_order(orders);
>         while (orders) {
>                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
> -               if (pte_range_none(pte + pte_index(addr), 1 << order))
> +               if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)

Minor suggestion: it's a bit odd that we're doing 1 << order twice.
Perhaps consider
introducing a local variable, nr_pages, for clarity.

Otherwise,

Reviewed-by: Barry Song <baohua@kernel.org>


>                         break;
>                 order = next_order(&orders, order);
>         }
> @@ -4787,7 +4787,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>         if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>                 update_mmu_tlb(vma, addr, vmf->pte);
>                 goto release;
> -       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +       } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
>                 update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
>                 goto release;
>         }
> @@ -5121,7 +5121,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>                 update_mmu_tlb(vma, addr, vmf->pte);
>                 ret = VM_FAULT_NOPAGE;
>                 goto unlock;
> -       } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +       } else if (nr_pages > 1 && pte_range_none(vmf->pte, nr_pages) != nr_pages) {
>                 update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
>                 ret = VM_FAULT_NOPAGE;
>                 goto unlock;
> --
> 2.30.2
>

Thanks
Barry


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges
  2024-09-19  1:34   ` Barry Song
@ 2024-09-19  8:40     ` Dev Jain
  2024-09-19 16:55       ` Ryan Roberts
  0 siblings, 1 reply; 9+ messages in thread
From: Dev Jain @ 2024-09-19  8:40 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, david, willy, ryan.roberts, anshuman.khandual, hughd,
	ioworker0, wangkefeng.wang, baolin.wang, gshan, linux-kernel,
	linux-mm


On 9/19/24 07:04, Barry Song wrote:
> On Mon, Sep 16, 2024 at 11:08 PM Dev Jain <dev.jain@arm.com> wrote:
>> For an mTHP allocation, we need to check, for every order, whether for
>> that order, we have enough number of contiguous PTEs empty. Instead of
>> iterating the while loop for every order, use some information, which
>> is the first set PTE found, from the previous iteration to eliminate
>> some cases. The key to understanding the correctness of the patch
>> is that the ranges we want to examine form a strictly decreasing
>> sequence of nested intervals.
> Could we include some benchmark data here, as suggested by Ryan in this thread?
>
> https://lore.kernel.org/linux-mm/58f91a56-890a-45d0-8b1f-47c4c70c9600@arm.com/

Can you please verify and get some numbers for the following program,
because if I am doing this correctly, it would be a regression :)
https://www.codedump.xyz/cpp/Zuvf8FwvRPH21UO2

The program does this: disable THP completely -> mmap 1G VMA -> touch the last
page of a 32K sized boundary. That is, 0th till 32K/4K - 2 pages are
empty, while the 32K/4K - 1'th page is touched, and so on -> madvise
the entire VMA -> enable all THPs except 2M -> touch all pages.

Therefore, we have 0 - 6 PTEs empty, 7th is filled, and so on. Eventually,
kernel will fall down to finding 4 contiguous PTEs empty and allocate
4K * 4 = 16K mTHP.

The result without the patches:

real: 8.250s
user: 0.941s
sys: 7.077s

real: 8.175s
user: 0.939s
sys: 7.021s

With the patches:

real: 8.584s
user: 1.089s
sys: 7.234s

real: 8.429s
user: 0.954s
sys: 7.220s

You can change the #iterations in the for loop to magnify this,
and the current code surprisingly wins.


>
>> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/memory.c | 20 ++++++++++++++++++--
>>   1 file changed, 18 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 8bb1236de93c..e81c6abe09ce 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4633,10 +4633,11 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>   {
>>          struct vm_area_struct *vma = vmf->vma;
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +       pte_t *first_set_pte = NULL, *align_pte, *pte;
>>          unsigned long orders;
>>          struct folio *folio;
>>          unsigned long addr;
>> -       pte_t *pte;
>> +       int max_empty;
>>          gfp_t gfp;
>>          int order;
>>
>> @@ -4671,8 +4672,23 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
>>          order = highest_order(orders);
>>          while (orders) {
>>                  addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>> -               if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 << order)
>> +               align_pte = pte + pte_index(addr);
>> +
>> +               /* Range to be scanned known to be empty */
>> +               if (align_pte + (1 << order) <= first_set_pte)
>> +                       break;
>> +
>> +               /* Range to be scanned contains first_set_pte */
>> +               if (align_pte <= first_set_pte)
>> +                       goto repeat;
>> +
>> +               /* align_pte > first_set_pte, so need to check properly */
>> +               max_empty = pte_range_none(align_pte, 1 << order);
>> +               if (max_empty == 1 << order)
>>                          break;
>> +
>> +               first_set_pte = align_pte + max_empty;
>> +repeat:
>>                  order = next_order(&orders, order);
>>          }
>>
>> --
>> 2.30.2
>>
> Thanks
> barry


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges
  2024-09-19  8:40     ` Dev Jain
@ 2024-09-19 16:55       ` Ryan Roberts
  2024-09-20  4:04         ` Dev Jain
  0 siblings, 1 reply; 9+ messages in thread
From: Ryan Roberts @ 2024-09-19 16:55 UTC (permalink / raw)
  To: Dev Jain, Barry Song
  Cc: akpm, david, willy, anshuman.khandual, hughd, ioworker0,
	wangkefeng.wang, baolin.wang, gshan, linux-kernel, linux-mm

On 19/09/2024 09:40, Dev Jain wrote:
> 
> On 9/19/24 07:04, Barry Song wrote:
>> On Mon, Sep 16, 2024 at 11:08 PM Dev Jain <dev.jain@arm.com> wrote:
>>> For an mTHP allocation, we need to check, for every order, whether for
>>> that order, we have enough number of contiguous PTEs empty. Instead of
>>> iterating the while loop for every order, use some information, which
>>> is the first set PTE found, from the previous iteration to eliminate
>>> some cases. The key to understanding the correctness of the patch
>>> is that the ranges we want to examine form a strictly decreasing
>>> sequence of nested intervals.
>> Could we include some benchmark data here, as suggested by Ryan in this thread?
>>
>> https://lore.kernel.org/linux-mm/58f91a56-890a-45d0-8b1f-47c4c70c9600@arm.com/
> 
> Can you please verify and get some numbers for the following program,
> because if I am doing this correctly, it would be a regression :)
> https://www.codedump.xyz/cpp/Zuvf8FwvRPH21UO2

Some brief comments on the test code:

- You don't need to enable/disable the top-level control. Regardless, I don't
think this breaks the benchmark.

- I think you have an off-by-1 in your for loop condition:

for (unsigned long i = 1; (i * border) < size; ++i) {

I think this needs to be:

for (unsigned long i = 1; (i * border) <= size; ++i) {

It just means that the final 32K block will get a single 32K mapping.

- You're measuring the whole program; including mmap/munmap and
enabling/disabling mTHP. It would be much better to just measure the loop that
writes to each page after mTHP is enabled.


I modified the code to iterate for 10 seconds and on each iteration, measure
only the time spent in the interesting loop. Running on Apple M2 VM:

Before the change:

ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.070028 seconds per GB (iterations=98)
ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.068495 seconds per GB (iterations=96)
ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.070207 seconds per GB (iterations=93)

After the change:

ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.076923 seconds per GB (iterations=88)
ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.072206 seconds per GB (iterations=96)
ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
Average: 0.072397 seconds per GB (iterations=89)

So this looks pretty clearly slower to me. So suggest we shouldn't take this patch.


> 
> The program does this: disable THP completely -> mmap 1G VMA -> touch the last
> page of a 32K sized boundary. That is, 0th till 32K/4K - 2 pages are
> empty, while the 32K/4K - 1'th page is touched, and so on -> madvise
> the entire VMA -> enable all THPs except 2M -> touch all pages.
> 
> Therefore, we have 0 - 6 PTEs empty, 7th is filled, and so on. Eventually,
> kernel will fall down to finding 4 contiguous PTEs empty and allocate
> 4K * 4 = 16K mTHP.
> 
> The result without the patches:
> 
> real: 8.250s
> user: 0.941s
> sys: 7.077s
> 
> real: 8.175s
> user: 0.939s
> sys: 7.021s
> 
> With the patches:
> 
> real: 8.584s
> user: 1.089s
> sys: 7.234s
> 
> real: 8.429s
> user: 0.954s
> sys: 7.220s

What HW did you measure this on? I'm guessing this is measuring multiple
iterations, otherwise it looks extremely slow. If you were measuring on FVP, for
example, that would not give representative performance numbers.

Thanks,
Ryan

> 
> You can change the #iterations in the for loop to magnify this,
> and the current code surprisingly wins.
> 
> 
>>
>>> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   mm/memory.c | 20 ++++++++++++++++++--
>>>   1 file changed, 18 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 8bb1236de93c..e81c6abe09ce 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4633,10 +4633,11 @@ static struct folio *alloc_anon_folio(struct vm_fault
>>> *vmf)
>>>   {
>>>          struct vm_area_struct *vma = vmf->vma;
>>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> +       pte_t *first_set_pte = NULL, *align_pte, *pte;
>>>          unsigned long orders;
>>>          struct folio *folio;
>>>          unsigned long addr;
>>> -       pte_t *pte;
>>> +       int max_empty;
>>>          gfp_t gfp;
>>>          int order;
>>>
>>> @@ -4671,8 +4672,23 @@ static struct folio *alloc_anon_folio(struct vm_fault
>>> *vmf)
>>>          order = highest_order(orders);
>>>          while (orders) {
>>>                  addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>> -               if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 <<
>>> order)
>>> +               align_pte = pte + pte_index(addr);
>>> +
>>> +               /* Range to be scanned known to be empty */
>>> +               if (align_pte + (1 << order) <= first_set_pte)
>>> +                       break;
>>> +
>>> +               /* Range to be scanned contains first_set_pte */
>>> +               if (align_pte <= first_set_pte)
>>> +                       goto repeat;
>>> +
>>> +               /* align_pte > first_set_pte, so need to check properly */
>>> +               max_empty = pte_range_none(align_pte, 1 << order);
>>> +               if (max_empty == 1 << order)
>>>                          break;
>>> +
>>> +               first_set_pte = align_pte + max_empty;
>>> +repeat:
>>>                  order = next_order(&orders, order);
>>>          }
>>>
>>> -- 
>>> 2.30.2
>>>
>> Thanks
>> barry



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges
  2024-09-19 16:55       ` Ryan Roberts
@ 2024-09-20  4:04         ` Dev Jain
  0 siblings, 0 replies; 9+ messages in thread
From: Dev Jain @ 2024-09-20  4:04 UTC (permalink / raw)
  To: Ryan Roberts, Barry Song
  Cc: akpm, david, willy, anshuman.khandual, hughd, ioworker0,
	wangkefeng.wang, baolin.wang, gshan, linux-kernel, linux-mm


On 9/19/24 22:25, Ryan Roberts wrote:
> On 19/09/2024 09:40, Dev Jain wrote:
>> On 9/19/24 07:04, Barry Song wrote:
>>> On Mon, Sep 16, 2024 at 11:08 PM Dev Jain <dev.jain@arm.com> wrote:
>>>> For an mTHP allocation, we need to check, for every order, whether for
>>>> that order, we have enough number of contiguous PTEs empty. Instead of
>>>> iterating the while loop for every order, use some information, which
>>>> is the first set PTE found, from the previous iteration to eliminate
>>>> some cases. The key to understanding the correctness of the patch
>>>> is that the ranges we want to examine form a strictly decreasing
>>>> sequence of nested intervals.
>>> Could we include some benchmark data here, as suggested by Ryan in this thread?
>>>
>>> https://lore.kernel.org/linux-mm/58f91a56-890a-45d0-8b1f-47c4c70c9600@arm.com/
>> Can you please verify and get some numbers for the following program,
>> because if I am doing this correctly, it would be a regression :)
>> https://www.codedump.xyz/cpp/Zuvf8FwvRPH21UO2
> Some brief comments on the test code:
>
> - You don't need to enable/disable the top-level control. Regardless, I don't
> think this breaks the benchmark.
>
> - I think you have an off-by-1 in your for loop condition:
>
> for (unsigned long i = 1; (i * border) < size; ++i) {
>
> I think this needs to be:
>
> for (unsigned long i = 1; (i * border) <= size; ++i) {
>
> It just means that the final 32K block will get a single 32K mapping.
>
> - You're measuring the whole program; including mmap/munmap and
> enabling/disabling mTHP. It would be much better to just measure the loop that
> writes to each page after mTHP is enabled.
>
>
> I modified the code to iterate for 10 seconds and on each iteration, measure
> only the time spent in the interesting loop. Running on Apple M2 VM:
>
> Before the change:
>
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.070028 seconds per GB (iterations=98)
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.068495 seconds per GB (iterations=96)
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.070207 seconds per GB (iterations=93)
>
> After the change:
>
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.076923 seconds per GB (iterations=88)
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.072206 seconds per GB (iterations=96)
> ubuntu@ubuntuvm:~/scan-pte$ sudo ./scan-pte
> Average: 0.072397 seconds per GB (iterations=89)
>
> So this looks pretty clearly slower to me. So suggest we shouldn't take this patch.

Thanks for testing!

>
>
>> The program does this: disable THP completely -> mmap 1G VMA -> touch the last
>> page of a 32K sized boundary. That is, 0th till 32K/4K - 2 pages are
>> empty, while the 32K/4K - 1'th page is touched, and so on -> madvise
>> the entire VMA -> enable all THPs except 2M -> touch all pages.
>>
>> Therefore, we have 0 - 6 PTEs empty, 7th is filled, and so on. Eventually,
>> kernel will fall down to finding 4 contiguous PTEs empty and allocate
>> 4K * 4 = 16K mTHP.
>>
>> The result without the patches:
>>
>> real: 8.250s
>> user: 0.941s
>> sys: 7.077s
>>
>> real: 8.175s
>> user: 0.939s
>> sys: 7.021s
>>
>> With the patches:
>>
>> real: 8.584s
>> user: 1.089s
>> sys: 7.234s
>>
>> real: 8.429s
>> user: 0.954s
>> sys: 7.220s
> What HW did you measure this on? I'm guessing this is measuring multiple
> iterations, otherwise it looks extremely slow. If you were measuring on FVP, for
> example, that would not give representative performance numbers.

I measured with qemu.

>
> Thanks,
> Ryan
>
>> You can change the #iterations in the for loop to magnify this,
>> and the current code surprisingly wins.
>>
>>
>>>> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>    mm/memory.c | 20 ++++++++++++++++++--
>>>>    1 file changed, 18 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 8bb1236de93c..e81c6abe09ce 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4633,10 +4633,11 @@ static struct folio *alloc_anon_folio(struct vm_fault
>>>> *vmf)
>>>>    {
>>>>           struct vm_area_struct *vma = vmf->vma;
>>>>    #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +       pte_t *first_set_pte = NULL, *align_pte, *pte;
>>>>           unsigned long orders;
>>>>           struct folio *folio;
>>>>           unsigned long addr;
>>>> -       pte_t *pte;
>>>> +       int max_empty;
>>>>           gfp_t gfp;
>>>>           int order;
>>>>
>>>> @@ -4671,8 +4672,23 @@ static struct folio *alloc_anon_folio(struct vm_fault
>>>> *vmf)
>>>>           order = highest_order(orders);
>>>>           while (orders) {
>>>>                   addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
>>>> -               if (pte_range_none(pte + pte_index(addr), 1 << order) == 1 <<
>>>> order)
>>>> +               align_pte = pte + pte_index(addr);
>>>> +
>>>> +               /* Range to be scanned known to be empty */
>>>> +               if (align_pte + (1 << order) <= first_set_pte)
>>>> +                       break;
>>>> +
>>>> +               /* Range to be scanned contains first_set_pte */
>>>> +               if (align_pte <= first_set_pte)
>>>> +                       goto repeat;
>>>> +
>>>> +               /* align_pte > first_set_pte, so need to check properly */
>>>> +               max_empty = pte_range_none(align_pte, 1 << order);
>>>> +               if (max_empty == 1 << order)
>>>>                           break;
>>>> +
>>>> +               first_set_pte = align_pte + max_empty;
>>>> +repeat:
>>>>                   order = next_order(&orders, order);
>>>>           }
>>>>
>>>> -- 
>>>> 2.30.2
>>>>
>>> Thanks
>>> barry


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2024-12-05 15:20 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-09-16 11:07 [PATCH v2 0/2] Compute contiguous empty PTEs for mTHP efficiently Dev Jain
2024-09-16 11:07 ` [PATCH v2 1/2] mm: Make pte_range_none() return number of empty PTEs Dev Jain
2024-09-18  7:03   ` Baolin Wang
2024-09-19  1:38   ` Barry Song
2024-09-16 11:07 ` [PATCH v2 2/2] mm: Compute first_set_pte to eliminate evaluating redundant ranges Dev Jain
2024-09-19  1:34   ` Barry Song
2024-09-19  8:40     ` Dev Jain
2024-09-19 16:55       ` Ryan Roberts
2024-09-20  4:04         ` Dev Jain

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox