linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] shmem: support huge_fault to avoid pmd split
@ 2022-07-26 12:43 Liu Zixian
  2022-07-26 13:09 ` Matthew Wilcox
  2022-07-26 17:54 ` Yang Shi
  0 siblings, 2 replies; 5+ messages in thread
From: Liu Zixian @ 2022-07-26 12:43 UTC (permalink / raw)
  To: hughd, akpm, linux-mm; +Cc: linfeilong, liuzixian4

Transparent hugepage of tmpfs is useful to improve TLB miss, but
it will be split during cow memory fault.
This will happen if we mprotect and rewrite code segment (which is
private file map) to hotpatch a running process.

We can avoid the splitting by adding a huge_fault function.

Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
---
 mm/shmem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index a6f565308..12b2b5140 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2120,6 +2120,51 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
 	return ret;
 }
 
+static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
+{
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	struct page *old_page, *new_page;
+	int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
+
+	/* read or shared fault will not split huge pmd */
+	if (!(vmf->flags & FAULT_FLAG_WRITE)
+			|| (vmf->vma->vm_flags & VM_SHARED))
+		return VM_FAULT_FALLBACK;
+	if (pe_size != PE_SIZE_PMD)
+		return VM_FAULT_FALLBACK;
+
+	if (pmd_none(*vmf->pmd)) {
+		if (shmem_fault(vmf) & VM_FAULT_ERROR)
+			goto out;
+		if (!PageTransHuge(vmf->page))
+			goto out;
+		old_page = vmf->page;
+	} else {
+		old_page = pmd_page(*vmf->pmd);
+		page_remove_rmap(old_page, vmf->vma, true);
+		pmdp_huge_clear_flush(vmf->vma, haddr, vmf->pmd);
+		add_mm_counter(vmf->vma->vm_mm, MM_SHMEMPAGES, -HPAGE_PMD_NR);
+	}
+
+	new_page = &vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
+			vmf->vma, haddr, true)->page;
+	if (!new_page)
+		goto out;
+	prep_transhuge_page(new_page);
+	copy_user_huge_page(new_page, old_page, haddr, vmf->vma, HPAGE_PMD_NR);
+	__SetPageUptodate(new_page);
+
+	ret = do_set_pmd(vmf, new_page);
+
+out:
+	if (vmf->page) {
+		unlock_page(vmf->page);
+		put_page(vmf->page);
+	}
+	return ret;
+}
+
 unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long uaddr, unsigned long len,
 				      unsigned long pgoff, unsigned long flags)
@@ -3884,6 +3929,7 @@ static const struct super_operations shmem_ops = {
 
 static const struct vm_operations_struct shmem_vm_ops = {
 	.fault		= shmem_fault,
+	.huge_fault	= shmem_huge_fault,
 	.map_pages	= filemap_map_pages,
 #ifdef CONFIG_NUMA
 	.set_policy     = shmem_set_policy,
-- 
2.33.0



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] shmem: support huge_fault to avoid pmd split
  2022-07-26 12:43 [PATCH] shmem: support huge_fault to avoid pmd split Liu Zixian
@ 2022-07-26 13:09 ` Matthew Wilcox
  2022-07-26 13:28   ` Liu Zixian
  2022-07-26 14:31   ` Kefeng Wang
  2022-07-26 17:54 ` Yang Shi
  1 sibling, 2 replies; 5+ messages in thread
From: Matthew Wilcox @ 2022-07-26 13:09 UTC (permalink / raw)
  To: Liu Zixian; +Cc: hughd, akpm, linux-mm, linfeilong

On Tue, Jul 26, 2022 at 08:43:15PM +0800, Liu Zixian wrote:
> Transparent hugepage of tmpfs is useful to improve TLB miss, but
> it will be split during cow memory fault.

That's intentional.  Possibly misguided, but there's a tradeoff to
be made between memory consumption and using large pages.

> This will happen if we mprotect and rewrite code segment (which is
> private file map) to hotpatch a running process.
> 
> We can avoid the splitting by adding a huge_fault function.
> 
> Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
> ---
>  mm/shmem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index a6f565308..12b2b5140 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2120,6 +2120,51 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>  	return ret;
>  }
>  
> +static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
> +{
> +	vm_fault_t ret = VM_FAULT_FALLBACK;
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	struct page *old_page, *new_page;
> +	int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
> +
> +	/* read or shared fault will not split huge pmd */
> +	if (!(vmf->flags & FAULT_FLAG_WRITE)
> +			|| (vmf->vma->vm_flags & VM_SHARED))
> +		return VM_FAULT_FALLBACK;
> +	if (pe_size != PE_SIZE_PMD)
> +		return VM_FAULT_FALLBACK;
> +
> +	if (pmd_none(*vmf->pmd)) {
> +		if (shmem_fault(vmf) & VM_FAULT_ERROR)
> +			goto out;
> +		if (!PageTransHuge(vmf->page))
> +			goto out;
> +		old_page = vmf->page;
> +	} else {
> +		old_page = pmd_page(*vmf->pmd);
> +		page_remove_rmap(old_page, vmf->vma, true);
> +		pmdp_huge_clear_flush(vmf->vma, haddr, vmf->pmd);
> +		add_mm_counter(vmf->vma->vm_mm, MM_SHMEMPAGES, -HPAGE_PMD_NR);
> +	}
> +
> +	new_page = &vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
> +			vmf->vma, haddr, true)->page;
> +	if (!new_page)
> +		goto out;
> +	prep_transhuge_page(new_page);

vma_alloc_folio() does the prep_transhuge_page() for you.

> +	copy_user_huge_page(new_page, old_page, haddr, vmf->vma, HPAGE_PMD_NR);
> +	__SetPageUptodate(new_page);
> +
> +	ret = do_set_pmd(vmf, new_page);
> +
> +out:
> +	if (vmf->page) {
> +		unlock_page(vmf->page);
> +		put_page(vmf->page);
> +	}
> +	return ret;
> +}
> +
>  unsigned long shmem_get_unmapped_area(struct file *file,
>  				      unsigned long uaddr, unsigned long len,
>  				      unsigned long pgoff, unsigned long flags)
> @@ -3884,6 +3929,7 @@ static const struct super_operations shmem_ops = {
>  
>  static const struct vm_operations_struct shmem_vm_ops = {
>  	.fault		= shmem_fault,
> +	.huge_fault	= shmem_huge_fault,
>  	.map_pages	= filemap_map_pages,
>  #ifdef CONFIG_NUMA
>  	.set_policy     = shmem_set_policy,
> -- 
> 2.33.0
> 
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH] shmem: support huge_fault to avoid pmd split
  2022-07-26 13:09 ` Matthew Wilcox
@ 2022-07-26 13:28   ` Liu Zixian
  2022-07-26 14:31   ` Kefeng Wang
  1 sibling, 0 replies; 5+ messages in thread
From: Liu Zixian @ 2022-07-26 13:28 UTC (permalink / raw)
  To: willy; +Cc: akpm, hughd, linfeilong, linux-mm, liuzixian4

Thank you for you review! I've sent a v2 patch to remove prep_transhuge_page.

>That's intentional.  Possibly misguided, but there's a tradeoff to be made between memory consumption and using large pages.

Our tmpfs users have seen a ~5% performance degradation after hotpatch, and they think its unacceptable.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] shmem: support huge_fault to avoid pmd split
  2022-07-26 13:09 ` Matthew Wilcox
  2022-07-26 13:28   ` Liu Zixian
@ 2022-07-26 14:31   ` Kefeng Wang
  1 sibling, 0 replies; 5+ messages in thread
From: Kefeng Wang @ 2022-07-26 14:31 UTC (permalink / raw)
  To: Matthew Wilcox, Liu Zixian; +Cc: hughd, akpm, linux-mm, linfeilong


On 2022/7/26 21:09, Matthew Wilcox wrote:
> On Tue, Jul 26, 2022 at 08:43:15PM +0800, Liu Zixian wrote:
>> Transparent hugepage of tmpfs is useful to improve TLB miss, but
>> it will be split during cow memory fault.
> That's intentional.  Possibly misguided, but there's a tradeoff to
> be made between memory consumption and using large pages.
>
>> This will happen if we mprotect and rewrite code segment (which is
>> private file map) to hotpatch a running process.
>>
>> We can avoid the splitting by adding a huge_fault function.
>>
>> Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
>> ---
>>   mm/shmem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 46 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index a6f565308..12b2b5140 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -2120,6 +2120,51 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>>   	return ret;
>>   }
>>   
>> +static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
>> +{
>> +	vm_fault_t ret = VM_FAULT_FALLBACK;
>> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +	struct page *old_page, *new_page;
>> +	int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;

There are many vmf->vma, so better to add 'struct vm_area_struct *vma = 
vmf->vma;' and

use vma directly

>> +
>> +	/* read or shared fault will not split huge pmd */
>> +	if (!(vmf->flags & FAULT_FLAG_WRITE)
>> +			|| (vmf->vma->vm_flags & VM_SHARED))
>> +		return VM_FAULT_FALLBACK;
>> +	if (pe_size != PE_SIZE_PMD)
>> +		return VM_FAULT_FALLBACK;
return ret;
>> +
>> +	if (pmd_none(*vmf->pmd)) {
>> +		if (shmem_fault(vmf) & VM_FAULT_ERROR)
>> +			goto out;
>> +		if (!PageTransHuge(vmf->page))
>> +			goto out;
>> +		old_page = vmf->page;
>> +	} else {
>> +		old_page = pmd_page(*vmf->pmd);
>> +		page_remove_rmap(old_page, vmf->vma, true);
>> +		pmdp_huge_clear_flush(vmf->vma, haddr, vmf->pmd);
>> +		add_mm_counter(vmf->vma->vm_mm, MM_SHMEMPAGES, -HPAGE_PMD_NR);

MM_SHMEMPAGES -> mm_counter_file(page)

>> +	}
>> +
directly use GFP_TRANSHUGE_LIGHT?
>> +	new_page = &vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
>> +			vmf->vma, haddr, true)->page;
>> +	if (!new_page)
add   count_vm_event(THP_FAULT_FALLBACK);
>> +		goto out;
>> +	prep_transhuge_page(new_page);
> vma_alloc_folio() does the prep_transhuge_page() for you.
>
>> +	copy_user_huge_page(new_page, old_page, haddr, vmf->vma, HPAGE_PMD_NR);
>> +	__SetPageUptodate(new_page);
>> +
>> +	ret = do_set_pmd(vmf, new_page);
>> +
>> +out:
>> +	if (vmf->page) {
>> +		unlock_page(vmf->page);
>> +		put_page(vmf->page);
>> +	}
>> +	return ret;
>> +}
>> +
>>   unsigned long shmem_get_unmapped_area(struct file *file,
>>   				      unsigned long uaddr, unsigned long len,
>>   				      unsigned long pgoff, unsigned long flags)
>> @@ -3884,6 +3929,7 @@ static const struct super_operations shmem_ops = {
>>   
>>   static const struct vm_operations_struct shmem_vm_ops = {
>>   	.fault		= shmem_fault,
>> +	.huge_fault	= shmem_huge_fault,
>>   	.map_pages	= filemap_map_pages,
>>   #ifdef CONFIG_NUMA
>>   	.set_policy     = shmem_set_policy,
>> -- 
>> 2.33.0
>>
>>
> .


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] shmem: support huge_fault to avoid pmd split
  2022-07-26 12:43 [PATCH] shmem: support huge_fault to avoid pmd split Liu Zixian
  2022-07-26 13:09 ` Matthew Wilcox
@ 2022-07-26 17:54 ` Yang Shi
  1 sibling, 0 replies; 5+ messages in thread
From: Yang Shi @ 2022-07-26 17:54 UTC (permalink / raw)
  To: Liu Zixian, Kirill A. Shutemov; +Cc: hughd, akpm, linux-mm, linfeilong

On Tue, Jul 26, 2022 at 5:43 AM Liu Zixian <liuzixian4@huawei.com> wrote:
>
> Transparent hugepage of tmpfs is useful to improve TLB miss, but
> it will be split during cow memory fault.
> This will happen if we mprotect and rewrite code segment (which is
> private file map) to hotpatch a running process.

As Matthew said it is intentional and a tradeoff between memory
consumption and performance. Other than that file COW is more
complicated and THP is actually not supported for private mapping
AFAIK since it is anonymous pages mapped to a file vma. So the private
mapping THP support must be added before supporting your usecase.

>
> We can avoid the splitting by adding a huge_fault function.
>
> Signed-off-by: Liu Zixian <liuzixian4@huawei.com>
> ---
>  mm/shmem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index a6f565308..12b2b5140 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2120,6 +2120,51 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>         return ret;
>  }
>
> +static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
> +{
> +       vm_fault_t ret = VM_FAULT_FALLBACK;
> +       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +       struct page *old_page, *new_page;
> +       int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
> +
> +       /* read or shared fault will not split huge pmd */
> +       if (!(vmf->flags & FAULT_FLAG_WRITE)
> +                       || (vmf->vma->vm_flags & VM_SHARED))
> +               return VM_FAULT_FALLBACK;
> +       if (pe_size != PE_SIZE_PMD)
> +               return VM_FAULT_FALLBACK;
> +
> +       if (pmd_none(*vmf->pmd)) {
> +               if (shmem_fault(vmf) & VM_FAULT_ERROR)
> +                       goto out;
> +               if (!PageTransHuge(vmf->page))
> +                       goto out;
> +               old_page = vmf->page;
> +       } else {
> +               old_page = pmd_page(*vmf->pmd);
> +               page_remove_rmap(old_page, vmf->vma, true);
> +               pmdp_huge_clear_flush(vmf->vma, haddr, vmf->pmd);
> +               add_mm_counter(vmf->vma->vm_mm, MM_SHMEMPAGES, -HPAGE_PMD_NR);
> +       }
> +
> +       new_page = &vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
> +                       vmf->vma, haddr, true)->page;
> +       if (!new_page)
> +               goto out;
> +       prep_transhuge_page(new_page);
> +       copy_user_huge_page(new_page, old_page, haddr, vmf->vma, HPAGE_PMD_NR);
> +       __SetPageUptodate(new_page);
> +
> +       ret = do_set_pmd(vmf, new_page);

This is also totally wrong IIUC. You are actually allocating anonymous
pages, but do_set_pmd() is used for file pages. So all the
manipulations to anonymous rmap, accounting, lru, memcg and etc are
actually missing.

> +
> +out:
> +       if (vmf->page) {
> +               unlock_page(vmf->page);
> +               put_page(vmf->page);
> +       }
> +       return ret;
> +}
> +
>  unsigned long shmem_get_unmapped_area(struct file *file,
>                                       unsigned long uaddr, unsigned long len,
>                                       unsigned long pgoff, unsigned long flags)
> @@ -3884,6 +3929,7 @@ static const struct super_operations shmem_ops = {
>
>  static const struct vm_operations_struct shmem_vm_ops = {
>         .fault          = shmem_fault,
> +       .huge_fault     = shmem_huge_fault,
>         .map_pages      = filemap_map_pages,
>  #ifdef CONFIG_NUMA
>         .set_policy     = shmem_set_policy,
> --
> 2.33.0
>
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-07-26 17:54 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-26 12:43 [PATCH] shmem: support huge_fault to avoid pmd split Liu Zixian
2022-07-26 13:09 ` Matthew Wilcox
2022-07-26 13:28   ` Liu Zixian
2022-07-26 14:31   ` Kefeng Wang
2022-07-26 17:54 ` Yang Shi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox