linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Aneesh Kumar K V <aneesh.kumar@linux.ibm.com>
To: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>,
	linux-mm@kvack.org, akpm@linux-foundation.org,
	mpe@ellerman.id.au, linuxppc-dev@lists.ozlabs.org,
	npiggin@gmail.com, christophe.leroy@csgroup.eu
Cc: Oscar Salvador <osalvador@suse.de>,
	Mike Kravetz <mike.kravetz@oracle.com>,
	Dan Williams <dan.j.williams@intel.com>,
	Joao Martins <joao.m.martins@oracle.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Muchun Song <muchun.song@linux.dev>,
	Will Deacon <will@kernel.org>
Subject: Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage
Date: Wed, 28 Jun 2023 09:02:21 +0530	[thread overview]
Message-ID: <0d8633f2-69de-c178-fe03-49742b063962@linux.ibm.com> (raw)
In-Reply-To: <87ttusnzv5.fsf@doe.com>

On 6/28/23 6:53 AM, Ritesh Harjani (IBM) wrote:
> "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

....

>>  
>>  static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>> @@ -292,9 +320,18 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
>>  	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
>>  }
>>  
>> +static inline pud_t radix__pud_mkdevmap(pud_t pud)
>> +{
>> +	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
>> +}
>> +
>> +struct vmem_altmap;
>> +struct dev_pagemap;
> 
> Minor nit.
> I guess this struct dev_pagemap is meant to be added in a later patch.

Moved that change to a later patch.

> 
>>  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
>>  					     unsigned long page_size,
>>  					     unsigned long phys);
>> +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
>> +				      int node, struct vmem_altmap *altmap);
>>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>>  				    unsigned long page_size);
>>  
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> index 77797a2a82eb..a38542259fab 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
>> @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
>>  				      unsigned long end, int psize);
>>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  				       unsigned long start, unsigned long end);
>> +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end);
>>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
>>  			    unsigned long end);
>>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
>> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> index 0d0c1447ecf0..a01c20a8fbf7 100644
>> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
>> @@ -50,6 +50,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  		radix__flush_pmd_tlb_range(vma, start, end);
>>  }
>>  
>> +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE
>> +static inline void flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				       unsigned long start, unsigned long end)
>> +{
>> +	if (radix_enabled())
>> +		radix__flush_pud_tlb_range(vma, start, end);
>> +}
>> +
>>  #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
>>  static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
>>  					   unsigned long start,
>> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
>> index 85c84e89e3ea..9e5f01a1738c 100644
>> --- a/arch/powerpc/mm/book3s64/pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/pgtable.c
>> @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>>  	return changed;
>>  }
>>  
>> +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
>> +			  pud_t *pudp, pud_t entry, int dirty)
>> +{
>> +	int changed;
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
> 
> just a query -
> for pmdp_set_access_flags() we had
> WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> 
> so why don't we require the same check here?
> 

Because we don't support generic anon PUD level THP. and we want to catch a wrong usage.

>> +	assert_spin_locked(pud_lockptr(vma->vm_mm, pudp));
>> +#endif
>> +	changed = !pud_same(*(pudp), entry);
>> +	if (changed) {
>> +		/*
>> +		 * We can use MMU_PAGE_2M here, because only radix
> 
> s/MMU_PAGE_2M/MMY_PAGE_1G
> 

updated

>> +		 * path look at the psize.
>> +		 */
>> +		__ptep_set_access_flags(vma, pudp_ptep(pudp),
>> +					pud_pte(entry), address, MMU_PAGE_1G);
>> +	}
>> +	return changed;
>> +}
>> +
>> +
>>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>>  			      unsigned long address, pmd_t *pmdp)
>>  {
>>  	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
>>  }
>> +
>> +int pudp_test_and_clear_young(struct vm_area_struct *vma,
>> +			      unsigned long address, pud_t *pudp)
>> +{
>> +	return __pudp_test_and_clear_young(vma->vm_mm, address, pudp);
>> +}
>> +
>>  /*
>>   * set a new huge pmd. We should not be called for updating
>>   * an existing pmd entry. That should go via pmd_hugepage_update.
>> @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>>  	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
>>  }
>>  
>> +void set_pud_at(struct mm_struct *mm, unsigned long addr,
>> +		pud_t *pudp, pud_t pud)
>> +{
>> +#ifdef CONFIG_DEBUG_VM
>> +	/*
>> +	 * Make sure hardware valid bit is not set. We don't do
>> +	 * tlb flush for this update.
>> +	 */
>> +
>> +	WARN_ON(pte_hw_valid(pud_pte(*pudp)));
> 
> For set_pmd_at() we had
> WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
> 
> Could you please help me understand why is it different for set_pud_at()
> 
> 

There is no numa fault support for devdax pages and we want to catch wrong usage.

>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +	WARN_ON(!(pud_large(pud)));
>> +#endif
>> +	trace_hugepage_set_pud(addr, pud_val(pud));
>> +	return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud));
>> +}
>> +
>>  static void do_serialize(void *arg)
>>  {
>>  	/* We've taken the IPI, so try to trim the mask while here */
>> @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
>>  	return pmd;
>>  }
>>  
>> +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
>> +				   unsigned long addr, pud_t *pudp, int full)
>> +{
>> +	pud_t pud;
>> +
>> +	VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
>> +	VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) ||
>> +		  !pud_present(*pudp));
>> +	pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp);
>> +	/*
>> +	 * if it not a fullmm flush, then we can possibly end up converting
>> +	 * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
>> +	 * Make sure we flush the tlb in this case.
>> +	 */
>> +	if (!full)
>> +		flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE);
>> +	return pud;
>> +}
>> +
>>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
>>  {
>>  	return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
>>  }
>>  
>> +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot)
>> +{
>> +	return __pud(pud_val(pud) | pgprot_val(pgprot));
>> +}
>> +
>>  /*
>>   * At some point we should be able to get rid of
>>   * pmd_mkhuge() and mk_huge_pmd() when we update all the
>> @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
>>  	return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot));
>>  }
>>  
>> +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
>> +{
>> +	unsigned long pudv;
>> +
>> +	pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK;
>> +
>> +	return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
>> +}
>> +
>>  pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
>>  {
>>  	return pfn_pmd(page_to_pfn(page), pgprot);
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index 76f6a1f3b9d8..d7e2dd3d4add 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -965,6 +965,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
>>  	return old;
>>  }
>>  
>> +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
>> +					 pud_t *pudp, unsigned long clr,
>> +					 unsigned long set)
>> +{
>> +	unsigned long old;
>> +
>> +#ifdef CONFIG_DEBUG_VM
>> +	WARN_ON(!pud_devmap(*pudp));
>> +	assert_spin_locked(pud_lockptr(mm, pudp));
>> +#endif
>> +
>> +	old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
>> +	trace_hugepage_update(addr, old, clr, set);
> 
> Here, we are using the same trace event for both pmd and pud update.
> See the comment in the end.
> 


updated

>> +
>> +	return old;
>> +}
>> +
>>  pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>>  			pmd_t *pmdp)
>>  
>> @@ -1041,6 +1058,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
>>  	return old_pmd;
>>  }
>>  
>> +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
>> +				     unsigned long addr, pud_t *pudp)
>> +{
>> +	pud_t old_pud;
>> +	unsigned long old;
>> +
>> +	old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
>> +	old_pud = __pud(old);
>> +	return old_pud;
>> +}
>> +
>>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>  
>>  void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index ce804b7bf84e..a18f7d2c9f63 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -1453,6 +1453,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>>  }
>>  EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
>>  
>> +void radix__flush_pud_tlb_range(struct vm_area_struct *vma,
>> +				unsigned long start, unsigned long end)
>> +{
>> +	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G);
>> +}
>> +EXPORT_SYMBOL(radix__flush_pud_tlb_range);
>> +
>>  void radix__flush_tlb_all(void)
>>  {
>>  	unsigned long rb,prs,r,rs;
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
>> index 45fd975ef521..340b86ef7284 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -94,6 +94,7 @@ config PPC_BOOK3S_64
>>  	select PPC_FPU
>>  	select PPC_HAVE_PMU_SUPPORT
>>  	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
>> +	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>  	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
>>  	select ARCH_ENABLE_SPLIT_PMD_PTLOCK
>>  	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
>> diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
>> index 202b3e3e67ff..a919d943d106 100644
>> --- a/include/trace/events/thp.h
>> +++ b/include/trace/events/thp.h
>> @@ -25,6 +25,23 @@ TRACE_EVENT(hugepage_set_pmd,
>>  	    TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
>>  );
>>  
>> +TRACE_EVENT(hugepage_set_pud,
> 
> Given we have trace_hugepage_set_pud and trace_hugepage_set_pmd, doing
> the exact same thing, we can as well have a
> DECLARE_EVENT_CLASS(hugepage_set) and then DEFINE_EVENT using this event class.
> 
>> +
>> +	    TP_PROTO(unsigned long addr, unsigned long pud),
>> +	    TP_ARGS(addr, pud),
>> +	    TP_STRUCT__entry(
>> +		    __field(unsigned long, addr)
>> +		    __field(unsigned long, pud)
>> +			    ),
>> +
>> +	    TP_fast_assign(
>> +		    __entry->addr = addr;
>> +		    __entry->pud = pud;
>> +		    ),
>> +
>> +	    TP_printk("Set pud with 0x%lx with 0x%lx", __entry->addr, __entry->pud)
>> +	);
>> +
>>  
>>  TRACE_EVENT(hugepage_update,
> 
> Same here. We can have a DECLARE_EVENT_CLASS(hugepage_update) and then
> DEFINE_EVENT for hugepage_update_pmd and hugepage_update_pud.

updated. I  created a patch that first switch  the existing tracepoint to DECLARE_EVENT_CLASS.

-aneesh



  reply	other threads:[~2023-06-28  3:32 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-16 11:08 [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 01/16] powerpc/mm/book3s64: Use pmdp_ptep helper instead of typecasting Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 02/16] powerpc/book3s64/mm: mmu_vmemmap_psize is used by radix Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 03/16] powerpc/book3s64/mm: Fix DirectMap stats in /proc/meminfo Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 04/16] powerpc/book3s64/mm: Use PAGE_KERNEL instead of opencoding Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 05/16] powerpc/mm/dax: Fix the condition when checking if altmap vmemap can cross-boundary Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 06/16] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 07/16] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 08/16] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override Aneesh Kumar K.V
2023-06-20 11:53   ` Joao Martins
2023-06-20 14:29     ` Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 09/16] mm/vmemmap: Allow architectures to override how vmemmap optimization works Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 10/16] mm: Add __HAVE_ARCH_PUD_SAME similar to __HAVE_ARCH_P4D_SAME Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 11/16] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 12/16] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization Aneesh Kumar K.V
2023-06-28  1:09   ` Ritesh Harjani
2023-06-28  3:01     ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage Aneesh Kumar K.V
2023-06-28  1:23   ` Ritesh Harjani
2023-06-28  3:32     ` Aneesh Kumar K V [this message]
2023-06-16 11:08 ` [PATCH v2 14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function Aneesh Kumar K.V
2023-06-28  1:33   ` Ritesh Harjani
2023-06-28  3:37     ` Aneesh Kumar K V
2023-06-16 11:08 ` [PATCH v2 15/16] powerpc/book3s64/radix: Add support for vmemmap optimization for radix Aneesh Kumar K.V
2023-06-16 11:08 ` [PATCH v2 16/16] powerpc/book3s64/radix: Remove mmu_vmemmap_psize Aneesh Kumar K.V
2023-06-18 11:54 ` [PATCH v2 00/16] Add support for DAX vmemmap optimization for ppc64 Sachin Sant
2023-06-24 14:52 ` Aneesh Kumar K.V
2023-06-24 17:22   ` Andrew Morton
2023-07-03  5:26 ` (subset) " Michael Ellerman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=0d8633f2-69de-c178-fe03-49742b063962@linux.ibm.com \
    --to=aneesh.kumar@linux.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=dan.j.williams@intel.com \
    --cc=joao.m.martins@oracle.com \
    --cc=linux-mm@kvack.org \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mike.kravetz@oracle.com \
    --cc=mpe@ellerman.id.au \
    --cc=muchun.song@linux.dev \
    --cc=npiggin@gmail.com \
    --cc=osalvador@suse.de \
    --cc=ritesh.list@gmail.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox