[PATCH 1/2] mm: khugepaged: convert set_huge

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio
@ 2025-05-07  9:26 Baolin Wang
  2025-05-07  9:26 ` [PATCH 2/2] mm: convert do_set_pmd() " Baolin Wang
  2025-05-07 12:04 ` [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() " Matthew Wilcox
  0 siblings, 2 replies; 14+ messages in thread
From: Baolin Wang @ 2025-05-07  9:26 UTC (permalink / raw)
  To: akpm, willy, david
  Cc: hannes, lorenzo.stoakes, Liam.Howlett, npache, ryan.roberts,
	dev.jain, ziy, vbabka, rppt, surenb, mhocko, baolin.wang,
	linux-mm, linux-kernel

We've already gotten the stable locked folio in collapse_pte_mapped_thp(),
so just use folio for set_huge_pmd() to set the PMD entry, which is more
straightforward.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/khugepaged.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b04b6a770afe..0215ef6095d9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1467,7 +1467,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
 #ifdef CONFIG_SHMEM
 /* hpage must be locked, and mmap_lock must be held */
 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
-			pmd_t *pmdp, struct page *hpage)
+			pmd_t *pmdp, struct folio *folio)
 {
 	struct vm_fault vmf = {
 		.vma = vma,
@@ -1476,13 +1476,13 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 		.pmd = pmdp,
 	};
 
-	VM_BUG_ON(!PageTransHuge(hpage));
+	VM_BUG_ON(!folio_test_large(folio));
 	mmap_assert_locked(vma->vm_mm);
 
-	if (do_set_pmd(&vmf, hpage))
+	if (do_set_pmd(&vmf, &folio->page))
 		return SCAN_FAIL;
 
-	get_page(hpage);
+	folio_get(folio);
 	return SCAN_SUCCEED;
 }
 
@@ -1689,7 +1689,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 maybe_install_pmd:
 	/* step 5: install pmd entry */
 	result = install_pmd
-			? set_huge_pmd(vma, haddr, pmd, &folio->page)
+			? set_huge_pmd(vma, haddr, pmd, folio)
 			: SCAN_SUCCEED;
 	goto drop_folio;
 abort:
-- 
2.43.5



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07  9:26 [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio Baolin Wang
@ 2025-05-07  9:26 ` Baolin Wang
  2025-05-07 12:10   ` Matthew Wilcox
  2025-05-07 12:04 ` [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() " Matthew Wilcox
  1 sibling, 1 reply; 14+ messages in thread
From: Baolin Wang @ 2025-05-07  9:26 UTC (permalink / raw)
  To: akpm, willy, david
  Cc: hannes, lorenzo.stoakes, Liam.Howlett, npache, ryan.roberts,
	dev.jain, ziy, vbabka, rppt, surenb, mhocko, baolin.wang,
	linux-mm, linux-kernel

In do_set_pmd(), we always use the folio->page to build PMD mappings for
the entire folio. Since all callers of do_set_pmd() already hold a stable
folio, converting do_set_pmd() to take a folio is safe and more straightforward.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  3 +--
 mm/khugepaged.c    |  2 +-
 mm/memory.c        | 12 ++++++------
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43748c8f3454..e00d81b9eb2a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1237,7 +1237,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio);
 void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		struct page *page, unsigned int nr, unsigned long addr);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..fbf8d7b76ab7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3532,8 +3532,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
 	}
 
 	if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
-		struct page *page = folio_file_page(folio, start);
-		vm_fault_t ret = do_set_pmd(vmf, page);
+		vm_fault_t ret = do_set_pmd(vmf, folio);
 		if (!ret) {
 			/* The page is mapped successfully, reference consumed. */
 			folio_unlock(folio);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0215ef6095d9..575a3384d046 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1479,7 +1479,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
 	VM_BUG_ON(!folio_test_large(folio));
 	mmap_assert_locked(vma->vm_mm);
 
-	if (do_set_pmd(&vmf, &folio->page))
+	if (do_set_pmd(&vmf, folio))
 		return SCAN_FAIL;
 
 	folio_get(folio);
diff --git a/mm/memory.c b/mm/memory.c
index 68c1d962d0ad..0f5c8d8d59b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5176,14 +5176,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	vmf->prealloc_pte = NULL;
 }
 
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
 {
-	struct folio *folio = page_folio(page);
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	pmd_t entry;
 	vm_fault_t ret = VM_FAULT_FALLBACK;
+	struct page *page;
 
 	/*
 	 * It is too late to allocate a small folio, we already have a large
@@ -5251,7 +5251,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 	return ret;
 }
 #else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
 {
 	return VM_FAULT_FALLBACK;
 }
@@ -5345,6 +5345,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	else
 		page = vmf->page;
 
+	folio = page_folio(page);
 	/*
 	 * check even for read faults because we might have lost our CoWed
 	 * page
@@ -5356,8 +5357,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 	}
 
 	if (pmd_none(*vmf->pmd)) {
-		if (PageTransCompound(page)) {
-			ret = do_set_pmd(vmf, page);
+		if (folio_test_pmd_mappable(folio)) {
+			ret = do_set_pmd(vmf, folio);
 			if (ret != VM_FAULT_FALLBACK)
 				return ret;
 		}
@@ -5368,7 +5369,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 			return VM_FAULT_OOM;
 	}
 
-	folio = page_folio(page);
 	nr_pages = folio_nr_pages(folio);
 
 	/*
-- 
2.43.5



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio
  2025-05-07  9:26 [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio Baolin Wang
  2025-05-07  9:26 ` [PATCH 2/2] mm: convert do_set_pmd() " Baolin Wang
@ 2025-05-07 12:04 ` Matthew Wilcox
  2025-05-07 21:19   ` David Hildenbrand
  1 sibling, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2025-05-07 12:04 UTC (permalink / raw)
  To: Baolin Wang
  Cc: akpm, david, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On Wed, May 07, 2025 at 05:26:12PM +0800, Baolin Wang wrote:
> @@ -1476,13 +1476,13 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
>  		.pmd = pmdp,
>  	};
>  
> -	VM_BUG_ON(!PageTransHuge(hpage));
> +	VM_BUG_ON(!folio_test_large(folio));

I don't think this is aterribly useful assertion to keep in a static
function with one caller, do you?



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07  9:26 ` [PATCH 2/2] mm: convert do_set_pmd() " Baolin Wang
@ 2025-05-07 12:10   ` Matthew Wilcox
  2025-05-07 12:36     ` Baolin Wang
  2025-05-07 21:24     ` David Hildenbrand
  0 siblings, 2 replies; 14+ messages in thread
From: Matthew Wilcox @ 2025-05-07 12:10 UTC (permalink / raw)
  To: Baolin Wang
  Cc: akpm, david, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On Wed, May 07, 2025 at 05:26:13PM +0800, Baolin Wang wrote:
> In do_set_pmd(), we always use the folio->page to build PMD mappings for
> the entire folio. Since all callers of do_set_pmd() already hold a stable
> folio, converting do_set_pmd() to take a folio is safe and more straightforward.

What testing did you do of this?

> -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
>  {
> -	struct folio *folio = page_folio(page);
>  	struct vm_area_struct *vma = vmf->vma;
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>  	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>  	pmd_t entry;
>  	vm_fault_t ret = VM_FAULT_FALLBACK;
> +	struct page *page;

Because I see nowhere in this patch that you initialise 'page'.

And that's really the important part.  You seem to be assuming that a
folio will never be larger than PMD size, and I'm not comfortable with
that assumption.  It's a limitation I put in place a few years ago so we
didn't have to find and fix all those assumptions immediately, but I
imagine that some day we'll want to have larger folios.

So unless you can derive _which_ page in the folio we want to map from
the vmf, NACK this patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 12:10   ` Matthew Wilcox
@ 2025-05-07 12:36     ` Baolin Wang
  2025-05-07 16:47       ` Matthew Wilcox
  2025-05-07 21:24     ` David Hildenbrand
  1 sibling, 1 reply; 14+ messages in thread
From: Baolin Wang @ 2025-05-07 12:36 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: akpm, david, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel



On 2025/5/7 20:10, Matthew Wilcox wrote:
> On Wed, May 07, 2025 at 05:26:13PM +0800, Baolin Wang wrote:
>> In do_set_pmd(), we always use the folio->page to build PMD mappings for
>> the entire folio. Since all callers of do_set_pmd() already hold a stable
>> folio, converting do_set_pmd() to take a folio is safe and more straightforward.
> 
> What testing did you do of this?

I did mm selftests, tmpfs/xfs PMD-sized mmap() tests.

> 
>> -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
>>   {
>> -	struct folio *folio = page_folio(page);
>>   	struct vm_area_struct *vma = vmf->vma;
>>   	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>   	pmd_t entry;
>>   	vm_fault_t ret = VM_FAULT_FALLBACK;
>> +	struct page *page;
> 
> Because I see nowhere in this patch that you initialise 'page'.

Please look at the following code in do_set_pmd(), and the 'page' will 
be initialized before using.

         if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
                 return ret;

         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                 return ret;

         if (folio_order(folio) != HPAGE_PMD_ORDER)
                 return ret;
         page = &folio->page;


> And that's really the important part.  You seem to be assuming that a
> folio will never be larger than PMD size, and I'm not comfortable with

No, I have no this assumption. But do_set_pmd() is used to establish PMD 
mappings for the PMD-sized folios, and we already have PMD-sized checks 
to validate the folio size:

         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                 return ret;

         if (folio_order(folio) != HPAGE_PMD_ORDER)
                 return ret;

> that assumption.  It's a limitation I put in place a few years ago so we
> didn't have to find and fix all those assumptions immediately, but I
> imagine that some day we'll want to have larger folios.
> 
> So unless you can derive _which_ page in the folio we want to map from

IMO, for PMD mapping of a PMD-sized folio, we do not need to know 
_which_ page in the folio we want to map, because we'll always map the 
entire PMD-sized folio.

> the vmf, NACK this patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 12:36     ` Baolin Wang
@ 2025-05-07 16:47       ` Matthew Wilcox
  2025-05-08  2:23         ` Baolin Wang
  0 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2025-05-07 16:47 UTC (permalink / raw)
  To: Baolin Wang
  Cc: akpm, david, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On Wed, May 07, 2025 at 08:36:54PM +0800, Baolin Wang wrote:
> On 2025/5/7 20:10, Matthew Wilcox wrote:
> > Because I see nowhere in this patch that you initialise 'page'.
> 
> Please look at the following code in do_set_pmd(), and the 'page' will be
> initialized before using.
> 
>         if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
>                 return ret;
> 
>         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>                 return ret;
> 
>         if (folio_order(folio) != HPAGE_PMD_ORDER)
>                 return ret;
>         page = &folio->page;

Ah, fair, I missed that.

> > And that's really the important part.  You seem to be assuming that a
> > folio will never be larger than PMD size, and I'm not comfortable with
> 
> No, I have no this assumption. But do_set_pmd() is used to establish PMD
> mappings for the PMD-sized folios, and we already have PMD-sized checks to
> validate the folio size:
> 
>         if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>                 return ret;
> 
>         if (folio_order(folio) != HPAGE_PMD_ORDER)
>                 return ret;
> 
> > that assumption.  It's a limitation I put in place a few years ago so we
> > didn't have to find and fix all those assumptions immediately, but I
> > imagine that some day we'll want to have larger folios.
> > 
> > So unless you can derive _which_ page in the folio we want to map from
> 
> IMO, for PMD mapping of a PMD-sized folio, we do not need to know _which_
> page in the folio we want to map, because we'll always map the entire
> PMD-sized folio.

There's a difference between "Assert that the folio is PMD sized" inside
the function because we know there are still problems, and "Change the
interface so we can't specify which page inside the folio is the one
we're actually interested in".

I reiterate the NACK to this patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio
  2025-05-07 12:04 ` [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() " Matthew Wilcox
@ 2025-05-07 21:19   ` David Hildenbrand
  2025-05-08  2:08     ` Baolin Wang
  0 siblings, 1 reply; 14+ messages in thread
From: David Hildenbrand @ 2025-05-07 21:19 UTC (permalink / raw)
  To: Matthew Wilcox, Baolin Wang
  Cc: akpm, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On 07.05.25 14:04, Matthew Wilcox wrote:
> On Wed, May 07, 2025 at 05:26:12PM +0800, Baolin Wang wrote:
>> @@ -1476,13 +1476,13 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
>>   		.pmd = pmdp,
>>   	};
>>   
>> -	VM_BUG_ON(!PageTransHuge(hpage));
>> +	VM_BUG_ON(!folio_test_large(folio));
> 
> I don't think this is aterribly useful assertion to keep in a static
> function with one caller, do you?
> 

I'll not that -- whatever we do here -- we can (finally) get rid of 
PageTransHuge.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 12:10   ` Matthew Wilcox
  2025-05-07 12:36     ` Baolin Wang
@ 2025-05-07 21:24     ` David Hildenbrand
  2025-05-07 23:46       ` Zi Yan
  1 sibling, 1 reply; 14+ messages in thread
From: David Hildenbrand @ 2025-05-07 21:24 UTC (permalink / raw)
  To: Matthew Wilcox, Baolin Wang
  Cc: akpm, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On 07.05.25 14:10, Matthew Wilcox wrote:
> On Wed, May 07, 2025 at 05:26:13PM +0800, Baolin Wang wrote:
>> In do_set_pmd(), we always use the folio->page to build PMD mappings for
>> the entire folio. Since all callers of do_set_pmd() already hold a stable
>> folio, converting do_set_pmd() to take a folio is safe and more straightforward.
> 
> What testing did you do of this?
> 
>> -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
>>   {
>> -	struct folio *folio = page_folio(page);
>>   	struct vm_area_struct *vma = vmf->vma;
>>   	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>   	pmd_t entry;
>>   	vm_fault_t ret = VM_FAULT_FALLBACK;
>> +	struct page *page;
> 
> Because I see nowhere in this patch that you initialise 'page'.
> 
> And that's really the important part.  You seem to be assuming that a
> folio will never be larger than PMD size, and I'm not comfortable with
> that assumption.  It's a limitation I put in place a few years ago so we
> didn't have to find and fix all those assumptions immediately, but I
> imagine that some day we'll want to have larger folios.
> 
> So unless you can derive _which_ page in the folio we want to map from
> the vmf, NACK this patch.

Agreed. Probably folio + idx is our best bet.

Which raises an interesting question: I assume in the future, when we 
have a 4 MiB folio on x86-64 that is *misaligned* in VA space regarding 
PMDs (e.g., aligned to 1 MiB but not 2 MiB), we could still allow to use 
a PMD for the middle part.

So idx must not necessarily be aligned to PMDs in the future.

For now, we could sanity-check that idx is always 0.

But the rmap sanity checks in folio_add_file_rmap_pmd() will already 
catch that for us.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 21:24     ` David Hildenbrand
@ 2025-05-07 23:46       ` Zi Yan
  2025-05-08  7:36         ` David Hildenbrand
  0 siblings, 1 reply; 14+ messages in thread
From: Zi Yan @ 2025-05-07 23:46 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Matthew Wilcox, Baolin Wang, akpm, hannes, lorenzo.stoakes,
	Liam.Howlett, npache, ryan.roberts, dev.jain, vbabka, rppt,
	surenb, mhocko, linux-mm, linux-kernel

On 7 May 2025, at 17:24, David Hildenbrand wrote:

> On 07.05.25 14:10, Matthew Wilcox wrote:
>> On Wed, May 07, 2025 at 05:26:13PM +0800, Baolin Wang wrote:
>>> In do_set_pmd(), we always use the folio->page to build PMD mappings for
>>> the entire folio. Since all callers of do_set_pmd() already hold a stable
>>> folio, converting do_set_pmd() to take a folio is safe and more straightforward.
>>
>> What testing did you do of this?
>>
>>> -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
>>>   {
>>> -	struct folio *folio = page_folio(page);
>>>   	struct vm_area_struct *vma = vmf->vma;
>>>   	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>   	pmd_t entry;
>>>   	vm_fault_t ret = VM_FAULT_FALLBACK;
>>> +	struct page *page;
>>
>> Because I see nowhere in this patch that you initialise 'page'.
>>
>> And that's really the important part.  You seem to be assuming that a
>> folio will never be larger than PMD size, and I'm not comfortable with
>> that assumption.  It's a limitation I put in place a few years ago so we
>> didn't have to find and fix all those assumptions immediately, but I
>> imagine that some day we'll want to have larger folios.
>>
>> So unless you can derive _which_ page in the folio we want to map from
>> the vmf, NACK this patch.
>
> Agreed. Probably folio + idx is our best bet.
>
> Which raises an interesting question: I assume in the future, when we have a 4 MiB folio on x86-64 that is *misaligned* in VA space regarding PMDs (e.g., aligned to 1 MiB but not 2 MiB), we could still allow to use a PMD for the middle part.

It might not be possible if the folio comes from buddy allocator due to how
buddy allocator merges a PFN with its buddy (see __find_buddy_pfn() in mm/internal.h).
A 4MB folio will always be two 2MB-aligned parts. In addition, VA and PA need
to have the same lower 9+12 bits for a PMD mapping. So PMD mappings for
a 4MB folio would always be two PMDs. Let me know if I miss anything.

Of course, if the folio comes from alloc_contig_range() or we add support for
in-place folio promotion, the situation you are talking about would be possible.

>
> So idx must not necessarily be aligned to PMDs in the future.
>
> For now, we could sanity-check that idx is always 0.
>
> But the rmap sanity checks in folio_add_file_rmap_pmd() will already catch that for us.



--
Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio
  2025-05-07 21:19   ` David Hildenbrand
@ 2025-05-08  2:08     ` Baolin Wang
  0 siblings, 0 replies; 14+ messages in thread
From: Baolin Wang @ 2025-05-08  2:08 UTC (permalink / raw)
  To: David Hildenbrand, Matthew Wilcox
  Cc: akpm, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel



On 2025/5/8 05:19, David Hildenbrand wrote:
> On 07.05.25 14:04, Matthew Wilcox wrote:
>> On Wed, May 07, 2025 at 05:26:12PM +0800, Baolin Wang wrote:
>>> @@ -1476,13 +1476,13 @@ static int set_huge_pmd(struct vm_area_struct 
>>> *vma, unsigned long addr,
>>>           .pmd = pmdp,
>>>       };
>>> -    VM_BUG_ON(!PageTransHuge(hpage));
>>> +    VM_BUG_ON(!folio_test_large(folio));
>>
>> I don't think this is aterribly useful assertion to keep in a static
>> function with one caller, do you?

Yes. Will remove this VM_BUG_ON.

> I'll not that -- whatever we do here -- we can (finally) get rid of 
> PageTransHuge.

Right. I will remove the PageTransHuge in the next version.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 16:47       ` Matthew Wilcox
@ 2025-05-08  2:23         ` Baolin Wang
  2025-05-08  7:37           ` David Hildenbrand
  0 siblings, 1 reply; 14+ messages in thread
From: Baolin Wang @ 2025-05-08  2:23 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: akpm, david, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel



On 2025/5/8 00:47, Matthew Wilcox wrote:
> On Wed, May 07, 2025 at 08:36:54PM +0800, Baolin Wang wrote:
>> On 2025/5/7 20:10, Matthew Wilcox wrote:
>>> Because I see nowhere in this patch that you initialise 'page'.
>>
>> Please look at the following code in do_set_pmd(), and the 'page' will be
>> initialized before using.
>>
>>          if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
>>                  return ret;
>>
>>          if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>>                  return ret;
>>
>>          if (folio_order(folio) != HPAGE_PMD_ORDER)
>>                  return ret;
>>          page = &folio->page;
> 
> Ah, fair, I missed that.
> 
>>> And that's really the important part.  You seem to be assuming that a
>>> folio will never be larger than PMD size, and I'm not comfortable with
>>
>> No, I have no this assumption. But do_set_pmd() is used to establish PMD
>> mappings for the PMD-sized folios, and we already have PMD-sized checks to
>> validate the folio size:
>>
>>          if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>>                  return ret;
>>
>>          if (folio_order(folio) != HPAGE_PMD_ORDER)
>>                  return ret;
>>
>>> that assumption.  It's a limitation I put in place a few years ago so we
>>> didn't have to find and fix all those assumptions immediately, but I
>>> imagine that some day we'll want to have larger folios.
>>>
>>> So unless you can derive _which_ page in the folio we want to map from
>>
>> IMO, for PMD mapping of a PMD-sized folio, we do not need to know _which_
>> page in the folio we want to map, because we'll always map the entire
>> PMD-sized folio.
> 
> There's a difference between "Assert that the folio is PMD sized" inside
> the function because we know there are still problems, and "Change the
> interface so we can't specify which page inside the folio is the one
> we're actually interested in".

Fair enough. So how about adding a new 'folio' parameter to 
do_set_pmd(), similar to the set_pte_range() function prototype?

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct 
page *page)

> I reiterate the NACK to this patch.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-07 23:46       ` Zi Yan
@ 2025-05-08  7:36         ` David Hildenbrand
  2025-05-08 13:12           ` Matthew Wilcox
  0 siblings, 1 reply; 14+ messages in thread
From: David Hildenbrand @ 2025-05-08  7:36 UTC (permalink / raw)
  To: Zi Yan
  Cc: Matthew Wilcox, Baolin Wang, akpm, hannes, lorenzo.stoakes,
	Liam.Howlett, npache, ryan.roberts, dev.jain, vbabka, rppt,
	surenb, mhocko, linux-mm, linux-kernel

On 08.05.25 01:46, Zi Yan wrote:
> On 7 May 2025, at 17:24, David Hildenbrand wrote:
> 
>> On 07.05.25 14:10, Matthew Wilcox wrote:
>>> On Wed, May 07, 2025 at 05:26:13PM +0800, Baolin Wang wrote:
>>>> In do_set_pmd(), we always use the folio->page to build PMD mappings for
>>>> the entire folio. Since all callers of do_set_pmd() already hold a stable
>>>> folio, converting do_set_pmd() to take a folio is safe and more straightforward.
>>>
>>> What testing did you do of this?
>>>
>>>> -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>> +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio)
>>>>    {
>>>> -	struct folio *folio = page_folio(page);
>>>>    	struct vm_area_struct *vma = vmf->vma;
>>>>    	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>>>    	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>>    	pmd_t entry;
>>>>    	vm_fault_t ret = VM_FAULT_FALLBACK;
>>>> +	struct page *page;
>>>
>>> Because I see nowhere in this patch that you initialise 'page'.
>>>
>>> And that's really the important part.  You seem to be assuming that a
>>> folio will never be larger than PMD size, and I'm not comfortable with
>>> that assumption.  It's a limitation I put in place a few years ago so we
>>> didn't have to find and fix all those assumptions immediately, but I
>>> imagine that some day we'll want to have larger folios.
>>>
>>> So unless you can derive _which_ page in the folio we want to map from
>>> the vmf, NACK this patch.
>>
>> Agreed. Probably folio + idx is our best bet.
>>
>> Which raises an interesting question: I assume in the future, when we have a 4 MiB folio on x86-64 that is *misaligned* in VA space regarding PMDs (e.g., aligned to 1 MiB but not 2 MiB), we could still allow to use a PMD for the middle part.
> 
> It might not be possible if the folio comes from buddy allocator due to how
> buddy allocator merges a PFN with its buddy (see __find_buddy_pfn() in mm/internal.h).
> A 4MB folio will always be two 2MB-aligned parts. In addition, VA and PA need
> to have the same lower 9+12 bits for a PMD mapping. So PMD mappings for
> a 4MB folio would always be two PMDs. Let me know if I miss anything.

PA is clear. But is mis-alignment in VA space impossible on all 
architectures? I certainly remember it being impossible on x86-64 and 
s390x (remaining PMD entry bits used for something else).

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-08  2:23         ` Baolin Wang
@ 2025-05-08  7:37           ` David Hildenbrand
  0 siblings, 0 replies; 14+ messages in thread
From: David Hildenbrand @ 2025-05-08  7:37 UTC (permalink / raw)
  To: Baolin Wang, Matthew Wilcox
  Cc: akpm, hannes, lorenzo.stoakes, Liam.Howlett, npache,
	ryan.roberts, dev.jain, ziy, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On 08.05.25 04:23, Baolin Wang wrote:
> 
> 
> On 2025/5/8 00:47, Matthew Wilcox wrote:
>> On Wed, May 07, 2025 at 08:36:54PM +0800, Baolin Wang wrote:
>>> On 2025/5/7 20:10, Matthew Wilcox wrote:
>>>> Because I see nowhere in this patch that you initialise 'page'.
>>>
>>> Please look at the following code in do_set_pmd(), and the 'page' will be
>>> initialized before using.
>>>
>>>           if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
>>>                   return ret;
>>>
>>>           if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>>>                   return ret;
>>>
>>>           if (folio_order(folio) != HPAGE_PMD_ORDER)
>>>                   return ret;
>>>           page = &folio->page;
>>
>> Ah, fair, I missed that.
>>
>>>> And that's really the important part.  You seem to be assuming that a
>>>> folio will never be larger than PMD size, and I'm not comfortable with
>>>
>>> No, I have no this assumption. But do_set_pmd() is used to establish PMD
>>> mappings for the PMD-sized folios, and we already have PMD-sized checks to
>>> validate the folio size:
>>>
>>>           if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
>>>                   return ret;
>>>
>>>           if (folio_order(folio) != HPAGE_PMD_ORDER)
>>>                   return ret;
>>>
>>>> that assumption.  It's a limitation I put in place a few years ago so we
>>>> didn't have to find and fix all those assumptions immediately, but I
>>>> imagine that some day we'll want to have larger folios.
>>>>
>>>> So unless you can derive _which_ page in the folio we want to map from
>>>
>>> IMO, for PMD mapping of a PMD-sized folio, we do not need to know _which_
>>> page in the folio we want to map, because we'll always map the entire
>>> PMD-sized folio.
>>
>> There's a difference between "Assert that the folio is PMD sized" inside
>> the function because we know there are still problems, and "Change the
>> interface so we can't specify which page inside the folio is the one
>> we're actually interested in".
> 
> Fair enough. So how about adding a new 'folio' parameter to
> do_set_pmd(), similar to the set_pte_range() function prototype?
> 
> vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct
> page *page)

That's what I used for rmap functions. *Maybe* folio+idx is better: 
might avoid having to lookup the page in some cases (probably in the 
future).

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/2] mm: convert do_set_pmd() to take a folio
  2025-05-08  7:36         ` David Hildenbrand
@ 2025-05-08 13:12           ` Matthew Wilcox
  0 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2025-05-08 13:12 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Zi Yan, Baolin Wang, akpm, hannes, lorenzo.stoakes, Liam.Howlett,
	npache, ryan.roberts, dev.jain, vbabka, rppt, surenb, mhocko,
	linux-mm, linux-kernel

On Thu, May 08, 2025 at 09:36:02AM +0200, David Hildenbrand wrote:
> > > Which raises an interesting question: I assume in the future, when we have a 4 MiB folio on x86-64 that is *misaligned* in VA space regarding PMDs (e.g., aligned to 1 MiB but not 2 MiB), we could still allow to use a PMD for the middle part.
> > 
> > It might not be possible if the folio comes from buddy allocator due to how
> > buddy allocator merges a PFN with its buddy (see __find_buddy_pfn() in mm/internal.h).
> > A 4MB folio will always be two 2MB-aligned parts. In addition, VA and PA need
> > to have the same lower 9+12 bits for a PMD mapping. So PMD mappings for
> > a 4MB folio would always be two PMDs. Let me know if I miss anything.
> 
> PA is clear. But is mis-alignment in VA space impossible on all
> architectures? I certainly remember it being impossible on x86-64 and s390x
> (remaining PMD entry bits used for something else).

Yeah, that's not possible; the PMD is always PMD-aligned.


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2025-05-08 13:12 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-05-07  9:26 [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() to take a folio Baolin Wang
2025-05-07  9:26 ` [PATCH 2/2] mm: convert do_set_pmd() " Baolin Wang
2025-05-07 12:10   ` Matthew Wilcox
2025-05-07 12:36     ` Baolin Wang
2025-05-07 16:47       ` Matthew Wilcox
2025-05-08  2:23         ` Baolin Wang
2025-05-08  7:37           ` David Hildenbrand
2025-05-07 21:24     ` David Hildenbrand
2025-05-07 23:46       ` Zi Yan
2025-05-08  7:36         ` David Hildenbrand
2025-05-08 13:12           ` Matthew Wilcox
2025-05-07 12:04 ` [PATCH 1/2] mm: khugepaged: convert set_huge_pmd() " Matthew Wilcox
2025-05-07 21:19   ` David Hildenbrand
2025-05-08  2:08     ` Baolin Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox