[PATCH 1/4] mm/migrate: Add migrate_device_split

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/4] mm/migrate: Add migrate_device_split_page
       [not found] <20251216201206.1660899-1-francois.dugast@intel.com>
@ 2025-12-16 20:10 ` Francois Dugast
  2025-12-16 20:34   ` Matthew Wilcox
  2026-01-07 20:20   ` Zi Yan
  0 siblings, 2 replies; 15+ messages in thread
From: Francois Dugast @ 2025-12-16 20:10 UTC (permalink / raw)
  To: intel-xe
  Cc: dri-devel, Matthew Brost, Andrew Morton, Balbir Singh, linux-mm,
	Francois Dugast

From: Matthew Brost <matthew.brost@intel.com>

Introduce migrate_device_split_page() to split a device page into
lower-order pages. Used when a folio allocated as higher-order is freed
and later reallocated at a smaller order by the driver memory manager.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-mm@kvack.org
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
---
 include/linux/huge_mm.h |  3 +++
 include/linux/migrate.h |  1 +
 mm/huge_memory.c        |  6 ++---
 mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a4d9f964dfde..6ad8f359bc0d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
 int folio_split_unmapped(struct folio *folio, unsigned int new_order);
 unsigned int min_order_for_split(struct folio *folio);
 int split_folio_to_list(struct folio *folio, struct list_head *list);
+int __split_unmapped_folio(struct folio *folio, int new_order,
+			   struct page *split_at, struct xa_state *xas,
+			   struct address_space *mapping, enum split_type split_type);
 int folio_check_splittable(struct folio *folio, unsigned int new_order,
 			   enum split_type split_type);
 int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 26ca00c325d9..ec65e4fd5f88 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
 			unsigned long npages);
 void migrate_device_finalize(unsigned long *src_pfns,
 			unsigned long *dst_pfns, unsigned long npages);
+int migrate_device_split_page(struct page *page);
 
 #endif /* CONFIG_MIGRATION */
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40cf59301c21..7ded35a3ecec 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
  * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
  * split but not to @new_order, the caller needs to check)
  */
-static int __split_unmapped_folio(struct folio *folio, int new_order,
-		struct page *split_at, struct xa_state *xas,
-		struct address_space *mapping, enum split_type split_type)
+int __split_unmapped_folio(struct folio *folio, int new_order,
+			   struct page *split_at, struct xa_state *xas,
+			   struct address_space *mapping, enum split_type split_type)
 {
 	const bool is_anon = folio_test_anon(folio);
 	int old_order = folio_order(folio);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 23379663b1e1..eb0f0e938947 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
 EXPORT_SYMBOL(migrate_vma_setup);
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_device_split_page() - Split device page
+ * @page: Device page to split
+ *
+ * Splits a device page into smaller pages. Typically called when reallocating a
+ * folio to a smaller size. Inherently racy—only safe if the caller ensures
+ * mutual exclusion within the page's folio (i.e., no other threads are using
+ * pages within the folio). Expected to be called a free device page and
+ * restores all split out pages to a free state.
+ */
+int migrate_device_split_page(struct page *page)
+{
+	struct folio *folio = page_folio(page);
+	struct dev_pagemap *pgmap = folio->pgmap;
+	struct page *unlock_page = folio_page(folio, 0);
+	unsigned int order = folio_order(folio), i;
+	int ret = 0;
+
+	VM_BUG_ON_FOLIO(!order, folio);
+	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
+	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+
+	folio_lock(folio);
+
+	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
+	if (ret) {
+	       /*
+		* We can't fail here unless the caller doesn't know what they
+		* are doing.
+		*/
+		VM_BUG_ON_FOLIO(ret, folio);
+
+		return ret;
+	}
+
+	for (i = 0; i < 0x1 << order; ++i, ++unlock_page) {
+		page_folio(unlock_page)->pgmap = pgmap;
+		folio_unlock(page_folio(unlock_page));
+	}
+
+	return 0;
+}
+
 /**
  * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
  * at @addr. folio is already allocated as a part of the migration process with
@@ -927,6 +970,11 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
 	return ret;
 }
 #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+int migrate_device_split_page(struct page *page)
+{
+	return 0;
+}
+
 static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
 					 unsigned long addr,
 					 struct page *page,
@@ -943,6 +991,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
 	return 0;
 }
 #endif
+EXPORT_SYMBOL(migrate_device_split_page);
 
 static unsigned long migrate_vma_nr_pages(unsigned long *src)
 {
-- 
2.43.0



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2025-12-16 20:10 ` [PATCH 1/4] mm/migrate: Add migrate_device_split_page Francois Dugast
@ 2025-12-16 20:34   ` Matthew Wilcox
  2025-12-16 21:39     ` Matthew Brost
  2026-01-07 20:20   ` Zi Yan
  1 sibling, 1 reply; 15+ messages in thread
From: Matthew Wilcox @ 2025-12-16 20:34 UTC (permalink / raw)
  To: Francois Dugast
  Cc: intel-xe, dri-devel, Matthew Brost, Andrew Morton, Balbir Singh,
	linux-mm

On Tue, Dec 16, 2025 at 09:10:11PM +0100, Francois Dugast wrote:
> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);

We're trying to get rid of uniform splits.  Why do you need this to be
uniform?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2025-12-16 20:34   ` Matthew Wilcox
@ 2025-12-16 21:39     ` Matthew Brost
  2026-01-06  2:39       ` Matthew Brost
  0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2025-12-16 21:39 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Francois Dugast, intel-xe, dri-devel, Andrew Morton,
	Balbir Singh, linux-mm

On Tue, Dec 16, 2025 at 08:34:46PM +0000, Matthew Wilcox wrote:
> On Tue, Dec 16, 2025 at 09:10:11PM +0100, Francois Dugast wrote:
> > +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
> 
> We're trying to get rid of uniform splits.  Why do you need this to be
> uniform?

It’s very possible we’re doing this incorrectly due to a lack of core MM
experience. I believe Zi Yan suggested this approach (use
__split_unmapped_folio) a while back.

Let me start by explaining what we’re trying to do and see if there’s a
better suggestion for how to accomplish it.

Would SPLIT_TYPE_NON_UNIFORM split work here? Or do you have another
suggestion on how to split the folio aside from __split_unmapped_folio?

This covers the case where a GPU device page was allocated as a THP
(e.g., we call zone_device_folio_init with an order of 9). Later, this
page is freed/unmapped and then reallocated for a CPU VMA that is
smaller than a THP (e.g., we’d allocate either 4KB or 64KB based on
CPU VMA size alignment). At this point, we need to split the device
folio so we can migrate data into 4KB device pages.

Would SPLIT_TYPE_NON_UNIFORM work here? Or do you have another
suggestion for splitting the folio aside from __split_unmapped_folio?

Matt

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2025-12-16 21:39     ` Matthew Brost
@ 2026-01-06  2:39       ` Matthew Brost
  2026-01-07 20:15         ` Zi Yan
  0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2026-01-06  2:39 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Francois Dugast, intel-xe, dri-devel, Andrew Morton,
	Balbir Singh, linux-mm

On Tue, Dec 16, 2025 at 01:39:50PM -0800, Matthew Brost wrote:
> On Tue, Dec 16, 2025 at 08:34:46PM +0000, Matthew Wilcox wrote:
> > On Tue, Dec 16, 2025 at 09:10:11PM +0100, Francois Dugast wrote:
> > > +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
> > 
> > We're trying to get rid of uniform splits.  Why do you need this to be
> > uniform?

I looked into this bit more - we do want a uniform split here. What we
want is to split the THP into 512 4k pages here.

Per the doc for __split_unmapped_folio:

3590  * @split_at: in buddy allocator like split, the folio containing @split_at
3591  *            will be split until its order becomes @new_order.

I think this implies some of the pages may still be a higher order which
is not desired behavior for this usage.

Matt

> 
> It’s very possible we’re doing this incorrectly due to a lack of core MM
> experience. I believe Zi Yan suggested this approach (use
> __split_unmapped_folio) a while back.
> 
> Let me start by explaining what we’re trying to do and see if there’s a
> better suggestion for how to accomplish it.
> 
> Would SPLIT_TYPE_NON_UNIFORM split work here? Or do you have another
> suggestion on how to split the folio aside from __split_unmapped_folio?
> 
> This covers the case where a GPU device page was allocated as a THP
> (e.g., we call zone_device_folio_init with an order of 9). Later, this
> page is freed/unmapped and then reallocated for a CPU VMA that is
> smaller than a THP (e.g., we’d allocate either 4KB or 64KB based on
> CPU VMA size alignment). At this point, we need to split the device
> folio so we can migrate data into 4KB device pages.
> 
> Would SPLIT_TYPE_NON_UNIFORM work here? Or do you have another
> suggestion for splitting the folio aside from __split_unmapped_folio?
> 
> Matt


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-06  2:39       ` Matthew Brost
@ 2026-01-07 20:15         ` Zi Yan
  0 siblings, 0 replies; 15+ messages in thread
From: Zi Yan @ 2026-01-07 20:15 UTC (permalink / raw)
  To: Matthew Brost
  Cc: Matthew Wilcox, Francois Dugast, intel-xe, dri-devel,
	Andrew Morton, Balbir Singh, linux-mm

On 5 Jan 2026, at 21:39, Matthew Brost wrote:

> On Tue, Dec 16, 2025 at 01:39:50PM -0800, Matthew Brost wrote:
>> On Tue, Dec 16, 2025 at 08:34:46PM +0000, Matthew Wilcox wrote:
>>> On Tue, Dec 16, 2025 at 09:10:11PM +0100, Francois Dugast wrote:
>>>> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
>>>
>>> We're trying to get rid of uniform splits.  Why do you need this to be
>>> uniform?
>
> I looked into this bit more - we do want a uniform split here. What we
> want is to split the THP into 512 4k pages here.
>
> Per the doc for __split_unmapped_folio:
>
> 3590  * @split_at: in buddy allocator like split, the folio containing @split_at
> 3591  *            will be split until its order becomes @new_order.
>
> I think this implies some of the pages may still be a higher order which
> is not desired behavior for this usage.

IIUC, this is because there is no mTHP support in device private folio yet
and device private folio can only be order-0 or order-9. But after adding
mTHP support, non uniform split should work, since as you said below,
only 4KB or 64KB is reallocated in CPU memory.

In terms of mTHP support in device private folio, how much effort will it
take? Maybe add a TODO in migrate_device_split_page(), saying move to
NON_UNIFORM when mTHP support is ready.

>
> Matt
>
>>
>> It’s very possible we’re doing this incorrectly due to a lack of core MM
>> experience. I believe Zi Yan suggested this approach (use
>> __split_unmapped_folio) a while back.
>>
>> Let me start by explaining what we’re trying to do and see if there’s a
>> better suggestion for how to accomplish it.
>>
>> Would SPLIT_TYPE_NON_UNIFORM split work here? Or do you have another
>> suggestion on how to split the folio aside from __split_unmapped_folio?
>>
>> This covers the case where a GPU device page was allocated as a THP
>> (e.g., we call zone_device_folio_init with an order of 9). Later, this
>> page is freed/unmapped and then reallocated for a CPU VMA that is
>> smaller than a THP (e.g., we’d allocate either 4KB or 64KB based on
>> CPU VMA size alignment). At this point, we need to split the device
>> folio so we can migrate data into 4KB device pages.
>>
>> Would SPLIT_TYPE_NON_UNIFORM work here? Or do you have another
>> suggestion for splitting the folio aside from __split_unmapped_folio?
>>
>> Matt


Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2025-12-16 20:10 ` [PATCH 1/4] mm/migrate: Add migrate_device_split_page Francois Dugast
  2025-12-16 20:34   ` Matthew Wilcox
@ 2026-01-07 20:20   ` Zi Yan
  2026-01-07 20:38     ` Zi Yan
  1 sibling, 1 reply; 15+ messages in thread
From: Zi Yan @ 2026-01-07 20:20 UTC (permalink / raw)
  To: Francois Dugast
  Cc: intel-xe, dri-devel, Matthew Brost, Andrew Morton, Balbir Singh,
	linux-mm, David Hildenbrand, Lorenzo Stoakes, Baolin Wang,
	Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain, Barry Song,
	Lance Yang

+THP folks

On 16 Dec 2025, at 15:10, Francois Dugast wrote:

> From: Matthew Brost <matthew.brost@intel.com>
>
> Introduce migrate_device_split_page() to split a device page into
> lower-order pages. Used when a folio allocated as higher-order is freed
> and later reallocated at a smaller order by the driver memory manager.
>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Balbir Singh <balbirs@nvidia.com>
> Cc: dri-devel@lists.freedesktop.org
> Cc: linux-mm@kvack.org
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> ---
>  include/linux/huge_mm.h |  3 +++
>  include/linux/migrate.h |  1 +
>  mm/huge_memory.c        |  6 ++---
>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 56 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index a4d9f964dfde..6ad8f359bc0d 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>  unsigned int min_order_for_split(struct folio *folio);
>  int split_folio_to_list(struct folio *folio, struct list_head *list);
> +int __split_unmapped_folio(struct folio *folio, int new_order,
> +			   struct page *split_at, struct xa_state *xas,
> +			   struct address_space *mapping, enum split_type split_type);
>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>  			   enum split_type split_type);
>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 26ca00c325d9..ec65e4fd5f88 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>  			unsigned long npages);
>  void migrate_device_finalize(unsigned long *src_pfns,
>  			unsigned long *dst_pfns, unsigned long npages);
> +int migrate_device_split_page(struct page *page);
>
>  #endif /* CONFIG_MIGRATION */
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 40cf59301c21..7ded35a3ecec 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>   * split but not to @new_order, the caller needs to check)
>   */
> -static int __split_unmapped_folio(struct folio *folio, int new_order,
> -		struct page *split_at, struct xa_state *xas,
> -		struct address_space *mapping, enum split_type split_type)
> +int __split_unmapped_folio(struct folio *folio, int new_order,
> +			   struct page *split_at, struct xa_state *xas,
> +			   struct address_space *mapping, enum split_type split_type)
>  {
>  	const bool is_anon = folio_test_anon(folio);
>  	int old_order = folio_order(folio);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 23379663b1e1..eb0f0e938947 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>  EXPORT_SYMBOL(migrate_vma_setup);
>
>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +/**
> + * migrate_device_split_page() - Split device page
> + * @page: Device page to split
> + *
> + * Splits a device page into smaller pages. Typically called when reallocating a
> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
> + * mutual exclusion within the page's folio (i.e., no other threads are using
> + * pages within the folio). Expected to be called a free device page and
> + * restores all split out pages to a free state.
> + */
> +int migrate_device_split_page(struct page *page)
> +{
> +	struct folio *folio = page_folio(page);
> +	struct dev_pagemap *pgmap = folio->pgmap;
> +	struct page *unlock_page = folio_page(folio, 0);
> +	unsigned int order = folio_order(folio), i;
> +	int ret = 0;
> +
> +	VM_BUG_ON_FOLIO(!order, folio);
> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
> +	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
> +
> +	folio_lock(folio);
> +
> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
> +	if (ret) {
> +	       /*
> +		* We can't fail here unless the caller doesn't know what they
> +		* are doing.
> +		*/
> +		VM_BUG_ON_FOLIO(ret, folio);
> +
> +		return ret;
> +	}
> +
> +	for (i = 0; i < 0x1 << order; ++i, ++unlock_page) {
> +		page_folio(unlock_page)->pgmap = pgmap;
> +		folio_unlock(page_folio(unlock_page));
> +	}
> +
> +	return 0;
> +}
> +
>  /**
>   * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
>   * at @addr. folio is already allocated as a part of the migration process with
> @@ -927,6 +970,11 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>  	return ret;
>  }
>  #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +int migrate_device_split_page(struct page *page)
> +{
> +	return 0;
> +}
> +
>  static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
>  					 unsigned long addr,
>  					 struct page *page,
> @@ -943,6 +991,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>  	return 0;
>  }
>  #endif
> +EXPORT_SYMBOL(migrate_device_split_page);
>
>  static unsigned long migrate_vma_nr_pages(unsigned long *src)
>  {
> -- 
> 2.43.0


Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-07 20:20   ` Zi Yan
@ 2026-01-07 20:38     ` Zi Yan
  2026-01-07 21:15       ` Matthew Brost
  0 siblings, 1 reply; 15+ messages in thread
From: Zi Yan @ 2026-01-07 20:38 UTC (permalink / raw)
  To: Francois Dugast
  Cc: intel-xe, dri-devel, Matthew Brost, Andrew Morton, Balbir Singh,
	linux-mm, David Hildenbrand, Lorenzo Stoakes, Baolin Wang,
	Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain, Barry Song,
	Lance Yang, Matthew Wilcox

On 7 Jan 2026, at 15:20, Zi Yan wrote:

> +THP folks

+willy, since he commented in another thread.

>
> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
>
>> From: Matthew Brost <matthew.brost@intel.com>
>>
>> Introduce migrate_device_split_page() to split a device page into
>> lower-order pages. Used when a folio allocated as higher-order is freed
>> and later reallocated at a smaller order by the driver memory manager.
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Balbir Singh <balbirs@nvidia.com>
>> Cc: dri-devel@lists.freedesktop.org
>> Cc: linux-mm@kvack.org
>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>> ---
>>  include/linux/huge_mm.h |  3 +++
>>  include/linux/migrate.h |  1 +
>>  mm/huge_memory.c        |  6 ++---
>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index a4d9f964dfde..6ad8f359bc0d 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>  unsigned int min_order_for_split(struct folio *folio);
>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>> +			   struct page *split_at, struct xa_state *xas,
>> +			   struct address_space *mapping, enum split_type split_type);
>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>  			   enum split_type split_type);
>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index 26ca00c325d9..ec65e4fd5f88 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>  			unsigned long npages);
>>  void migrate_device_finalize(unsigned long *src_pfns,
>>  			unsigned long *dst_pfns, unsigned long npages);
>> +int migrate_device_split_page(struct page *page);
>>
>>  #endif /* CONFIG_MIGRATION */
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 40cf59301c21..7ded35a3ecec 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>>   * split but not to @new_order, the caller needs to check)
>>   */
>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
>> -		struct page *split_at, struct xa_state *xas,
>> -		struct address_space *mapping, enum split_type split_type)
>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>> +			   struct page *split_at, struct xa_state *xas,
>> +			   struct address_space *mapping, enum split_type split_type)
>>  {
>>  	const bool is_anon = folio_test_anon(folio);
>>  	int old_order = folio_order(folio);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 23379663b1e1..eb0f0e938947 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>>  EXPORT_SYMBOL(migrate_vma_setup);
>>
>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>> +/**
>> + * migrate_device_split_page() - Split device page
>> + * @page: Device page to split
>> + *
>> + * Splits a device page into smaller pages. Typically called when reallocating a
>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
>> + * mutual exclusion within the page's folio (i.e., no other threads are using
>> + * pages within the folio). Expected to be called a free device page and
>> + * restores all split out pages to a free state.
>> + */

Do you mind explaining why __split_unmapped_folio() is needed for a free device
page? A free page is not supposed to be a large folio, at least from a core
MM point of view. __split_unmapped_folio() is intended to work on large folios
(or compound pages), even if the input folio has refcount == 0 (because it is
frozen).

>> +int migrate_device_split_page(struct page *page)
>> +{
>> +	struct folio *folio = page_folio(page);
>> +	struct dev_pagemap *pgmap = folio->pgmap;
>> +	struct page *unlock_page = folio_page(folio, 0);
>> +	unsigned int order = folio_order(folio), i;
>> +	int ret = 0;
>> +
>> +	VM_BUG_ON_FOLIO(!order, folio);
>> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
>> +	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);

Please use VM_WARN_ON_FOLIO() instead to catch errors. There is no need to crash
the kernel

>> +
>> +	folio_lock(folio);
>> +
>> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
>> +	if (ret) {
>> +	       /*
>> +		* We can't fail here unless the caller doesn't know what they
>> +		* are doing.
>> +		*/
>> +		VM_BUG_ON_FOLIO(ret, folio);

Same here.

>> +
>> +		return ret;
>> +	}
>> +
>> +	for (i = 0; i < 0x1 << order; ++i, ++unlock_page) {
>> +		page_folio(unlock_page)->pgmap = pgmap;
>> +		folio_unlock(page_folio(unlock_page));
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  /**
>>   * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
>>   * at @addr. folio is already allocated as a part of the migration process with
>> @@ -927,6 +970,11 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>>  	return ret;
>>  }
>>  #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
>> +int migrate_device_split_page(struct page *page)
>> +{
>> +	return 0;
>> +}
>> +
>>  static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
>>  					 unsigned long addr,
>>  					 struct page *page,
>> @@ -943,6 +991,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>>  	return 0;
>>  }
>>  #endif
>> +EXPORT_SYMBOL(migrate_device_split_page);
>>
>>  static unsigned long migrate_vma_nr_pages(unsigned long *src)
>>  {
>> -- 
>> 2.43.0
>
>
> Best Regards,
> Yan, Zi


Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-07 20:38     ` Zi Yan
@ 2026-01-07 21:15       ` Matthew Brost
  2026-01-07 22:03         ` Zi Yan
  0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2026-01-07 21:15 UTC (permalink / raw)
  To: Zi Yan
  Cc: Francois Dugast, intel-xe, dri-devel, Andrew Morton,
	Balbir Singh, linux-mm, David Hildenbrand, Lorenzo Stoakes,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox

On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
> On 7 Jan 2026, at 15:20, Zi Yan wrote:
> 
> > +THP folks
> 
> +willy, since he commented in another thread.
> 
> >
> > On 16 Dec 2025, at 15:10, Francois Dugast wrote:
> >
> >> From: Matthew Brost <matthew.brost@intel.com>
> >>
> >> Introduce migrate_device_split_page() to split a device page into
> >> lower-order pages. Used when a folio allocated as higher-order is freed
> >> and later reallocated at a smaller order by the driver memory manager.
> >>
> >> Cc: Andrew Morton <akpm@linux-foundation.org>
> >> Cc: Balbir Singh <balbirs@nvidia.com>
> >> Cc: dri-devel@lists.freedesktop.org
> >> Cc: linux-mm@kvack.org
> >> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> >> ---
> >>  include/linux/huge_mm.h |  3 +++
> >>  include/linux/migrate.h |  1 +
> >>  mm/huge_memory.c        |  6 ++---
> >>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
> >>  4 files changed, 56 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index a4d9f964dfde..6ad8f359bc0d 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
> >>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> >>  unsigned int min_order_for_split(struct folio *folio);
> >>  int split_folio_to_list(struct folio *folio, struct list_head *list);
> >> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >> +			   struct page *split_at, struct xa_state *xas,
> >> +			   struct address_space *mapping, enum split_type split_type);
> >>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
> >>  			   enum split_type split_type);
> >>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
> >> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >> index 26ca00c325d9..ec65e4fd5f88 100644
> >> --- a/include/linux/migrate.h
> >> +++ b/include/linux/migrate.h
> >> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >>  			unsigned long npages);
> >>  void migrate_device_finalize(unsigned long *src_pfns,
> >>  			unsigned long *dst_pfns, unsigned long npages);
> >> +int migrate_device_split_page(struct page *page);
> >>
> >>  #endif /* CONFIG_MIGRATION */
> >>
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 40cf59301c21..7ded35a3ecec 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
> >>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
> >>   * split but not to @new_order, the caller needs to check)
> >>   */
> >> -static int __split_unmapped_folio(struct folio *folio, int new_order,
> >> -		struct page *split_at, struct xa_state *xas,
> >> -		struct address_space *mapping, enum split_type split_type)
> >> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >> +			   struct page *split_at, struct xa_state *xas,
> >> +			   struct address_space *mapping, enum split_type split_type)
> >>  {
> >>  	const bool is_anon = folio_test_anon(folio);
> >>  	int old_order = folio_order(folio);
> >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> index 23379663b1e1..eb0f0e938947 100644
> >> --- a/mm/migrate_device.c
> >> +++ b/mm/migrate_device.c
> >> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
> >>  EXPORT_SYMBOL(migrate_vma_setup);
> >>
> >>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> >> +/**
> >> + * migrate_device_split_page() - Split device page
> >> + * @page: Device page to split
> >> + *
> >> + * Splits a device page into smaller pages. Typically called when reallocating a
> >> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
> >> + * mutual exclusion within the page's folio (i.e., no other threads are using
> >> + * pages within the folio). Expected to be called a free device page and
> >> + * restores all split out pages to a free state.
> >> + */
> 
> Do you mind explaining why __split_unmapped_folio() is needed for a free device
> page? A free page is not supposed to be a large folio, at least from a core
> MM point of view. __split_unmapped_folio() is intended to work on large folios
> (or compound pages), even if the input folio has refcount == 0 (because it is
> frozen).
> 

Well, then maybe this is a bug in core MM where the freed page is still
a THP. Let me explain the scenario and why this is needed from my POV.

Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
This is a shared pool between traditional DRM GEMs (buffer objects) and
SVM allocations (pages). It doesn’t have any view of the page backing—it
basically just hands back a pointer to VRAM space that we allocate from.
From that, if it’s an SVM allocation, we can derive the device pages.

What I see happening is: a 2M buddy allocation occurs, we make the
backing device pages a large folio, and sometime later the folio
refcount goes to zero and we free the buddy allocation. Later, the buddy
allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
backing pages are still a large folio. Here is where we need to split
the folio into 4K pages so we can properly migrate the pages via the
migrate_vma_* calls. Also note: if you call zone_device_page_init with
an order of zero on a large device folio, that also blows up.

Open to other ideas here for how to handle this scenario.

> >> +int migrate_device_split_page(struct page *page)
> >> +{
> >> +	struct folio *folio = page_folio(page);
> >> +	struct dev_pagemap *pgmap = folio->pgmap;
> >> +	struct page *unlock_page = folio_page(folio, 0);
> >> +	unsigned int order = folio_order(folio), i;
> >> +	int ret = 0;
> >> +
> >> +	VM_BUG_ON_FOLIO(!order, folio);
> >> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
> >> +	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
> 
> Please use VM_WARN_ON_FOLIO() instead to catch errors. There is no need to crash
> the kernel
> 

Sure.

> >> +
> >> +	folio_lock(folio);
> >> +
> >> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
> >> +	if (ret) {
> >> +	       /*
> >> +		* We can't fail here unless the caller doesn't know what they
> >> +		* are doing.
> >> +		*/
> >> +		VM_BUG_ON_FOLIO(ret, folio);
> 
> Same here.
> 

Will do.

Matt

> >> +
> >> +		return ret;
> >> +	}
> >> +
> >> +	for (i = 0; i < 0x1 << order; ++i, ++unlock_page) {
> >> +		page_folio(unlock_page)->pgmap = pgmap;
> >> +		folio_unlock(page_folio(unlock_page));
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  /**
> >>   * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
> >>   * at @addr. folio is already allocated as a part of the migration process with
> >> @@ -927,6 +970,11 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
> >>  	return ret;
> >>  }
> >>  #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
> >> +int migrate_device_split_page(struct page *page)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >>  static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
> >>  					 unsigned long addr,
> >>  					 struct page *page,
> >> @@ -943,6 +991,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
> >>  	return 0;
> >>  }
> >>  #endif
> >> +EXPORT_SYMBOL(migrate_device_split_page);
> >>
> >>  static unsigned long migrate_vma_nr_pages(unsigned long *src)
> >>  {
> >> -- 
> >> 2.43.0
> >
> >
> > Best Regards,
> > Yan, Zi
> 
> 
> Best Regards,
> Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-07 21:15       ` Matthew Brost
@ 2026-01-07 22:03         ` Zi Yan
  2026-01-08  0:56           ` Balbir Singh
  0 siblings, 1 reply; 15+ messages in thread
From: Zi Yan @ 2026-01-07 22:03 UTC (permalink / raw)
  To: Matthew Brost, Balbir Singh, Alistair Popple
  Cc: Francois Dugast, intel-xe, dri-devel, Andrew Morton, linux-mm,
	David Hildenbrand, Lorenzo Stoakes, Baolin Wang, Liam R. Howlett,
	Nico Pache, Ryan Roberts, Dev Jain, Barry Song, Lance Yang,
	Matthew Wilcox

On 7 Jan 2026, at 16:15, Matthew Brost wrote:

> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
>>
>>> +THP folks
>>
>> +willy, since he commented in another thread.
>>
>>>
>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
>>>
>>>> From: Matthew Brost <matthew.brost@intel.com>
>>>>
>>>> Introduce migrate_device_split_page() to split a device page into
>>>> lower-order pages. Used when a folio allocated as higher-order is freed
>>>> and later reallocated at a smaller order by the driver memory manager.
>>>>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>> Cc: dri-devel@lists.freedesktop.org
>>>> Cc: linux-mm@kvack.org
>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>>>> ---
>>>>  include/linux/huge_mm.h |  3 +++
>>>>  include/linux/migrate.h |  1 +
>>>>  mm/huge_memory.c        |  6 ++---
>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index a4d9f964dfde..6ad8f359bc0d 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>>  unsigned int min_order_for_split(struct folio *folio);
>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>> +			   struct page *split_at, struct xa_state *xas,
>>>> +			   struct address_space *mapping, enum split_type split_type);
>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>>  			   enum split_type split_type);
>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>> index 26ca00c325d9..ec65e4fd5f88 100644
>>>> --- a/include/linux/migrate.h
>>>> +++ b/include/linux/migrate.h
>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>>>  			unsigned long npages);
>>>>  void migrate_device_finalize(unsigned long *src_pfns,
>>>>  			unsigned long *dst_pfns, unsigned long npages);
>>>> +int migrate_device_split_page(struct page *page);
>>>>
>>>>  #endif /* CONFIG_MIGRATION */
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 40cf59301c21..7ded35a3ecec 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>>>>   * split but not to @new_order, the caller needs to check)
>>>>   */
>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
>>>> -		struct page *split_at, struct xa_state *xas,
>>>> -		struct address_space *mapping, enum split_type split_type)
>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>> +			   struct page *split_at, struct xa_state *xas,
>>>> +			   struct address_space *mapping, enum split_type split_type)
>>>>  {
>>>>  	const bool is_anon = folio_test_anon(folio);
>>>>  	int old_order = folio_order(folio);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 23379663b1e1..eb0f0e938947 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>>  EXPORT_SYMBOL(migrate_vma_setup);
>>>>
>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>>>> +/**
>>>> + * migrate_device_split_page() - Split device page
>>>> + * @page: Device page to split
>>>> + *
>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
>>>> + * pages within the folio). Expected to be called a free device page and
>>>> + * restores all split out pages to a free state.
>>>> + */
>>
>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
>> page? A free page is not supposed to be a large folio, at least from a core
>> MM point of view. __split_unmapped_folio() is intended to work on large folios
>> (or compound pages), even if the input folio has refcount == 0 (because it is
>> frozen).
>>
>
> Well, then maybe this is a bug in core MM where the freed page is still
> a THP. Let me explain the scenario and why this is needed from my POV.
>
> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
> This is a shared pool between traditional DRM GEMs (buffer objects) and
> SVM allocations (pages). It doesn’t have any view of the page backing—it
> basically just hands back a pointer to VRAM space that we allocate from.
> From that, if it’s an SVM allocation, we can derive the device pages.
>
> What I see happening is: a 2M buddy allocation occurs, we make the
> backing device pages a large folio, and sometime later the folio
> refcount goes to zero and we free the buddy allocation. Later, the buddy
> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
> backing pages are still a large folio. Here is where we need to split

I agree with you that it might be a bug in free_zone_device_folio() based
on my understanding. Since zone_device_page_init() calls prep_compound_page()
for >0 orders, but free_zone_device_folio() never reverse the process.

Balbir and Alistair might be able to help here.

I cherry picked the code from __free_frozen_pages() to reverse the process.
Can you give it a try to see if it solve the above issue? Thanks.

From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 7 Jan 2026 16:49:52 -0500
Subject: [PATCH] mm/memremap: free device private folio fix
Content-Type: text/plain; charset="utf-8"

Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 mm/memremap.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/mm/memremap.c b/mm/memremap.c
index 63c6ab4fdf08..483666ff7271 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
 		pgmap->ops->folio_free(folio);
 		break;
 	}
+
+	if (nr > 1) {
+		struct page *head = folio_page(folio, 0);
+
+		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+		folio->_nr_pages = 0;
+#endif
+		for (i = 1; i < nr; i++) {
+			(head + i)->mapping = NULL;
+			clear_compound_head(head + i);
+		}
+		folio->mapping = NULL;
+		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+	}
 }

 void zone_device_page_init(struct page *page, unsigned int order)
-- 
2.51.0



> the folio into 4K pages so we can properly migrate the pages via the
> migrate_vma_* calls. Also note: if you call zone_device_page_init with
> an order of zero on a large device folio, that also blows up.
>
> Open to other ideas here for how to handle this scenario.
>
>>>> +int migrate_device_split_page(struct page *page)
>>>> +{
>>>> +	struct folio *folio = page_folio(page);
>>>> +	struct dev_pagemap *pgmap = folio->pgmap;
>>>> +	struct page *unlock_page = folio_page(folio, 0);
>>>> +	unsigned int order = folio_order(folio), i;
>>>> +	int ret = 0;
>>>> +
>>>> +	VM_BUG_ON_FOLIO(!order, folio);
>>>> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
>>>> +	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
>>
>> Please use VM_WARN_ON_FOLIO() instead to catch errors. There is no need to crash
>> the kernel
>>
>
> Sure.
>
>>>> +
>>>> +	folio_lock(folio);
>>>> +
>>>> +	ret = __split_unmapped_folio(folio, 0, page, NULL, NULL, SPLIT_TYPE_UNIFORM);
>>>> +	if (ret) {
>>>> +	       /*
>>>> +		* We can't fail here unless the caller doesn't know what they
>>>> +		* are doing.
>>>> +		*/
>>>> +		VM_BUG_ON_FOLIO(ret, folio);
>>
>> Same here.
>>
>
> Will do.
>
> Matt
>
>>>> +
>>>> +		return ret;
>>>> +	}
>>>> +
>>>> +	for (i = 0; i < 0x1 << order; ++i, ++unlock_page) {
>>>> +		page_folio(unlock_page)->pgmap = pgmap;
>>>> +		folio_unlock(page_folio(unlock_page));
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  /**
>>>>   * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
>>>>   * at @addr. folio is already allocated as a part of the migration process with
>>>> @@ -927,6 +970,11 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>>>>  	return ret;
>>>>  }
>>>>  #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
>>>> +int migrate_device_split_page(struct page *page)
>>>> +{
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
>>>>  					 unsigned long addr,
>>>>  					 struct page *page,
>>>> @@ -943,6 +991,7 @@ static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
>>>>  	return 0;
>>>>  }
>>>>  #endif
>>>> +EXPORT_SYMBOL(migrate_device_split_page);
>>>>
>>>>  static unsigned long migrate_vma_nr_pages(unsigned long *src)
>>>>  {
>>>> -- 
>>>> 2.43.0
>>>
>>>
>>> Best Regards,
>>> Yan, Zi
>>
>>
>> Best Regards,
>> Yan, Zi


Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-07 22:03         ` Zi Yan
@ 2026-01-08  0:56           ` Balbir Singh
  2026-01-08  2:17             ` Matthew Brost
  0 siblings, 1 reply; 15+ messages in thread
From: Balbir Singh @ 2026-01-08  0:56 UTC (permalink / raw)
  To: Zi Yan, Matthew Brost, Alistair Popple
  Cc: Francois Dugast, intel-xe, dri-devel, Andrew Morton, linux-mm,
	David Hildenbrand, Lorenzo Stoakes, Baolin Wang, Liam R. Howlett,
	Nico Pache, Ryan Roberts, Dev Jain, Barry Song, Lance Yang,
	Matthew Wilcox

On 1/8/26 08:03, Zi Yan wrote:
> On 7 Jan 2026, at 16:15, Matthew Brost wrote:
> 
>> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
>>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
>>>
>>>> +THP folks
>>>
>>> +willy, since he commented in another thread.
>>>
>>>>
>>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
>>>>
>>>>> From: Matthew Brost <matthew.brost@intel.com>
>>>>>
>>>>> Introduce migrate_device_split_page() to split a device page into
>>>>> lower-order pages. Used when a folio allocated as higher-order is freed
>>>>> and later reallocated at a smaller order by the driver memory manager.
>>>>>
>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>>> Cc: dri-devel@lists.freedesktop.org
>>>>> Cc: linux-mm@kvack.org
>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>>>>> ---
>>>>>  include/linux/huge_mm.h |  3 +++
>>>>>  include/linux/migrate.h |  1 +
>>>>>  mm/huge_memory.c        |  6 ++---
>>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index a4d9f964dfde..6ad8f359bc0d 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>>>  unsigned int min_order_for_split(struct folio *folio);
>>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>> +			   struct address_space *mapping, enum split_type split_type);
>>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>>>  			   enum split_type split_type);
>>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>>> index 26ca00c325d9..ec65e4fd5f88 100644
>>>>> --- a/include/linux/migrate.h
>>>>> +++ b/include/linux/migrate.h
>>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>>>>  			unsigned long npages);
>>>>>  void migrate_device_finalize(unsigned long *src_pfns,
>>>>>  			unsigned long *dst_pfns, unsigned long npages);
>>>>> +int migrate_device_split_page(struct page *page);
>>>>>
>>>>>  #endif /* CONFIG_MIGRATION */
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 40cf59301c21..7ded35a3ecec 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>>>>>   * split but not to @new_order, the caller needs to check)
>>>>>   */
>>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>> -		struct page *split_at, struct xa_state *xas,
>>>>> -		struct address_space *mapping, enum split_type split_type)
>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>> +			   struct address_space *mapping, enum split_type split_type)
>>>>>  {
>>>>>  	const bool is_anon = folio_test_anon(folio);
>>>>>  	int old_order = folio_order(folio);
>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>> index 23379663b1e1..eb0f0e938947 100644
>>>>> --- a/mm/migrate_device.c
>>>>> +++ b/mm/migrate_device.c
>>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>>>  EXPORT_SYMBOL(migrate_vma_setup);
>>>>>
>>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>>>>> +/**
>>>>> + * migrate_device_split_page() - Split device page
>>>>> + * @page: Device page to split
>>>>> + *
>>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
>>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
>>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
>>>>> + * pages within the folio). Expected to be called a free device page and
>>>>> + * restores all split out pages to a free state.
>>>>> + */
>>>
>>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
>>> page? A free page is not supposed to be a large folio, at least from a core
>>> MM point of view. __split_unmapped_folio() is intended to work on large folios
>>> (or compound pages), even if the input folio has refcount == 0 (because it is
>>> frozen).
>>>
>>
>> Well, then maybe this is a bug in core MM where the freed page is still
>> a THP. Let me explain the scenario and why this is needed from my POV.
>>
>> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
>> This is a shared pool between traditional DRM GEMs (buffer objects) and
>> SVM allocations (pages). It doesn’t have any view of the page backing—it
>> basically just hands back a pointer to VRAM space that we allocate from.
>> From that, if it’s an SVM allocation, we can derive the device pages.
>>
>> What I see happening is: a 2M buddy allocation occurs, we make the
>> backing device pages a large folio, and sometime later the folio
>> refcount goes to zero and we free the buddy allocation. Later, the buddy
>> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
>> backing pages are still a large folio. Here is where we need to split
> 
> I agree with you that it might be a bug in free_zone_device_folio() based
> on my understanding. Since zone_device_page_init() calls prep_compound_page()
> for >0 orders, but free_zone_device_folio() never reverse the process.
> 
> Balbir and Alistair might be able to help here.

I agree it's an API limitation

> 
> I cherry picked the code from __free_frozen_pages() to reverse the process.
> Can you give it a try to see if it solve the above issue? Thanks.
> 
> From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
> From: Zi Yan <ziy@nvidia.com>
> Date: Wed, 7 Jan 2026 16:49:52 -0500
> Subject: [PATCH] mm/memremap: free device private folio fix
> Content-Type: text/plain; charset="utf-8"
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  mm/memremap.c | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 63c6ab4fdf08..483666ff7271 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
>  		pgmap->ops->folio_free(folio);
>  		break;
>  	}
> +
> +	if (nr > 1) {
> +		struct page *head = folio_page(folio, 0);
> +
> +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> +#ifdef NR_PAGES_IN_LARGE_FOLIO
> +		folio->_nr_pages = 0;
> +#endif
> +		for (i = 1; i < nr; i++) {
> +			(head + i)->mapping = NULL;
> +			clear_compound_head(head + i);

I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
to invoke it even for zone device private pages

> +		}
> +		folio->mapping = NULL;

This is already done in free_zone_device_folio()

> +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;

I don't think this is required for zone device private folios, but I suppose it
keeps the code generic

> +	}
>  }
> 
>  void zone_device_page_init(struct page *page, unsigned int order)


Otherwise, it seems like the right way to solve the issue.

Balbir


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-08  0:56           ` Balbir Singh
@ 2026-01-08  2:17             ` Matthew Brost
  2026-01-08  2:53               ` Zi Yan
  0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2026-01-08  2:17 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Zi Yan, Alistair Popple, Francois Dugast, intel-xe, dri-devel,
	Andrew Morton, linux-mm, David Hildenbrand, Lorenzo Stoakes,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox

On Thu, Jan 08, 2026 at 11:56:03AM +1100, Balbir Singh wrote:
> On 1/8/26 08:03, Zi Yan wrote:
> > On 7 Jan 2026, at 16:15, Matthew Brost wrote:
> > 
> >> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
> >>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
> >>>
> >>>> +THP folks
> >>>
> >>> +willy, since he commented in another thread.
> >>>
> >>>>
> >>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
> >>>>
> >>>>> From: Matthew Brost <matthew.brost@intel.com>
> >>>>>
> >>>>> Introduce migrate_device_split_page() to split a device page into
> >>>>> lower-order pages. Used when a folio allocated as higher-order is freed
> >>>>> and later reallocated at a smaller order by the driver memory manager.
> >>>>>
> >>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>>>> Cc: Balbir Singh <balbirs@nvidia.com>
> >>>>> Cc: dri-devel@lists.freedesktop.org
> >>>>> Cc: linux-mm@kvack.org
> >>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> >>>>> ---
> >>>>>  include/linux/huge_mm.h |  3 +++
> >>>>>  include/linux/migrate.h |  1 +
> >>>>>  mm/huge_memory.c        |  6 ++---
> >>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
> >>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
> >>>>>
> >>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>> index a4d9f964dfde..6ad8f359bc0d 100644
> >>>>> --- a/include/linux/huge_mm.h
> >>>>> +++ b/include/linux/huge_mm.h
> >>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
> >>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> >>>>>  unsigned int min_order_for_split(struct folio *folio);
> >>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
> >>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>> +			   struct page *split_at, struct xa_state *xas,
> >>>>> +			   struct address_space *mapping, enum split_type split_type);
> >>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
> >>>>>  			   enum split_type split_type);
> >>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
> >>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >>>>> index 26ca00c325d9..ec65e4fd5f88 100644
> >>>>> --- a/include/linux/migrate.h
> >>>>> +++ b/include/linux/migrate.h
> >>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >>>>>  			unsigned long npages);
> >>>>>  void migrate_device_finalize(unsigned long *src_pfns,
> >>>>>  			unsigned long *dst_pfns, unsigned long npages);
> >>>>> +int migrate_device_split_page(struct page *page);
> >>>>>
> >>>>>  #endif /* CONFIG_MIGRATION */
> >>>>>
> >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>> index 40cf59301c21..7ded35a3ecec 100644
> >>>>> --- a/mm/huge_memory.c
> >>>>> +++ b/mm/huge_memory.c
> >>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
> >>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
> >>>>>   * split but not to @new_order, the caller needs to check)
> >>>>>   */
> >>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>> -		struct page *split_at, struct xa_state *xas,
> >>>>> -		struct address_space *mapping, enum split_type split_type)
> >>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>> +			   struct page *split_at, struct xa_state *xas,
> >>>>> +			   struct address_space *mapping, enum split_type split_type)
> >>>>>  {
> >>>>>  	const bool is_anon = folio_test_anon(folio);
> >>>>>  	int old_order = folio_order(folio);
> >>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >>>>> index 23379663b1e1..eb0f0e938947 100644
> >>>>> --- a/mm/migrate_device.c
> >>>>> +++ b/mm/migrate_device.c
> >>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
> >>>>>  EXPORT_SYMBOL(migrate_vma_setup);
> >>>>>
> >>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> >>>>> +/**
> >>>>> + * migrate_device_split_page() - Split device page
> >>>>> + * @page: Device page to split
> >>>>> + *
> >>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
> >>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
> >>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
> >>>>> + * pages within the folio). Expected to be called a free device page and
> >>>>> + * restores all split out pages to a free state.
> >>>>> + */
> >>>
> >>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
> >>> page? A free page is not supposed to be a large folio, at least from a core
> >>> MM point of view. __split_unmapped_folio() is intended to work on large folios
> >>> (or compound pages), even if the input folio has refcount == 0 (because it is
> >>> frozen).
> >>>
> >>
> >> Well, then maybe this is a bug in core MM where the freed page is still
> >> a THP. Let me explain the scenario and why this is needed from my POV.
> >>
> >> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
> >> This is a shared pool between traditional DRM GEMs (buffer objects) and
> >> SVM allocations (pages). It doesn’t have any view of the page backing—it
> >> basically just hands back a pointer to VRAM space that we allocate from.
> >> From that, if it’s an SVM allocation, we can derive the device pages.
> >>
> >> What I see happening is: a 2M buddy allocation occurs, we make the
> >> backing device pages a large folio, and sometime later the folio
> >> refcount goes to zero and we free the buddy allocation. Later, the buddy
> >> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
> >> backing pages are still a large folio. Here is where we need to split
> > 
> > I agree with you that it might be a bug in free_zone_device_folio() based
> > on my understanding. Since zone_device_page_init() calls prep_compound_page()
> > for >0 orders, but free_zone_device_folio() never reverse the process.
> > 
> > Balbir and Alistair might be able to help here.
> 
> I agree it's an API limitation
> 
> > 
> > I cherry picked the code from __free_frozen_pages() to reverse the process.
> > Can you give it a try to see if it solve the above issue? Thanks.
> > 
> > From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
> > From: Zi Yan <ziy@nvidia.com>
> > Date: Wed, 7 Jan 2026 16:49:52 -0500
> > Subject: [PATCH] mm/memremap: free device private folio fix
> > Content-Type: text/plain; charset="utf-8"
> > 
> > Signed-off-by: Zi Yan <ziy@nvidia.com>
> > ---
> >  mm/memremap.c | 15 +++++++++++++++
> >  1 file changed, 15 insertions(+)
> > 
> > diff --git a/mm/memremap.c b/mm/memremap.c
> > index 63c6ab4fdf08..483666ff7271 100644
> > --- a/mm/memremap.c
> > +++ b/mm/memremap.c
> > @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
> >  		pgmap->ops->folio_free(folio);
> >  		break;
> >  	}
> > +
> > +	if (nr > 1) {
> > +		struct page *head = folio_page(folio, 0);
> > +
> > +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> > +#ifdef NR_PAGES_IN_LARGE_FOLIO
> > +		folio->_nr_pages = 0;
> > +#endif
> > +		for (i = 1; i < nr; i++) {
> > +			(head + i)->mapping = NULL;
> > +			clear_compound_head(head + i);
> 
> I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
> to invoke it even for zone device private pages
> 
> > +		}
> > +		folio->mapping = NULL;
> 
> This is already done in free_zone_device_folio()
> 
> > +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> 
> I don't think this is required for zone device private folios, but I suppose it
> keeps the code generic
> 

Well, the above code doesn’t work, but I think it’s the right idea.
clear_compound_head aliases to pgmap, which we don’t want to be NULL. I
believe the individual pages likely need their flags cleared (?), and
this step must be done before calling folio_free and include a barrier,
as the page can be immediately reallocated.

So here’s what I came up with, and it seems to work (for Xe, GPU SVM):

 mm/memremap.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/mm/memremap.c b/mm/memremap.c
index 63c6ab4fdf08..ac20abb6a441 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -448,6 +448,27 @@ void free_zone_device_folio(struct folio *folio)
            pgmap->type != MEMORY_DEVICE_GENERIC)
                folio->mapping = NULL;

+       if (nr > 1) {
+               struct page *head = folio_page(folio, 0);
+
+               head[1].flags.f &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+               folio->_nr_pages = 0;
+#endif
+               for (i = 1; i < nr; i++) {
+                       struct folio *new_folio = (struct folio *)(head + i);
+
+                       (head + i)->mapping = NULL;
+                       (head + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+
+                       /* Overwrite compound_head with pgmap */
+                       new_folio->pgmap = pgmap;
+               }
+
+               head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+               smp_wmb();	/* Changes but be visable before freeing folio */
+       }
+
        switch (pgmap->type) {
        case MEMORY_DEVICE_PRIVATE:
        case MEMORY_DEVICE_COHERENT:

> > +	}
> >  }
> > 
> >  void zone_device_page_init(struct page *page, unsigned int order)
> 
> 
> Otherwise, it seems like the right way to solve the issue.
> 

My question is: why isn’t Nouveau hitting this issue, or your Nvidia
out-of-tree driver (lack of testing, Xe's test suite coverage is quite
good at finding corners).

Also, will this change in behavior break either ofthose drivers?

Matt

> Balbir


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-08  2:17             ` Matthew Brost
@ 2026-01-08  2:53               ` Zi Yan
  2026-01-08  3:14                 ` Alistair Popple
  0 siblings, 1 reply; 15+ messages in thread
From: Zi Yan @ 2026-01-08  2:53 UTC (permalink / raw)
  To: Matthew Brost, Balbir Singh
  Cc: Alistair Popple, Francois Dugast, intel-xe, dri-devel,
	Andrew Morton, linux-mm, David Hildenbrand, Lorenzo Stoakes,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox

On 7 Jan 2026, at 21:17, Matthew Brost wrote:

> On Thu, Jan 08, 2026 at 11:56:03AM +1100, Balbir Singh wrote:
>> On 1/8/26 08:03, Zi Yan wrote:
>>> On 7 Jan 2026, at 16:15, Matthew Brost wrote:
>>>
>>>> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
>>>>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
>>>>>
>>>>>> +THP folks
>>>>>
>>>>> +willy, since he commented in another thread.
>>>>>
>>>>>>
>>>>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
>>>>>>
>>>>>>> From: Matthew Brost <matthew.brost@intel.com>
>>>>>>>
>>>>>>> Introduce migrate_device_split_page() to split a device page into
>>>>>>> lower-order pages. Used when a folio allocated as higher-order is freed
>>>>>>> and later reallocated at a smaller order by the driver memory manager.
>>>>>>>
>>>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>>>>> Cc: dri-devel@lists.freedesktop.org
>>>>>>> Cc: linux-mm@kvack.org
>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>>>>>>> ---
>>>>>>>  include/linux/huge_mm.h |  3 +++
>>>>>>>  include/linux/migrate.h |  1 +
>>>>>>>  mm/huge_memory.c        |  6 ++---
>>>>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>> index a4d9f964dfde..6ad8f359bc0d 100644
>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>>>>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>>>>>  unsigned int min_order_for_split(struct folio *folio);
>>>>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>>>> +			   struct address_space *mapping, enum split_type split_type);
>>>>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>>>>>  			   enum split_type split_type);
>>>>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
>>>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>>>>> index 26ca00c325d9..ec65e4fd5f88 100644
>>>>>>> --- a/include/linux/migrate.h
>>>>>>> +++ b/include/linux/migrate.h
>>>>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>>>>>>  			unsigned long npages);
>>>>>>>  void migrate_device_finalize(unsigned long *src_pfns,
>>>>>>>  			unsigned long *dst_pfns, unsigned long npages);
>>>>>>> +int migrate_device_split_page(struct page *page);
>>>>>>>
>>>>>>>  #endif /* CONFIG_MIGRATION */
>>>>>>>
>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>> index 40cf59301c21..7ded35a3ecec 100644
>>>>>>> --- a/mm/huge_memory.c
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>>>>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>>>>>>>   * split but not to @new_order, the caller needs to check)
>>>>>>>   */
>>>>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>> -		struct page *split_at, struct xa_state *xas,
>>>>>>> -		struct address_space *mapping, enum split_type split_type)
>>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>>>> +			   struct address_space *mapping, enum split_type split_type)
>>>>>>>  {
>>>>>>>  	const bool is_anon = folio_test_anon(folio);
>>>>>>>  	int old_order = folio_order(folio);
>>>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>>>> index 23379663b1e1..eb0f0e938947 100644
>>>>>>> --- a/mm/migrate_device.c
>>>>>>> +++ b/mm/migrate_device.c
>>>>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>>>>>  EXPORT_SYMBOL(migrate_vma_setup);
>>>>>>>
>>>>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>>>>>>> +/**
>>>>>>> + * migrate_device_split_page() - Split device page
>>>>>>> + * @page: Device page to split
>>>>>>> + *
>>>>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
>>>>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
>>>>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
>>>>>>> + * pages within the folio). Expected to be called a free device page and
>>>>>>> + * restores all split out pages to a free state.
>>>>>>> + */
>>>>>
>>>>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
>>>>> page? A free page is not supposed to be a large folio, at least from a core
>>>>> MM point of view. __split_unmapped_folio() is intended to work on large folios
>>>>> (or compound pages), even if the input folio has refcount == 0 (because it is
>>>>> frozen).
>>>>>
>>>>
>>>> Well, then maybe this is a bug in core MM where the freed page is still
>>>> a THP. Let me explain the scenario and why this is needed from my POV.
>>>>
>>>> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
>>>> This is a shared pool between traditional DRM GEMs (buffer objects) and
>>>> SVM allocations (pages). It doesn’t have any view of the page backing—it
>>>> basically just hands back a pointer to VRAM space that we allocate from.
>>>> From that, if it’s an SVM allocation, we can derive the device pages.
>>>>
>>>> What I see happening is: a 2M buddy allocation occurs, we make the
>>>> backing device pages a large folio, and sometime later the folio
>>>> refcount goes to zero and we free the buddy allocation. Later, the buddy
>>>> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
>>>> backing pages are still a large folio. Here is where we need to split
>>>
>>> I agree with you that it might be a bug in free_zone_device_folio() based
>>> on my understanding. Since zone_device_page_init() calls prep_compound_page()
>>> for >0 orders, but free_zone_device_folio() never reverse the process.
>>>
>>> Balbir and Alistair might be able to help here.
>>
>> I agree it's an API limitation

I am not sure. If free_zone_device_folio() does not get rid of large folio
metadata, there is no guarantee that a freed large device private folio will
be reallocated as a large device private folio. And when mTHP support is
added, the folio order might change too. That can cause issues when
compound_head() is called on a tail page of a previously large folio, since
compound_head() will return the old head page instead of the tail page itself.

>>
>>>
>>> I cherry picked the code from __free_frozen_pages() to reverse the process.
>>> Can you give it a try to see if it solve the above issue? Thanks.
>>>
>>> From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
>>> From: Zi Yan <ziy@nvidia.com>
>>> Date: Wed, 7 Jan 2026 16:49:52 -0500
>>> Subject: [PATCH] mm/memremap: free device private folio fix
>>> Content-Type: text/plain; charset="utf-8"
>>>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>  mm/memremap.c | 15 +++++++++++++++
>>>  1 file changed, 15 insertions(+)
>>>
>>> diff --git a/mm/memremap.c b/mm/memremap.c
>>> index 63c6ab4fdf08..483666ff7271 100644
>>> --- a/mm/memremap.c
>>> +++ b/mm/memremap.c
>>> @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
>>>  		pgmap->ops->folio_free(folio);
>>>  		break;
>>>  	}
>>> +
>>> +	if (nr > 1) {
>>> +		struct page *head = folio_page(folio, 0);
>>> +
>>> +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
>>> +#ifdef NR_PAGES_IN_LARGE_FOLIO
>>> +		folio->_nr_pages = 0;
>>> +#endif
>>> +		for (i = 1; i < nr; i++) {
>>> +			(head + i)->mapping = NULL;
>>> +			clear_compound_head(head + i);
>>
>> I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
>> to invoke it even for zone device private pages

I am not sure about what part of compound page is also used in device private folio.
Yes, it is better to add right checks.

>>
>>> +		}
>>> +		folio->mapping = NULL;
>>
>> This is already done in free_zone_device_folio()
>>
>>> +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
>>
>> I don't think this is required for zone device private folios, but I suppose it
>> keeps the code generic
>>
>
> Well, the above code doesn’t work, but I think it’s the right idea.
> clear_compound_head aliases to pgmap, which we don’t want to be NULL. I

Thank you for pointing it out. I am not familiar with device private page code.

> believe the individual pages likely need their flags cleared (?), and

Yes, I missed the tail page flag clearing part.

> this step must be done before calling folio_free and include a barrier,
> as the page can be immediately reallocated.
>
> So here’s what I came up with, and it seems to work (for Xe, GPU SVM):
>
>  mm/memremap.c | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
>
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 63c6ab4fdf08..ac20abb6a441 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -448,6 +448,27 @@ void free_zone_device_folio(struct folio *folio)
>             pgmap->type != MEMORY_DEVICE_GENERIC)
>                 folio->mapping = NULL;
>
> +       if (nr > 1) {
> +               struct page *head = folio_page(folio, 0);
> +
> +               head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> +#ifdef NR_PAGES_IN_LARGE_FOLIO
> +               folio->_nr_pages = 0;
> +#endif
> +               for (i = 1; i < nr; i++) {
> +                       struct folio *new_folio = (struct folio *)(head + i);
> +
> +                       (head + i)->mapping = NULL;
> +                       (head + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> +
> +                       /* Overwrite compound_head with pgmap */
> +                       new_folio->pgmap = pgmap;
> +               }
> +
> +               head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> +               smp_wmb();	/* Changes but be visable before freeing folio */
> +       }
> +
>         switch (pgmap->type) {
>         case MEMORY_DEVICE_PRIVATE:
>         case MEMORY_DEVICE_COHERENT:
>

It looks good to me, but I am very likely missing the detail on device private
pages. Like Balbir pointed out above, for tail pages, calling
free_tail_page_prepare() might be better to get sanity checks like normal
large folio, although you will need to set ->pgmap after it.

It is better to send it as a proper patch and get reviews from other
MM folks.

>>> +	}
>>>  }
>>>
>>>  void zone_device_page_init(struct page *page, unsigned int order)
>>
>>
>> Otherwise, it seems like the right way to solve the issue.
>>
>
> My question is: why isn’t Nouveau hitting this issue, or your Nvidia
> out-of-tree driver (lack of testing, Xe's test suite coverage is quite
> good at finding corners).
>
> Also, will this change in behavior break either ofthose drivers?
>
> Matt
>
>> Balbir


Best Regards,
Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-08  2:53               ` Zi Yan
@ 2026-01-08  3:14                 ` Alistair Popple
  2026-01-08  3:42                   ` Matthew Brost
  0 siblings, 1 reply; 15+ messages in thread
From: Alistair Popple @ 2026-01-08  3:14 UTC (permalink / raw)
  To: Zi Yan
  Cc: Matthew Brost, Balbir Singh, Francois Dugast, intel-xe,
	dri-devel, Andrew Morton, linux-mm, David Hildenbrand,
	Lorenzo Stoakes, Baolin Wang, Liam R. Howlett, Nico Pache,
	Ryan Roberts, Dev Jain, Barry Song, Lance Yang, Matthew Wilcox

On 2026-01-08 at 13:53 +1100, Zi Yan <ziy@nvidia.com> wrote...
> On 7 Jan 2026, at 21:17, Matthew Brost wrote:
> 
> > On Thu, Jan 08, 2026 at 11:56:03AM +1100, Balbir Singh wrote:
> >> On 1/8/26 08:03, Zi Yan wrote:
> >>> On 7 Jan 2026, at 16:15, Matthew Brost wrote:
> >>>
> >>>> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
> >>>>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
> >>>>>
> >>>>>> +THP folks
> >>>>>
> >>>>> +willy, since he commented in another thread.
> >>>>>
> >>>>>>
> >>>>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
> >>>>>>
> >>>>>>> From: Matthew Brost <matthew.brost@intel.com>
> >>>>>>>
> >>>>>>> Introduce migrate_device_split_page() to split a device page into
> >>>>>>> lower-order pages. Used when a folio allocated as higher-order is freed
> >>>>>>> and later reallocated at a smaller order by the driver memory manager.
> >>>>>>>
> >>>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> >>>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
> >>>>>>> Cc: dri-devel@lists.freedesktop.org
> >>>>>>> Cc: linux-mm@kvack.org
> >>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> >>>>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> >>>>>>> ---
> >>>>>>>  include/linux/huge_mm.h |  3 +++
> >>>>>>>  include/linux/migrate.h |  1 +
> >>>>>>>  mm/huge_memory.c        |  6 ++---
> >>>>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
> >>>>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>> index a4d9f964dfde..6ad8f359bc0d 100644
> >>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
> >>>>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> >>>>>>>  unsigned int min_order_for_split(struct folio *folio);
> >>>>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
> >>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>>>> +			   struct page *split_at, struct xa_state *xas,
> >>>>>>> +			   struct address_space *mapping, enum split_type split_type);
> >>>>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
> >>>>>>>  			   enum split_type split_type);
> >>>>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
> >>>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> >>>>>>> index 26ca00c325d9..ec65e4fd5f88 100644
> >>>>>>> --- a/include/linux/migrate.h
> >>>>>>> +++ b/include/linux/migrate.h
> >>>>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> >>>>>>>  			unsigned long npages);
> >>>>>>>  void migrate_device_finalize(unsigned long *src_pfns,
> >>>>>>>  			unsigned long *dst_pfns, unsigned long npages);
> >>>>>>> +int migrate_device_split_page(struct page *page);
> >>>>>>>
> >>>>>>>  #endif /* CONFIG_MIGRATION */
> >>>>>>>
> >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>>> index 40cf59301c21..7ded35a3ecec 100644
> >>>>>>> --- a/mm/huge_memory.c
> >>>>>>> +++ b/mm/huge_memory.c
> >>>>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
> >>>>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
> >>>>>>>   * split but not to @new_order, the caller needs to check)
> >>>>>>>   */
> >>>>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>>>> -		struct page *split_at, struct xa_state *xas,
> >>>>>>> -		struct address_space *mapping, enum split_type split_type)
> >>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> >>>>>>> +			   struct page *split_at, struct xa_state *xas,
> >>>>>>> +			   struct address_space *mapping, enum split_type split_type)
> >>>>>>>  {
> >>>>>>>  	const bool is_anon = folio_test_anon(folio);
> >>>>>>>  	int old_order = folio_order(folio);
> >>>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >>>>>>> index 23379663b1e1..eb0f0e938947 100644
> >>>>>>> --- a/mm/migrate_device.c
> >>>>>>> +++ b/mm/migrate_device.c
> >>>>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
> >>>>>>>  EXPORT_SYMBOL(migrate_vma_setup);
> >>>>>>>
> >>>>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> >>>>>>> +/**
> >>>>>>> + * migrate_device_split_page() - Split device page
> >>>>>>> + * @page: Device page to split
> >>>>>>> + *
> >>>>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
> >>>>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
> >>>>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
> >>>>>>> + * pages within the folio). Expected to be called a free device page and
> >>>>>>> + * restores all split out pages to a free state.
> >>>>>>> + */
> >>>>>
> >>>>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
> >>>>> page? A free page is not supposed to be a large folio, at least from a core
> >>>>> MM point of view. __split_unmapped_folio() is intended to work on large folios
> >>>>> (or compound pages), even if the input folio has refcount == 0 (because it is
> >>>>> frozen).
> >>>>>
> >>>>
> >>>> Well, then maybe this is a bug in core MM where the freed page is still
> >>>> a THP. Let me explain the scenario and why this is needed from my POV.
> >>>>
> >>>> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
> >>>> This is a shared pool between traditional DRM GEMs (buffer objects) and
> >>>> SVM allocations (pages). It doesn’t have any view of the page backing—it
> >>>> basically just hands back a pointer to VRAM space that we allocate from.
> >>>> From that, if it’s an SVM allocation, we can derive the device pages.
> >>>>
> >>>> What I see happening is: a 2M buddy allocation occurs, we make the
> >>>> backing device pages a large folio, and sometime later the folio
> >>>> refcount goes to zero and we free the buddy allocation. Later, the buddy
> >>>> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
> >>>> backing pages are still a large folio. Here is where we need to split
> >>>
> >>> I agree with you that it might be a bug in free_zone_device_folio() based
> >>> on my understanding. Since zone_device_page_init() calls prep_compound_page()
> >>> for >0 orders, but free_zone_device_folio() never reverse the process.
> >>>
> >>> Balbir and Alistair might be able to help here.

Just catching up after the Christmas break.

> >>
> >> I agree it's an API limitation
> 
> I am not sure. If free_zone_device_folio() does not get rid of large folio
> metadata, there is no guarantee that a freed large device private folio will
> be reallocated as a large device private folio. And when mTHP support is
> added, the folio order might change too. That can cause issues when
> compound_head() is called on a tail page of a previously large folio, since
> compound_head() will return the old head page instead of the tail page itself.

I agree that freeing the device folio should get rid of the large folio. That
would also keep it consistent with what we do for FS DAX for example.

> >>
> >>>
> >>> I cherry picked the code from __free_frozen_pages() to reverse the process.
> >>> Can you give it a try to see if it solve the above issue? Thanks.

It would be nice if this could be a common helper for freeing compound
ZONE_DEVICE pages. FS DAX already has this for example:

static inline unsigned long dax_folio_put(struct folio *folio)
{
	unsigned long ref;
	int order, i;

	if (!dax_folio_is_shared(folio))
		ref = 0;
	else
		ref = --folio->share;

	if (ref)
		return ref;

	folio->mapping = NULL;
	order = folio_order(folio);
	if (!order)
		return 0;
	folio_reset_order(folio);

	for (i = 0; i < (1UL << order); i++) {
		struct dev_pagemap *pgmap = page_pgmap(&folio->page);
		struct page *page = folio_page(folio, i);
		struct folio *new_folio = (struct folio *)page;

		ClearPageHead(page);
		clear_compound_head(page);

		new_folio->mapping = NULL;
		/*
		 * Reset pgmap which was over-written by
		 * prep_compound_page().
		 */
		new_folio->pgmap = pgmap;
		new_folio->share = 0;
		WARN_ON_ONCE(folio_ref_count(new_folio));
	}

	return ref;
}

Aside from the weird refcount checks that FS DAX needs to at the start of this
function I don't think there is anything specific to DEVICE_PRIVATE pages there.

> >>>
> >>> From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
> >>> From: Zi Yan <ziy@nvidia.com>
> >>> Date: Wed, 7 Jan 2026 16:49:52 -0500
> >>> Subject: [PATCH] mm/memremap: free device private folio fix
> >>> Content-Type: text/plain; charset="utf-8"
> >>>
> >>> Signed-off-by: Zi Yan <ziy@nvidia.com>
> >>> ---
> >>>  mm/memremap.c | 15 +++++++++++++++
> >>>  1 file changed, 15 insertions(+)
> >>>
> >>> diff --git a/mm/memremap.c b/mm/memremap.c
> >>> index 63c6ab4fdf08..483666ff7271 100644
> >>> --- a/mm/memremap.c
> >>> +++ b/mm/memremap.c
> >>> @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
> >>>  		pgmap->ops->folio_free(folio);
> >>>  		break;
> >>>  	}
> >>> +
> >>> +	if (nr > 1) {
> >>> +		struct page *head = folio_page(folio, 0);
> >>> +
> >>> +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> >>> +#ifdef NR_PAGES_IN_LARGE_FOLIO
> >>> +		folio->_nr_pages = 0;
> >>> +#endif
> >>> +		for (i = 1; i < nr; i++) {
> >>> +			(head + i)->mapping = NULL;
> >>> +			clear_compound_head(head + i);
> >>
> >> I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
> >> to invoke it even for zone device private pages
> 
> I am not sure about what part of compound page is also used in device private folio.
> Yes, it is better to add right checks.
> 
> >>
> >>> +		}
> >>> +		folio->mapping = NULL;
> >>
> >> This is already done in free_zone_device_folio()
> >>
> >>> +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> >>
> >> I don't think this is required for zone device private folios, but I suppose it
> >> keeps the code generic
> >>
> >
> > Well, the above code doesn’t work, but I think it’s the right idea.
> > clear_compound_head aliases to pgmap, which we don’t want to be NULL. I
> 
> Thank you for pointing it out. I am not familiar with device private page code.
> 
> > believe the individual pages likely need their flags cleared (?), and
> 
> Yes, I missed the tail page flag clearing part.
> 
> > this step must be done before calling folio_free and include a barrier,
> > as the page can be immediately reallocated.
> >
> > So here’s what I came up with, and it seems to work (for Xe, GPU SVM):
> >
> >  mm/memremap.c | 21 +++++++++++++++++++++
> >  1 file changed, 21 insertions(+)
> >
> > diff --git a/mm/memremap.c b/mm/memremap.c
> > index 63c6ab4fdf08..ac20abb6a441 100644
> > --- a/mm/memremap.c
> > +++ b/mm/memremap.c
> > @@ -448,6 +448,27 @@ void free_zone_device_folio(struct folio *folio)
> >             pgmap->type != MEMORY_DEVICE_GENERIC)
> >                 folio->mapping = NULL;
> >
> > +       if (nr > 1) {
> > +               struct page *head = folio_page(folio, 0);
> > +
> > +               head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> > +#ifdef NR_PAGES_IN_LARGE_FOLIO
> > +               folio->_nr_pages = 0;
> > +#endif
> > +               for (i = 1; i < nr; i++) {
> > +                       struct folio *new_folio = (struct folio *)(head + i);
> > +
> > +                       (head + i)->mapping = NULL;
> > +                       (head + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > +
> > +                       /* Overwrite compound_head with pgmap */
> > +                       new_folio->pgmap = pgmap;
> > +               }
> > +
> > +               head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > +               smp_wmb();	/* Changes but be visable before freeing folio */
> > +       }
> > +
> >         switch (pgmap->type) {
> >         case MEMORY_DEVICE_PRIVATE:
> >         case MEMORY_DEVICE_COHERENT:
> >
> 
> It looks good to me, but I am very likely missing the detail on device private
> pages. Like Balbir pointed out above, for tail pages, calling
> free_tail_page_prepare() might be better to get sanity checks like normal
> large folio, although you will need to set ->pgmap after it.
> 
> It is better to send it as a proper patch and get reviews from other
> MM folks.
> 
> >>> +	}
> >>>  }
> >>>
> >>>  void zone_device_page_init(struct page *page, unsigned int order)
> >>
> >>
> >> Otherwise, it seems like the right way to solve the issue.
> >>
> >
> > My question is: why isn’t Nouveau hitting this issue, or your Nvidia
> > out-of-tree driver (lack of testing, Xe's test suite coverage is quite
> > good at finding corners).
> >
> > Also, will this change in behavior break either ofthose drivers?
> >
> > Matt
> >
> >> Balbir
> 
> 
> Best Regards,
> Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-08  3:14                 ` Alistair Popple
@ 2026-01-08  3:42                   ` Matthew Brost
  2026-01-08  4:47                     ` Balbir Singh
  0 siblings, 1 reply; 15+ messages in thread
From: Matthew Brost @ 2026-01-08  3:42 UTC (permalink / raw)
  To: Alistair Popple
  Cc: Zi Yan, Balbir Singh, Francois Dugast, intel-xe, dri-devel,
	Andrew Morton, linux-mm, David Hildenbrand, Lorenzo Stoakes,
	Baolin Wang, Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain,
	Barry Song, Lance Yang, Matthew Wilcox

On Thu, Jan 08, 2026 at 02:14:28PM +1100, Alistair Popple wrote:
> On 2026-01-08 at 13:53 +1100, Zi Yan <ziy@nvidia.com> wrote...
> > On 7 Jan 2026, at 21:17, Matthew Brost wrote:
> > 
> > > On Thu, Jan 08, 2026 at 11:56:03AM +1100, Balbir Singh wrote:
> > >> On 1/8/26 08:03, Zi Yan wrote:
> > >>> On 7 Jan 2026, at 16:15, Matthew Brost wrote:
> > >>>
> > >>>> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
> > >>>>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
> > >>>>>
> > >>>>>> +THP folks
> > >>>>>
> > >>>>> +willy, since he commented in another thread.
> > >>>>>
> > >>>>>>
> > >>>>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
> > >>>>>>
> > >>>>>>> From: Matthew Brost <matthew.brost@intel.com>
> > >>>>>>>
> > >>>>>>> Introduce migrate_device_split_page() to split a device page into
> > >>>>>>> lower-order pages. Used when a folio allocated as higher-order is freed
> > >>>>>>> and later reallocated at a smaller order by the driver memory manager.
> > >>>>>>>
> > >>>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
> > >>>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
> > >>>>>>> Cc: dri-devel@lists.freedesktop.org
> > >>>>>>> Cc: linux-mm@kvack.org
> > >>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > >>>>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
> > >>>>>>> ---
> > >>>>>>>  include/linux/huge_mm.h |  3 +++
> > >>>>>>>  include/linux/migrate.h |  1 +
> > >>>>>>>  mm/huge_memory.c        |  6 ++---
> > >>>>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
> > >>>>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
> > >>>>>>>
> > >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > >>>>>>> index a4d9f964dfde..6ad8f359bc0d 100644
> > >>>>>>> --- a/include/linux/huge_mm.h
> > >>>>>>> +++ b/include/linux/huge_mm.h
> > >>>>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
> > >>>>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> > >>>>>>>  unsigned int min_order_for_split(struct folio *folio);
> > >>>>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
> > >>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> > >>>>>>> +			   struct page *split_at, struct xa_state *xas,
> > >>>>>>> +			   struct address_space *mapping, enum split_type split_type);
> > >>>>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
> > >>>>>>>  			   enum split_type split_type);
> > >>>>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
> > >>>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > >>>>>>> index 26ca00c325d9..ec65e4fd5f88 100644
> > >>>>>>> --- a/include/linux/migrate.h
> > >>>>>>> +++ b/include/linux/migrate.h
> > >>>>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
> > >>>>>>>  			unsigned long npages);
> > >>>>>>>  void migrate_device_finalize(unsigned long *src_pfns,
> > >>>>>>>  			unsigned long *dst_pfns, unsigned long npages);
> > >>>>>>> +int migrate_device_split_page(struct page *page);
> > >>>>>>>
> > >>>>>>>  #endif /* CONFIG_MIGRATION */
> > >>>>>>>
> > >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >>>>>>> index 40cf59301c21..7ded35a3ecec 100644
> > >>>>>>> --- a/mm/huge_memory.c
> > >>>>>>> +++ b/mm/huge_memory.c
> > >>>>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
> > >>>>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
> > >>>>>>>   * split but not to @new_order, the caller needs to check)
> > >>>>>>>   */
> > >>>>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
> > >>>>>>> -		struct page *split_at, struct xa_state *xas,
> > >>>>>>> -		struct address_space *mapping, enum split_type split_type)
> > >>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
> > >>>>>>> +			   struct page *split_at, struct xa_state *xas,
> > >>>>>>> +			   struct address_space *mapping, enum split_type split_type)
> > >>>>>>>  {
> > >>>>>>>  	const bool is_anon = folio_test_anon(folio);
> > >>>>>>>  	int old_order = folio_order(folio);
> > >>>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > >>>>>>> index 23379663b1e1..eb0f0e938947 100644
> > >>>>>>> --- a/mm/migrate_device.c
> > >>>>>>> +++ b/mm/migrate_device.c
> > >>>>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
> > >>>>>>>  EXPORT_SYMBOL(migrate_vma_setup);
> > >>>>>>>
> > >>>>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > >>>>>>> +/**
> > >>>>>>> + * migrate_device_split_page() - Split device page
> > >>>>>>> + * @page: Device page to split
> > >>>>>>> + *
> > >>>>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
> > >>>>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
> > >>>>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
> > >>>>>>> + * pages within the folio). Expected to be called a free device page and
> > >>>>>>> + * restores all split out pages to a free state.
> > >>>>>>> + */
> > >>>>>
> > >>>>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
> > >>>>> page? A free page is not supposed to be a large folio, at least from a core
> > >>>>> MM point of view. __split_unmapped_folio() is intended to work on large folios
> > >>>>> (or compound pages), even if the input folio has refcount == 0 (because it is
> > >>>>> frozen).
> > >>>>>
> > >>>>
> > >>>> Well, then maybe this is a bug in core MM where the freed page is still
> > >>>> a THP. Let me explain the scenario and why this is needed from my POV.
> > >>>>
> > >>>> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
> > >>>> This is a shared pool between traditional DRM GEMs (buffer objects) and
> > >>>> SVM allocations (pages). It doesn’t have any view of the page backing—it
> > >>>> basically just hands back a pointer to VRAM space that we allocate from.
> > >>>> From that, if it’s an SVM allocation, we can derive the device pages.
> > >>>>
> > >>>> What I see happening is: a 2M buddy allocation occurs, we make the
> > >>>> backing device pages a large folio, and sometime later the folio
> > >>>> refcount goes to zero and we free the buddy allocation. Later, the buddy
> > >>>> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
> > >>>> backing pages are still a large folio. Here is where we need to split
> > >>>
> > >>> I agree with you that it might be a bug in free_zone_device_folio() based
> > >>> on my understanding. Since zone_device_page_init() calls prep_compound_page()
> > >>> for >0 orders, but free_zone_device_folio() never reverse the process.
> > >>>
> > >>> Balbir and Alistair might be able to help here.
> 
> Just catching up after the Christmas break.
> 

I think everyone is and scrambling for the release PR. :)

> > >>
> > >> I agree it's an API limitation
> > 
> > I am not sure. If free_zone_device_folio() does not get rid of large folio
> > metadata, there is no guarantee that a freed large device private folio will
> > be reallocated as a large device private folio. And when mTHP support is
> > added, the folio order might change too. That can cause issues when
> > compound_head() is called on a tail page of a previously large folio, since
> > compound_head() will return the old head page instead of the tail page itself.
> 
> I agree that freeing the device folio should get rid of the large folio. That
> would also keep it consistent with what we do for FS DAX for example.
> 

+1

> > >>
> > >>>
> > >>> I cherry picked the code from __free_frozen_pages() to reverse the process.
> > >>> Can you give it a try to see if it solve the above issue? Thanks.
> 
> It would be nice if this could be a common helper for freeing compound
> ZONE_DEVICE pages. FS DAX already has this for example:
> 
> static inline unsigned long dax_folio_put(struct folio *folio)
> {
> 	unsigned long ref;
> 	int order, i;
> 
> 	if (!dax_folio_is_shared(folio))
> 		ref = 0;
> 	else
> 		ref = --folio->share;
> 
> 	if (ref)
> 		return ref;
> 
> 	folio->mapping = NULL;
> 	order = folio_order(folio);
> 	if (!order)
> 		return 0;
> 	folio_reset_order(folio);
> 
> 	for (i = 0; i < (1UL << order); i++) {
> 		struct dev_pagemap *pgmap = page_pgmap(&folio->page);
> 		struct page *page = folio_page(folio, i);
> 		struct folio *new_folio = (struct folio *)page;
> 
> 		ClearPageHead(page);
> 		clear_compound_head(page);
> 
> 		new_folio->mapping = NULL;
> 		/*
> 		 * Reset pgmap which was over-written by
> 		 * prep_compound_page().
> 		 */
> 		new_folio->pgmap = pgmap;
> 		new_folio->share = 0;
> 		WARN_ON_ONCE(folio_ref_count(new_folio));
> 	}
> 
> 	return ref;
> }
> 
> Aside from the weird refcount checks that FS DAX needs to at the start of this
> function I don't think there is anything specific to DEVICE_PRIVATE pages there.
> 

Thanks for the reference, Alistair. This looks roughly like what I
hacked together in an effort to just get something working. I believe a
common helper can be made to work. Let me churn on this tomorrow and put
together a proper patch.

> > >>>
> > >>> From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
> > >>> From: Zi Yan <ziy@nvidia.com>
> > >>> Date: Wed, 7 Jan 2026 16:49:52 -0500
> > >>> Subject: [PATCH] mm/memremap: free device private folio fix
> > >>> Content-Type: text/plain; charset="utf-8"
> > >>>
> > >>> Signed-off-by: Zi Yan <ziy@nvidia.com>
> > >>> ---
> > >>>  mm/memremap.c | 15 +++++++++++++++
> > >>>  1 file changed, 15 insertions(+)
> > >>>
> > >>> diff --git a/mm/memremap.c b/mm/memremap.c
> > >>> index 63c6ab4fdf08..483666ff7271 100644
> > >>> --- a/mm/memremap.c
> > >>> +++ b/mm/memremap.c
> > >>> @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
> > >>>  		pgmap->ops->folio_free(folio);
> > >>>  		break;
> > >>>  	}
> > >>> +
> > >>> +	if (nr > 1) {
> > >>> +		struct page *head = folio_page(folio, 0);
> > >>> +
> > >>> +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> > >>> +#ifdef NR_PAGES_IN_LARGE_FOLIO
> > >>> +		folio->_nr_pages = 0;
> > >>> +#endif
> > >>> +		for (i = 1; i < nr; i++) {
> > >>> +			(head + i)->mapping = NULL;
> > >>> +			clear_compound_head(head + i);
> > >>
> > >> I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
> > >> to invoke it even for zone device private pages
> > 
> > I am not sure about what part of compound page is also used in device private folio.
> > Yes, it is better to add right checks.
> > 
> > >>
> > >>> +		}
> > >>> +		folio->mapping = NULL;
> > >>
> > >> This is already done in free_zone_device_folio()
> > >>
> > >>> +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > >>
> > >> I don't think this is required for zone device private folios, but I suppose it
> > >> keeps the code generic
> > >>
> > >
> > > Well, the above code doesn’t work, but I think it’s the right idea.
> > > clear_compound_head aliases to pgmap, which we don’t want to be NULL. I
> > 
> > Thank you for pointing it out. I am not familiar with device private page code.
> > 
> > > believe the individual pages likely need their flags cleared (?), and
> > 
> > Yes, I missed the tail page flag clearing part.
> > 

I think the page head is the only thing that really needs to be cleared,
though I could be wrong.

> > > this step must be done before calling folio_free and include a barrier,
> > > as the page can be immediately reallocated.
> > >
> > > So here’s what I came up with, and it seems to work (for Xe, GPU SVM):
> > >
> > >  mm/memremap.c | 21 +++++++++++++++++++++
> > >  1 file changed, 21 insertions(+)
> > >
> > > diff --git a/mm/memremap.c b/mm/memremap.c
> > > index 63c6ab4fdf08..ac20abb6a441 100644
> > > --- a/mm/memremap.c
> > > +++ b/mm/memremap.c
> > > @@ -448,6 +448,27 @@ void free_zone_device_folio(struct folio *folio)
> > >             pgmap->type != MEMORY_DEVICE_GENERIC)
> > >                 folio->mapping = NULL;
> > >
> > > +       if (nr > 1) {
> > > +               struct page *head = folio_page(folio, 0);
> > > +
> > > +               head[1].flags.f &= ~PAGE_FLAGS_SECOND;
> > > +#ifdef NR_PAGES_IN_LARGE_FOLIO
> > > +               folio->_nr_pages = 0;
> > > +#endif
> > > +               for (i = 1; i < nr; i++) {
> > > +                       struct folio *new_folio = (struct folio *)(head + i);
> > > +
> > > +                       (head + i)->mapping = NULL;
> > > +                       (head + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > > +
> > > +                       /* Overwrite compound_head with pgmap */
> > > +                       new_folio->pgmap = pgmap;
> > > +               }
> > > +
> > > +               head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
> > > +               smp_wmb();	/* Changes but be visable before freeing folio */
> > > +       }
> > > +
> > >         switch (pgmap->type) {
> > >         case MEMORY_DEVICE_PRIVATE:
> > >         case MEMORY_DEVICE_COHERENT:
> > >
> > 
> > It looks good to me, but I am very likely missing the detail on device private
> > pages. Like Balbir pointed out above, for tail pages, calling
> > free_tail_page_prepare() might be better to get sanity checks like normal
> > large folio, although you will need to set ->pgmap after it.
> > 
> > It is better to send it as a proper patch and get reviews from other
> > MM folks.
> > 

Yes, agreed. See above—I’ll work on a proper patch tomorrow and CC all
the correct MM folks. Aiming to have something ready for the next
release PR.

Matt

> > >>> +	}
> > >>>  }
> > >>>
> > >>>  void zone_device_page_init(struct page *page, unsigned int order)
> > >>
> > >>
> > >> Otherwise, it seems like the right way to solve the issue.
> > >>
> > >
> > > My question is: why isn’t Nouveau hitting this issue, or your Nvidia
> > > out-of-tree driver (lack of testing, Xe's test suite coverage is quite
> > > good at finding corners).
> > >
> > > Also, will this change in behavior break either ofthose drivers?
> > >
> > > Matt
> > >
> > >> Balbir
> > 
> > 
> > Best Regards,
> > Yan, Zi


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] mm/migrate: Add migrate_device_split_page
  2026-01-08  3:42                   ` Matthew Brost
@ 2026-01-08  4:47                     ` Balbir Singh
  0 siblings, 0 replies; 15+ messages in thread
From: Balbir Singh @ 2026-01-08  4:47 UTC (permalink / raw)
  To: Matthew Brost, Alistair Popple
  Cc: Zi Yan, Francois Dugast, intel-xe, dri-devel, Andrew Morton,
	linux-mm, David Hildenbrand, Lorenzo Stoakes, Baolin Wang,
	Liam R. Howlett, Nico Pache, Ryan Roberts, Dev Jain, Barry Song,
	Lance Yang, Matthew Wilcox

On 1/8/26 13:42, Matthew Brost wrote:
> On Thu, Jan 08, 2026 at 02:14:28PM +1100, Alistair Popple wrote:
>> On 2026-01-08 at 13:53 +1100, Zi Yan <ziy@nvidia.com> wrote...
>>> On 7 Jan 2026, at 21:17, Matthew Brost wrote:
>>>
>>>> On Thu, Jan 08, 2026 at 11:56:03AM +1100, Balbir Singh wrote:
>>>>> On 1/8/26 08:03, Zi Yan wrote:
>>>>>> On 7 Jan 2026, at 16:15, Matthew Brost wrote:
>>>>>>
>>>>>>> On Wed, Jan 07, 2026 at 03:38:35PM -0500, Zi Yan wrote:
>>>>>>>> On 7 Jan 2026, at 15:20, Zi Yan wrote:
>>>>>>>>
>>>>>>>>> +THP folks
>>>>>>>>
>>>>>>>> +willy, since he commented in another thread.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 16 Dec 2025, at 15:10, Francois Dugast wrote:
>>>>>>>>>
>>>>>>>>>> From: Matthew Brost <matthew.brost@intel.com>
>>>>>>>>>>
>>>>>>>>>> Introduce migrate_device_split_page() to split a device page into
>>>>>>>>>> lower-order pages. Used when a folio allocated as higher-order is freed
>>>>>>>>>> and later reallocated at a smaller order by the driver memory manager.
>>>>>>>>>>
>>>>>>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>>>>>>> Cc: Balbir Singh <balbirs@nvidia.com>
>>>>>>>>>> Cc: dri-devel@lists.freedesktop.org
>>>>>>>>>> Cc: linux-mm@kvack.org
>>>>>>>>>> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
>>>>>>>>>> Signed-off-by: Francois Dugast <francois.dugast@intel.com>
>>>>>>>>>> ---
>>>>>>>>>>  include/linux/huge_mm.h |  3 +++
>>>>>>>>>>  include/linux/migrate.h |  1 +
>>>>>>>>>>  mm/huge_memory.c        |  6 ++---
>>>>>>>>>>  mm/migrate_device.c     | 49 +++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>  4 files changed, 56 insertions(+), 3 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>>>>> index a4d9f964dfde..6ad8f359bc0d 100644
>>>>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>>>>> @@ -374,6 +374,9 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list
>>>>>>>>>>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>>>>>>>>  unsigned int min_order_for_split(struct folio *folio);
>>>>>>>>>>  int split_folio_to_list(struct folio *folio, struct list_head *list);
>>>>>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>>>>>>> +			   struct address_space *mapping, enum split_type split_type);
>>>>>>>>>>  int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>>>>>>>>  			   enum split_type split_type);
>>>>>>>>>>  int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
>>>>>>>>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>>>>>>>>> index 26ca00c325d9..ec65e4fd5f88 100644
>>>>>>>>>> --- a/include/linux/migrate.h
>>>>>>>>>> +++ b/include/linux/migrate.h
>>>>>>>>>> @@ -192,6 +192,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
>>>>>>>>>>  			unsigned long npages);
>>>>>>>>>>  void migrate_device_finalize(unsigned long *src_pfns,
>>>>>>>>>>  			unsigned long *dst_pfns, unsigned long npages);
>>>>>>>>>> +int migrate_device_split_page(struct page *page);
>>>>>>>>>>
>>>>>>>>>>  #endif /* CONFIG_MIGRATION */
>>>>>>>>>>
>>>>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>>>>> index 40cf59301c21..7ded35a3ecec 100644
>>>>>>>>>> --- a/mm/huge_memory.c
>>>>>>>>>> +++ b/mm/huge_memory.c
>>>>>>>>>> @@ -3621,9 +3621,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
>>>>>>>>>>   * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
>>>>>>>>>>   * split but not to @new_order, the caller needs to check)
>>>>>>>>>>   */
>>>>>>>>>> -static int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>>>>> -		struct page *split_at, struct xa_state *xas,
>>>>>>>>>> -		struct address_space *mapping, enum split_type split_type)
>>>>>>>>>> +int __split_unmapped_folio(struct folio *folio, int new_order,
>>>>>>>>>> +			   struct page *split_at, struct xa_state *xas,
>>>>>>>>>> +			   struct address_space *mapping, enum split_type split_type)
>>>>>>>>>>  {
>>>>>>>>>>  	const bool is_anon = folio_test_anon(folio);
>>>>>>>>>>  	int old_order = folio_order(folio);
>>>>>>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>>>>>>> index 23379663b1e1..eb0f0e938947 100644
>>>>>>>>>> --- a/mm/migrate_device.c
>>>>>>>>>> +++ b/mm/migrate_device.c
>>>>>>>>>> @@ -775,6 +775,49 @@ int migrate_vma_setup(struct migrate_vma *args)
>>>>>>>>>>  EXPORT_SYMBOL(migrate_vma_setup);
>>>>>>>>>>
>>>>>>>>>>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
>>>>>>>>>> +/**
>>>>>>>>>> + * migrate_device_split_page() - Split device page
>>>>>>>>>> + * @page: Device page to split
>>>>>>>>>> + *
>>>>>>>>>> + * Splits a device page into smaller pages. Typically called when reallocating a
>>>>>>>>>> + * folio to a smaller size. Inherently racy—only safe if the caller ensures
>>>>>>>>>> + * mutual exclusion within the page's folio (i.e., no other threads are using
>>>>>>>>>> + * pages within the folio). Expected to be called a free device page and
>>>>>>>>>> + * restores all split out pages to a free state.
>>>>>>>>>> + */
>>>>>>>>
>>>>>>>> Do you mind explaining why __split_unmapped_folio() is needed for a free device
>>>>>>>> page? A free page is not supposed to be a large folio, at least from a core
>>>>>>>> MM point of view. __split_unmapped_folio() is intended to work on large folios
>>>>>>>> (or compound pages), even if the input folio has refcount == 0 (because it is
>>>>>>>> frozen).
>>>>>>>>
>>>>>>>
>>>>>>> Well, then maybe this is a bug in core MM where the freed page is still
>>>>>>> a THP. Let me explain the scenario and why this is needed from my POV.
>>>>>>>
>>>>>>> Our VRAM allocator in Xe (and several other DRM drivers) is DRM buddy.
>>>>>>> This is a shared pool between traditional DRM GEMs (buffer objects) and
>>>>>>> SVM allocations (pages). It doesn’t have any view of the page backing—it
>>>>>>> basically just hands back a pointer to VRAM space that we allocate from.
>>>>>>> From that, if it’s an SVM allocation, we can derive the device pages.
>>>>>>>
>>>>>>> What I see happening is: a 2M buddy allocation occurs, we make the
>>>>>>> backing device pages a large folio, and sometime later the folio
>>>>>>> refcount goes to zero and we free the buddy allocation. Later, the buddy
>>>>>>> allocation is reused for a smaller allocation (e.g., 4K or 64K), but the
>>>>>>> backing pages are still a large folio. Here is where we need to split
>>>>>>
>>>>>> I agree with you that it might be a bug in free_zone_device_folio() based
>>>>>> on my understanding. Since zone_device_page_init() calls prep_compound_page()
>>>>>> for >0 orders, but free_zone_device_folio() never reverse the process.
>>>>>>
>>>>>> Balbir and Alistair might be able to help here.
>>
>> Just catching up after the Christmas break.
>>
> 
> I think everyone is and scrambling for the release PR. :)
> 
>>>>>
>>>>> I agree it's an API limitation
>>>
>>> I am not sure. If free_zone_device_folio() does not get rid of large folio
>>> metadata, there is no guarantee that a freed large device private folio will
>>> be reallocated as a large device private folio. And when mTHP support is
>>> added, the folio order might change too. That can cause issues when
>>> compound_head() is called on a tail page of a previously large folio, since
>>> compound_head() will return the old head page instead of the tail page itself.
>>
>> I agree that freeing the device folio should get rid of the large folio. That
>> would also keep it consistent with what we do for FS DAX for example.
>>
> 
> +1
> 
>>>>>
>>>>>>
>>>>>> I cherry picked the code from __free_frozen_pages() to reverse the process.
>>>>>> Can you give it a try to see if it solve the above issue? Thanks.
>>
>> It would be nice if this could be a common helper for freeing compound
>> ZONE_DEVICE pages. FS DAX already has this for example:
>>
>> static inline unsigned long dax_folio_put(struct folio *folio)
>> {
>> 	unsigned long ref;
>> 	int order, i;
>>
>> 	if (!dax_folio_is_shared(folio))
>> 		ref = 0;
>> 	else
>> 		ref = --folio->share;
>>
>> 	if (ref)
>> 		return ref;
>>
>> 	folio->mapping = NULL;
>> 	order = folio_order(folio);
>> 	if (!order)
>> 		return 0;
>> 	folio_reset_order(folio);
>>
>> 	for (i = 0; i < (1UL << order); i++) {
>> 		struct dev_pagemap *pgmap = page_pgmap(&folio->page);
>> 		struct page *page = folio_page(folio, i);
>> 		struct folio *new_folio = (struct folio *)page;
>>
>> 		ClearPageHead(page);
>> 		clear_compound_head(page);
>>
>> 		new_folio->mapping = NULL;
>> 		/*
>> 		 * Reset pgmap which was over-written by
>> 		 * prep_compound_page().
>> 		 */
>> 		new_folio->pgmap = pgmap;
>> 		new_folio->share = 0;
>> 		WARN_ON_ONCE(folio_ref_count(new_folio));
>> 	}
>>
>> 	return ref;
>> }
>>
>> Aside from the weird refcount checks that FS DAX needs to at the start of this
>> function I don't think there is anything specific to DEVICE_PRIVATE pages there.
>>
> 
> Thanks for the reference, Alistair. This looks roughly like what I
> hacked together in an effort to just get something working. I believe a
> common helper can be made to work. Let me churn on this tomorrow and put
> together a proper patch.
> 
>>>>>>
>>>>>> From 3aa03baa39b7e62ea079e826de6ed5aab3061e46 Mon Sep 17 00:00:00 2001
>>>>>> From: Zi Yan <ziy@nvidia.com>
>>>>>> Date: Wed, 7 Jan 2026 16:49:52 -0500
>>>>>> Subject: [PATCH] mm/memremap: free device private folio fix
>>>>>> Content-Type: text/plain; charset="utf-8"
>>>>>>
>>>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>>>> ---
>>>>>>  mm/memremap.c | 15 +++++++++++++++
>>>>>>  1 file changed, 15 insertions(+)
>>>>>>
>>>>>> diff --git a/mm/memremap.c b/mm/memremap.c
>>>>>> index 63c6ab4fdf08..483666ff7271 100644
>>>>>> --- a/mm/memremap.c
>>>>>> +++ b/mm/memremap.c
>>>>>> @@ -475,6 +475,21 @@ void free_zone_device_folio(struct folio *folio)
>>>>>>  		pgmap->ops->folio_free(folio);
>>>>>>  		break;
>>>>>>  	}
>>>>>> +
>>>>>> +	if (nr > 1) {
>>>>>> +		struct page *head = folio_page(folio, 0);
>>>>>> +
>>>>>> +		head[1].flags.f &= ~PAGE_FLAGS_SECOND;
>>>>>> +#ifdef NR_PAGES_IN_LARGE_FOLIO
>>>>>> +		folio->_nr_pages = 0;
>>>>>> +#endif
>>>>>> +		for (i = 1; i < nr; i++) {
>>>>>> +			(head + i)->mapping = NULL;
>>>>>> +			clear_compound_head(head + i);
>>>>>
>>>>> I see that your skipping the checks in free_page_tail_prepare()? IIUC, we should be able
>>>>> to invoke it even for zone device private pages
>>>
>>> I am not sure about what part of compound page is also used in device private folio.
>>> Yes, it is better to add right checks.
>>>
>>>>>
>>>>>> +		}
>>>>>> +		folio->mapping = NULL;
>>>>>
>>>>> This is already done in free_zone_device_folio()
>>>>>
>>>>>> +		head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
>>>>>
>>>>> I don't think this is required for zone device private folios, but I suppose it
>>>>> keeps the code generic
>>>>>
>>>>
>>>> Well, the above code doesn’t work, but I think it’s the right idea.
>>>> clear_compound_head aliases to pgmap, which we don’t want to be NULL. I
>>>
>>> Thank you for pointing it out. I am not familiar with device private page code.
>>>
>>>> believe the individual pages likely need their flags cleared (?), and
>>>
>>> Yes, I missed the tail page flag clearing part.
>>>
> 
> I think the page head is the only thing that really needs to be cleared,
> though I could be wrong.
> 
>>>> this step must be done before calling folio_free and include a barrier,
>>>> as the page can be immediately reallocated.
>>>>
>>>> So here’s what I came up with, and it seems to work (for Xe, GPU SVM):
>>>>
>>>>  mm/memremap.c | 21 +++++++++++++++++++++
>>>>  1 file changed, 21 insertions(+)
>>>>
>>>> diff --git a/mm/memremap.c b/mm/memremap.c
>>>> index 63c6ab4fdf08..ac20abb6a441 100644
>>>> --- a/mm/memremap.c
>>>> +++ b/mm/memremap.c
>>>> @@ -448,6 +448,27 @@ void free_zone_device_folio(struct folio *folio)
>>>>             pgmap->type != MEMORY_DEVICE_GENERIC)
>>>>                 folio->mapping = NULL;
>>>>
>>>> +       if (nr > 1) {
>>>> +               struct page *head = folio_page(folio, 0);
>>>> +
>>>> +               head[1].flags.f &= ~PAGE_FLAGS_SECOND;
>>>> +#ifdef NR_PAGES_IN_LARGE_FOLIO
>>>> +               folio->_nr_pages = 0;
>>>> +#endif
>>>> +               for (i = 1; i < nr; i++) {
>>>> +                       struct folio *new_folio = (struct folio *)(head + i);
>>>> +
>>>> +                       (head + i)->mapping = NULL;
>>>> +                       (head + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
>>>> +
>>>> +                       /* Overwrite compound_head with pgmap */
>>>> +                       new_folio->pgmap = pgmap;
>>>> +               }
>>>> +
>>>> +               head->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
>>>> +               smp_wmb();	/* Changes but be visable before freeing folio */
>>>> +       }
>>>> +
>>>>         switch (pgmap->type) {
>>>>         case MEMORY_DEVICE_PRIVATE:
>>>>         case MEMORY_DEVICE_COHERENT:
>>>>
>>>
>>> It looks good to me, but I am very likely missing the detail on device private
>>> pages. Like Balbir pointed out above, for tail pages, calling
>>> free_tail_page_prepare() might be better to get sanity checks like normal
>>> large folio, although you will need to set ->pgmap after it.
>>>
>>> It is better to send it as a proper patch and get reviews from other
>>> MM folks.
>>>
> 
> Yes, agreed. See above—I’ll work on a proper patch tomorrow and CC all
> the correct MM folks. Aiming to have something ready for the next
> release PR.
> 

Yes, please!

Thanks,
Balbir



^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2026-01-08  4:47 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20251216201206.1660899-1-francois.dugast@intel.com>
2025-12-16 20:10 ` [PATCH 1/4] mm/migrate: Add migrate_device_split_page Francois Dugast
2025-12-16 20:34   ` Matthew Wilcox
2025-12-16 21:39     ` Matthew Brost
2026-01-06  2:39       ` Matthew Brost
2026-01-07 20:15         ` Zi Yan
2026-01-07 20:20   ` Zi Yan
2026-01-07 20:38     ` Zi Yan
2026-01-07 21:15       ` Matthew Brost
2026-01-07 22:03         ` Zi Yan
2026-01-08  0:56           ` Balbir Singh
2026-01-08  2:17             ` Matthew Brost
2026-01-08  2:53               ` Zi Yan
2026-01-08  3:14                 ` Alistair Popple
2026-01-08  3:42                   ` Matthew Brost
2026-01-08  4:47                     ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox