Re: [PATCH 1/8] mm/hugetlb: add pre-zeroed framework

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Raghavendra K T <raghavendra.kt@amd.com>
To: 李喆 <lizhe.67@bytedance.com>,
	muchun.song@linux.dev, osalvador@suse.de, david@kernel.org,
	akpm@linux-foundation.org, fvdl@google.com
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/8] mm/hugetlb: add pre-zeroed framework
Date: Fri, 26 Dec 2025 14:54:17 +0530	[thread overview]
Message-ID: <46bf07b6-633f-43b8-8e2b-b08d437494b9@amd.com> (raw)
In-Reply-To: <20251225082059.1632-2-lizhe.67@bytedance.com>

On 12/25/2025 1:50 PM, æå wrote:
> From: Li Zhe <lizhe.67@bytedance.com>
> 
> This patch establishes a pre-zeroing framework by introducing two new
> hugetlb page flags and extends the code at every point where these flags
> may later be required. The roles of the two flags are as follows.
> 
> (1) HPG_zeroed – indicates that the huge folio has already been
>      zeroed
> (2) HPG_zeroing – marks that the huge folio is currently being zeroed
> 
> No functional change, as nothing sets the flags yet.
> 
> Co-developed-by: Frank van der Linden <fvdl@google.com>
> Signed-off-by: Frank van der Linden <fvdl@google.com>
> Signed-off-by: Li Zhe <lizhe.67@bytedance.com>
> ---
>   fs/hugetlbfs/inode.c    |   3 +-
>   include/linux/hugetlb.h |  26 +++++++++
>   mm/hugetlb.c            | 113 +++++++++++++++++++++++++++++++++++++---
>   3 files changed, 133 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 3b4c152c5c73..be6b32ab3ca8 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -828,8 +828,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
>   			error = PTR_ERR(folio);
>   			goto out;
>   		}
> -		folio_zero_user(folio, addr);
> -		__folio_mark_uptodate(folio);
> +		hugetlb_zero_folio(folio, addr);
>   		error = hugetlb_add_to_page_cache(folio, mapping, index);
>   		if (unlikely(error)) {
>   			restore_reserve_on_error(h, &pseudo_vma, addr, folio);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 019a1c5281e4..2daf4422a17d 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -584,6 +584,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>    * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
>    * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
>    *     that is not tracked by raw_hwp_page list.
> + * HPG_zeroed - page was pre-zeroed.
> + *	Synchronization: hugetlb_lock held when set by pre-zero thread.
> + *	Only valid to read outside hugetlb_lock once the page is off
> + *	the freelist, and HPG_zeroing is clear. Always cleared when a
> + *	page is put (back) on the freelist.
> + * HPG_zeroing - page is being zeroed by the pre-zero thread.
> + *	Synchronization: set and cleared by the pre-zero thread with
> + *	hugetlb_lock held. Access by others is read-only. Once the page
> + *	is off the freelist, this can only change from set -> clear,
> + *	which the new page owner must wait for. Always cleared
> + *	when a page is put (back) on the freelist.
>    */
>   enum hugetlb_page_flags {
>   	HPG_restore_reserve = 0,
> @@ -593,6 +604,8 @@ enum hugetlb_page_flags {
>   	HPG_vmemmap_optimized,
>   	HPG_raw_hwp_unreliable,
>   	HPG_cma,
> +	HPG_zeroed,
> +	HPG_zeroing,
>   	__NR_HPAGEFLAGS,
>   };
>   
> @@ -653,6 +666,8 @@ HPAGEFLAG(Freed, freed)
>   HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
>   HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
>   HPAGEFLAG(Cma, cma)
> +HPAGEFLAG(Zeroed, zeroed)
> +HPAGEFLAG(Zeroing, zeroing)
>   
>   #ifdef CONFIG_HUGETLB_PAGE
>   
> @@ -678,6 +693,12 @@ struct hstate {
>   	unsigned int nr_huge_pages_node[MAX_NUMNODES];
>   	unsigned int free_huge_pages_node[MAX_NUMNODES];
>   	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
> +
> +	unsigned int free_huge_pages_zero_node[MAX_NUMNODES];
> +
> +	/* Queue to wait for a hugetlb folio that is being prezeroed */
> +	wait_queue_head_t dqzero_wait[MAX_NUMNODES];
> +
>   	char name[HSTATE_NAME_LEN];
>   };
>   
> @@ -711,6 +732,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
>   			pgoff_t idx);
>   void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
>   				unsigned long address, struct folio *folio);
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address);
>   
>   /* arch callback */
>   int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
> @@ -1303,6 +1325,10 @@ static inline bool hugetlb_bootmem_allocated(void)
>   {
>   	return false;
>   }
> +
> +static inline void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> +}
>   #endif	/* CONFIG_HUGETLB_PAGE */
>   
>   static inline spinlock_t *huge_pte_lock(struct hstate *h,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 51273baec9e5..d20614b1c927 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -93,6 +93,8 @@ static int hugetlb_param_index __initdata;
>   static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
>   static __init void hugetlb_parse_params(void);
>   
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio);
> +
>   #define hugetlb_early_param(str, func) \
>   static __init int func##args(char *s) \
>   { \
> @@ -1292,21 +1294,33 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
>   	hugetlb_dup_vma_private(vma);
>   }
>   
> +/*
> + * Clear flags for either a fresh page or one that is being
> + * added to the free list.
> + */
> +static inline void prep_clear_zeroed(struct folio *folio)
> +{
> +	folio_clear_hugetlb_zeroed(folio);
> +	folio_clear_hugetlb_zeroing(folio);
> +}
> +
>   static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
>   {
>   	int nid = folio_nid(folio);
>   
>   	lockdep_assert_held(&hugetlb_lock);
>   	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
>   
>   	list_move(&folio->lru, &h->hugepage_freelists[nid]);
>   	h->free_huge_pages++;
>   	h->free_huge_pages_node[nid]++;
> +	prep_clear_zeroed(folio);
>   	folio_set_hugetlb_freed(folio);
>   }
>   
> -static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
> -								int nid)
> +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid,
> +		gfp_t gfp_mask)
>   {
>   	struct folio *folio;
>   	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
> @@ -1316,6 +1330,16 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
>   		if (pin && !folio_is_longterm_pinnable(folio))
>   			continue;
>   
> +		/*
> +		 * This shouldn't happen, as hugetlb pages are never allocated
> +		 * with GFP_ATOMIC. But be paranoid and check for it, as
> +		 * a zero_busy page might cause a sleep later in
> +		 * hpage_wait_zeroing().
> +		 */
> +		if (WARN_ON_ONCE(folio_test_hugetlb_zeroing(folio) &&
> +					!gfpflags_allow_blocking(gfp_mask)))
> +			continue;
> +
>   		if (folio_test_hwpoison(folio))
>   			continue;
>   
> @@ -1327,6 +1351,10 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
>   		folio_clear_hugetlb_freed(folio);
>   		h->free_huge_pages--;
>   		h->free_huge_pages_node[nid]--;
> +		if (folio_test_hugetlb_zeroed(folio) ||
> +		    folio_test_hugetlb_zeroing(folio))
> +			h->free_huge_pages_zero_node[nid]--;
> +
>   		return folio;
>   	}
>   
> @@ -1363,7 +1391,7 @@ static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_
>   			continue;
>   		node = zone_to_nid(zone);
>   
> -		folio = dequeue_hugetlb_folio_node_exact(h, node);
> +		folio = dequeue_hugetlb_folio_node_exact(h, node, gfp_mask);
>   		if (folio)
>   			return folio;
>   	}
> @@ -1490,7 +1518,16 @@ void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
>   		folio_clear_hugetlb_freed(folio);
>   		h->free_huge_pages--;
>   		h->free_huge_pages_node[nid]--;
> +		folio_clear_hugetlb_freed(folio);
>   	}
> +	/*
> +	 * Adjust the zero page counters now. Note that
> +	 * if a page is currently being zeroed, that
> +	 * will be waited for in update_and_free_page()
> +	 */
> +	if (folio_test_hugetlb_zeroed(folio) ||
> +	    folio_test_hugetlb_zeroing(folio))
> +		h->free_huge_pages_zero_node[nid]--;
>   	if (adjust_surplus) {
>   		h->surplus_huge_pages--;
>   		h->surplus_huge_pages_node[nid]--;
> @@ -1543,6 +1580,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
>   {
>   	bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
>   
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb_zeroing(folio), folio);
> +
>   	if (hstate_is_gigantic_no_runtime(h))
>   		return;
>   
> @@ -1627,6 +1666,7 @@ static void free_hpage_workfn(struct work_struct *work)
>   		 */
>   		h = size_to_hstate(folio_size(folio));
>   
> +		hpage_wait_zeroing(h, folio);
>   		__update_and_free_hugetlb_folio(h, folio);
>   
>   		cond_resched();
> @@ -1643,7 +1683,8 @@ static inline void flush_free_hpage_work(struct hstate *h)
>   static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
>   				 bool atomic)
>   {
> -	if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
> +	if ((!folio_test_hugetlb_zeroing(folio) &&
> +	     !folio_test_hugetlb_vmemmap_optimized(folio)) || !atomic) {
>   		__update_and_free_hugetlb_folio(h, folio);
>   		return;
>   	}
> @@ -1840,6 +1881,13 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
>   	h->nr_huge_pages_node[folio_nid(folio)]++;
>   }
>   
> +static void prep_new_hugetlb_folio(struct folio *folio)
> +{
> +	lockdep_assert_held(&hugetlb_lock);
> +	folio_clear_hugetlb_freed(folio);
> +	prep_clear_zeroed(folio);
> +}
> +
>   void init_new_hugetlb_folio(struct folio *folio)
>   {
>   	__folio_set_hugetlb(folio);
> @@ -1964,6 +2012,7 @@ void prep_and_add_allocated_folios(struct hstate *h,
>   	/* Add all new pool pages to free lists in one lock cycle */
>   	spin_lock_irqsave(&hugetlb_lock, flags);
>   	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
> +		prep_new_hugetlb_folio(folio);
>   		account_new_hugetlb_folio(h, folio);
>   		enqueue_hugetlb_folio(h, folio);
>   	}
> @@ -2171,6 +2220,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
>   		return NULL;
>   
>   	spin_lock_irq(&hugetlb_lock);
> +	prep_new_hugetlb_folio(folio);
>   	/*
>   	 * nr_huge_pages needs to be adjusted within the same lock cycle
>   	 * as surplus_pages, otherwise it might confuse
> @@ -2214,6 +2264,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
>   		return NULL;
>   
>   	spin_lock_irq(&hugetlb_lock);
> +	prep_new_hugetlb_folio(folio);
>   	account_new_hugetlb_folio(h, folio);
>   	spin_unlock_irq(&hugetlb_lock);
>   
> @@ -2289,6 +2340,13 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
>   						preferred_nid, nmask);
>   		if (folio) {
>   			spin_unlock_irq(&hugetlb_lock);
> +			/*
> +			 * The contents of this page will be completely
> +			 * overwritten immediately, as its a migration
> +			 * target, so no clearing is needed. Do wait in
> +			 * case pre-zero thread was working on it, though.
> +			 */
> +			hpage_wait_zeroing(h, folio);
>   			return folio;
>   		}
>   	}
> @@ -2779,6 +2837,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio,
>   		 */
>   		remove_hugetlb_folio(h, old_folio, false);
>   
> +		prep_new_hugetlb_folio(new_folio);
>   		/*
>   		 * Ref count on new_folio is already zero as it was dropped
>   		 * earlier.  It can be directly added to the pool free list.
> @@ -2999,6 +3058,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>   
>   	spin_unlock_irq(&hugetlb_lock);
>   
> +	hpage_wait_zeroing(h, folio);
> +
>   	hugetlb_set_folio_subpool(folio, spool);
>   
>   	if (map_chg != MAP_CHG_ENFORCED) {
> @@ -3257,6 +3318,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
>   		hugetlb_bootmem_init_migratetype(folio, h);
>   		/* Subdivide locks to achieve better parallel performance */
>   		spin_lock_irqsave(&hugetlb_lock, flags);
> +		prep_new_hugetlb_folio(folio);
>   		account_new_hugetlb_folio(h, folio);
>   		enqueue_hugetlb_folio(h, folio);
>   		spin_unlock_irqrestore(&hugetlb_lock, flags);
> @@ -4190,6 +4252,42 @@ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
>   	return size == HPAGE_SIZE;
>   }
>   
> +/*
> + * Zero a hugetlb page.
> + *
> + * The caller has already made sure that the page is not
> + * being actively zeroed out in the background.
> + *
> + * If it wasn't zeroed out, do it ourselves.
> + */
> +void hugetlb_zero_folio(struct folio *folio, unsigned long address)
> +{
> +	if (!folio_test_hugetlb_zeroed(folio))
> +		folio_zero_user(folio, address);
> +
> +	__folio_mark_uptodate(folio);
> +}
> +
> +/*
> + * Once a page has been taken off the freelist, the new page owner
> + * must wait for the pre-zero thread to finish if it happens
> + * to be working on this page (which should be rare).
> + */
> +static void hpage_wait_zeroing(struct hstate *h, struct folio *folio)
> +{
> +	if (!folio_test_hugetlb_zeroing(folio))
> +		return;
> +
> +	spin_lock_irq(&hugetlb_lock);
> +
> +	wait_event_cmd(h->dqzero_wait[folio_nid(folio)],
> +		       !folio_test_hugetlb_zeroing(folio),
> +		       spin_unlock_irq(&hugetlb_lock),
> +		       spin_lock_irq(&hugetlb_lock));
> +
> +	spin_unlock_irq(&hugetlb_lock);
> +}
> +

nit:
May be simple enough chunk to introduce guard() above

[...]

Regards
- Raghu

next prev parent reply	other threads:[~2025-12-26  9:24 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-25  8:20 [PATCH 0/8] Introduce a huge-page pre-zeroing mechanism 李喆
2025-12-25  8:20 ` [PATCH 1/8] mm/hugetlb: add pre-zeroed framework 李喆
2025-12-26  9:24   ` Raghavendra K T [this message]
2025-12-26  9:48     ` Li Zhe
2025-12-25  8:20 ` [PATCH 2/8] mm/hugetlb: convert to prep_account_new_hugetlb_folio() 李喆
2025-12-25  8:20 ` [PATCH 3/8] mm/hugetlb: move the huge folio to the end of the list during enqueue 李喆
2025-12-25  8:20 ` [PATCH 4/8] mm/hugetlb: introduce per-node sysfs interface "zeroable_hugepages" 李喆
2025-12-26 18:51   ` Frank van der Linden
2025-12-29 12:25     ` Li Zhe
2025-12-29 18:57       ` Frank van der Linden
2025-12-30  2:41         ` Li Zhe
2025-12-25  8:20 ` [PATCH 5/8] mm/hugetlb: simplify function hugetlb_sysfs_add_hstate() 李喆
2025-12-25  8:20 ` [PATCH 6/8] mm/hugetlb: relocate the per-hstate struct kobject pointer 李喆
2025-12-25  8:20 ` [PATCH 7/8] mm/hugetlb: add epoll support for interface "zeroable_hugepages" 李喆
2025-12-25  8:20 ` [PATCH 8/8] mm/hugetlb: limit event generation frequency of function do_zero_free_notify() 李喆
2025-12-26 18:32 ` [PATCH 0/8] Introduce a huge-page pre-zeroing mechanism Frank van der Linden
2025-12-26 21:42   ` Frank van der Linden
2025-12-29 12:28     ` Li Zhe
2025-12-27  7:21 ` Mateusz Guzik
2025-12-29 12:31   ` Li Zhe
2025-12-28 21:44 ` Andrew Morton
2025-12-29 12:34   ` Li Zhe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=46bf07b6-633f-43b8-8e2b-b08d437494b9@amd.com \
    --to=raghavendra.kt@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@kernel.org \
    --cc=fvdl@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizhe.67@bytedance.com \
    --cc=muchun.song@linux.dev \
    --cc=osalvador@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox