linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
@ 2025-12-16 21:56 Jane Chu
  2025-12-18  8:41 ` David Hildenbrand (Red Hat)
  0 siblings, 1 reply; 6+ messages in thread
From: Jane Chu @ 2025-12-16 21:56 UTC (permalink / raw)
  To: muchun.song, osalvador, david, linmiaohe, jiaqiyan,
	william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett,
	rppt, surenb, mhocko, linux-mm, linux-kernel

When a newly poisoned subpage ends up in an already poisoned hugetlb
folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
is not. Fix the inconsistency by designating action_result() to update
them both.

Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
Cc: <stable@vger.kernel.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 include/linux/hugetlb.h |  4 ++--
 include/linux/mm.h      |  4 ++--
 mm/hugetlb.c            |  4 ++--
 mm/memory-failure.c     | 22 +++++++++++++---------
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2e6690c9df96 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared);
+				bool *migratable_cleared, bool *samepg);
 void folio_putback_hugetlb(struct folio *folio);
 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
 }
 
 static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
+					bool *migratable_cleared, bool *samepg)
 {
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..68b1812e9c0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
 extern const struct attribute_group memory_failure_attr_group;
 extern void memory_failure_queue(unsigned long pfn, int flags);
 extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared);
+					bool *migratable_cleared, bool *samepg);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
 #else
@@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
 }
 
 static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
+					bool *migratable_cleared, bool *samepg)
 {
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0455119716ec..f78562a578e5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
 }
 
 int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared)
+				bool *migratable_cleared, bool *samepg)
 {
 	int ret;
 
 	spin_lock_irq(&hugetlb_lock);
-	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
+	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
 	spin_unlock_irq(&hugetlb_lock);
 	return ret;
 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..070f43bb110a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
 	return count;
 }
 
-static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
+					bool *samepg)
 {
 	struct llist_head *head;
 	struct raw_hwp_page *raw_hwp;
@@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 		return -EHWPOISON;
 	head = raw_hwp_list_head(folio);
 	llist_for_each_entry(p, head->first, node) {
-		if (p->page == page)
+		if (p->page == page) {
+			*samepg = true;
 			return -EHWPOISON;
+		}
 	}
 
 	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
 	if (raw_hwp) {
 		raw_hwp->page = page;
 		llist_add(&raw_hwp->node, head);
-		/* the first error event will be counted in action_result(). */
-		if (ret)
-			num_poisoned_pages_inc(page_to_pfn(page));
 	} else {
 		/*
 		 * Failed to save raw error info.  We no longer trace all
@@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
  *   -EHWPOISON    - the hugepage is already hwpoisoned
  */
 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				 bool *migratable_cleared)
+				 bool *migratable_cleared, bool *samepg)
 {
 	struct page *page = pfn_to_page(pfn);
 	struct folio *folio = page_folio(page);
@@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 			goto out;
 	}
 
-	if (folio_set_hugetlb_hwpoison(folio, page)) {
+	if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
 		ret = -EHWPOISON;
 		goto out;
 	}
@@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 	struct page *p = pfn_to_page(pfn);
 	struct folio *folio;
 	unsigned long page_flags;
+	bool samepg = false;
 	bool migratable_cleared = false;
 
 	*hugetlb = 1;
 retry:
-	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
+	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
 	if (res == 2) { /* fallback to normal page handling */
 		*hugetlb = 0;
 		return 0;
@@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 			folio = page_folio(p);
 			res = kill_accessing_process(current, folio_pfn(folio), flags);
 		}
-		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+		if (samepg)
+			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+		else
+			action_result(pfn, MF_MSG_HUGE, MF_FAILED);
 		return res;
 	} else if (res == -EBUSY) {
 		if (!(flags & MF_NO_RETRY)) {
-- 
2.43.5



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
  2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu
@ 2025-12-18  8:41 ` David Hildenbrand (Red Hat)
  2025-12-18 19:01   ` jane.chu
  0 siblings, 1 reply; 6+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-18  8:41 UTC (permalink / raw)
  To: Jane Chu, muchun.song, osalvador, linmiaohe, jiaqiyan,
	william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett,
	rppt, surenb, mhocko, linux-mm, linux-kernel

On 12/16/25 22:56, Jane Chu wrote:
> When a newly poisoned subpage ends up in an already poisoned hugetlb

The concept of subpages does not exist. It's a page of a hugetlb folio.

> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
> is not. Fix the inconsistency by designating action_result() to update
> them both.

What is the user-visible result of that?

> 
> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
>   include/linux/hugetlb.h |  4 ++--
>   include/linux/mm.h      |  4 ++--
>   mm/hugetlb.c            |  4 ++--
>   mm/memory-failure.c     | 22 +++++++++++++---------
>   4 files changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f..2e6690c9df96 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>   bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
>   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared);
> +				bool *migratable_cleared, bool *samepg);
>   void folio_putback_hugetlb(struct folio *folio);
>   void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
>   void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>   }
>   
>   static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> +					bool *migratable_cleared, bool *samepg)
>   {
>   	return 0;
>   }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7c79b3369b82..68b1812e9c0a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
>   extern const struct attribute_group memory_failure_attr_group;
>   extern void memory_failure_queue(unsigned long pfn, int flags);
>   extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared);
> +					bool *migratable_cleared, bool *samepg);
>   void num_poisoned_pages_inc(unsigned long pfn);
>   void num_poisoned_pages_sub(unsigned long pfn, long i);
>   #else
> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
>   }
>   
>   static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -					bool *migratable_cleared)
> +					bool *migratable_cleared, bool *samepg)
>   {
>   	return 0;
>   }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0455119716ec..f78562a578e5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
>   }
>   
>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				bool *migratable_cleared)
> +				bool *migratable_cleared, bool *samepg)
>   {
>   	int ret;
>   
>   	spin_lock_irq(&hugetlb_lock);
> -	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> +	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
>   	spin_unlock_irq(&hugetlb_lock);
>   	return ret;
>   }
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30..070f43bb110a 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
>   	return count;
>   }
>   
> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
> +					bool *samepg)
>   {
>   	struct llist_head *head;
>   	struct raw_hwp_page *raw_hwp;
> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
>   		return -EHWPOISON;
>   	head = raw_hwp_list_head(folio);
>   	llist_for_each_entry(p, head->first, node) {
> -		if (p->page == page)
> +		if (p->page == page) {
> +			*samepg = true;
>   			return -EHWPOISON;
> +		}
>   	}
>   
>   	raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>   	if (raw_hwp) {
>   		raw_hwp->page = page;
>   		llist_add(&raw_hwp->node, head);
> -		/* the first error event will be counted in action_result(). */
> -		if (ret)
> -			num_poisoned_pages_inc(page_to_pfn(page));
>   	} else {
>   		/*
>   		 * Failed to save raw error info.  We no longer trace all
> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
>    *   -EHWPOISON    - the hugepage is already hwpoisoned
>    */
>   int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> -				 bool *migratable_cleared)
> +				 bool *migratable_cleared, bool *samepg)
>   {
>   	struct page *page = pfn_to_page(pfn);
>   	struct folio *folio = page_folio(page);
> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>   			goto out;
>   	}
>   
> -	if (folio_set_hugetlb_hwpoison(folio, page)) {
> +	if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>   		ret = -EHWPOISON;
>   		goto out;
>   	}
> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>   	struct page *p = pfn_to_page(pfn);
>   	struct folio *folio;
>   	unsigned long page_flags;
> +	bool samepg = false;
>   	bool migratable_cleared = false;
>   
>   	*hugetlb = 1;
>   retry:
> -	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
> +	res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
>   	if (res == 2) { /* fallback to normal page handling */
>   		*hugetlb = 0;
>   		return 0;
> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
>   			folio = page_folio(p);
>   			res = kill_accessing_process(current, folio_pfn(folio), flags);
>   		}
> -		action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> +		if (samepg)
> +			action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> +		else
> +			action_result(pfn, MF_MSG_HUGE, MF_FAILED);

Can't we somehow return that result from get_huge_page_for_hwpoison() 
... folio_set_hugetlb_hwpoison() differently? E.g., return an enum 
instead of "-EHWPOISON" or magic value "2".

"samepg" is petty much unreadable. Same with what?

What you really mean is "page was already hwpoisoned".

In an enum you might be better able to describe the various scenarios.

-- 
Cheers

David


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
  2025-12-18  8:41 ` David Hildenbrand (Red Hat)
@ 2025-12-18 19:01   ` jane.chu
  2025-12-18 20:26     ` Liam R. Howlett
  0 siblings, 1 reply; 6+ messages in thread
From: jane.chu @ 2025-12-18 19:01 UTC (permalink / raw)
  To: David Hildenbrand (Red Hat),
	muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
	rientjes, akpm, lorenzo.stoakes, Liam.Howlett, rppt, surenb,
	mhocko, linux-mm, linux-kernel

Hi, David,

Thanks for the review.

On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> On 12/16/25 22:56, Jane Chu wrote:
>> When a newly poisoned subpage ends up in an already poisoned hugetlb
> 
> The concept of subpages does not exist. It's a page of a hugetlb folio.

Okay.

> 
>> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
>> is not. Fix the inconsistency by designating action_result() to update
>> them both.
> 
> What is the user-visible result of that?

For the purpose of observation, and potential action afterwards.

# cat /proc/meminfo | grep HardwareCorrupted
shows 'num_poisoned_pages', the global count of poisoned pages.

# ls /sys/devices/system/node/node0/memory_failure
delayed  failed  ignored  recovered  total
these fields show the per node ->mf_stats, that is the MF handling results.

> 
>>
>> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats 
>> to pglist_data")
>> Cc: <stable@vger.kernel.org>
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>>   include/linux/hugetlb.h |  4 ++--
>>   include/linux/mm.h      |  4 ++--
>>   mm/hugetlb.c            |  4 ++--
>>   mm/memory-failure.c     | 22 +++++++++++++---------
>>   4 files changed, 19 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 8e63e46b8e1f..2e6690c9df96 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, 
>> long start, long end,
>>   bool folio_isolate_hugetlb(struct folio *folio, struct list_head 
>> *list);
>>   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, 
>> bool unpoison);
>>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                bool *migratable_cleared);
>> +                bool *migratable_cleared, bool *samepg);
>>   void folio_putback_hugetlb(struct folio *folio);
>>   void move_hugetlb_state(struct folio *old_folio, struct folio 
>> *new_folio, int reason);
>>   void hugetlb_fix_reserve_counts(struct inode *inode);
>> @@ -420,7 +420,7 @@ static inline int 
>> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>   }
>>   static inline int get_huge_page_for_hwpoison(unsigned long pfn, int 
>> flags,
>> -                    bool *migratable_cleared)
>> +                    bool *migratable_cleared, bool *samepg)
>>   {
>>       return 0;
>>   }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 7c79b3369b82..68b1812e9c0a 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, 
>> int flags);
>>   extern const struct attribute_group memory_failure_attr_group;
>>   extern void memory_failure_queue(unsigned long pfn, int flags);
>>   extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                    bool *migratable_cleared);
>> +                    bool *migratable_cleared, bool *samepg);
>>   void num_poisoned_pages_inc(unsigned long pfn);
>>   void num_poisoned_pages_sub(unsigned long pfn, long i);
>>   #else
>> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned 
>> long pfn, int flags)
>>   }
>>   static inline int __get_huge_page_for_hwpoison(unsigned long pfn, 
>> int flags,
>> -                    bool *migratable_cleared)
>> +                    bool *migratable_cleared, bool *samepg)
>>   {
>>       return 0;
>>   }
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 0455119716ec..f78562a578e5 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio 
>> *folio, bool *hugetlb, bool unpoison
>>   }
>>   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                bool *migratable_cleared)
>> +                bool *migratable_cleared, bool *samepg)
>>   {
>>       int ret;
>>       spin_lock_irq(&hugetlb_lock);
>> -    ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>> +    ret = __get_huge_page_for_hwpoison(pfn, flags, 
>> migratable_cleared, samepg);
>>       spin_unlock_irq(&hugetlb_lock);
>>       return ret;
>>   }
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 3edebb0cda30..070f43bb110a 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct 
>> folio *folio, bool move_flag)
>>       return count;
>>   }
>> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct 
>> page *page)
>> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct 
>> page *page,
>> +                    bool *samepg)
>>   {
>>       struct llist_head *head;
>>       struct raw_hwp_page *raw_hwp;
>> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct 
>> folio *folio, struct page *page)
>>           return -EHWPOISON;
>>       head = raw_hwp_list_head(folio);
>>       llist_for_each_entry(p, head->first, node) {
>> -        if (p->page == page)
>> +        if (p->page == page) {
>> +            *samepg = true;
>>               return -EHWPOISON;
>> +        }
>>       }
>>       raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>>       if (raw_hwp) {
>>           raw_hwp->page = page;
>>           llist_add(&raw_hwp->node, head);
>> -        /* the first error event will be counted in action_result(). */
>> -        if (ret)
>> -            num_poisoned_pages_inc(page_to_pfn(page));
>>       } else {
>>           /*
>>            * Failed to save raw error info.  We no longer trace all
>> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio 
>> *folio)
>>    *   -EHWPOISON    - the hugepage is already hwpoisoned
>>    */
>>   int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> -                 bool *migratable_cleared)
>> +                 bool *migratable_cleared, bool *samepg)
>>   {
>>       struct page *page = pfn_to_page(pfn);
>>       struct folio *folio = page_folio(page);
>> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long 
>> pfn, int flags,
>>               goto out;
>>       }
>> -    if (folio_set_hugetlb_hwpoison(folio, page)) {
>> +    if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>>           ret = -EHWPOISON;
>>           goto out;
>>       }
>> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned 
>> long pfn, int flags, int *hugetlb
>>       struct page *p = pfn_to_page(pfn);
>>       struct folio *folio;
>>       unsigned long page_flags;
>> +    bool samepg = false;
>>       bool migratable_cleared = false;
>>       *hugetlb = 1;
>>   retry:
>> -    res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
>> +    res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, 
>> &samepg);
>>       if (res == 2) { /* fallback to normal page handling */
>>           *hugetlb = 0;
>>           return 0;
>> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned 
>> long pfn, int flags, int *hugetlb
>>               folio = page_folio(p);
>>               res = kill_accessing_process(current, folio_pfn(folio), 
>> flags);
>>           }
>> -        action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>> +        if (samepg)
>> +            action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>> +        else
>> +            action_result(pfn, MF_MSG_HUGE, MF_FAILED);
> 
> Can't we somehow return that result from 
> get_huge_page_for_hwpoison() ... folio_set_hugetlb_hwpoison() 
> differently? E.g., return an enum instead of "-EHWPOISON" or magic value 
> "2".

This is an option. The existing return codes are as follow.
__get_huge_page_for_hwpoison():
  * Return values:
  *   0             - free hugepage
  *   1             - in-use hugepage
  *   2             - not a hugepage
  *   -EBUSY        - the hugepage is busy (try to retry)
  *   -EHWPOISON    - the hugepage is already hwpoisoned

folio_set_hugetlb_hwpoison()
returns
	0:		folio was not poisoned before
	-EHWPOISON:	folio was poisoned before

To get rid of 'samepg', how about

__get_huge_page_for_hwpoison():
  * Return values:
  *   0             - free hugepage
  *   1             - in-use hugepage
  *   2             - not a hugepage
  *   3		   - the hugepage is already hwpoisoned in different page
  *   4  	   - the hugepage is already hwpoisoned in the same page
  *   -EBUSY        - the hugepage is busy (try to retry)

folio_set_hugetlb_hwpoison()
returns
	0:		folio was not poisoned before
	1:	        folio was poisoned before in different page
	2:	        folio was poisoned before in the same page

The whole point about identifying the same page is so that the re-poison
event is not doubled counted.

> 
> "samepg" is petty much unreadable. Same with what?
> 
> What you really mean is "page was already hwpoisoned".
> 

For example, a previously poison-free hugetlb folio is detected that its 
3rd tail page is poisoned, MF handler marks the folio as poisoned and
record the 3rd tail.  Sometime later, the same hugetlb folio again is 
detected being poisoned, this time, the poisoned page might be the 3rd 
tail page(indicated by 'samepg' = true) in which case the MF handler 
does not update the stats. Or, the poisoned page might be a new tail 
page, say the 7th tail, in which case, the MF handler records the 7th 
tail, and update the user-visible stats.

I have struggled a bit about the naming, wondering whether "repoison" 
might work better, or, a longer name like "same-page-repoisoned"?

> In an enum you might be better able to describe the various scenarios.
> 

Thanks!

-jane


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
  2025-12-18 19:01   ` jane.chu
@ 2025-12-18 20:26     ` Liam R. Howlett
  2025-12-18 23:10       ` jane.chu
  0 siblings, 1 reply; 6+ messages in thread
From: Liam R. Howlett @ 2025-12-18 20:26 UTC (permalink / raw)
  To: jane.chu
  Cc: David Hildenbrand (Red Hat),
	muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
	rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
	linux-kernel

* jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
> Hi, David,
> 
> Thanks for the review.
> 
> On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> > On 12/16/25 22:56, Jane Chu wrote:
> > > When a newly poisoned subpage ends up in an already poisoned hugetlb
> > 
> > The concept of subpages does not exist. It's a page of a hugetlb folio.
> 
> Okay.
> 
> > 
> > > folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
> > > is not. Fix the inconsistency by designating action_result() to update
> > > them both.
> > 
> > What is the user-visible result of that?
> 
> For the purpose of observation, and potential action afterwards.
> 
> # cat /proc/meminfo | grep HardwareCorrupted
> shows 'num_poisoned_pages', the global count of poisoned pages.
> 
> # ls /sys/devices/system/node/node0/memory_failure
> delayed  failed  ignored  recovered  total
> these fields show the per node ->mf_stats, that is the MF handling results.
> 
> > 
> > > 
> > > Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats
> > > to pglist_data")
> > > Cc: <stable@vger.kernel.org>
> > > Signed-off-by: Jane Chu <jane.chu@oracle.com>
> > > ---
> > >   include/linux/hugetlb.h |  4 ++--
> > >   include/linux/mm.h      |  4 ++--
> > >   mm/hugetlb.c            |  4 ++--
> > >   mm/memory-failure.c     | 22 +++++++++++++---------
> > >   4 files changed, 19 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index 8e63e46b8e1f..2e6690c9df96 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode
> > > *inode, long start, long end,
> > >   bool folio_isolate_hugetlb(struct folio *folio, struct list_head
> > > *list);
> > >   int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> > > bool unpoison);
> > >   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > -                bool *migratable_cleared);
> > > +                bool *migratable_cleared, bool *samepg);
> > >   void folio_putback_hugetlb(struct folio *folio);
> > >   void move_hugetlb_state(struct folio *old_folio, struct folio
> > > *new_folio, int reason);
> > >   void hugetlb_fix_reserve_counts(struct inode *inode);
> > > @@ -420,7 +420,7 @@ static inline int
> > > get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> > >   }
> > >   static inline int get_huge_page_for_hwpoison(unsigned long pfn,
> > > int flags,
> > > -                    bool *migratable_cleared)
> > > +                    bool *migratable_cleared, bool *samepg)
> > >   {
> > >       return 0;
> > >   }
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 7c79b3369b82..68b1812e9c0a 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long
> > > pfn, int flags);
> > >   extern const struct attribute_group memory_failure_attr_group;
> > >   extern void memory_failure_queue(unsigned long pfn, int flags);
> > >   extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > -                    bool *migratable_cleared);
> > > +                    bool *migratable_cleared, bool *samepg);
> > >   void num_poisoned_pages_inc(unsigned long pfn);
> > >   void num_poisoned_pages_sub(unsigned long pfn, long i);
> > >   #else
> > > @@ -4045,7 +4045,7 @@ static inline void
> > > memory_failure_queue(unsigned long pfn, int flags)
> > >   }
> > >   static inline int __get_huge_page_for_hwpoison(unsigned long pfn,
> > > int flags,
> > > -                    bool *migratable_cleared)
> > > +                    bool *migratable_cleared, bool *samepg)
> > >   {
> > >       return 0;
> > >   }
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index 0455119716ec..f78562a578e5 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio
> > > *folio, bool *hugetlb, bool unpoison
> > >   }
> > >   int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > -                bool *migratable_cleared)
> > > +                bool *migratable_cleared, bool *samepg)
> > >   {
> > >       int ret;
> > >       spin_lock_irq(&hugetlb_lock);
> > > -    ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> > > +    ret = __get_huge_page_for_hwpoison(pfn, flags,
> > > migratable_cleared, samepg);
> > >       spin_unlock_irq(&hugetlb_lock);
> > >       return ret;
> > >   }
> > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > index 3edebb0cda30..070f43bb110a 100644
> > > --- a/mm/memory-failure.c
> > > +++ b/mm/memory-failure.c
> > > @@ -1873,7 +1873,8 @@ static unsigned long
> > > __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> > >       return count;
> > >   }
> > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > page *page)
> > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > page *page,
> > > +                    bool *samepg)
> > >   {
> > >       struct llist_head *head;
> > >       struct raw_hwp_page *raw_hwp;
> > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
> > > folio *folio, struct page *page)
> > >           return -EHWPOISON;
> > >       head = raw_hwp_list_head(folio);
> > >       llist_for_each_entry(p, head->first, node) {
> > > -        if (p->page == page)
> > > +        if (p->page == page) {
> > > +            *samepg = true;
> > >               return -EHWPOISON;
> > > +        }
> > >       }
> > >       raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
> > >       if (raw_hwp) {
> > >           raw_hwp->page = page;
> > >           llist_add(&raw_hwp->node, head);
> > > -        /* the first error event will be counted in action_result(). */
> > > -        if (ret)
> > > -            num_poisoned_pages_inc(page_to_pfn(page));
> > >       } else {
> > >           /*
> > >            * Failed to save raw error info.  We no longer trace all
> > > @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio
> > > *folio)
> > >    *   -EHWPOISON    - the hugepage is already hwpoisoned
> > >    */
> > >   int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > -                 bool *migratable_cleared)
> > > +                 bool *migratable_cleared, bool *samepg)
> > >   {
> > >       struct page *page = pfn_to_page(pfn);
> > >       struct folio *folio = page_folio(page);
> > > @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long
> > > pfn, int flags,
> > >               goto out;
> > >       }
> > > -    if (folio_set_hugetlb_hwpoison(folio, page)) {
> > > +    if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
> > >           ret = -EHWPOISON;
> > >           goto out;
> > >       }
> > > @@ -2014,11 +2014,12 @@ static int
> > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > *hugetlb
> > >       struct page *p = pfn_to_page(pfn);
> > >       struct folio *folio;
> > >       unsigned long page_flags;
> > > +    bool samepg = false;
> > >       bool migratable_cleared = false;
> > >       *hugetlb = 1;
> > >   retry:
> > > -    res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
> > > +    res = get_huge_page_for_hwpoison(pfn, flags,
> > > &migratable_cleared, &samepg);
> > >       if (res == 2) { /* fallback to normal page handling */
> > >           *hugetlb = 0;
> > >           return 0;
> > > @@ -2027,7 +2028,10 @@ static int
> > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > *hugetlb
> > >               folio = page_folio(p);
> > >               res = kill_accessing_process(current,
> > > folio_pfn(folio), flags);
> > >           }
> > > -        action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > +        if (samepg)
> > > +            action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > +        else
> > > +            action_result(pfn, MF_MSG_HUGE, MF_FAILED);
> > 
> > Can't we somehow return that result from get_huge_page_for_hwpoison()
> > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
> > instead of "-EHWPOISON" or magic value "2".
> 
> This is an option. The existing return codes are as follow.
> __get_huge_page_for_hwpoison():
>  * Return values:
>  *   0             - free hugepage
>  *   1             - in-use hugepage
>  *   2             - not a hugepage
>  *   -EBUSY        - the hugepage is busy (try to retry)
>  *   -EHWPOISON    - the hugepage is already hwpoisoned
> 
> folio_set_hugetlb_hwpoison()
> returns
> 	0:		folio was not poisoned before
> 	-EHWPOISON:	folio was poisoned before
> 
> To get rid of 'samepg', how about
> 
> __get_huge_page_for_hwpoison():
>  * Return values:
>  *   0             - free hugepage
>  *   1             - in-use hugepage
>  *   2             - not a hugepage
>  *   3		   - the hugepage is already hwpoisoned in different page
>  *   4  	   - the hugepage is already hwpoisoned in the same page
>  *   -EBUSY        - the hugepage is busy (try to retry)
> 
> folio_set_hugetlb_hwpoison()
> returns
> 	0:		folio was not poisoned before
> 	1:	        folio was poisoned before in different page
> 	2:	        folio was poisoned before in the same page
> 
> The whole point about identifying the same page is so that the re-poison
> event is not doubled counted.

This means folio_set_hugetlb_hwpoison() returns 0 on success but
positives on error.. this seems to be going further away from the
standard way of doing things?

It would actually be good to remove all magic values instead of
expanding them.

I think what David was trying to say is to have a local enum that states
what these numbers mean so that the code reads more cleanly, instead of
digging for the right comment to decode it.

For example, in try_memory_failure_hugetlb():

if (res == 2) { /* fallback to normal page handling */

vs:

if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */

You could spell out your other options as well.  Maybe something like
MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
MEMORY_FAILURE_HWPOISONED

This would avoid adding more magic values and increase readability.

If you changed try_memory_failure_hugetlb() to use a switch statement,
then the compiler can catch unchecked enums for us too.

If you don't want to go the enum route, then you could use a different
error code and propagate it through, like -EEXISTS for the new case?
That way the return is still 0 on success and less than 0 on failure,
but I think the enum idea has a number of advantages.

Thanks,
Liam



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
  2025-12-18 20:26     ` Liam R. Howlett
@ 2025-12-18 23:10       ` jane.chu
  2025-12-19  4:09         ` Liam R. Howlett
  0 siblings, 1 reply; 6+ messages in thread
From: jane.chu @ 2025-12-18 23:10 UTC (permalink / raw)
  To: Liam R. Howlett, David Hildenbrand (Red Hat),
	muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
	rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
	linux-kernel

Hi, Liam,

Thanks!  My reply towards the end.

On 12/18/2025 12:26 PM, Liam R. Howlett wrote:
> * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
>> Hi, David,
>>
>> Thanks for the review.
>>
>> On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
>>> On 12/16/25 22:56, Jane Chu wrote:
>>>> When a newly poisoned subpage ends up in an already poisoned hugetlb
>>>
>>> The concept of subpages does not exist. It's a page of a hugetlb folio.
>>
>> Okay.
>>
>>>
>>>> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
>>>> is not. Fix the inconsistency by designating action_result() to update
>>>> them both.
>>>
>>> What is the user-visible result of that?
>>
>> For the purpose of observation, and potential action afterwards.
>>
>> # cat /proc/meminfo | grep HardwareCorrupted
>> shows 'num_poisoned_pages', the global count of poisoned pages.
>>
>> # ls /sys/devices/system/node/node0/memory_failure
>> delayed  failed  ignored  recovered  total
>> these fields show the per node ->mf_stats, that is the MF handling results.
>>
>>>
>>>>
>>>> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats
>>>> to pglist_data")
>>>> Cc: <stable@vger.kernel.org>
>>>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>>>> ---
>>>>    include/linux/hugetlb.h |  4 ++--
>>>>    include/linux/mm.h      |  4 ++--
>>>>    mm/hugetlb.c            |  4 ++--
>>>>    mm/memory-failure.c     | 22 +++++++++++++---------
>>>>    4 files changed, 19 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>> index 8e63e46b8e1f..2e6690c9df96 100644
>>>> --- a/include/linux/hugetlb.h
>>>> +++ b/include/linux/hugetlb.h
>>>> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode
>>>> *inode, long start, long end,
>>>>    bool folio_isolate_hugetlb(struct folio *folio, struct list_head
>>>> *list);
>>>>    int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>>> bool unpoison);
>>>>    int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> -                bool *migratable_cleared);
>>>> +                bool *migratable_cleared, bool *samepg);
>>>>    void folio_putback_hugetlb(struct folio *folio);
>>>>    void move_hugetlb_state(struct folio *old_folio, struct folio
>>>> *new_folio, int reason);
>>>>    void hugetlb_fix_reserve_counts(struct inode *inode);
>>>> @@ -420,7 +420,7 @@ static inline int
>>>> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>>>    }
>>>>    static inline int get_huge_page_for_hwpoison(unsigned long pfn,
>>>> int flags,
>>>> -                    bool *migratable_cleared)
>>>> +                    bool *migratable_cleared, bool *samepg)
>>>>    {
>>>>        return 0;
>>>>    }
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 7c79b3369b82..68b1812e9c0a 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long
>>>> pfn, int flags);
>>>>    extern const struct attribute_group memory_failure_attr_group;
>>>>    extern void memory_failure_queue(unsigned long pfn, int flags);
>>>>    extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> -                    bool *migratable_cleared);
>>>> +                    bool *migratable_cleared, bool *samepg);
>>>>    void num_poisoned_pages_inc(unsigned long pfn);
>>>>    void num_poisoned_pages_sub(unsigned long pfn, long i);
>>>>    #else
>>>> @@ -4045,7 +4045,7 @@ static inline void
>>>> memory_failure_queue(unsigned long pfn, int flags)
>>>>    }
>>>>    static inline int __get_huge_page_for_hwpoison(unsigned long pfn,
>>>> int flags,
>>>> -                    bool *migratable_cleared)
>>>> +                    bool *migratable_cleared, bool *samepg)
>>>>    {
>>>>        return 0;
>>>>    }
>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>> index 0455119716ec..f78562a578e5 100644
>>>> --- a/mm/hugetlb.c
>>>> +++ b/mm/hugetlb.c
>>>> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio
>>>> *folio, bool *hugetlb, bool unpoison
>>>>    }
>>>>    int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> -                bool *migratable_cleared)
>>>> +                bool *migratable_cleared, bool *samepg)
>>>>    {
>>>>        int ret;
>>>>        spin_lock_irq(&hugetlb_lock);
>>>> -    ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>>>> +    ret = __get_huge_page_for_hwpoison(pfn, flags,
>>>> migratable_cleared, samepg);
>>>>        spin_unlock_irq(&hugetlb_lock);
>>>>        return ret;
>>>>    }
>>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>>> index 3edebb0cda30..070f43bb110a 100644
>>>> --- a/mm/memory-failure.c
>>>> +++ b/mm/memory-failure.c
>>>> @@ -1873,7 +1873,8 @@ static unsigned long
>>>> __folio_free_raw_hwp(struct folio *folio, bool move_flag)
>>>>        return count;
>>>>    }
>>>> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>>>> page *page)
>>>> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>>>> page *page,
>>>> +                    bool *samepg)
>>>>    {
>>>>        struct llist_head *head;
>>>>        struct raw_hwp_page *raw_hwp;
>>>> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
>>>> folio *folio, struct page *page)
>>>>            return -EHWPOISON;
>>>>        head = raw_hwp_list_head(folio);
>>>>        llist_for_each_entry(p, head->first, node) {
>>>> -        if (p->page == page)
>>>> +        if (p->page == page) {
>>>> +            *samepg = true;
>>>>                return -EHWPOISON;
>>>> +        }
>>>>        }
>>>>        raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>>>>        if (raw_hwp) {
>>>>            raw_hwp->page = page;
>>>>            llist_add(&raw_hwp->node, head);
>>>> -        /* the first error event will be counted in action_result(). */
>>>> -        if (ret)
>>>> -            num_poisoned_pages_inc(page_to_pfn(page));
>>>>        } else {
>>>>            /*
>>>>             * Failed to save raw error info.  We no longer trace all
>>>> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio
>>>> *folio)
>>>>     *   -EHWPOISON    - the hugepage is already hwpoisoned
>>>>     */
>>>>    int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> -                 bool *migratable_cleared)
>>>> +                 bool *migratable_cleared, bool *samepg)
>>>>    {
>>>>        struct page *page = pfn_to_page(pfn);
>>>>        struct folio *folio = page_folio(page);
>>>> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long
>>>> pfn, int flags,
>>>>                goto out;
>>>>        }
>>>> -    if (folio_set_hugetlb_hwpoison(folio, page)) {
>>>> +    if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>>>>            ret = -EHWPOISON;
>>>>            goto out;
>>>>        }
>>>> @@ -2014,11 +2014,12 @@ static int
>>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int
>>>> *hugetlb
>>>>        struct page *p = pfn_to_page(pfn);
>>>>        struct folio *folio;
>>>>        unsigned long page_flags;
>>>> +    bool samepg = false;
>>>>        bool migratable_cleared = false;
>>>>        *hugetlb = 1;
>>>>    retry:
>>>> -    res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
>>>> +    res = get_huge_page_for_hwpoison(pfn, flags,
>>>> &migratable_cleared, &samepg);
>>>>        if (res == 2) { /* fallback to normal page handling */
>>>>            *hugetlb = 0;
>>>>            return 0;
>>>> @@ -2027,7 +2028,10 @@ static int
>>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int
>>>> *hugetlb
>>>>                folio = page_folio(p);
>>>>                res = kill_accessing_process(current,
>>>> folio_pfn(folio), flags);
>>>>            }
>>>> -        action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>>>> +        if (samepg)
>>>> +            action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>>>> +        else
>>>> +            action_result(pfn, MF_MSG_HUGE, MF_FAILED);
>>>
>>> Can't we somehow return that result from get_huge_page_for_hwpoison()
>>> ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
>>> instead of "-EHWPOISON" or magic value "2".
>>
>> This is an option. The existing return codes are as follow.
>> __get_huge_page_for_hwpoison():
>>   * Return values:
>>   *   0             - free hugepage
>>   *   1             - in-use hugepage
>>   *   2             - not a hugepage
>>   *   -EBUSY        - the hugepage is busy (try to retry)
>>   *   -EHWPOISON    - the hugepage is already hwpoisoned
>>
>> folio_set_hugetlb_hwpoison()
>> returns
>> 	0:		folio was not poisoned before
>> 	-EHWPOISON:	folio was poisoned before
>>
>> To get rid of 'samepg', how about
>>
>> __get_huge_page_for_hwpoison():
>>   * Return values:
>>   *   0             - free hugepage
>>   *   1             - in-use hugepage
>>   *   2             - not a hugepage
>>   *   3		   - the hugepage is already hwpoisoned in different page
>>   *   4  	   - the hugepage is already hwpoisoned in the same page
>>   *   -EBUSY        - the hugepage is busy (try to retry)
>>
>> folio_set_hugetlb_hwpoison()
>> returns
>> 	0:		folio was not poisoned before
>> 	1:	        folio was poisoned before in different page
>> 	2:	        folio was poisoned before in the same page
>>
>> The whole point about identifying the same page is so that the re-poison
>> event is not doubled counted.
> 
> This means folio_set_hugetlb_hwpoison() returns 0 on success but
> positives on error.. this seems to be going further away from the
> standard way of doing things?

Yes.
 > > It would actually be good to remove all magic values instead of
> expanding them.
> 
> I think what David was trying to say is to have a local enum that states
> what these numbers mean so that the code reads more cleanly, instead of
> digging for the right comment to decode it.
> 
> For example, in try_memory_failure_hugetlb():
> 
> if (res == 2) { /* fallback to normal page handling */
> 
> vs:
> 
> if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */
> 
> You could spell out your other options as well.  Maybe something like
> MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
> MEMORY_FAILURE_HWPOISONED
> 
> This would avoid adding more magic values and increase readability.
> 
> If you changed try_memory_failure_hugetlb() to use a switch statement,
> then the compiler can catch unchecked enums for us too.
> 
> If you don't want to go the enum route, then you could use a different
> error code and propagate it through, like -EEXISTS for the new case?
> That way the return is still 0 on success and less than 0 on failure,
> but I think the enum idea has a number of advantages.

I am open, actually prefer enum with switch statement as you suggested 
above.

What about folio_set_hugetlb_hwpoison()?
Indeed the conventional way of folio_set_X_Y() returns only two possible
values, but we need three.
How about changing the function name to set_hugetlb_hwpoison() to 
deviate from the convention?  afterall, the function does more than 
conventional bit setting, it maintains a per folio raw-error linked list
to track the poisoned pages within.

Thanks!
-jane

> 
> Thanks,
> Liam
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
  2025-12-18 23:10       ` jane.chu
@ 2025-12-19  4:09         ` Liam R. Howlett
  0 siblings, 0 replies; 6+ messages in thread
From: Liam R. Howlett @ 2025-12-19  4:09 UTC (permalink / raw)
  To: jane.chu
  Cc: David Hildenbrand (Red Hat),
	muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
	rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
	linux-kernel

* jane.chu@oracle.com <jane.chu@oracle.com> [251218 18:10]:
...
> > * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
...
> > > On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> > > > On 12/16/25 22:56, Jane Chu wrote:
...

> > > > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > > > page *page)
> > > > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > > > page *page,
> > > > > +                    bool *samepg)
> > > > >    {
> > > > >        struct llist_head *head;
> > > > >        struct raw_hwp_page *raw_hwp;
> > > > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
> > > > > folio *folio, struct page *page)
> > > > >            return -EHWPOISON;
> > > > >        head = raw_hwp_list_head(folio);
> > > > >        llist_for_each_entry(p, head->first, node) {
> > > > > -        if (p->page == page)
> > > > > +        if (p->page == page) {
> > > > > +            *samepg = true;
> > > > >                return -EHWPOISON;
> > > > > +        }
> > > > >        }
> > > > >        raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
> > > > >        if (raw_hwp) {
> > > > >            raw_hwp->page = page;
> > > > >            llist_add(&raw_hwp->node, head);
> > > > > -        /* the first error event will be counted in action_result(). */
> > > > > -        if (ret)
> > > > > -            num_poisoned_pages_inc(page_to_pfn(page));
> > > > >        } else {
> > > > >            /*
> > > > >             * Failed to save raw error info.  We no longer trace all
...

> > > > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > > > *hugetlb
> > > > >                folio = page_folio(p);
> > > > >                res = kill_accessing_process(current,
> > > > > folio_pfn(folio), flags);
> > > > >            }
> > > > > -        action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > > > +        if (samepg)
> > > > > +            action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > > > +        else
> > > > > +            action_result(pfn, MF_MSG_HUGE, MF_FAILED);
> > > > 
> > > > Can't we somehow return that result from get_huge_page_for_hwpoison()
> > > > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
> > > > instead of "-EHWPOISON" or magic value "2".
> > > 
> > > This is an option. The existing return codes are as follow.
> > > __get_huge_page_for_hwpoison():
> > >   * Return values:
> > >   *   0             - free hugepage
> > >   *   1             - in-use hugepage
> > >   *   2             - not a hugepage
> > >   *   -EBUSY        - the hugepage is busy (try to retry)
> > >   *   -EHWPOISON    - the hugepage is already hwpoisoned
> > > 
> > > folio_set_hugetlb_hwpoison()
> > > returns
> > > 	0:		folio was not poisoned before
> > > 	-EHWPOISON:	folio was poisoned before
> > > 
> > > To get rid of 'samepg', how about
> > > 
> > > __get_huge_page_for_hwpoison():
> > >   * Return values:
> > >   *   0             - free hugepage
> > >   *   1             - in-use hugepage
> > >   *   2             - not a hugepage
> > >   *   3		   - the hugepage is already hwpoisoned in different page
> > >   *   4  	   - the hugepage is already hwpoisoned in the same page
> > >   *   -EBUSY        - the hugepage is busy (try to retry)
> > > 
> > > folio_set_hugetlb_hwpoison()
> > > returns
> > > 	0:		folio was not poisoned before
> > > 	1:	        folio was poisoned before in different page
> > > 	2:	        folio was poisoned before in the same page
> > > 
> > > The whole point about identifying the same page is so that the re-poison
> > > event is not doubled counted.
> > 
> > This means folio_set_hugetlb_hwpoison() returns 0 on success but
> > positives on error.. this seems to be going further away from the
> > standard way of doing things?
> 
> Yes.
> > > It would actually be good to remove all magic values instead of
> > expanding them.
> > 
> > I think what David was trying to say is to have a local enum that states
> > what these numbers mean so that the code reads more cleanly, instead of
> > digging for the right comment to decode it.
> > 
> > For example, in try_memory_failure_hugetlb():
> > 
> > if (res == 2) { /* fallback to normal page handling */
> > 
> > vs:
> > 
> > if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */
> > 
> > You could spell out your other options as well.  Maybe something like
> > MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
> > MEMORY_FAILURE_HWPOISONED
> > 
> > This would avoid adding more magic values and increase readability.
> > 
> > If you changed try_memory_failure_hugetlb() to use a switch statement,
> > then the compiler can catch unchecked enums for us too.
> > 
> > If you don't want to go the enum route, then you could use a different
> > error code and propagate it through, like -EEXISTS for the new case?
> > That way the return is still 0 on success and less than 0 on failure,
> > but I think the enum idea has a number of advantages.
> 
> I am open, actually prefer enum with switch statement as you suggested
> above.

It's David's suggestion, really :)

> 
> What about folio_set_hugetlb_hwpoison()?
> Indeed the conventional way of folio_set_X_Y() returns only two possible
> values, but we need three.

I am having a hard time finding any folio_set_* that returns anything.

$ git grep int\ folio_set_|wc -l
1
$ git grep void\ folio_set_|wc -l
20

> How about changing the function name to set_hugetlb_hwpoison() to deviate
> from the convention?  afterall, the function does more than conventional bit
> setting, it maintains a per folio raw-error linked list
> to track the poisoned pages within.

I think that's a good idea considering it seems like folio_set_ implies
that it will be setting something unconditionally, and this function
does not always set something and does even more work.

In fact the names make no sense to begin with.
get_huge_page_for_hwpoison() ends up actually calling
folio_set_hugetlb_hwpoison(), so it's not getting the huge page at all,
it's doing (at least some of) the work.

So, yeah, renaming it isn't going to make things worse.  I'd try to
indicate that folio_set_hugetlb_hwpoison() might not do anything.  It's
static and has hugetlb in the name, so we're pretty safe with anything.

Maybe update_hugetlb_hwpoison_list() ?

Thanks,
Liam



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-12-19  4:10 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu
2025-12-18  8:41 ` David Hildenbrand (Red Hat)
2025-12-18 19:01   ` jane.chu
2025-12-18 20:26     ` Liam R. Howlett
2025-12-18 23:10       ` jane.chu
2025-12-19  4:09         ` Liam R. Howlett

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox