* [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
@ 2025-12-16 21:56 Jane Chu
2025-12-18 8:41 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 6+ messages in thread
From: Jane Chu @ 2025-12-16 21:56 UTC (permalink / raw)
To: muchun.song, osalvador, david, linmiaohe, jiaqiyan,
william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett,
rppt, surenb, mhocko, linux-mm, linux-kernel
When a newly poisoned subpage ends up in an already poisoned hugetlb
folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
is not. Fix the inconsistency by designating action_result() to update
them both.
Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
Cc: <stable@vger.kernel.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
include/linux/hugetlb.h | 4 ++--
include/linux/mm.h | 4 ++--
mm/hugetlb.c | 4 ++--
mm/memory-failure.c | 22 +++++++++++++---------
4 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2e6690c9df96 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
+ bool *migratable_cleared, bool *samepg);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
}
static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
return 0;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..68b1812e9c0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
+ bool *migratable_cleared, bool *samepg);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
@@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
}
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0455119716ec..f78562a578e5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
}
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..070f43bb110a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
return count;
}
-static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
+ bool *samepg)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
@@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
return -EHWPOISON;
head = raw_hwp_list_head(folio);
llist_for_each_entry(p, head->first, node) {
- if (p->page == page)
+ if (p->page == page) {
+ *samepg = true;
return -EHWPOISON;
+ }
}
raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
if (raw_hwp) {
raw_hwp->page = page;
llist_add(&raw_hwp->node, head);
- /* the first error event will be counted in action_result(). */
- if (ret)
- num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
struct page *page = pfn_to_page(pfn);
struct folio *folio = page_folio(page);
@@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
goto out;
}
- if (folio_set_hugetlb_hwpoison(folio, page)) {
+ if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
ret = -EHWPOISON;
goto out;
}
@@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct folio *folio;
unsigned long page_flags;
+ bool samepg = false;
bool migratable_cleared = false;
*hugetlb = 1;
retry:
- res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
@@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
folio = page_folio(p);
res = kill_accessing_process(current, folio_pfn(folio), flags);
}
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+ if (samepg)
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+ else
+ action_result(pfn, MF_MSG_HUGE, MF_FAILED);
return res;
} else if (res == -EBUSY) {
if (!(flags & MF_NO_RETRY)) {
--
2.43.5
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu
@ 2025-12-18 8:41 ` David Hildenbrand (Red Hat)
2025-12-18 19:01 ` jane.chu
0 siblings, 1 reply; 6+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-18 8:41 UTC (permalink / raw)
To: Jane Chu, muchun.song, osalvador, linmiaohe, jiaqiyan,
william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett,
rppt, surenb, mhocko, linux-mm, linux-kernel
On 12/16/25 22:56, Jane Chu wrote:
> When a newly poisoned subpage ends up in an already poisoned hugetlb
The concept of subpages does not exist. It's a page of a hugetlb folio.
> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
> is not. Fix the inconsistency by designating action_result() to update
> them both.
What is the user-visible result of that?
>
> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
> ---
> include/linux/hugetlb.h | 4 ++--
> include/linux/mm.h | 4 ++--
> mm/hugetlb.c | 4 ++--
> mm/memory-failure.c | 22 +++++++++++++---------
> 4 files changed, 19 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f..2e6690c9df96 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
> bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared);
> + bool *migratable_cleared, bool *samepg);
> void folio_putback_hugetlb(struct folio *folio);
> void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
> void hugetlb_fix_reserve_counts(struct inode *inode);
> @@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> }
>
> static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> + bool *migratable_cleared, bool *samepg)
> {
> return 0;
> }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7c79b3369b82..68b1812e9c0a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
> extern const struct attribute_group memory_failure_attr_group;
> extern void memory_failure_queue(unsigned long pfn, int flags);
> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared);
> + bool *migratable_cleared, bool *samepg);
> void num_poisoned_pages_inc(unsigned long pfn);
> void num_poisoned_pages_sub(unsigned long pfn, long i);
> #else
> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
> }
>
> static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> + bool *migratable_cleared, bool *samepg)
> {
> return 0;
> }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0455119716ec..f78562a578e5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
> }
>
> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> + bool *migratable_cleared, bool *samepg)
> {
> int ret;
>
> spin_lock_irq(&hugetlb_lock);
> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
> spin_unlock_irq(&hugetlb_lock);
> return ret;
> }
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30..070f43bb110a 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> return count;
> }
>
> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
> + bool *samepg)
> {
> struct llist_head *head;
> struct raw_hwp_page *raw_hwp;
> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> return -EHWPOISON;
> head = raw_hwp_list_head(folio);
> llist_for_each_entry(p, head->first, node) {
> - if (p->page == page)
> + if (p->page == page) {
> + *samepg = true;
> return -EHWPOISON;
> + }
> }
>
> raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
> if (raw_hwp) {
> raw_hwp->page = page;
> llist_add(&raw_hwp->node, head);
> - /* the first error event will be counted in action_result(). */
> - if (ret)
> - num_poisoned_pages_inc(page_to_pfn(page));
> } else {
> /*
> * Failed to save raw error info. We no longer trace all
> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
> * -EHWPOISON - the hugepage is already hwpoisoned
> */
> int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> - bool *migratable_cleared)
> + bool *migratable_cleared, bool *samepg)
> {
> struct page *page = pfn_to_page(pfn);
> struct folio *folio = page_folio(page);
> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> goto out;
> }
>
> - if (folio_set_hugetlb_hwpoison(folio, page)) {
> + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
> ret = -EHWPOISON;
> goto out;
> }
> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
> struct page *p = pfn_to_page(pfn);
> struct folio *folio;
> unsigned long page_flags;
> + bool samepg = false;
> bool migratable_cleared = false;
>
> *hugetlb = 1;
> retry:
> - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
> + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
> if (res == 2) { /* fallback to normal page handling */
> *hugetlb = 0;
> return 0;
> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
> folio = page_folio(p);
> res = kill_accessing_process(current, folio_pfn(folio), flags);
> }
> - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> + if (samepg)
> + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> + else
> + action_result(pfn, MF_MSG_HUGE, MF_FAILED);
Can't we somehow return that result from get_huge_page_for_hwpoison()
... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
instead of "-EHWPOISON" or magic value "2".
"samepg" is petty much unreadable. Same with what?
What you really mean is "page was already hwpoisoned".
In an enum you might be better able to describe the various scenarios.
--
Cheers
David
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
2025-12-18 8:41 ` David Hildenbrand (Red Hat)
@ 2025-12-18 19:01 ` jane.chu
2025-12-18 20:26 ` Liam R. Howlett
0 siblings, 1 reply; 6+ messages in thread
From: jane.chu @ 2025-12-18 19:01 UTC (permalink / raw)
To: David Hildenbrand (Red Hat),
muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
rientjes, akpm, lorenzo.stoakes, Liam.Howlett, rppt, surenb,
mhocko, linux-mm, linux-kernel
Hi, David,
Thanks for the review.
On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> On 12/16/25 22:56, Jane Chu wrote:
>> When a newly poisoned subpage ends up in an already poisoned hugetlb
>
> The concept of subpages does not exist. It's a page of a hugetlb folio.
Okay.
>
>> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
>> is not. Fix the inconsistency by designating action_result() to update
>> them both.
>
> What is the user-visible result of that?
For the purpose of observation, and potential action afterwards.
# cat /proc/meminfo | grep HardwareCorrupted
shows 'num_poisoned_pages', the global count of poisoned pages.
# ls /sys/devices/system/node/node0/memory_failure
delayed failed ignored recovered total
these fields show the per node ->mf_stats, that is the MF handling results.
>
>>
>> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats
>> to pglist_data")
>> Cc: <stable@vger.kernel.org>
>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>> ---
>> include/linux/hugetlb.h | 4 ++--
>> include/linux/mm.h | 4 ++--
>> mm/hugetlb.c | 4 ++--
>> mm/memory-failure.c | 22 +++++++++++++---------
>> 4 files changed, 19 insertions(+), 15 deletions(-)
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 8e63e46b8e1f..2e6690c9df96 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode,
>> long start, long end,
>> bool folio_isolate_hugetlb(struct folio *folio, struct list_head
>> *list);
>> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>> bool unpoison);
>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared);
>> + bool *migratable_cleared, bool *samepg);
>> void folio_putback_hugetlb(struct folio *folio);
>> void move_hugetlb_state(struct folio *old_folio, struct folio
>> *new_folio, int reason);
>> void hugetlb_fix_reserve_counts(struct inode *inode);
>> @@ -420,7 +420,7 @@ static inline int
>> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>> }
>> static inline int get_huge_page_for_hwpoison(unsigned long pfn, int
>> flags,
>> - bool *migratable_cleared)
>> + bool *migratable_cleared, bool *samepg)
>> {
>> return 0;
>> }
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 7c79b3369b82..68b1812e9c0a 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn,
>> int flags);
>> extern const struct attribute_group memory_failure_attr_group;
>> extern void memory_failure_queue(unsigned long pfn, int flags);
>> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared);
>> + bool *migratable_cleared, bool *samepg);
>> void num_poisoned_pages_inc(unsigned long pfn);
>> void num_poisoned_pages_sub(unsigned long pfn, long i);
>> #else
>> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned
>> long pfn, int flags)
>> }
>> static inline int __get_huge_page_for_hwpoison(unsigned long pfn,
>> int flags,
>> - bool *migratable_cleared)
>> + bool *migratable_cleared, bool *samepg)
>> {
>> return 0;
>> }
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 0455119716ec..f78562a578e5 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio
>> *folio, bool *hugetlb, bool unpoison
>> }
>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared)
>> + bool *migratable_cleared, bool *samepg)
>> {
>> int ret;
>> spin_lock_irq(&hugetlb_lock);
>> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>> + ret = __get_huge_page_for_hwpoison(pfn, flags,
>> migratable_cleared, samepg);
>> spin_unlock_irq(&hugetlb_lock);
>> return ret;
>> }
>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>> index 3edebb0cda30..070f43bb110a 100644
>> --- a/mm/memory-failure.c
>> +++ b/mm/memory-failure.c
>> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct
>> folio *folio, bool move_flag)
>> return count;
>> }
>> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>> page *page)
>> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>> page *page,
>> + bool *samepg)
>> {
>> struct llist_head *head;
>> struct raw_hwp_page *raw_hwp;
>> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
>> folio *folio, struct page *page)
>> return -EHWPOISON;
>> head = raw_hwp_list_head(folio);
>> llist_for_each_entry(p, head->first, node) {
>> - if (p->page == page)
>> + if (p->page == page) {
>> + *samepg = true;
>> return -EHWPOISON;
>> + }
>> }
>> raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>> if (raw_hwp) {
>> raw_hwp->page = page;
>> llist_add(&raw_hwp->node, head);
>> - /* the first error event will be counted in action_result(). */
>> - if (ret)
>> - num_poisoned_pages_inc(page_to_pfn(page));
>> } else {
>> /*
>> * Failed to save raw error info. We no longer trace all
>> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio
>> *folio)
>> * -EHWPOISON - the hugepage is already hwpoisoned
>> */
>> int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>> - bool *migratable_cleared)
>> + bool *migratable_cleared, bool *samepg)
>> {
>> struct page *page = pfn_to_page(pfn);
>> struct folio *folio = page_folio(page);
>> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long
>> pfn, int flags,
>> goto out;
>> }
>> - if (folio_set_hugetlb_hwpoison(folio, page)) {
>> + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>> ret = -EHWPOISON;
>> goto out;
>> }
>> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned
>> long pfn, int flags, int *hugetlb
>> struct page *p = pfn_to_page(pfn);
>> struct folio *folio;
>> unsigned long page_flags;
>> + bool samepg = false;
>> bool migratable_cleared = false;
>> *hugetlb = 1;
>> retry:
>> - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
>> + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared,
>> &samepg);
>> if (res == 2) { /* fallback to normal page handling */
>> *hugetlb = 0;
>> return 0;
>> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned
>> long pfn, int flags, int *hugetlb
>> folio = page_folio(p);
>> res = kill_accessing_process(current, folio_pfn(folio),
>> flags);
>> }
>> - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>> + if (samepg)
>> + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>> + else
>> + action_result(pfn, MF_MSG_HUGE, MF_FAILED);
>
> Can't we somehow return that result from
> get_huge_page_for_hwpoison() ... folio_set_hugetlb_hwpoison()
> differently? E.g., return an enum instead of "-EHWPOISON" or magic value
> "2".
This is an option. The existing return codes are as follow.
__get_huge_page_for_hwpoison():
* Return values:
* 0 - free hugepage
* 1 - in-use hugepage
* 2 - not a hugepage
* -EBUSY - the hugepage is busy (try to retry)
* -EHWPOISON - the hugepage is already hwpoisoned
folio_set_hugetlb_hwpoison()
returns
0: folio was not poisoned before
-EHWPOISON: folio was poisoned before
To get rid of 'samepg', how about
__get_huge_page_for_hwpoison():
* Return values:
* 0 - free hugepage
* 1 - in-use hugepage
* 2 - not a hugepage
* 3 - the hugepage is already hwpoisoned in different page
* 4 - the hugepage is already hwpoisoned in the same page
* -EBUSY - the hugepage is busy (try to retry)
folio_set_hugetlb_hwpoison()
returns
0: folio was not poisoned before
1: folio was poisoned before in different page
2: folio was poisoned before in the same page
The whole point about identifying the same page is so that the re-poison
event is not doubled counted.
>
> "samepg" is petty much unreadable. Same with what?
>
> What you really mean is "page was already hwpoisoned".
>
For example, a previously poison-free hugetlb folio is detected that its
3rd tail page is poisoned, MF handler marks the folio as poisoned and
record the 3rd tail. Sometime later, the same hugetlb folio again is
detected being poisoned, this time, the poisoned page might be the 3rd
tail page(indicated by 'samepg' = true) in which case the MF handler
does not update the stats. Or, the poisoned page might be a new tail
page, say the 7th tail, in which case, the MF handler records the 7th
tail, and update the user-visible stats.
I have struggled a bit about the naming, wondering whether "repoison"
might work better, or, a longer name like "same-page-repoisoned"?
> In an enum you might be better able to describe the various scenarios.
>
Thanks!
-jane
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
2025-12-18 19:01 ` jane.chu
@ 2025-12-18 20:26 ` Liam R. Howlett
2025-12-18 23:10 ` jane.chu
0 siblings, 1 reply; 6+ messages in thread
From: Liam R. Howlett @ 2025-12-18 20:26 UTC (permalink / raw)
To: jane.chu
Cc: David Hildenbrand (Red Hat),
muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
linux-kernel
* jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
> Hi, David,
>
> Thanks for the review.
>
> On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> > On 12/16/25 22:56, Jane Chu wrote:
> > > When a newly poisoned subpage ends up in an already poisoned hugetlb
> >
> > The concept of subpages does not exist. It's a page of a hugetlb folio.
>
> Okay.
>
> >
> > > folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
> > > is not. Fix the inconsistency by designating action_result() to update
> > > them both.
> >
> > What is the user-visible result of that?
>
> For the purpose of observation, and potential action afterwards.
>
> # cat /proc/meminfo | grep HardwareCorrupted
> shows 'num_poisoned_pages', the global count of poisoned pages.
>
> # ls /sys/devices/system/node/node0/memory_failure
> delayed failed ignored recovered total
> these fields show the per node ->mf_stats, that is the MF handling results.
>
> >
> > >
> > > Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats
> > > to pglist_data")
> > > Cc: <stable@vger.kernel.org>
> > > Signed-off-by: Jane Chu <jane.chu@oracle.com>
> > > ---
> > > include/linux/hugetlb.h | 4 ++--
> > > include/linux/mm.h | 4 ++--
> > > mm/hugetlb.c | 4 ++--
> > > mm/memory-failure.c | 22 +++++++++++++---------
> > > 4 files changed, 19 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index 8e63e46b8e1f..2e6690c9df96 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode
> > > *inode, long start, long end,
> > > bool folio_isolate_hugetlb(struct folio *folio, struct list_head
> > > *list);
> > > int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> > > bool unpoison);
> > > int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > - bool *migratable_cleared);
> > > + bool *migratable_cleared, bool *samepg);
> > > void folio_putback_hugetlb(struct folio *folio);
> > > void move_hugetlb_state(struct folio *old_folio, struct folio
> > > *new_folio, int reason);
> > > void hugetlb_fix_reserve_counts(struct inode *inode);
> > > @@ -420,7 +420,7 @@ static inline int
> > > get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
> > > }
> > > static inline int get_huge_page_for_hwpoison(unsigned long pfn,
> > > int flags,
> > > - bool *migratable_cleared)
> > > + bool *migratable_cleared, bool *samepg)
> > > {
> > > return 0;
> > > }
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 7c79b3369b82..68b1812e9c0a 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long
> > > pfn, int flags);
> > > extern const struct attribute_group memory_failure_attr_group;
> > > extern void memory_failure_queue(unsigned long pfn, int flags);
> > > extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > - bool *migratable_cleared);
> > > + bool *migratable_cleared, bool *samepg);
> > > void num_poisoned_pages_inc(unsigned long pfn);
> > > void num_poisoned_pages_sub(unsigned long pfn, long i);
> > > #else
> > > @@ -4045,7 +4045,7 @@ static inline void
> > > memory_failure_queue(unsigned long pfn, int flags)
> > > }
> > > static inline int __get_huge_page_for_hwpoison(unsigned long pfn,
> > > int flags,
> > > - bool *migratable_cleared)
> > > + bool *migratable_cleared, bool *samepg)
> > > {
> > > return 0;
> > > }
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index 0455119716ec..f78562a578e5 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio
> > > *folio, bool *hugetlb, bool unpoison
> > > }
> > > int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > - bool *migratable_cleared)
> > > + bool *migratable_cleared, bool *samepg)
> > > {
> > > int ret;
> > > spin_lock_irq(&hugetlb_lock);
> > > - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
> > > + ret = __get_huge_page_for_hwpoison(pfn, flags,
> > > migratable_cleared, samepg);
> > > spin_unlock_irq(&hugetlb_lock);
> > > return ret;
> > > }
> > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > > index 3edebb0cda30..070f43bb110a 100644
> > > --- a/mm/memory-failure.c
> > > +++ b/mm/memory-failure.c
> > > @@ -1873,7 +1873,8 @@ static unsigned long
> > > __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> > > return count;
> > > }
> > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > page *page)
> > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > page *page,
> > > + bool *samepg)
> > > {
> > > struct llist_head *head;
> > > struct raw_hwp_page *raw_hwp;
> > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
> > > folio *folio, struct page *page)
> > > return -EHWPOISON;
> > > head = raw_hwp_list_head(folio);
> > > llist_for_each_entry(p, head->first, node) {
> > > - if (p->page == page)
> > > + if (p->page == page) {
> > > + *samepg = true;
> > > return -EHWPOISON;
> > > + }
> > > }
> > > raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
> > > if (raw_hwp) {
> > > raw_hwp->page = page;
> > > llist_add(&raw_hwp->node, head);
> > > - /* the first error event will be counted in action_result(). */
> > > - if (ret)
> > > - num_poisoned_pages_inc(page_to_pfn(page));
> > > } else {
> > > /*
> > > * Failed to save raw error info. We no longer trace all
> > > @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio
> > > *folio)
> > > * -EHWPOISON - the hugepage is already hwpoisoned
> > > */
> > > int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > > - bool *migratable_cleared)
> > > + bool *migratable_cleared, bool *samepg)
> > > {
> > > struct page *page = pfn_to_page(pfn);
> > > struct folio *folio = page_folio(page);
> > > @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long
> > > pfn, int flags,
> > > goto out;
> > > }
> > > - if (folio_set_hugetlb_hwpoison(folio, page)) {
> > > + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
> > > ret = -EHWPOISON;
> > > goto out;
> > > }
> > > @@ -2014,11 +2014,12 @@ static int
> > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > *hugetlb
> > > struct page *p = pfn_to_page(pfn);
> > > struct folio *folio;
> > > unsigned long page_flags;
> > > + bool samepg = false;
> > > bool migratable_cleared = false;
> > > *hugetlb = 1;
> > > retry:
> > > - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
> > > + res = get_huge_page_for_hwpoison(pfn, flags,
> > > &migratable_cleared, &samepg);
> > > if (res == 2) { /* fallback to normal page handling */
> > > *hugetlb = 0;
> > > return 0;
> > > @@ -2027,7 +2028,10 @@ static int
> > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > *hugetlb
> > > folio = page_folio(p);
> > > res = kill_accessing_process(current,
> > > folio_pfn(folio), flags);
> > > }
> > > - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > + if (samepg)
> > > + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > + else
> > > + action_result(pfn, MF_MSG_HUGE, MF_FAILED);
> >
> > Can't we somehow return that result from get_huge_page_for_hwpoison()
> > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
> > instead of "-EHWPOISON" or magic value "2".
>
> This is an option. The existing return codes are as follow.
> __get_huge_page_for_hwpoison():
> * Return values:
> * 0 - free hugepage
> * 1 - in-use hugepage
> * 2 - not a hugepage
> * -EBUSY - the hugepage is busy (try to retry)
> * -EHWPOISON - the hugepage is already hwpoisoned
>
> folio_set_hugetlb_hwpoison()
> returns
> 0: folio was not poisoned before
> -EHWPOISON: folio was poisoned before
>
> To get rid of 'samepg', how about
>
> __get_huge_page_for_hwpoison():
> * Return values:
> * 0 - free hugepage
> * 1 - in-use hugepage
> * 2 - not a hugepage
> * 3 - the hugepage is already hwpoisoned in different page
> * 4 - the hugepage is already hwpoisoned in the same page
> * -EBUSY - the hugepage is busy (try to retry)
>
> folio_set_hugetlb_hwpoison()
> returns
> 0: folio was not poisoned before
> 1: folio was poisoned before in different page
> 2: folio was poisoned before in the same page
>
> The whole point about identifying the same page is so that the re-poison
> event is not doubled counted.
This means folio_set_hugetlb_hwpoison() returns 0 on success but
positives on error.. this seems to be going further away from the
standard way of doing things?
It would actually be good to remove all magic values instead of
expanding them.
I think what David was trying to say is to have a local enum that states
what these numbers mean so that the code reads more cleanly, instead of
digging for the right comment to decode it.
For example, in try_memory_failure_hugetlb():
if (res == 2) { /* fallback to normal page handling */
vs:
if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */
You could spell out your other options as well. Maybe something like
MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
MEMORY_FAILURE_HWPOISONED
This would avoid adding more magic values and increase readability.
If you changed try_memory_failure_hugetlb() to use a switch statement,
then the compiler can catch unchecked enums for us too.
If you don't want to go the enum route, then you could use a different
error code and propagate it through, like -EEXISTS for the new case?
That way the return is still 0 on success and less than 0 on failure,
but I think the enum idea has a number of advantages.
Thanks,
Liam
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
2025-12-18 20:26 ` Liam R. Howlett
@ 2025-12-18 23:10 ` jane.chu
2025-12-19 4:09 ` Liam R. Howlett
0 siblings, 1 reply; 6+ messages in thread
From: jane.chu @ 2025-12-18 23:10 UTC (permalink / raw)
To: Liam R. Howlett, David Hildenbrand (Red Hat),
muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
linux-kernel
Hi, Liam,
Thanks! My reply towards the end.
On 12/18/2025 12:26 PM, Liam R. Howlett wrote:
> * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
>> Hi, David,
>>
>> Thanks for the review.
>>
>> On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
>>> On 12/16/25 22:56, Jane Chu wrote:
>>>> When a newly poisoned subpage ends up in an already poisoned hugetlb
>>>
>>> The concept of subpages does not exist. It's a page of a hugetlb folio.
>>
>> Okay.
>>
>>>
>>>> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
>>>> is not. Fix the inconsistency by designating action_result() to update
>>>> them both.
>>>
>>> What is the user-visible result of that?
>>
>> For the purpose of observation, and potential action afterwards.
>>
>> # cat /proc/meminfo | grep HardwareCorrupted
>> shows 'num_poisoned_pages', the global count of poisoned pages.
>>
>> # ls /sys/devices/system/node/node0/memory_failure
>> delayed failed ignored recovered total
>> these fields show the per node ->mf_stats, that is the MF handling results.
>>
>>>
>>>>
>>>> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats
>>>> to pglist_data")
>>>> Cc: <stable@vger.kernel.org>
>>>> Signed-off-by: Jane Chu <jane.chu@oracle.com>
>>>> ---
>>>> include/linux/hugetlb.h | 4 ++--
>>>> include/linux/mm.h | 4 ++--
>>>> mm/hugetlb.c | 4 ++--
>>>> mm/memory-failure.c | 22 +++++++++++++---------
>>>> 4 files changed, 19 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>> index 8e63e46b8e1f..2e6690c9df96 100644
>>>> --- a/include/linux/hugetlb.h
>>>> +++ b/include/linux/hugetlb.h
>>>> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode
>>>> *inode, long start, long end,
>>>> bool folio_isolate_hugetlb(struct folio *folio, struct list_head
>>>> *list);
>>>> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>>> bool unpoison);
>>>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> - bool *migratable_cleared);
>>>> + bool *migratable_cleared, bool *samepg);
>>>> void folio_putback_hugetlb(struct folio *folio);
>>>> void move_hugetlb_state(struct folio *old_folio, struct folio
>>>> *new_folio, int reason);
>>>> void hugetlb_fix_reserve_counts(struct inode *inode);
>>>> @@ -420,7 +420,7 @@ static inline int
>>>> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
>>>> }
>>>> static inline int get_huge_page_for_hwpoison(unsigned long pfn,
>>>> int flags,
>>>> - bool *migratable_cleared)
>>>> + bool *migratable_cleared, bool *samepg)
>>>> {
>>>> return 0;
>>>> }
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 7c79b3369b82..68b1812e9c0a 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long
>>>> pfn, int flags);
>>>> extern const struct attribute_group memory_failure_attr_group;
>>>> extern void memory_failure_queue(unsigned long pfn, int flags);
>>>> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> - bool *migratable_cleared);
>>>> + bool *migratable_cleared, bool *samepg);
>>>> void num_poisoned_pages_inc(unsigned long pfn);
>>>> void num_poisoned_pages_sub(unsigned long pfn, long i);
>>>> #else
>>>> @@ -4045,7 +4045,7 @@ static inline void
>>>> memory_failure_queue(unsigned long pfn, int flags)
>>>> }
>>>> static inline int __get_huge_page_for_hwpoison(unsigned long pfn,
>>>> int flags,
>>>> - bool *migratable_cleared)
>>>> + bool *migratable_cleared, bool *samepg)
>>>> {
>>>> return 0;
>>>> }
>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>> index 0455119716ec..f78562a578e5 100644
>>>> --- a/mm/hugetlb.c
>>>> +++ b/mm/hugetlb.c
>>>> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio
>>>> *folio, bool *hugetlb, bool unpoison
>>>> }
>>>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> - bool *migratable_cleared)
>>>> + bool *migratable_cleared, bool *samepg)
>>>> {
>>>> int ret;
>>>> spin_lock_irq(&hugetlb_lock);
>>>> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
>>>> + ret = __get_huge_page_for_hwpoison(pfn, flags,
>>>> migratable_cleared, samepg);
>>>> spin_unlock_irq(&hugetlb_lock);
>>>> return ret;
>>>> }
>>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
>>>> index 3edebb0cda30..070f43bb110a 100644
>>>> --- a/mm/memory-failure.c
>>>> +++ b/mm/memory-failure.c
>>>> @@ -1873,7 +1873,8 @@ static unsigned long
>>>> __folio_free_raw_hwp(struct folio *folio, bool move_flag)
>>>> return count;
>>>> }
>>>> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>>>> page *page)
>>>> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
>>>> page *page,
>>>> + bool *samepg)
>>>> {
>>>> struct llist_head *head;
>>>> struct raw_hwp_page *raw_hwp;
>>>> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
>>>> folio *folio, struct page *page)
>>>> return -EHWPOISON;
>>>> head = raw_hwp_list_head(folio);
>>>> llist_for_each_entry(p, head->first, node) {
>>>> - if (p->page == page)
>>>> + if (p->page == page) {
>>>> + *samepg = true;
>>>> return -EHWPOISON;
>>>> + }
>>>> }
>>>> raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
>>>> if (raw_hwp) {
>>>> raw_hwp->page = page;
>>>> llist_add(&raw_hwp->node, head);
>>>> - /* the first error event will be counted in action_result(). */
>>>> - if (ret)
>>>> - num_poisoned_pages_inc(page_to_pfn(page));
>>>> } else {
>>>> /*
>>>> * Failed to save raw error info. We no longer trace all
>>>> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio
>>>> *folio)
>>>> * -EHWPOISON - the hugepage is already hwpoisoned
>>>> */
>>>> int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
>>>> - bool *migratable_cleared)
>>>> + bool *migratable_cleared, bool *samepg)
>>>> {
>>>> struct page *page = pfn_to_page(pfn);
>>>> struct folio *folio = page_folio(page);
>>>> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long
>>>> pfn, int flags,
>>>> goto out;
>>>> }
>>>> - if (folio_set_hugetlb_hwpoison(folio, page)) {
>>>> + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
>>>> ret = -EHWPOISON;
>>>> goto out;
>>>> }
>>>> @@ -2014,11 +2014,12 @@ static int
>>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int
>>>> *hugetlb
>>>> struct page *p = pfn_to_page(pfn);
>>>> struct folio *folio;
>>>> unsigned long page_flags;
>>>> + bool samepg = false;
>>>> bool migratable_cleared = false;
>>>> *hugetlb = 1;
>>>> retry:
>>>> - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
>>>> + res = get_huge_page_for_hwpoison(pfn, flags,
>>>> &migratable_cleared, &samepg);
>>>> if (res == 2) { /* fallback to normal page handling */
>>>> *hugetlb = 0;
>>>> return 0;
>>>> @@ -2027,7 +2028,10 @@ static int
>>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int
>>>> *hugetlb
>>>> folio = page_folio(p);
>>>> res = kill_accessing_process(current,
>>>> folio_pfn(folio), flags);
>>>> }
>>>> - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>>>> + if (samepg)
>>>> + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
>>>> + else
>>>> + action_result(pfn, MF_MSG_HUGE, MF_FAILED);
>>>
>>> Can't we somehow return that result from get_huge_page_for_hwpoison()
>>> ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
>>> instead of "-EHWPOISON" or magic value "2".
>>
>> This is an option. The existing return codes are as follow.
>> __get_huge_page_for_hwpoison():
>> * Return values:
>> * 0 - free hugepage
>> * 1 - in-use hugepage
>> * 2 - not a hugepage
>> * -EBUSY - the hugepage is busy (try to retry)
>> * -EHWPOISON - the hugepage is already hwpoisoned
>>
>> folio_set_hugetlb_hwpoison()
>> returns
>> 0: folio was not poisoned before
>> -EHWPOISON: folio was poisoned before
>>
>> To get rid of 'samepg', how about
>>
>> __get_huge_page_for_hwpoison():
>> * Return values:
>> * 0 - free hugepage
>> * 1 - in-use hugepage
>> * 2 - not a hugepage
>> * 3 - the hugepage is already hwpoisoned in different page
>> * 4 - the hugepage is already hwpoisoned in the same page
>> * -EBUSY - the hugepage is busy (try to retry)
>>
>> folio_set_hugetlb_hwpoison()
>> returns
>> 0: folio was not poisoned before
>> 1: folio was poisoned before in different page
>> 2: folio was poisoned before in the same page
>>
>> The whole point about identifying the same page is so that the re-poison
>> event is not doubled counted.
>
> This means folio_set_hugetlb_hwpoison() returns 0 on success but
> positives on error.. this seems to be going further away from the
> standard way of doing things?
Yes.
> > It would actually be good to remove all magic values instead of
> expanding them.
>
> I think what David was trying to say is to have a local enum that states
> what these numbers mean so that the code reads more cleanly, instead of
> digging for the right comment to decode it.
>
> For example, in try_memory_failure_hugetlb():
>
> if (res == 2) { /* fallback to normal page handling */
>
> vs:
>
> if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */
>
> You could spell out your other options as well. Maybe something like
> MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
> MEMORY_FAILURE_HWPOISONED
>
> This would avoid adding more magic values and increase readability.
>
> If you changed try_memory_failure_hugetlb() to use a switch statement,
> then the compiler can catch unchecked enums for us too.
>
> If you don't want to go the enum route, then you could use a different
> error code and propagate it through, like -EEXISTS for the new case?
> That way the return is still 0 on success and less than 0 on failure,
> but I think the enum idea has a number of advantages.
I am open, actually prefer enum with switch statement as you suggested
above.
What about folio_set_hugetlb_hwpoison()?
Indeed the conventional way of folio_set_X_Y() returns only two possible
values, but we need three.
How about changing the function name to set_hugetlb_hwpoison() to
deviate from the convention? afterall, the function does more than
conventional bit setting, it maintains a per folio raw-error linked list
to track the poisoned pages within.
Thanks!
-jane
>
> Thanks,
> Liam
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
2025-12-18 23:10 ` jane.chu
@ 2025-12-19 4:09 ` Liam R. Howlett
0 siblings, 0 replies; 6+ messages in thread
From: Liam R. Howlett @ 2025-12-19 4:09 UTC (permalink / raw)
To: jane.chu
Cc: David Hildenbrand (Red Hat),
muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche,
rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm,
linux-kernel
* jane.chu@oracle.com <jane.chu@oracle.com> [251218 18:10]:
...
> > * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]:
...
> > > On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote:
> > > > On 12/16/25 22:56, Jane Chu wrote:
...
> > > > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > > > page *page)
> > > > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct
> > > > > page *page,
> > > > > + bool *samepg)
> > > > > {
> > > > > struct llist_head *head;
> > > > > struct raw_hwp_page *raw_hwp;
> > > > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct
> > > > > folio *folio, struct page *page)
> > > > > return -EHWPOISON;
> > > > > head = raw_hwp_list_head(folio);
> > > > > llist_for_each_entry(p, head->first, node) {
> > > > > - if (p->page == page)
> > > > > + if (p->page == page) {
> > > > > + *samepg = true;
> > > > > return -EHWPOISON;
> > > > > + }
> > > > > }
> > > > > raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
> > > > > if (raw_hwp) {
> > > > > raw_hwp->page = page;
> > > > > llist_add(&raw_hwp->node, head);
> > > > > - /* the first error event will be counted in action_result(). */
> > > > > - if (ret)
> > > > > - num_poisoned_pages_inc(page_to_pfn(page));
> > > > > } else {
> > > > > /*
> > > > > * Failed to save raw error info. We no longer trace all
...
> > > > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int
> > > > > *hugetlb
> > > > > folio = page_folio(p);
> > > > > res = kill_accessing_process(current,
> > > > > folio_pfn(folio), flags);
> > > > > }
> > > > > - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > > > + if (samepg)
> > > > > + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
> > > > > + else
> > > > > + action_result(pfn, MF_MSG_HUGE, MF_FAILED);
> > > >
> > > > Can't we somehow return that result from get_huge_page_for_hwpoison()
> > > > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum
> > > > instead of "-EHWPOISON" or magic value "2".
> > >
> > > This is an option. The existing return codes are as follow.
> > > __get_huge_page_for_hwpoison():
> > > * Return values:
> > > * 0 - free hugepage
> > > * 1 - in-use hugepage
> > > * 2 - not a hugepage
> > > * -EBUSY - the hugepage is busy (try to retry)
> > > * -EHWPOISON - the hugepage is already hwpoisoned
> > >
> > > folio_set_hugetlb_hwpoison()
> > > returns
> > > 0: folio was not poisoned before
> > > -EHWPOISON: folio was poisoned before
> > >
> > > To get rid of 'samepg', how about
> > >
> > > __get_huge_page_for_hwpoison():
> > > * Return values:
> > > * 0 - free hugepage
> > > * 1 - in-use hugepage
> > > * 2 - not a hugepage
> > > * 3 - the hugepage is already hwpoisoned in different page
> > > * 4 - the hugepage is already hwpoisoned in the same page
> > > * -EBUSY - the hugepage is busy (try to retry)
> > >
> > > folio_set_hugetlb_hwpoison()
> > > returns
> > > 0: folio was not poisoned before
> > > 1: folio was poisoned before in different page
> > > 2: folio was poisoned before in the same page
> > >
> > > The whole point about identifying the same page is so that the re-poison
> > > event is not doubled counted.
> >
> > This means folio_set_hugetlb_hwpoison() returns 0 on success but
> > positives on error.. this seems to be going further away from the
> > standard way of doing things?
>
> Yes.
> > > It would actually be good to remove all magic values instead of
> > expanding them.
> >
> > I think what David was trying to say is to have a local enum that states
> > what these numbers mean so that the code reads more cleanly, instead of
> > digging for the right comment to decode it.
> >
> > For example, in try_memory_failure_hugetlb():
> >
> > if (res == 2) { /* fallback to normal page handling */
> >
> > vs:
> >
> > if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */
> >
> > You could spell out your other options as well. Maybe something like
> > MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED
> > MEMORY_FAILURE_HWPOISONED
> >
> > This would avoid adding more magic values and increase readability.
> >
> > If you changed try_memory_failure_hugetlb() to use a switch statement,
> > then the compiler can catch unchecked enums for us too.
> >
> > If you don't want to go the enum route, then you could use a different
> > error code and propagate it through, like -EEXISTS for the new case?
> > That way the return is still 0 on success and less than 0 on failure,
> > but I think the enum idea has a number of advantages.
>
> I am open, actually prefer enum with switch statement as you suggested
> above.
It's David's suggestion, really :)
>
> What about folio_set_hugetlb_hwpoison()?
> Indeed the conventional way of folio_set_X_Y() returns only two possible
> values, but we need three.
I am having a hard time finding any folio_set_* that returns anything.
$ git grep int\ folio_set_|wc -l
1
$ git grep void\ folio_set_|wc -l
20
> How about changing the function name to set_hugetlb_hwpoison() to deviate
> from the convention? afterall, the function does more than conventional bit
> setting, it maintains a per folio raw-error linked list
> to track the poisoned pages within.
I think that's a good idea considering it seems like folio_set_ implies
that it will be setting something unconditionally, and this function
does not always set something and does even more work.
In fact the names make no sense to begin with.
get_huge_page_for_hwpoison() ends up actually calling
folio_set_hugetlb_hwpoison(), so it's not getting the huge page at all,
it's doing (at least some of) the work.
So, yeah, renaming it isn't going to make things worse. I'd try to
indicate that folio_set_hugetlb_hwpoison() might not do anything. It's
static and has hugetlb in the name, so we're pretty safe with anything.
Maybe update_hugetlb_hwpoison_list() ?
Thanks,
Liam
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-12-19 4:10 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu
2025-12-18 8:41 ` David Hildenbrand (Red Hat)
2025-12-18 19:01 ` jane.chu
2025-12-18 20:26 ` Liam R. Howlett
2025-12-18 23:10 ` jane.chu
2025-12-19 4:09 ` Liam R. Howlett
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox