* [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison
@ 2025-12-16 21:56 Jane Chu
2025-12-18 8:41 ` David Hildenbrand (Red Hat)
0 siblings, 1 reply; 6+ messages in thread
From: Jane Chu @ 2025-12-16 21:56 UTC (permalink / raw)
To: muchun.song, osalvador, david, linmiaohe, jiaqiyan,
william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett,
rppt, surenb, mhocko, linux-mm, linux-kernel
When a newly poisoned subpage ends up in an already poisoned hugetlb
folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats
is not. Fix the inconsistency by designating action_result() to update
them both.
Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data")
Cc: <stable@vger.kernel.org>
Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
include/linux/hugetlb.h | 4 ++--
include/linux/mm.h | 4 ++--
mm/hugetlb.c | 4 ++--
mm/memory-failure.c | 22 +++++++++++++---------
4 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..2e6690c9df96 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
+ bool *migratable_cleared, bool *samepg);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
}
static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
return 0;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7c79b3369b82..68b1812e9c0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags);
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared);
+ bool *migratable_cleared, bool *samepg);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
@@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
}
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0455119716ec..f78562a578e5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
}
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
int ret;
spin_lock_irq(&hugetlb_lock);
- ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
+ ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..070f43bb110a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
return count;
}
-static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page,
+ bool *samepg)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
@@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
return -EHWPOISON;
head = raw_hwp_list_head(folio);
llist_for_each_entry(p, head->first, node) {
- if (p->page == page)
+ if (p->page == page) {
+ *samepg = true;
return -EHWPOISON;
+ }
}
raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
if (raw_hwp) {
raw_hwp->page = page;
llist_add(&raw_hwp->node, head);
- /* the first error event will be counted in action_result(). */
- if (ret)
- num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
- bool *migratable_cleared)
+ bool *migratable_cleared, bool *samepg)
{
struct page *page = pfn_to_page(pfn);
struct folio *folio = page_folio(page);
@@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
goto out;
}
- if (folio_set_hugetlb_hwpoison(folio, page)) {
+ if (folio_set_hugetlb_hwpoison(folio, page, samepg)) {
ret = -EHWPOISON;
goto out;
}
@@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
struct page *p = pfn_to_page(pfn);
struct folio *folio;
unsigned long page_flags;
+ bool samepg = false;
bool migratable_cleared = false;
*hugetlb = 1;
retry:
- res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
+ res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
@@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
folio = page_folio(p);
res = kill_accessing_process(current, folio_pfn(folio), flags);
}
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+ if (samepg)
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+ else
+ action_result(pfn, MF_MSG_HUGE, MF_FAILED);
return res;
} else if (res == -EBUSY) {
if (!(flags & MF_NO_RETRY)) {
--
2.43.5
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison 2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu @ 2025-12-18 8:41 ` David Hildenbrand (Red Hat) 2025-12-18 19:01 ` jane.chu 0 siblings, 1 reply; 6+ messages in thread From: David Hildenbrand (Red Hat) @ 2025-12-18 8:41 UTC (permalink / raw) To: Jane Chu, muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett, rppt, surenb, mhocko, linux-mm, linux-kernel On 12/16/25 22:56, Jane Chu wrote: > When a newly poisoned subpage ends up in an already poisoned hugetlb The concept of subpages does not exist. It's a page of a hugetlb folio. > folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats > is not. Fix the inconsistency by designating action_result() to update > them both. What is the user-visible result of that? > > Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats to pglist_data") > Cc: <stable@vger.kernel.org> > Signed-off-by: Jane Chu <jane.chu@oracle.com> > --- > include/linux/hugetlb.h | 4 ++-- > include/linux/mm.h | 4 ++-- > mm/hugetlb.c | 4 ++-- > mm/memory-failure.c | 22 +++++++++++++--------- > 4 files changed, 19 insertions(+), 15 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 8e63e46b8e1f..2e6690c9df96 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, > bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); > int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); > int get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared); > + bool *migratable_cleared, bool *samepg); > void folio_putback_hugetlb(struct folio *folio); > void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); > void hugetlb_fix_reserve_counts(struct inode *inode); > @@ -420,7 +420,7 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, > } > > static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared) > + bool *migratable_cleared, bool *samepg) > { > return 0; > } > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 7c79b3369b82..68b1812e9c0a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, int flags); > extern const struct attribute_group memory_failure_attr_group; > extern void memory_failure_queue(unsigned long pfn, int flags); > extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared); > + bool *migratable_cleared, bool *samepg); > void num_poisoned_pages_inc(unsigned long pfn); > void num_poisoned_pages_sub(unsigned long pfn, long i); > #else > @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned long pfn, int flags) > } > > static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared) > + bool *migratable_cleared, bool *samepg) > { > return 0; > } > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 0455119716ec..f78562a578e5 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison > } > > int get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared) > + bool *migratable_cleared, bool *samepg) > { > int ret; > > spin_lock_irq(&hugetlb_lock); > - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); > + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared, samepg); > spin_unlock_irq(&hugetlb_lock); > return ret; > } > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > index 3edebb0cda30..070f43bb110a 100644 > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) > return count; > } > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page, > + bool *samepg) > { > struct llist_head *head; > struct raw_hwp_page *raw_hwp; > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) > return -EHWPOISON; > head = raw_hwp_list_head(folio); > llist_for_each_entry(p, head->first, node) { > - if (p->page == page) > + if (p->page == page) { > + *samepg = true; > return -EHWPOISON; > + } > } > > raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); > if (raw_hwp) { > raw_hwp->page = page; > llist_add(&raw_hwp->node, head); > - /* the first error event will be counted in action_result(). */ > - if (ret) > - num_poisoned_pages_inc(page_to_pfn(page)); > } else { > /* > * Failed to save raw error info. We no longer trace all > @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) > * -EHWPOISON - the hugepage is already hwpoisoned > */ > int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > - bool *migratable_cleared) > + bool *migratable_cleared, bool *samepg) > { > struct page *page = pfn_to_page(pfn); > struct folio *folio = page_folio(page); > @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > goto out; > } > > - if (folio_set_hugetlb_hwpoison(folio, page)) { > + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) { > ret = -EHWPOISON; > goto out; > } > @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb > struct page *p = pfn_to_page(pfn); > struct folio *folio; > unsigned long page_flags; > + bool samepg = false; > bool migratable_cleared = false; > > *hugetlb = 1; > retry: > - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); > + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, &samepg); > if (res == 2) { /* fallback to normal page handling */ > *hugetlb = 0; > return 0; > @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb > folio = page_folio(p); > res = kill_accessing_process(current, folio_pfn(folio), flags); > } > - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > + if (samepg) > + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > + else > + action_result(pfn, MF_MSG_HUGE, MF_FAILED); Can't we somehow return that result from get_huge_page_for_hwpoison() ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum instead of "-EHWPOISON" or magic value "2". "samepg" is petty much unreadable. Same with what? What you really mean is "page was already hwpoisoned". In an enum you might be better able to describe the various scenarios. -- Cheers David ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison 2025-12-18 8:41 ` David Hildenbrand (Red Hat) @ 2025-12-18 19:01 ` jane.chu 2025-12-18 20:26 ` Liam R. Howlett 0 siblings, 1 reply; 6+ messages in thread From: jane.chu @ 2025-12-18 19:01 UTC (permalink / raw) To: David Hildenbrand (Red Hat), muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche, rientjes, akpm, lorenzo.stoakes, Liam.Howlett, rppt, surenb, mhocko, linux-mm, linux-kernel Hi, David, Thanks for the review. On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote: > On 12/16/25 22:56, Jane Chu wrote: >> When a newly poisoned subpage ends up in an already poisoned hugetlb > > The concept of subpages does not exist. It's a page of a hugetlb folio. Okay. > >> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats >> is not. Fix the inconsistency by designating action_result() to update >> them both. > > What is the user-visible result of that? For the purpose of observation, and potential action afterwards. # cat /proc/meminfo | grep HardwareCorrupted shows 'num_poisoned_pages', the global count of poisoned pages. # ls /sys/devices/system/node/node0/memory_failure delayed failed ignored recovered total these fields show the per node ->mf_stats, that is the MF handling results. > >> >> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats >> to pglist_data") >> Cc: <stable@vger.kernel.org> >> Signed-off-by: Jane Chu <jane.chu@oracle.com> >> --- >> include/linux/hugetlb.h | 4 ++-- >> include/linux/mm.h | 4 ++-- >> mm/hugetlb.c | 4 ++-- >> mm/memory-failure.c | 22 +++++++++++++--------- >> 4 files changed, 19 insertions(+), 15 deletions(-) >> >> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h >> index 8e63e46b8e1f..2e6690c9df96 100644 >> --- a/include/linux/hugetlb.h >> +++ b/include/linux/hugetlb.h >> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode *inode, >> long start, long end, >> bool folio_isolate_hugetlb(struct folio *folio, struct list_head >> *list); >> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, >> bool unpoison); >> int get_huge_page_for_hwpoison(unsigned long pfn, int flags, >> - bool *migratable_cleared); >> + bool *migratable_cleared, bool *samepg); >> void folio_putback_hugetlb(struct folio *folio); >> void move_hugetlb_state(struct folio *old_folio, struct folio >> *new_folio, int reason); >> void hugetlb_fix_reserve_counts(struct inode *inode); >> @@ -420,7 +420,7 @@ static inline int >> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, >> } >> static inline int get_huge_page_for_hwpoison(unsigned long pfn, int >> flags, >> - bool *migratable_cleared) >> + bool *migratable_cleared, bool *samepg) >> { >> return 0; >> } >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 7c79b3369b82..68b1812e9c0a 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long pfn, >> int flags); >> extern const struct attribute_group memory_failure_attr_group; >> extern void memory_failure_queue(unsigned long pfn, int flags); >> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, >> - bool *migratable_cleared); >> + bool *migratable_cleared, bool *samepg); >> void num_poisoned_pages_inc(unsigned long pfn); >> void num_poisoned_pages_sub(unsigned long pfn, long i); >> #else >> @@ -4045,7 +4045,7 @@ static inline void memory_failure_queue(unsigned >> long pfn, int flags) >> } >> static inline int __get_huge_page_for_hwpoison(unsigned long pfn, >> int flags, >> - bool *migratable_cleared) >> + bool *migratable_cleared, bool *samepg) >> { >> return 0; >> } >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c >> index 0455119716ec..f78562a578e5 100644 >> --- a/mm/hugetlb.c >> +++ b/mm/hugetlb.c >> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio >> *folio, bool *hugetlb, bool unpoison >> } >> int get_huge_page_for_hwpoison(unsigned long pfn, int flags, >> - bool *migratable_cleared) >> + bool *migratable_cleared, bool *samepg) >> { >> int ret; >> spin_lock_irq(&hugetlb_lock); >> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); >> + ret = __get_huge_page_for_hwpoison(pfn, flags, >> migratable_cleared, samepg); >> spin_unlock_irq(&hugetlb_lock); >> return ret; >> } >> diff --git a/mm/memory-failure.c b/mm/memory-failure.c >> index 3edebb0cda30..070f43bb110a 100644 >> --- a/mm/memory-failure.c >> +++ b/mm/memory-failure.c >> @@ -1873,7 +1873,8 @@ static unsigned long __folio_free_raw_hwp(struct >> folio *folio, bool move_flag) >> return count; >> } >> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct >> page *page) >> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct >> page *page, >> + bool *samepg) >> { >> struct llist_head *head; >> struct raw_hwp_page *raw_hwp; >> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct >> folio *folio, struct page *page) >> return -EHWPOISON; >> head = raw_hwp_list_head(folio); >> llist_for_each_entry(p, head->first, node) { >> - if (p->page == page) >> + if (p->page == page) { >> + *samepg = true; >> return -EHWPOISON; >> + } >> } >> raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); >> if (raw_hwp) { >> raw_hwp->page = page; >> llist_add(&raw_hwp->node, head); >> - /* the first error event will be counted in action_result(). */ >> - if (ret) >> - num_poisoned_pages_inc(page_to_pfn(page)); >> } else { >> /* >> * Failed to save raw error info. We no longer trace all >> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio >> *folio) >> * -EHWPOISON - the hugepage is already hwpoisoned >> */ >> int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, >> - bool *migratable_cleared) >> + bool *migratable_cleared, bool *samepg) >> { >> struct page *page = pfn_to_page(pfn); >> struct folio *folio = page_folio(page); >> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long >> pfn, int flags, >> goto out; >> } >> - if (folio_set_hugetlb_hwpoison(folio, page)) { >> + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) { >> ret = -EHWPOISON; >> goto out; >> } >> @@ -2014,11 +2014,12 @@ static int try_memory_failure_hugetlb(unsigned >> long pfn, int flags, int *hugetlb >> struct page *p = pfn_to_page(pfn); >> struct folio *folio; >> unsigned long page_flags; >> + bool samepg = false; >> bool migratable_cleared = false; >> *hugetlb = 1; >> retry: >> - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); >> + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared, >> &samepg); >> if (res == 2) { /* fallback to normal page handling */ >> *hugetlb = 0; >> return 0; >> @@ -2027,7 +2028,10 @@ static int try_memory_failure_hugetlb(unsigned >> long pfn, int flags, int *hugetlb >> folio = page_folio(p); >> res = kill_accessing_process(current, folio_pfn(folio), >> flags); >> } >> - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); >> + if (samepg) >> + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); >> + else >> + action_result(pfn, MF_MSG_HUGE, MF_FAILED); > > Can't we somehow return that result from > get_huge_page_for_hwpoison() ... folio_set_hugetlb_hwpoison() > differently? E.g., return an enum instead of "-EHWPOISON" or magic value > "2". This is an option. The existing return codes are as follow. __get_huge_page_for_hwpoison(): * Return values: * 0 - free hugepage * 1 - in-use hugepage * 2 - not a hugepage * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned folio_set_hugetlb_hwpoison() returns 0: folio was not poisoned before -EHWPOISON: folio was poisoned before To get rid of 'samepg', how about __get_huge_page_for_hwpoison(): * Return values: * 0 - free hugepage * 1 - in-use hugepage * 2 - not a hugepage * 3 - the hugepage is already hwpoisoned in different page * 4 - the hugepage is already hwpoisoned in the same page * -EBUSY - the hugepage is busy (try to retry) folio_set_hugetlb_hwpoison() returns 0: folio was not poisoned before 1: folio was poisoned before in different page 2: folio was poisoned before in the same page The whole point about identifying the same page is so that the re-poison event is not doubled counted. > > "samepg" is petty much unreadable. Same with what? > > What you really mean is "page was already hwpoisoned". > For example, a previously poison-free hugetlb folio is detected that its 3rd tail page is poisoned, MF handler marks the folio as poisoned and record the 3rd tail. Sometime later, the same hugetlb folio again is detected being poisoned, this time, the poisoned page might be the 3rd tail page(indicated by 'samepg' = true) in which case the MF handler does not update the stats. Or, the poisoned page might be a new tail page, say the 7th tail, in which case, the MF handler records the 7th tail, and update the user-visible stats. I have struggled a bit about the naming, wondering whether "repoison" might work better, or, a longer name like "same-page-repoisoned"? > In an enum you might be better able to describe the various scenarios. > Thanks! -jane ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison 2025-12-18 19:01 ` jane.chu @ 2025-12-18 20:26 ` Liam R. Howlett 2025-12-18 23:10 ` jane.chu 0 siblings, 1 reply; 6+ messages in thread From: Liam R. Howlett @ 2025-12-18 20:26 UTC (permalink / raw) To: jane.chu Cc: David Hildenbrand (Red Hat), muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche, rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm, linux-kernel * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]: > Hi, David, > > Thanks for the review. > > On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote: > > On 12/16/25 22:56, Jane Chu wrote: > > > When a newly poisoned subpage ends up in an already poisoned hugetlb > > > > The concept of subpages does not exist. It's a page of a hugetlb folio. > > Okay. > > > > > > folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats > > > is not. Fix the inconsistency by designating action_result() to update > > > them both. > > > > What is the user-visible result of that? > > For the purpose of observation, and potential action afterwards. > > # cat /proc/meminfo | grep HardwareCorrupted > shows 'num_poisoned_pages', the global count of poisoned pages. > > # ls /sys/devices/system/node/node0/memory_failure > delayed failed ignored recovered total > these fields show the per node ->mf_stats, that is the MF handling results. > > > > > > > > > Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats > > > to pglist_data") > > > Cc: <stable@vger.kernel.org> > > > Signed-off-by: Jane Chu <jane.chu@oracle.com> > > > --- > > > include/linux/hugetlb.h | 4 ++-- > > > include/linux/mm.h | 4 ++-- > > > mm/hugetlb.c | 4 ++-- > > > mm/memory-failure.c | 22 +++++++++++++--------- > > > 4 files changed, 19 insertions(+), 15 deletions(-) > > > > > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > > > index 8e63e46b8e1f..2e6690c9df96 100644 > > > --- a/include/linux/hugetlb.h > > > +++ b/include/linux/hugetlb.h > > > @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode > > > *inode, long start, long end, > > > bool folio_isolate_hugetlb(struct folio *folio, struct list_head > > > *list); > > > int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, > > > bool unpoison); > > > int get_huge_page_for_hwpoison(unsigned long pfn, int flags, > > > - bool *migratable_cleared); > > > + bool *migratable_cleared, bool *samepg); > > > void folio_putback_hugetlb(struct folio *folio); > > > void move_hugetlb_state(struct folio *old_folio, struct folio > > > *new_folio, int reason); > > > void hugetlb_fix_reserve_counts(struct inode *inode); > > > @@ -420,7 +420,7 @@ static inline int > > > get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, > > > } > > > static inline int get_huge_page_for_hwpoison(unsigned long pfn, > > > int flags, > > > - bool *migratable_cleared) > > > + bool *migratable_cleared, bool *samepg) > > > { > > > return 0; > > > } > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > index 7c79b3369b82..68b1812e9c0a 100644 > > > --- a/include/linux/mm.h > > > +++ b/include/linux/mm.h > > > @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long > > > pfn, int flags); > > > extern const struct attribute_group memory_failure_attr_group; > > > extern void memory_failure_queue(unsigned long pfn, int flags); > > > extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > > > - bool *migratable_cleared); > > > + bool *migratable_cleared, bool *samepg); > > > void num_poisoned_pages_inc(unsigned long pfn); > > > void num_poisoned_pages_sub(unsigned long pfn, long i); > > > #else > > > @@ -4045,7 +4045,7 @@ static inline void > > > memory_failure_queue(unsigned long pfn, int flags) > > > } > > > static inline int __get_huge_page_for_hwpoison(unsigned long pfn, > > > int flags, > > > - bool *migratable_cleared) > > > + bool *migratable_cleared, bool *samepg) > > > { > > > return 0; > > > } > > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > > index 0455119716ec..f78562a578e5 100644 > > > --- a/mm/hugetlb.c > > > +++ b/mm/hugetlb.c > > > @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio > > > *folio, bool *hugetlb, bool unpoison > > > } > > > int get_huge_page_for_hwpoison(unsigned long pfn, int flags, > > > - bool *migratable_cleared) > > > + bool *migratable_cleared, bool *samepg) > > > { > > > int ret; > > > spin_lock_irq(&hugetlb_lock); > > > - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); > > > + ret = __get_huge_page_for_hwpoison(pfn, flags, > > > migratable_cleared, samepg); > > > spin_unlock_irq(&hugetlb_lock); > > > return ret; > > > } > > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > > > index 3edebb0cda30..070f43bb110a 100644 > > > --- a/mm/memory-failure.c > > > +++ b/mm/memory-failure.c > > > @@ -1873,7 +1873,8 @@ static unsigned long > > > __folio_free_raw_hwp(struct folio *folio, bool move_flag) > > > return count; > > > } > > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct > > > page *page) > > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct > > > page *page, > > > + bool *samepg) > > > { > > > struct llist_head *head; > > > struct raw_hwp_page *raw_hwp; > > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct > > > folio *folio, struct page *page) > > > return -EHWPOISON; > > > head = raw_hwp_list_head(folio); > > > llist_for_each_entry(p, head->first, node) { > > > - if (p->page == page) > > > + if (p->page == page) { > > > + *samepg = true; > > > return -EHWPOISON; > > > + } > > > } > > > raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); > > > if (raw_hwp) { > > > raw_hwp->page = page; > > > llist_add(&raw_hwp->node, head); > > > - /* the first error event will be counted in action_result(). */ > > > - if (ret) > > > - num_poisoned_pages_inc(page_to_pfn(page)); > > > } else { > > > /* > > > * Failed to save raw error info. We no longer trace all > > > @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio > > > *folio) > > > * -EHWPOISON - the hugepage is already hwpoisoned > > > */ > > > int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, > > > - bool *migratable_cleared) > > > + bool *migratable_cleared, bool *samepg) > > > { > > > struct page *page = pfn_to_page(pfn); > > > struct folio *folio = page_folio(page); > > > @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long > > > pfn, int flags, > > > goto out; > > > } > > > - if (folio_set_hugetlb_hwpoison(folio, page)) { > > > + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) { > > > ret = -EHWPOISON; > > > goto out; > > > } > > > @@ -2014,11 +2014,12 @@ static int > > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int > > > *hugetlb > > > struct page *p = pfn_to_page(pfn); > > > struct folio *folio; > > > unsigned long page_flags; > > > + bool samepg = false; > > > bool migratable_cleared = false; > > > *hugetlb = 1; > > > retry: > > > - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); > > > + res = get_huge_page_for_hwpoison(pfn, flags, > > > &migratable_cleared, &samepg); > > > if (res == 2) { /* fallback to normal page handling */ > > > *hugetlb = 0; > > > return 0; > > > @@ -2027,7 +2028,10 @@ static int > > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int > > > *hugetlb > > > folio = page_folio(p); > > > res = kill_accessing_process(current, > > > folio_pfn(folio), flags); > > > } > > > - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > > > + if (samepg) > > > + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > > > + else > > > + action_result(pfn, MF_MSG_HUGE, MF_FAILED); > > > > Can't we somehow return that result from get_huge_page_for_hwpoison() > > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum > > instead of "-EHWPOISON" or magic value "2". > > This is an option. The existing return codes are as follow. > __get_huge_page_for_hwpoison(): > * Return values: > * 0 - free hugepage > * 1 - in-use hugepage > * 2 - not a hugepage > * -EBUSY - the hugepage is busy (try to retry) > * -EHWPOISON - the hugepage is already hwpoisoned > > folio_set_hugetlb_hwpoison() > returns > 0: folio was not poisoned before > -EHWPOISON: folio was poisoned before > > To get rid of 'samepg', how about > > __get_huge_page_for_hwpoison(): > * Return values: > * 0 - free hugepage > * 1 - in-use hugepage > * 2 - not a hugepage > * 3 - the hugepage is already hwpoisoned in different page > * 4 - the hugepage is already hwpoisoned in the same page > * -EBUSY - the hugepage is busy (try to retry) > > folio_set_hugetlb_hwpoison() > returns > 0: folio was not poisoned before > 1: folio was poisoned before in different page > 2: folio was poisoned before in the same page > > The whole point about identifying the same page is so that the re-poison > event is not doubled counted. This means folio_set_hugetlb_hwpoison() returns 0 on success but positives on error.. this seems to be going further away from the standard way of doing things? It would actually be good to remove all magic values instead of expanding them. I think what David was trying to say is to have a local enum that states what these numbers mean so that the code reads more cleanly, instead of digging for the right comment to decode it. For example, in try_memory_failure_hugetlb(): if (res == 2) { /* fallback to normal page handling */ vs: if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */ You could spell out your other options as well. Maybe something like MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED MEMORY_FAILURE_HWPOISONED This would avoid adding more magic values and increase readability. If you changed try_memory_failure_hugetlb() to use a switch statement, then the compiler can catch unchecked enums for us too. If you don't want to go the enum route, then you could use a different error code and propagate it through, like -EEXISTS for the new case? That way the return is still 0 on success and less than 0 on failure, but I think the enum idea has a number of advantages. Thanks, Liam ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison 2025-12-18 20:26 ` Liam R. Howlett @ 2025-12-18 23:10 ` jane.chu 2025-12-19 4:09 ` Liam R. Howlett 0 siblings, 1 reply; 6+ messages in thread From: jane.chu @ 2025-12-18 23:10 UTC (permalink / raw) To: Liam R. Howlett, David Hildenbrand (Red Hat), muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche, rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm, linux-kernel Hi, Liam, Thanks! My reply towards the end. On 12/18/2025 12:26 PM, Liam R. Howlett wrote: > * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]: >> Hi, David, >> >> Thanks for the review. >> >> On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote: >>> On 12/16/25 22:56, Jane Chu wrote: >>>> When a newly poisoned subpage ends up in an already poisoned hugetlb >>> >>> The concept of subpages does not exist. It's a page of a hugetlb folio. >> >> Okay. >> >>> >>>> folio, 'num_poisoned_pages' is incremented, but the per node ->mf_stats >>>> is not. Fix the inconsistency by designating action_result() to update >>>> them both. >>> >>> What is the user-visible result of that? >> >> For the purpose of observation, and potential action afterwards. >> >> # cat /proc/meminfo | grep HardwareCorrupted >> shows 'num_poisoned_pages', the global count of poisoned pages. >> >> # ls /sys/devices/system/node/node0/memory_failure >> delayed failed ignored recovered total >> these fields show the per node ->mf_stats, that is the MF handling results. >> >>> >>>> >>>> Fixes: 18f41fa616ee4 ("mm: memory-failure: bump memory failure stats >>>> to pglist_data") >>>> Cc: <stable@vger.kernel.org> >>>> Signed-off-by: Jane Chu <jane.chu@oracle.com> >>>> --- >>>> include/linux/hugetlb.h | 4 ++-- >>>> include/linux/mm.h | 4 ++-- >>>> mm/hugetlb.c | 4 ++-- >>>> mm/memory-failure.c | 22 +++++++++++++--------- >>>> 4 files changed, 19 insertions(+), 15 deletions(-) >>>> >>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h >>>> index 8e63e46b8e1f..2e6690c9df96 100644 >>>> --- a/include/linux/hugetlb.h >>>> +++ b/include/linux/hugetlb.h >>>> @@ -157,7 +157,7 @@ long hugetlb_unreserve_pages(struct inode >>>> *inode, long start, long end, >>>> bool folio_isolate_hugetlb(struct folio *folio, struct list_head >>>> *list); >>>> int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, >>>> bool unpoison); >>>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags, >>>> - bool *migratable_cleared); >>>> + bool *migratable_cleared, bool *samepg); >>>> void folio_putback_hugetlb(struct folio *folio); >>>> void move_hugetlb_state(struct folio *old_folio, struct folio >>>> *new_folio, int reason); >>>> void hugetlb_fix_reserve_counts(struct inode *inode); >>>> @@ -420,7 +420,7 @@ static inline int >>>> get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, >>>> } >>>> static inline int get_huge_page_for_hwpoison(unsigned long pfn, >>>> int flags, >>>> - bool *migratable_cleared) >>>> + bool *migratable_cleared, bool *samepg) >>>> { >>>> return 0; >>>> } >>>> diff --git a/include/linux/mm.h b/include/linux/mm.h >>>> index 7c79b3369b82..68b1812e9c0a 100644 >>>> --- a/include/linux/mm.h >>>> +++ b/include/linux/mm.h >>>> @@ -4036,7 +4036,7 @@ extern int soft_offline_page(unsigned long >>>> pfn, int flags); >>>> extern const struct attribute_group memory_failure_attr_group; >>>> extern void memory_failure_queue(unsigned long pfn, int flags); >>>> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, >>>> - bool *migratable_cleared); >>>> + bool *migratable_cleared, bool *samepg); >>>> void num_poisoned_pages_inc(unsigned long pfn); >>>> void num_poisoned_pages_sub(unsigned long pfn, long i); >>>> #else >>>> @@ -4045,7 +4045,7 @@ static inline void >>>> memory_failure_queue(unsigned long pfn, int flags) >>>> } >>>> static inline int __get_huge_page_for_hwpoison(unsigned long pfn, >>>> int flags, >>>> - bool *migratable_cleared) >>>> + bool *migratable_cleared, bool *samepg) >>>> { >>>> return 0; >>>> } >>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c >>>> index 0455119716ec..f78562a578e5 100644 >>>> --- a/mm/hugetlb.c >>>> +++ b/mm/hugetlb.c >>>> @@ -7818,12 +7818,12 @@ int get_hwpoison_hugetlb_folio(struct folio >>>> *folio, bool *hugetlb, bool unpoison >>>> } >>>> int get_huge_page_for_hwpoison(unsigned long pfn, int flags, >>>> - bool *migratable_cleared) >>>> + bool *migratable_cleared, bool *samepg) >>>> { >>>> int ret; >>>> spin_lock_irq(&hugetlb_lock); >>>> - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); >>>> + ret = __get_huge_page_for_hwpoison(pfn, flags, >>>> migratable_cleared, samepg); >>>> spin_unlock_irq(&hugetlb_lock); >>>> return ret; >>>> } >>>> diff --git a/mm/memory-failure.c b/mm/memory-failure.c >>>> index 3edebb0cda30..070f43bb110a 100644 >>>> --- a/mm/memory-failure.c >>>> +++ b/mm/memory-failure.c >>>> @@ -1873,7 +1873,8 @@ static unsigned long >>>> __folio_free_raw_hwp(struct folio *folio, bool move_flag) >>>> return count; >>>> } >>>> -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct >>>> page *page) >>>> +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct >>>> page *page, >>>> + bool *samepg) >>>> { >>>> struct llist_head *head; >>>> struct raw_hwp_page *raw_hwp; >>>> @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct >>>> folio *folio, struct page *page) >>>> return -EHWPOISON; >>>> head = raw_hwp_list_head(folio); >>>> llist_for_each_entry(p, head->first, node) { >>>> - if (p->page == page) >>>> + if (p->page == page) { >>>> + *samepg = true; >>>> return -EHWPOISON; >>>> + } >>>> } >>>> raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); >>>> if (raw_hwp) { >>>> raw_hwp->page = page; >>>> llist_add(&raw_hwp->node, head); >>>> - /* the first error event will be counted in action_result(). */ >>>> - if (ret) >>>> - num_poisoned_pages_inc(page_to_pfn(page)); >>>> } else { >>>> /* >>>> * Failed to save raw error info. We no longer trace all >>>> @@ -1956,7 +1956,7 @@ void folio_clear_hugetlb_hwpoison(struct folio >>>> *folio) >>>> * -EHWPOISON - the hugepage is already hwpoisoned >>>> */ >>>> int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, >>>> - bool *migratable_cleared) >>>> + bool *migratable_cleared, bool *samepg) >>>> { >>>> struct page *page = pfn_to_page(pfn); >>>> struct folio *folio = page_folio(page); >>>> @@ -1981,7 +1981,7 @@ int __get_huge_page_for_hwpoison(unsigned long >>>> pfn, int flags, >>>> goto out; >>>> } >>>> - if (folio_set_hugetlb_hwpoison(folio, page)) { >>>> + if (folio_set_hugetlb_hwpoison(folio, page, samepg)) { >>>> ret = -EHWPOISON; >>>> goto out; >>>> } >>>> @@ -2014,11 +2014,12 @@ static int >>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int >>>> *hugetlb >>>> struct page *p = pfn_to_page(pfn); >>>> struct folio *folio; >>>> unsigned long page_flags; >>>> + bool samepg = false; >>>> bool migratable_cleared = false; >>>> *hugetlb = 1; >>>> retry: >>>> - res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); >>>> + res = get_huge_page_for_hwpoison(pfn, flags, >>>> &migratable_cleared, &samepg); >>>> if (res == 2) { /* fallback to normal page handling */ >>>> *hugetlb = 0; >>>> return 0; >>>> @@ -2027,7 +2028,10 @@ static int >>>> try_memory_failure_hugetlb(unsigned long pfn, int flags, int >>>> *hugetlb >>>> folio = page_folio(p); >>>> res = kill_accessing_process(current, >>>> folio_pfn(folio), flags); >>>> } >>>> - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); >>>> + if (samepg) >>>> + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); >>>> + else >>>> + action_result(pfn, MF_MSG_HUGE, MF_FAILED); >>> >>> Can't we somehow return that result from get_huge_page_for_hwpoison() >>> ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum >>> instead of "-EHWPOISON" or magic value "2". >> >> This is an option. The existing return codes are as follow. >> __get_huge_page_for_hwpoison(): >> * Return values: >> * 0 - free hugepage >> * 1 - in-use hugepage >> * 2 - not a hugepage >> * -EBUSY - the hugepage is busy (try to retry) >> * -EHWPOISON - the hugepage is already hwpoisoned >> >> folio_set_hugetlb_hwpoison() >> returns >> 0: folio was not poisoned before >> -EHWPOISON: folio was poisoned before >> >> To get rid of 'samepg', how about >> >> __get_huge_page_for_hwpoison(): >> * Return values: >> * 0 - free hugepage >> * 1 - in-use hugepage >> * 2 - not a hugepage >> * 3 - the hugepage is already hwpoisoned in different page >> * 4 - the hugepage is already hwpoisoned in the same page >> * -EBUSY - the hugepage is busy (try to retry) >> >> folio_set_hugetlb_hwpoison() >> returns >> 0: folio was not poisoned before >> 1: folio was poisoned before in different page >> 2: folio was poisoned before in the same page >> >> The whole point about identifying the same page is so that the re-poison >> event is not doubled counted. > > This means folio_set_hugetlb_hwpoison() returns 0 on success but > positives on error.. this seems to be going further away from the > standard way of doing things? Yes. > > It would actually be good to remove all magic values instead of > expanding them. > > I think what David was trying to say is to have a local enum that states > what these numbers mean so that the code reads more cleanly, instead of > digging for the right comment to decode it. > > For example, in try_memory_failure_hugetlb(): > > if (res == 2) { /* fallback to normal page handling */ > > vs: > > if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */ > > You could spell out your other options as well. Maybe something like > MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED > MEMORY_FAILURE_HWPOISONED > > This would avoid adding more magic values and increase readability. > > If you changed try_memory_failure_hugetlb() to use a switch statement, > then the compiler can catch unchecked enums for us too. > > If you don't want to go the enum route, then you could use a different > error code and propagate it through, like -EEXISTS for the new case? > That way the return is still 0 on success and less than 0 on failure, > but I think the enum idea has a number of advantages. I am open, actually prefer enum with switch statement as you suggested above. What about folio_set_hugetlb_hwpoison()? Indeed the conventional way of folio_set_X_Y() returns only two possible values, but we need three. How about changing the function name to set_hugetlb_hwpoison() to deviate from the convention? afterall, the function does more than conventional bit setting, it maintains a per folio raw-error linked list to track the poisoned pages within. Thanks! -jane > > Thanks, > Liam > ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison 2025-12-18 23:10 ` jane.chu @ 2025-12-19 4:09 ` Liam R. Howlett 0 siblings, 0 replies; 6+ messages in thread From: Liam R. Howlett @ 2025-12-19 4:09 UTC (permalink / raw) To: jane.chu Cc: David Hildenbrand (Red Hat), muchun.song, osalvador, linmiaohe, jiaqiyan, william.roche, rientjes, akpm, lorenzo.stoakes, rppt, surenb, mhocko, linux-mm, linux-kernel * jane.chu@oracle.com <jane.chu@oracle.com> [251218 18:10]: ... > > * jane.chu@oracle.com <jane.chu@oracle.com> [251218 14:01]: ... > > > On 12/18/2025 12:41 AM, David Hildenbrand (Red Hat) wrote: > > > > On 12/16/25 22:56, Jane Chu wrote: ... > > > > > -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct > > > > > page *page) > > > > > +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct > > > > > page *page, > > > > > + bool *samepg) > > > > > { > > > > > struct llist_head *head; > > > > > struct raw_hwp_page *raw_hwp; > > > > > @@ -1889,17 +1890,16 @@ static int folio_set_hugetlb_hwpoison(struct > > > > > folio *folio, struct page *page) > > > > > return -EHWPOISON; > > > > > head = raw_hwp_list_head(folio); > > > > > llist_for_each_entry(p, head->first, node) { > > > > > - if (p->page == page) > > > > > + if (p->page == page) { > > > > > + *samepg = true; > > > > > return -EHWPOISON; > > > > > + } > > > > > } > > > > > raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); > > > > > if (raw_hwp) { > > > > > raw_hwp->page = page; > > > > > llist_add(&raw_hwp->node, head); > > > > > - /* the first error event will be counted in action_result(). */ > > > > > - if (ret) > > > > > - num_poisoned_pages_inc(page_to_pfn(page)); > > > > > } else { > > > > > /* > > > > > * Failed to save raw error info. We no longer trace all ... > > > > > try_memory_failure_hugetlb(unsigned long pfn, int flags, int > > > > > *hugetlb > > > > > folio = page_folio(p); > > > > > res = kill_accessing_process(current, > > > > > folio_pfn(folio), flags); > > > > > } > > > > > - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > > > > > + if (samepg) > > > > > + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); > > > > > + else > > > > > + action_result(pfn, MF_MSG_HUGE, MF_FAILED); > > > > > > > > Can't we somehow return that result from get_huge_page_for_hwpoison() > > > > ... folio_set_hugetlb_hwpoison() differently? E.g., return an enum > > > > instead of "-EHWPOISON" or magic value "2". > > > > > > This is an option. The existing return codes are as follow. > > > __get_huge_page_for_hwpoison(): > > > * Return values: > > > * 0 - free hugepage > > > * 1 - in-use hugepage > > > * 2 - not a hugepage > > > * -EBUSY - the hugepage is busy (try to retry) > > > * -EHWPOISON - the hugepage is already hwpoisoned > > > > > > folio_set_hugetlb_hwpoison() > > > returns > > > 0: folio was not poisoned before > > > -EHWPOISON: folio was poisoned before > > > > > > To get rid of 'samepg', how about > > > > > > __get_huge_page_for_hwpoison(): > > > * Return values: > > > * 0 - free hugepage > > > * 1 - in-use hugepage > > > * 2 - not a hugepage > > > * 3 - the hugepage is already hwpoisoned in different page > > > * 4 - the hugepage is already hwpoisoned in the same page > > > * -EBUSY - the hugepage is busy (try to retry) > > > > > > folio_set_hugetlb_hwpoison() > > > returns > > > 0: folio was not poisoned before > > > 1: folio was poisoned before in different page > > > 2: folio was poisoned before in the same page > > > > > > The whole point about identifying the same page is so that the re-poison > > > event is not doubled counted. > > > > This means folio_set_hugetlb_hwpoison() returns 0 on success but > > positives on error.. this seems to be going further away from the > > standard way of doing things? > > Yes. > > > It would actually be good to remove all magic values instead of > > expanding them. > > > > I think what David was trying to say is to have a local enum that states > > what these numbers mean so that the code reads more cleanly, instead of > > digging for the right comment to decode it. > > > > For example, in try_memory_failure_hugetlb(): > > > > if (res == 2) { /* fallback to normal page handling */ > > > > vs: > > > > if (res == MEMORY_FAILURE_NOT_HUGEPAGE) { /* fallback to normal page handling */ > > > > You could spell out your other options as well. Maybe something like > > MEMORY_FAILURE_HWPOISONED_ALREADY_COUNTED > > MEMORY_FAILURE_HWPOISONED > > > > This would avoid adding more magic values and increase readability. > > > > If you changed try_memory_failure_hugetlb() to use a switch statement, > > then the compiler can catch unchecked enums for us too. > > > > If you don't want to go the enum route, then you could use a different > > error code and propagate it through, like -EEXISTS for the new case? > > That way the return is still 0 on success and less than 0 on failure, > > but I think the enum idea has a number of advantages. > > I am open, actually prefer enum with switch statement as you suggested > above. It's David's suggestion, really :) > > What about folio_set_hugetlb_hwpoison()? > Indeed the conventional way of folio_set_X_Y() returns only two possible > values, but we need three. I am having a hard time finding any folio_set_* that returns anything. $ git grep int\ folio_set_|wc -l 1 $ git grep void\ folio_set_|wc -l 20 > How about changing the function name to set_hugetlb_hwpoison() to deviate > from the convention? afterall, the function does more than conventional bit > setting, it maintains a per folio raw-error linked list > to track the poisoned pages within. I think that's a good idea considering it seems like folio_set_ implies that it will be setting something unconditionally, and this function does not always set something and does even more work. In fact the names make no sense to begin with. get_huge_page_for_hwpoison() ends up actually calling folio_set_hugetlb_hwpoison(), so it's not getting the huge page at all, it's doing (at least some of) the work. So, yeah, renaming it isn't going to make things worse. I'd try to indicate that folio_set_hugetlb_hwpoison() might not do anything. It's static and has hugetlb in the name, so we're pretty safe with anything. Maybe update_hugetlb_hwpoison_list() ? Thanks, Liam ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-12-19 4:10 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2025-12-16 21:56 [PATCH] mm/memory-failure: fix missing ->mf_stats count in hugetlb poison Jane Chu 2025-12-18 8:41 ` David Hildenbrand (Red Hat) 2025-12-18 19:01 ` jane.chu 2025-12-18 20:26 ` Liam R. Howlett 2025-12-18 23:10 ` jane.chu 2025-12-19 4:09 ` Liam R. Howlett
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox