* Re: [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy
2025-11-16 1:32 ` [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy Jiaqi Yan
@ 2025-11-25 21:47 ` William Roche
2025-11-25 22:04 ` William Roche
2025-12-03 4:11 ` jane.chu
2 siblings, 0 replies; 10+ messages in thread
From: William Roche @ 2025-11-25 21:47 UTC (permalink / raw)
To: Jiaqi Yan, nao.horiguchi, linmiaohe, harry.yoo
Cc: tony.luck, wangkefeng.wang, willy, jane.chu, akpm, osalvador,
rientjes, duenwen, jthoughton, jgg, ankita, peterx,
sidhartha.kumar, ziy, david, dave.hansen, muchun.song, linux-mm,
linux-kernel, linux-fsdevel
[-- Attachment #1: Type: text/plain, Size: 16193 bytes --]
Hello Jiaqi,
Here is a summary of a few nits in this code:
- Some functions declarations are problematic according to me
- The parameter testing to activate the feature looks incorrect
- The function signature change is probably not necessary
- Maybe we should wait for an agreement on your other proposal:
[PATCH v1 0/2] Only free healthy pages in high-order HWPoison folio
The last item is not a nit, but as your above proposal may require to
keep all data of a
hugetlb folio to recycle it correctly (especially the list of poisoned
sub-pages), and
to avoid the race condition with returning poisoned pages to the
freelist right before
removing them; you may need to change some aspects of this current code.
On 11/16/25 02:32, Jiaqi Yan wrote:
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f0..b7733ef5ee917 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -871,10 +871,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
>
> #ifdef CONFIG_MEMORY_FAILURE
> extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
> +extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> + struct address_space *mapping);
> #else
> static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
> {
> }
> +static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
> + struct address_space *mapping)
> +{
> + return false;
> +}
> #endif
>
You are conditionally declaring this
hugetlb_should_keep_hwpoison_mapped() function and implementing it into
mm/hugetlb.c, but this file can be compiled in both cases
(CONFIG_MEMORY_FAILURE enabled or not) So you either need to have a
single consistent declaration with the implementation and use something
like that:
bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio, struct
address_space *mapping) { +#ifdef CONFIG_MEMORY_FAILURE if
(WARN_ON_ONCE(!folio_test_hugetlb(folio))) return false; @@ -6087,6
+6088,9 @@ bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
return false; return mapping_mf_keep_ue_mapped(mapping); +#else + return
false; +#endif }
Or keep your double declaration and hide the implementation when
CONFIG_MEMORY_FAILURE is enabled:
+#ifdef CONFIG_MEMORY_FAILURE bool
hugetlb_should_keep_hwpoison_mapped(struct folio *folio, struct
address_space *mapping) { if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
return false; @@ -6087,6 +6088,9 @@ bool
hugetlb_should_keep_hwpoison_mapped(struct folio *folio, return false;
return mapping_mf_keep_ue_mapped(mapping); } +#endif
> #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 09b581c1d878d..9ad511aacde7c 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -213,6 +213,8 @@ enum mapping_flags {
> AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
> AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
> account usage to user cgroups */
> + /* For MFD_MF_KEEP_UE_MAPPED. */
> + AS_MF_KEEP_UE_MAPPED = 11,
> /* Bits 16-25 are used for FOLIO_ORDER */
> AS_FOLIO_ORDER_BITS = 5,
> AS_FOLIO_ORDER_MIN = 16,
> @@ -348,6 +350,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
> return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
> }
>
> +static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
> +{
> + return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> +static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
> +{
> + set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
> {
> return mapping->gfp_mask;
> @@ -1274,6 +1286,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
> void delete_from_page_cache_batch(struct address_space *mapping,
> struct folio_batch *fbatch);
> bool filemap_release_folio(struct folio *folio, gfp_t gfp);
> +#ifdef CONFIG_MEMORY_FAILURE
> +/*
> + * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
> + */
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio);
> +#else
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> +}
> +#endif
> loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
> int whence);
This filemap_offline_hwpoison_folio() declaration also is problematic in
the case without CONFIG_MEMORY_FAILURE, as we implement a public
function filemap_offline_hwpoison_folio() in all the files including
this "pagemap.h" header.
This coud be solved using "static inline" in this second case.
> diff --git a/mm/memfd.c b/mm/memfd.c
> index 1d109c1acf211..bfdde4cf90500 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -313,7 +313,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
> #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
>
> -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
> +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> + MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
>
> static int check_sysctl_memfd_noexec(unsigned int *flags)
> {
> @@ -387,6 +388,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
> if (!(flags & MFD_HUGETLB)) {
> if (flags & ~MFD_ALL_FLAGS)
> return -EINVAL;
> + if (flags & MFD_MF_KEEP_UE_MAPPED)
> + return -EINVAL;
> } else {
> /* Allow huge page size encoding in flags. */
> if (flags & ~(MFD_ALL_FLAGS |
> @@ -447,6 +450,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> + /*
> + * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
> + * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
> + * seal-able.
> + *
> + * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
> + */
> + if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
> + mapping_set_mf_keep_ue_mapped(file->f_mapping);
The flags value that we need to have in order to set the "keep" value on
the address space
is MFD_MF_KEEP_UE_MAPPED alone, as we already verified that the value is
only given combined to MFD_HUGETLB. This is a nit identified by Harry
Yoo during our internal conversations. Thanks Harry !
> +
> if (flags & MFD_NOEXEC_SEAL) {
> struct inode *inode = file_inode(file);
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30b..c5e3e28872797 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -373,11 +373,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> * Schedule a process for later kill.
> * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> */
> -static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> +static void __add_to_kill(struct task_struct *tsk, struct page *p,
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
Is there any reason to remove the "const" on the page structure in the
signature ?
It looks like you only do that for the new call to page_folio(p), but we
don't touch the page
> {
> struct to_kill *tk;
> + struct folio *folio;
You could use a "const" struct folio *folio too.
> + struct address_space *mapping;
>
> tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
> if (!tk) {
> @@ -388,8 +390,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> tk->addr = addr;
> if (is_zone_device_page(p))
> tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> - else
> - tk->size_shift = folio_shift(page_folio(p));
> + else {
> + folio = page_folio(p);
Now with both folio and p being "const", the code should work.
> + mapping = folio_mapping(folio);
> + if (mapping && mapping_mf_keep_ue_mapped(mapping))
> + /*
> + * Let userspace know the radius of HWPoison is
> + * the size of raw page; accessing other pages
> + * inside the folio is still ok.
> + */
> + tk->size_shift = PAGE_SHIFT;
> + else
> + tk->size_shift = folio_shift(folio);
> + }
>
> /*
> * Send SIGKILL if "tk->addr == -EFAULT". Also, as
> @@ -414,7 +427,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> list_add_tail(&tk->nd, to_kill);
> }
>
> -static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
> +static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
No need to change the signature here too (otherwise you would have
missed both functions
add_to_kill_fsdax() and add_to_kill_ksm().
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
> {
> @@ -535,7 +548,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
> * Collect processes when the error hit an anonymous page.
> */
> static void collect_procs_anon(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
No need to change
> int force_early)
> {
> struct task_struct *tsk;
> @@ -573,7 +586,7 @@ static void collect_procs_anon(const struct folio *folio,
> * Collect processes when the error hit a file mapped page.
> */
> static void collect_procs_file(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
> int force_early)
No need to change
> {
> struct vm_area_struct *vma;
> @@ -655,7 +668,7 @@ static void collect_procs_fsdax(const struct page *page,
> /*
> * Collect the processes who have the corrupted page mapped to kill.
> */
> -static void collect_procs(const struct folio *folio, const struct page *page,
> +static void collect_procs(const struct folio *folio, struct page *page,
No need to change
> struct list_head *tokill, int force_early)
> {
> if (!folio->mapping)
> @@ -1173,6 +1186,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
> }
> }
>
> + /*
> + * MF still needs to holds a refcount for the deferred actions in
> + * filemap_offline_hwpoison_folio.
> + */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + return res;
> +
> if (has_extra_refcount(ps, p, extra_pins))
> res = MF_FAILED;
>
> @@ -1569,6 +1589,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> {
> LIST_HEAD(tokill);
> bool unmap_success;
> + bool keep_mapped;
> int forcekill;
> bool mlocked = folio_test_mlocked(folio);
>
> @@ -1596,8 +1617,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> */
> collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
>
> - unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> - if (!unmap_success)
> + keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);
> + if (!keep_mapped)
> + unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> +
> + unmap_success = !folio_mapped(folio);
> + if (!keep_mapped && !unmap_success)
> pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
> pfn, folio_mapcount(folio));
>
> @@ -1622,7 +1647,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> !unmap_success;
> kill_procs(&tokill, forcekill, pfn, flags);
>
> - return unmap_success;
> + return unmap_success || keep_mapped;
> }
>
> static int identify_page_state(unsigned long pfn, struct page *p,
> @@ -1862,6 +1887,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> unsigned long count = 0;
>
> head = llist_del_all(raw_hwp_list_head(folio));
> + /*
> + * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
> + * it has already taken off the head of the llist.
> + */
> + if (head == NULL)
> + return 0;
> +
This may not be necessary depending on how we recycle hugetlb pages --
see below too.
> llist_for_each_entry_safe(p, next, head, node) {
> if (move_flag)
> SetPageHWPoison(p->page);
> @@ -1878,7 +1910,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> struct llist_head *head;
> struct raw_hwp_page *raw_hwp;
> struct raw_hwp_page *p;
> - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
> + struct address_space *mapping = folio->mapping;
> + bool has_hwpoison = folio_test_set_hwpoison(folio);
>
> /*
> * Once the hwpoison hugepage has lost reliable raw error info,
> @@ -1897,8 +1930,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> if (raw_hwp) {
> raw_hwp->page = page;
> llist_add(&raw_hwp->node, head);
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + /*
> + * A new raw HWPoison page. Don't return HWPOISON.
> + * Error event will be counted in action_result().
> + */
> + return 0;
> +
> /* the first error event will be counted in action_result(). */
> - if (ret)
> + if (has_hwpoison)
> num_poisoned_pages_inc(page_to_pfn(page));
> } else {
> /*
> @@ -1913,7 +1953,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> */
> __folio_free_raw_hwp(folio, false);
> }
> - return ret;
> +
> + return has_hwpoison ? -EHWPOISON : 0;
> }
>
> static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
> @@ -2002,6 +2043,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> return ret;
> }
>
> +static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
> +{
> + int ret;
> + struct llist_node *head;
> + struct raw_hwp_page *curr, *next;
> + struct page *page;
> + unsigned long pfn;
> +
> + /*
> + * Since folio is still in the folio_batch, drop the refcount
> + * elevated by filemap_get_folios.
> + */
> + folio_put_refs(folio, 1);
> + head = llist_del_all(raw_hwp_list_head(folio));
According to me we should wait until your other patch set is approved to
decide if the folio raw_hwp_list
has to be removed from the folio or if is should be left there so that
the recycling of this huge page
works correctly...
> +
> + /*
> + * Release refcounts held by try_memory_failure_hugetlb, one per
> + * HWPoison-ed page in the raw hwp list.
> + */
> + llist_for_each_entry(curr, head, node) {
> + SetPageHWPoison(curr->page);
> + folio_put(folio);
> + }
> +
> + /* Refcount now should be zero and ready to dissolve folio. */
> + ret = dissolve_free_hugetlb_folio(folio);
> + if (ret) {
> + pr_err("failed to dissolve hugetlb folio: %d\n", ret);
> + return;
> + }
> +
> + llist_for_each_entry_safe(curr, next, head, node) {
> + page = curr->page;
> + pfn = page_to_pfn(page);
> + drain_all_pages(page_zone(page));
> + if (!take_page_off_buddy(page))
> + pr_err("%#lx: unable to take off buddy allocator\n", pfn);
> +
> + page_ref_inc(page);
> + kfree(curr);
> + pr_info("%#lx: pending hard offline completed\n", pfn);
> + }
> +}
Let's revisit this above function when an agreement is reached on the
recycling hugetlb pages proposal.
> +
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> + WARN_ON_ONCE(!mapping);
> +
> + if (!folio_test_hwpoison(folio))
> + return;
> +
> + /* Pending MFR currently only exist for hugetlb. */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + filemap_offline_hwpoison_folio_hugetlb(folio);
> +}
> +
> /*
> * Taking refcount of hugetlb pages needs extra care about race conditions
> * with basic operations like hugepage allocation/free/demotion.
HTH
Best regards,
William.
[-- Attachment #2: Type: text/html, Size: 20426 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy
2025-11-16 1:32 ` [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy Jiaqi Yan
2025-11-25 21:47 ` William Roche
@ 2025-11-25 22:04 ` William Roche
2025-12-03 4:11 ` jane.chu
2 siblings, 0 replies; 10+ messages in thread
From: William Roche @ 2025-11-25 22:04 UTC (permalink / raw)
To: Jiaqi Yan, nao.horiguchi, linmiaohe, harry.yoo
Cc: tony.luck, wangkefeng.wang, willy, jane.chu, akpm, osalvador,
rientjes, duenwen, jthoughton, jgg, ankita, peterx,
sidhartha.kumar, ziy, david, dave.hansen, muchun.song, linux-mm,
linux-kernel, linux-fsdevel
Sorry, resending for the non-HTML version.
--
Hello Jiaqi,
Here is a summary of a few nits in this code:
- Some functions declarations are problematic according to me
- The parameter testing to activate the feature looks incorrect
- The function signature change is probably not necessary
- Maybe we should wait for an agreement on your other proposal:
[PATCH v1 0/2] Only free healthy pages in high-order HWPoison folio
The last item is not a nit, but as your above proposal may require to
keep all data of a
hugetlb folio to recycle it correctly (especially the list of poisoned
sub-pages), and
to avoid the race condition with returning poisoned pages to the
freelist right before
removing them; you may need to change some aspects of this current code.
On 11/16/25 02:32, Jiaqi Yan wrote:
> [...]
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f0..b7733ef5ee917 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -871,10 +871,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
>
> #ifdef CONFIG_MEMORY_FAILURE
> extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
> +extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> + struct address_space *mapping);
> #else
> static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
> {
> }
> +static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
> + struct address_space *mapping)
> +{
> + return false;
> +}
> #endif
You are conditionally declaring this
hugetlb_should_keep_hwpoison_mapped() function and implementing
it into mm/hugetlb.c, but this file can be compiled in both cases
(CONFIG_MEMORY_FAILURE enabled or not)
So you either need to have a single consistent declaration with the
implementation and use something like that:
bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
struct address_space *mapping)
{
+#ifdef CONFIG_MEMORY_FAILURE
if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
return false;
@@ -6087,6 +6088,9 @@ bool hugetlb_should_keep_hwpoison_mapped(struct
folio *folio,
return false;
return mapping_mf_keep_ue_mapped(mapping);
+#else
+ return false;
+#endif
}
Or keep your double declaration and hide the implementation when
CONFIG_MEMORY_FAILURE is enabled:
+#ifdef CONFIG_MEMORY_FAILURE
bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
struct address_space *mapping)
{
if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
return false;
@@ -6087,6 +6088,9 @@ bool hugetlb_should_keep_hwpoison_mapped(struct
folio *folio,
return false;
return mapping_mf_keep_ue_mapped(mapping);
}
+#endif
>
> #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 09b581c1d878d..9ad511aacde7c 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -213,6 +213,8 @@ enum mapping_flags {
> AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
> AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
> account usage to user cgroups */
> + /* For MFD_MF_KEEP_UE_MAPPED. */
> + AS_MF_KEEP_UE_MAPPED = 11,
> /* Bits 16-25 are used for FOLIO_ORDER */
> AS_FOLIO_ORDER_BITS = 5,
> AS_FOLIO_ORDER_MIN = 16,
> @@ -348,6 +350,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
> return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
> }
>
> +static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
> +{
> + return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> +static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
> +{
> + set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
> {
> return mapping->gfp_mask;
> @@ -1274,6 +1286,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
> void delete_from_page_cache_batch(struct address_space *mapping,
> struct folio_batch *fbatch);
> bool filemap_release_folio(struct folio *folio, gfp_t gfp);
> +#ifdef CONFIG_MEMORY_FAILURE
> +/*
> + * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
> + */
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio);
> +#else
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> +}
> +#endif
> loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
> int whence);
>
This filemap_offline_hwpoison_folio() declaration also is problematic in
the case without
CONFIG_MEMORY_FAILURE, as we implement a public function
filemap_offline_hwpoison_folio()
in all the files including this "pagemap.h" header.
This coud be solved using "static inline" in this second case.
> diff --git a/mm/memfd.c b/mm/memfd.c
> index 1d109c1acf211..bfdde4cf90500 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -313,7 +313,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
> #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
>
> -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
> +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> + MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
>
> static int check_sysctl_memfd_noexec(unsigned int *flags)
> {
> @@ -387,6 +388,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
> if (!(flags & MFD_HUGETLB)) {
> if (flags & ~MFD_ALL_FLAGS)
> return -EINVAL;
> + if (flags & MFD_MF_KEEP_UE_MAPPED)
> + return -EINVAL;
> } else {
> /* Allow huge page size encoding in flags. */
> if (flags & ~(MFD_ALL_FLAGS |
> @@ -447,6 +450,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> + /*
> + * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
> + * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
> + * seal-able.
> + *
> + * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
> + */
> + if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
> + mapping_set_mf_keep_ue_mapped(file->f_mapping);
> +
The flags value that we need to have in order to set the "keep" value on
the address space
is MFD_MF_KEEP_UE_MAPPED alone, as we already verified that the value is
only given combined
to MFD_HUGETLB.
This is a nit identified by Harry Yoo during our internal conversations.
Thanks Harry !
> if (flags & MFD_NOEXEC_SEAL) {
> struct inode *inode = file_inode(file);
>
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30b..c5e3e28872797 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -373,11 +373,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> * Schedule a process for later kill.
> * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> */
> -static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> +static void __add_to_kill(struct task_struct *tsk, struct page *p,
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
Is there any reason to remove the "const" on the page structure in the
signature ?
It looks like you only do that for the new call to page_folio(p), but we
don't touch the page
> {
> struct to_kill *tk;
> + struct folio *folio;
You could use a "const" struct folio *folio too.
> + struct address_space *mapping;
>
> tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
> if (!tk) {
> @@ -388,8 +390,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> tk->addr = addr;
> if (is_zone_device_page(p))
> tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> - else
> - tk->size_shift = folio_shift(page_folio(p));
> + else {
> + folio = page_folio(p);
Now with both folio and p being "const", the code should work.
> + mapping = folio_mapping(folio);
> + if (mapping && mapping_mf_keep_ue_mapped(mapping))
> + /*
> + * Let userspace know the radius of HWPoison is
> + * the size of raw page; accessing other pages
> + * inside the folio is still ok.
> + */
> + tk->size_shift = PAGE_SHIFT;
> + else
> + tk->size_shift = folio_shift(folio);
> + }
>
> /*
> * Send SIGKILL if "tk->addr == -EFAULT". Also, as
> @@ -414,7 +427,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> list_add_tail(&tk->nd, to_kill);
> }
>
> -static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
> +static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
No need to change the signature here too (otherwise you would have
missed both functions
add_to_kill_fsdax() and add_to_kill_ksm().
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
> {
> @@ -535,7 +548,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
> * Collect processes when the error hit an anonymous page.
> */
> static void collect_procs_anon(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
No need to change
> int force_early)
> {
> struct task_struct *tsk;
> @@ -573,7 +586,7 @@ static void collect_procs_anon(const struct folio *folio,
> * Collect processes when the error hit a file mapped page.
> */
> static void collect_procs_file(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
> int force_early)
No need to change
> {
> struct vm_area_struct *vma;
> @@ -655,7 +668,7 @@ static void collect_procs_fsdax(const struct page *page,
> /*
> * Collect the processes who have the corrupted page mapped to kill.
> */
> -static void collect_procs(const struct folio *folio, const struct page *page,
> +static void collect_procs(const struct folio *folio, struct page *page,
> struct list_head *tokill, int force_early)
> {
> if (!folio->mapping)
> @@ -1173,6 +1186,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
> }
> }
>
> + /*
> + * MF still needs to holds a refcount for the deferred actions in
> + * filemap_offline_hwpoison_folio.
> + */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + return res;
> +
> if (has_extra_refcount(ps, p, extra_pins))
> res = MF_FAILED;
>
> @@ -1569,6 +1589,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> {
> LIST_HEAD(tokill);
> bool unmap_success;
> + bool keep_mapped;
> int forcekill;
> bool mlocked = folio_test_mlocked(folio);
>
> @@ -1596,8 +1617,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> */
> collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
>
> - unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> - if (!unmap_success)
> + keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);
> + if (!keep_mapped)
> + unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> +
> + unmap_success = !folio_mapped(folio);
> + if (!keep_mapped && !unmap_success)
> pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
> pfn, folio_mapcount(folio));
>
> @@ -1622,7 +1647,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> !unmap_success;
> kill_procs(&tokill, forcekill, pfn, flags);
>
> - return unmap_success;
> + return unmap_success || keep_mapped;
> }
>
> static int identify_page_state(unsigned long pfn, struct page *p,
> @@ -1862,6 +1887,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> unsigned long count = 0;
>
> head = llist_del_all(raw_hwp_list_head(folio));
> + /*
> + * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
> + * it has already taken off the head of the llist.
> + */
> + if (head == NULL)
> + return 0;
> +
This may not be necessary depending on how we recycle hugetlb pages --
see below too.
> llist_for_each_entry_safe(p, next, head, node) {
> if (move_flag)
> SetPageHWPoison(p->page);
> @@ -1878,7 +1910,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> struct llist_head *head;
> struct raw_hwp_page *raw_hwp;
> struct raw_hwp_page *p;
> - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
> + struct address_space *mapping = folio->mapping;
> + bool has_hwpoison = folio_test_set_hwpoison(folio);
>
> /*
> * Once the hwpoison hugepage has lost reliable raw error info,
> @@ -1897,8 +1930,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> if (raw_hwp) {
> raw_hwp->page = page;
> llist_add(&raw_hwp->node, head);
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + /*
> + * A new raw HWPoison page. Don't return HWPOISON.
> + * Error event will be counted in action_result().
> + */
> + return 0;
> +
> /* the first error event will be counted in action_result(). */
> - if (ret)
> + if (has_hwpoison)
> num_poisoned_pages_inc(page_to_pfn(page));
> } else {
> /*
> @@ -1913,7 +1953,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> */
> __folio_free_raw_hwp(folio, false);
> }
> - return ret;
> +
> + return has_hwpoison ? -EHWPOISON : 0;
> }
>
> static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
> @@ -2002,6 +2043,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> return ret;
> }
>
> +static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
> +{
> + int ret;
> + struct llist_node *head;
> + struct raw_hwp_page *curr, *next;
> + struct page *page;
> + unsigned long pfn;
> +
> + /*
> + * Since folio is still in the folio_batch, drop the refcount
> + * elevated by filemap_get_folios.
> + */
> + folio_put_refs(folio, 1);
> + head = llist_del_all(raw_hwp_list_head(folio));
According to me we should wait until your other patch set is approved to
decide if the folio raw_hwp_list
has to be removed from the folio or if is should be left there so that
the recycling of this huge page
works correctly...
> +
> + /*
> + * Release refcounts held by try_memory_failure_hugetlb, one per
> + * HWPoison-ed page in the raw hwp list.
> + */
> + llist_for_each_entry(curr, head, node) {
> + SetPageHWPoison(curr->page);
> + folio_put(folio);
> + }
> +
> + /* Refcount now should be zero and ready to dissolve folio. */
> + ret = dissolve_free_hugetlb_folio(folio);
> + if (ret) {
> + pr_err("failed to dissolve hugetlb folio: %d\n", ret);
> + return;
> + }
> +
> + llist_for_each_entry_safe(curr, next, head, node) {
> + page = curr->page;
> + pfn = page_to_pfn(page);
> + drain_all_pages(page_zone(page));
> + if (!take_page_off_buddy(page))
> + pr_err("%#lx: unable to take off buddy allocator\n", pfn);
> +
> + page_ref_inc(page);
> + kfree(curr);
> + pr_info("%#lx: pending hard offline completed\n", pfn);
> + }
> +}
Let's revisit this above function when an agreement is reached on the
recycling hugetlb pages proposal.
> +
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> + WARN_ON_ONCE(!mapping);
> +
> + if (!folio_test_hwpoison(folio))
> + return;
> +
> + /* Pending MFR currently only exist for hugetlb. */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + filemap_offline_hwpoison_folio_hugetlb(folio);
> +}
> +
> /*
> * Taking refcount of hugetlb pages needs extra care about race conditions
> * with basic operations like hugepage allocation/free/demotion.
HTH
Best regards,
William.
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy
2025-11-16 1:32 ` [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy Jiaqi Yan
2025-11-25 21:47 ` William Roche
2025-11-25 22:04 ` William Roche
@ 2025-12-03 4:11 ` jane.chu
2025-12-03 19:41 ` Jiaqi Yan
2 siblings, 1 reply; 10+ messages in thread
From: jane.chu @ 2025-12-03 4:11 UTC (permalink / raw)
To: Jiaqi Yan, nao.horiguchi, linmiaohe, william.roche, harry.yoo
Cc: tony.luck, wangkefeng.wang, willy, akpm, osalvador, rientjes,
duenwen, jthoughton, jgg, ankita, peterx, sidhartha.kumar, ziy,
david, dave.hansen, muchun.song, linux-mm, linux-kernel,
linux-fsdevel
Hi, Jiaqi,
Thanks for the work, my comments inline.
On 11/15/2025 5:32 PM, Jiaqi Yan wrote:
> Sometimes immediately hard offlining a large chunk of contigous memory
> having uncorrected memory errors (UE) may not be the best option.
> Cloud providers usually serve capacity- and performance-critical guest
> memory with 1G HugeTLB hugepages, as this significantly reduces the
> overhead associated with managing page tables and TLB misses. However,
> for today's HugeTLB system, once a byte of memory in a hugepage is
> hardware corrupted, the kernel discards the whole hugepage, including
> the healthy portion. Customer workload running in the VM can hardly
> recover from such a great loss of memory.
>
> Therefore keeping or discarding a large chunk of contiguous memory
> owned by userspace (particularly to serve guest memory) due to
> recoverable UE may better be controlled by userspace process
> that owns the memory, e.g. VMM in Cloud environment.
>
> Introduce a memfd-based userspace memory failure (MFR) policy,
> MFD_MF_KEEP_UE_MAPPED. It is intended to be supported for other memfd,
> but the current implementation only covers HugeTLB.
>
> For any hugepage associated with the MFD_MF_KEEP_UE_MAPPED enabled memfd,
> whenever it runs into a UE, MFR doesn't hard offline the HWPoison-ed
> huge folio. IOW the HWPoison-ed memory remains accessible via the memory
> mapping created with that memfd. MFR still sends SIGBUS to the process
> as required. MFR also still maintains HWPoison metadata for the hugepage
> having the UE.
>
> A HWPoison-ed hugepage will be immediately isolated and prevented from
> future allocation once userspace truncates it via the memfd, or the
> owning memfd is closed.
>
> By default MFD_MF_KEEP_UE_MAPPED is not set, and MFR hard offlines
> hugepages having UEs.
>
> Tested with selftest in the follow-up commit.
>
> Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> Tested-by: William Roche <william.roche@oracle.com>
> ---
> fs/hugetlbfs/inode.c | 25 +++++++-
> include/linux/hugetlb.h | 7 +++
> include/linux/pagemap.h | 24 +++++++
> include/uapi/linux/memfd.h | 6 ++
> mm/hugetlb.c | 20 +++++-
> mm/memfd.c | 15 ++++-
> mm/memory-failure.c | 124 +++++++++++++++++++++++++++++++++----
> 7 files changed, 202 insertions(+), 19 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index f42548ee9083c..f8a5aa091d51d 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -532,6 +532,18 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
> }
>
> folio_unlock(folio);
> +
> + /*
> + * There may be pending HWPoison-ed folios when a memfd is being
> + * removed or part of it is being truncated.
> + *
> + * HugeTLBFS' error_remove_folio keeps the HWPoison-ed folios in
> + * page cache until mm wants to drop the folio at the end of the
> + * of the filemap. At this point, if memory failure was delayed
> + * by MFD_MF_KEEP_UE_MAPPED in the past, we can now deal with it.
> + */
> + filemap_offline_hwpoison_folio(mapping, folio);
> +
> return ret;
> }
Looks okay.
>
> @@ -563,13 +575,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> const pgoff_t end = lend >> PAGE_SHIFT;
> struct folio_batch fbatch;
> pgoff_t next, index;
> - int i, freed = 0;
> + int i, j, freed = 0;
> bool truncate_op = (lend == LLONG_MAX);
>
> folio_batch_init(&fbatch);
> next = lstart >> PAGE_SHIFT;
> while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
> - for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> + for (i = 0, j = 0; i < folio_batch_count(&fbatch); ++i) {
> struct folio *folio = fbatch.folios[i];
> u32 hash = 0;
>
> @@ -584,8 +596,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> index, truncate_op))
> freed++;
>
> + /*
> + * Skip HWPoison-ed hugepages, which should no
> + * longer be hugetlb if successfully dissolved.
> + */
> + if (folio_test_hugetlb(folio))
> + fbatch.folios[j++] = folio;
> +
> mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> }
> + fbatch.nr = j;
> +
> folio_batch_release(&fbatch);
> cond_resched();
> }
Looks okay.
But this reminds me that for now remove_inode_single_folio() has no path
to return 'false' anyway, and if it does, remove_inode_hugepages() will
be broken since it has no logic to account for failed to be
removed folios. Do you mind to make remove_inode_single_folio() a void
function in order to avoid the confusion?
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f0..b7733ef5ee917 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -871,10 +871,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
>
> #ifdef CONFIG_MEMORY_FAILURE
> extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
> +extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> + struct address_space *mapping);
> #else
> static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
> {
> }
> +static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
> + struct address_space *mapping)
> +{
> + return false;
> +}
> #endif
It appears that hugetlb_should_keep_hwpoison_mapped() is only called
within mm/memory-failure.c. How about moving it there ?
>
> #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 09b581c1d878d..9ad511aacde7c 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -213,6 +213,8 @@ enum mapping_flags {
> AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
> AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
> account usage to user cgroups */
> + /* For MFD_MF_KEEP_UE_MAPPED. */
> + AS_MF_KEEP_UE_MAPPED = 11,
> /* Bits 16-25 are used for FOLIO_ORDER */
> AS_FOLIO_ORDER_BITS = 5,
> AS_FOLIO_ORDER_MIN = 16,
> @@ -348,6 +350,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
> return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
> }
>
Okay.
> +static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
> +{
> + return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> +static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
> +{
> + set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> +}
> +
> static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
> {
> return mapping->gfp_mask;
> @@ -1274,6 +1286,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
> void delete_from_page_cache_batch(struct address_space *mapping,
> struct folio_batch *fbatch);
> bool filemap_release_folio(struct folio *folio, gfp_t gfp);
> +#ifdef CONFIG_MEMORY_FAILURE
> +/*
> + * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
> + */
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio);
> +#else
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> +}
> +#endif
Okay.
> loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
> int whence);
>
> diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
> index 273a4e15dfcff..d9875da551b7f 100644
> --- a/include/uapi/linux/memfd.h
> +++ b/include/uapi/linux/memfd.h
> @@ -12,6 +12,12 @@
> #define MFD_NOEXEC_SEAL 0x0008U
> /* executable */
> #define MFD_EXEC 0x0010U
> +/*
> + * Keep owned folios mapped when uncorrectable memory errors (UE) causes
> + * memory failure (MF) within the folio. Only at the end of the mapping
> + * will its HWPoison-ed folios be dealt with.
> + */
> +#define MFD_MF_KEEP_UE_MAPPED 0x0020U
>
> /*
> * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0455119716ec0..dd3bc0b75e059 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6415,6 +6415,18 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
> return same;
> }
>
> +bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> + struct address_space *mapping)
> +{
> + if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
> + return false;
> +
> + if (!mapping)
> + return false;
> +
> + return mapping_mf_keep_ue_mapped(mapping);
> +}
> +
Okay.
> static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> struct vm_fault *vmf)
> {
> @@ -6537,9 +6549,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> * So we need to block hugepage fault by PG_hwpoison bit check.
> */
> if (unlikely(folio_test_hwpoison(folio))) {
> - ret = VM_FAULT_HWPOISON_LARGE |
> - VM_FAULT_SET_HINDEX(hstate_index(h));
> - goto backout_unlocked;
> + if (!mapping_mf_keep_ue_mapped(mapping)) {
> + ret = VM_FAULT_HWPOISON_LARGE |
> + VM_FAULT_SET_HINDEX(hstate_index(h));
> + goto backout_unlocked;
> + }
> }
>
Looks okay, but am curious at Miaohe and others' take.
To allow a known poisoned hugetlb page to be faulted in is for the sake
of capacity, so this, versus a SIGBUS from the MF handler indicating a
disruption and loss of both data and capacity.
No strong opinion here, just wondering if there is any merit to limit
the scope to the MF handler only.
> /* Check for page in userfault range. */
> diff --git a/mm/memfd.c b/mm/memfd.c
> index 1d109c1acf211..bfdde4cf90500 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -313,7 +313,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
> #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
>
> -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
> +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> + MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
>
> static int check_sysctl_memfd_noexec(unsigned int *flags)
> {
> @@ -387,6 +388,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
> if (!(flags & MFD_HUGETLB)) {
> if (flags & ~MFD_ALL_FLAGS)
> return -EINVAL;
> + if (flags & MFD_MF_KEEP_UE_MAPPED)
> + return -EINVAL;
> } else {
> /* Allow huge page size encoding in flags. */
> if (flags & ~(MFD_ALL_FLAGS |
> @@ -447,6 +450,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> + /*
> + * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
> + * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
> + * seal-able.
> + *
> + * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
> + */
> + if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
> + mapping_set_mf_keep_ue_mapped(file->f_mapping);
> +
> if (flags & MFD_NOEXEC_SEAL) {
> struct inode *inode = file_inode(file);
>
Okay.
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3edebb0cda30b..c5e3e28872797 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -373,11 +373,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> * Schedule a process for later kill.
> * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> */
> -static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> +static void __add_to_kill(struct task_struct *tsk, struct page *p,
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
> {
> struct to_kill *tk;
> + struct folio *folio;
> + struct address_space *mapping;
>
> tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
> if (!tk) {
> @@ -388,8 +390,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> tk->addr = addr;
> if (is_zone_device_page(p))
> tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> - else
> - tk->size_shift = folio_shift(page_folio(p));
> + else {
> + folio = page_folio(p);
> + mapping = folio_mapping(folio);
> + if (mapping && mapping_mf_keep_ue_mapped(mapping))
> + /*
> + * Let userspace know the radius of HWPoison is
> + * the size of raw page; accessing other pages
> + * inside the folio is still ok.
> + */
> + tk->size_shift = PAGE_SHIFT;
> + else
> + tk->size_shift = folio_shift(folio);
> + }
>
> /*
> * Send SIGKILL if "tk->addr == -EFAULT". Also, as
> @@ -414,7 +427,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> list_add_tail(&tk->nd, to_kill);
> }
>
> -static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
> +static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
> struct vm_area_struct *vma, struct list_head *to_kill,
> unsigned long addr)
> {
> @@ -535,7 +548,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
> * Collect processes when the error hit an anonymous page.
> */
> static void collect_procs_anon(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
> int force_early)
> {
> struct task_struct *tsk;
> @@ -573,7 +586,7 @@ static void collect_procs_anon(const struct folio *folio,
> * Collect processes when the error hit a file mapped page.
> */
> static void collect_procs_file(const struct folio *folio,
> - const struct page *page, struct list_head *to_kill,
> + struct page *page, struct list_head *to_kill,
> int force_early)
> {
> struct vm_area_struct *vma;
> @@ -655,7 +668,7 @@ static void collect_procs_fsdax(const struct page *page,
> /*
> * Collect the processes who have the corrupted page mapped to kill.
> */
> -static void collect_procs(const struct folio *folio, const struct page *page,
> +static void collect_procs(const struct folio *folio, struct page *page,
> struct list_head *tokill, int force_early)
> {
> if (!folio->mapping)
> @@ -1173,6 +1186,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
> }
> }
>
> + /*
> + * MF still needs to holds a refcount for the deferred actions in
> + * filemap_offline_hwpoison_folio.
> + */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + return res;
> +
Okay.
> if (has_extra_refcount(ps, p, extra_pins))
> res = MF_FAILED;
>
> @@ -1569,6 +1589,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> {
> LIST_HEAD(tokill);
> bool unmap_success;
> + bool keep_mapped;
> int forcekill;
> bool mlocked = folio_test_mlocked(folio);
>
> @@ -1596,8 +1617,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> */
> collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
>
> - unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> - if (!unmap_success)
> + keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);
> + if (!keep_mapped)
> + unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> +
> + unmap_success = !folio_mapped(folio);
> + if (!keep_mapped && !unmap_success)
> pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
> pfn, folio_mapcount(folio));
>
> @@ -1622,7 +1647,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> !unmap_success;
> kill_procs(&tokill, forcekill, pfn, flags);
>
> - return unmap_success;
> + return unmap_success || keep_mapped;
> }
Okay.
>
> static int identify_page_state(unsigned long pfn, struct page *p,
> @@ -1862,6 +1887,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> unsigned long count = 0;
>
> head = llist_del_all(raw_hwp_list_head(folio));
> + /*
> + * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
> + * it has already taken off the head of the llist.
> + */
> + if (head == NULL)
> + return 0;
> +
> llist_for_each_entry_safe(p, next, head, node) {
> if (move_flag)
> SetPageHWPoison(p->page);
> @@ -1878,7 +1910,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> struct llist_head *head;
> struct raw_hwp_page *raw_hwp;
> struct raw_hwp_page *p;
> - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
> + struct address_space *mapping = folio->mapping;
> + bool has_hwpoison = folio_test_set_hwpoison(folio);
>
> /*
> * Once the hwpoison hugepage has lost reliable raw error info,
> @@ -1897,8 +1930,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> if (raw_hwp) {
> raw_hwp->page = page;
> llist_add(&raw_hwp->node, head);
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + /*
> + * A new raw HWPoison page. Don't return HWPOISON.
> + * Error event will be counted in action_result().
> + */
> + return 0;
> +
> /* the first error event will be counted in action_result(). */
> - if (ret)
> + if (has_hwpoison)
> num_poisoned_pages_inc(page_to_pfn(page));
> } else {
> /*
> @@ -1913,7 +1953,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> */
> __folio_free_raw_hwp(folio, false);
> }
> - return ret;
> +
> + return has_hwpoison ? -EHWPOISON : 0;
> }
Okay.
>
> static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
> @@ -2002,6 +2043,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> return ret;
> }
>
> +static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
> +{
> + int ret;
> + struct llist_node *head;
> + struct raw_hwp_page *curr, *next;
> + struct page *page;
> + unsigned long pfn;
> +
> + /*
> + * Since folio is still in the folio_batch, drop the refcount
> + * elevated by filemap_get_folios.
> + */
> + folio_put_refs(folio, 1);
> + head = llist_del_all(raw_hwp_list_head(folio));
> +
> + /*
> + * Release refcounts held by try_memory_failure_hugetlb, one per
> + * HWPoison-ed page in the raw hwp list.
> + */
> + llist_for_each_entry(curr, head, node) {
> + SetPageHWPoison(curr->page);
> + folio_put(folio);
> + }
> +
> + /* Refcount now should be zero and ready to dissolve folio. */
> + ret = dissolve_free_hugetlb_folio(folio);
> + if (ret) {
> + pr_err("failed to dissolve hugetlb folio: %d\n", ret);
> + return;
> + }
> +
> + llist_for_each_entry_safe(curr, next, head, node) {
> + page = curr->page;
> + pfn = page_to_pfn(page);
> + drain_all_pages(page_zone(page));
> + if (!take_page_off_buddy(page))
> + pr_err("%#lx: unable to take off buddy allocator\n", pfn);
> +
> + page_ref_inc(page);
> + kfree(curr);
> + pr_info("%#lx: pending hard offline completed\n", pfn);
> + }
> +}
> +
> +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> + struct folio *folio)
> +{
> + WARN_ON_ONCE(!mapping);
> +
> + if (!folio_test_hwpoison(folio))
> + return;
> +
> + /* Pending MFR currently only exist for hugetlb. */
> + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> + filemap_offline_hwpoison_folio_hugetlb(folio);
> +}
> +
> /*
> * Taking refcount of hugetlb pages needs extra care about race conditions
> * with basic operations like hugepage allocation/free/demotion.
Looks good.
thanks,
-jane
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH v2 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy
2025-12-03 4:11 ` jane.chu
@ 2025-12-03 19:41 ` Jiaqi Yan
0 siblings, 0 replies; 10+ messages in thread
From: Jiaqi Yan @ 2025-12-03 19:41 UTC (permalink / raw)
To: jane.chu, william.roche
Cc: nao.horiguchi, linmiaohe, harry.yoo, tony.luck, wangkefeng.wang,
willy, akpm, osalvador, rientjes, duenwen, jthoughton, jgg,
ankita, peterx, sidhartha.kumar, ziy, david, dave.hansen,
muchun.song, linux-mm, linux-kernel, linux-fsdevel
On Tue, Dec 2, 2025 at 8:11 PM <jane.chu@oracle.com> wrote:
>
> Hi, Jiaqi,
>
> Thanks for the work, my comments inline.
Thank you both for the thorough and helpful reviews, Jane and William!
I plan to first rework "[PATCH v1 0/2] Only free healthy pages in
high-order HWPoison folio", given it is the key to concerns you have
in this patch. Then I will address your comments on code
quality/readability for this patch.
>
> On 11/15/2025 5:32 PM, Jiaqi Yan wrote:
> > Sometimes immediately hard offlining a large chunk of contigous memory
> > having uncorrected memory errors (UE) may not be the best option.
> > Cloud providers usually serve capacity- and performance-critical guest
> > memory with 1G HugeTLB hugepages, as this significantly reduces the
> > overhead associated with managing page tables and TLB misses. However,
> > for today's HugeTLB system, once a byte of memory in a hugepage is
> > hardware corrupted, the kernel discards the whole hugepage, including
> > the healthy portion. Customer workload running in the VM can hardly
> > recover from such a great loss of memory.
> >
> > Therefore keeping or discarding a large chunk of contiguous memory
> > owned by userspace (particularly to serve guest memory) due to
> > recoverable UE may better be controlled by userspace process
> > that owns the memory, e.g. VMM in Cloud environment.
> >
> > Introduce a memfd-based userspace memory failure (MFR) policy,
> > MFD_MF_KEEP_UE_MAPPED. It is intended to be supported for other memfd,
> > but the current implementation only covers HugeTLB.
> >
> > For any hugepage associated with the MFD_MF_KEEP_UE_MAPPED enabled memfd,
> > whenever it runs into a UE, MFR doesn't hard offline the HWPoison-ed
> > huge folio. IOW the HWPoison-ed memory remains accessible via the memory
> > mapping created with that memfd. MFR still sends SIGBUS to the process
> > as required. MFR also still maintains HWPoison metadata for the hugepage
> > having the UE.
> >
> > A HWPoison-ed hugepage will be immediately isolated and prevented from
> > future allocation once userspace truncates it via the memfd, or the
> > owning memfd is closed.
> >
> > By default MFD_MF_KEEP_UE_MAPPED is not set, and MFR hard offlines
> > hugepages having UEs.
> >
> > Tested with selftest in the follow-up commit.
> >
> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
> > Tested-by: William Roche <william.roche@oracle.com>
> > ---
> > fs/hugetlbfs/inode.c | 25 +++++++-
> > include/linux/hugetlb.h | 7 +++
> > include/linux/pagemap.h | 24 +++++++
> > include/uapi/linux/memfd.h | 6 ++
> > mm/hugetlb.c | 20 +++++-
> > mm/memfd.c | 15 ++++-
> > mm/memory-failure.c | 124 +++++++++++++++++++++++++++++++++----
> > 7 files changed, 202 insertions(+), 19 deletions(-)
> >
> > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> > index f42548ee9083c..f8a5aa091d51d 100644
> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -532,6 +532,18 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
> > }
> >
> > folio_unlock(folio);
> > +
> > + /*
> > + * There may be pending HWPoison-ed folios when a memfd is being
> > + * removed or part of it is being truncated.
> > + *
> > + * HugeTLBFS' error_remove_folio keeps the HWPoison-ed folios in
> > + * page cache until mm wants to drop the folio at the end of the
> > + * of the filemap. At this point, if memory failure was delayed
> > + * by MFD_MF_KEEP_UE_MAPPED in the past, we can now deal with it.
> > + */
> > + filemap_offline_hwpoison_folio(mapping, folio);
> > +
> > return ret;
> > }
>
> Looks okay.
>
> >
> > @@ -563,13 +575,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> > const pgoff_t end = lend >> PAGE_SHIFT;
> > struct folio_batch fbatch;
> > pgoff_t next, index;
> > - int i, freed = 0;
> > + int i, j, freed = 0;
> > bool truncate_op = (lend == LLONG_MAX);
> >
> > folio_batch_init(&fbatch);
> > next = lstart >> PAGE_SHIFT;
> > while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
> > - for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> > + for (i = 0, j = 0; i < folio_batch_count(&fbatch); ++i) {
> > struct folio *folio = fbatch.folios[i];
> > u32 hash = 0;
> >
> > @@ -584,8 +596,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> > index, truncate_op))
> > freed++;
> >
> > + /*
> > + * Skip HWPoison-ed hugepages, which should no
> > + * longer be hugetlb if successfully dissolved.
> > + */
> > + if (folio_test_hugetlb(folio))
> > + fbatch.folios[j++] = folio;
> > +
> > mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> > }
> > + fbatch.nr = j;
> > +
> > folio_batch_release(&fbatch);
> > cond_resched();
> > }
>
> Looks okay.
>
> But this reminds me that for now remove_inode_single_folio() has no path
> to return 'false' anyway, and if it does, remove_inode_hugepages() will
> be broken since it has no logic to account for failed to be
> removed folios. Do you mind to make remove_inode_single_folio() a void
> function in order to avoid the confusion?
>
>
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 8e63e46b8e1f0..b7733ef5ee917 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -871,10 +871,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
> >
> > #ifdef CONFIG_MEMORY_FAILURE
> > extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
> > +extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> > + struct address_space *mapping);
> > #else
> > static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
> > {
> > }
> > +static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
> > + struct address_space *mapping)
> > +{
> > + return false;
> > +}
> > #endif
>
> It appears that hugetlb_should_keep_hwpoison_mapped() is only called
> within mm/memory-failure.c. How about moving it there ?
>
> >
> > #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
> > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> > index 09b581c1d878d..9ad511aacde7c 100644
> > --- a/include/linux/pagemap.h
> > +++ b/include/linux/pagemap.h
> > @@ -213,6 +213,8 @@ enum mapping_flags {
> > AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
> > AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
> > account usage to user cgroups */
> > + /* For MFD_MF_KEEP_UE_MAPPED. */
> > + AS_MF_KEEP_UE_MAPPED = 11,
> > /* Bits 16-25 are used for FOLIO_ORDER */
> > AS_FOLIO_ORDER_BITS = 5,
> > AS_FOLIO_ORDER_MIN = 16,
> > @@ -348,6 +350,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
> > return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
> > }
> >
> Okay.
>
> > +static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
> > +{
> > + return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> > +}
> > +
> > +static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
> > +{
> > + set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
> > +}
> > +
> > static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
> > {
> > return mapping->gfp_mask;
> > @@ -1274,6 +1286,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
> > void delete_from_page_cache_batch(struct address_space *mapping,
> > struct folio_batch *fbatch);
> > bool filemap_release_folio(struct folio *folio, gfp_t gfp);
> > +#ifdef CONFIG_MEMORY_FAILURE
> > +/*
> > + * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
> > + */
> > +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> > + struct folio *folio);
> > +#else
> > +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> > + struct folio *folio)
> > +{
> > +}
> > +#endif
>
> Okay.
>
> > loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
> > int whence);
> >
> > diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
> > index 273a4e15dfcff..d9875da551b7f 100644
> > --- a/include/uapi/linux/memfd.h
> > +++ b/include/uapi/linux/memfd.h
> > @@ -12,6 +12,12 @@
> > #define MFD_NOEXEC_SEAL 0x0008U
> > /* executable */
> > #define MFD_EXEC 0x0010U
> > +/*
> > + * Keep owned folios mapped when uncorrectable memory errors (UE) causes
> > + * memory failure (MF) within the folio. Only at the end of the mapping
> > + * will its HWPoison-ed folios be dealt with.
> > + */
> > +#define MFD_MF_KEEP_UE_MAPPED 0x0020U
> >
> > /*
> > * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 0455119716ec0..dd3bc0b75e059 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -6415,6 +6415,18 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
> > return same;
> > }
> >
> > +bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
> > + struct address_space *mapping)
> > +{
> > + if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
> > + return false;
> > +
> > + if (!mapping)
> > + return false;
> > +
> > + return mapping_mf_keep_ue_mapped(mapping);
> > +}
> > +
>
> Okay.
>
> > static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> > struct vm_fault *vmf)
> > {
> > @@ -6537,9 +6549,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> > * So we need to block hugepage fault by PG_hwpoison bit check.
> > */
> > if (unlikely(folio_test_hwpoison(folio))) {
> > - ret = VM_FAULT_HWPOISON_LARGE |
> > - VM_FAULT_SET_HINDEX(hstate_index(h));
> > - goto backout_unlocked;
> > + if (!mapping_mf_keep_ue_mapped(mapping)) {
> > + ret = VM_FAULT_HWPOISON_LARGE |
> > + VM_FAULT_SET_HINDEX(hstate_index(h));
> > + goto backout_unlocked;
> > + }
> > }
> >
>
> Looks okay, but am curious at Miaohe and others' take.
>
> To allow a known poisoned hugetlb page to be faulted in is for the sake
> of capacity, so this, versus a SIGBUS from the MF handler indicating a
> disruption and loss of both data and capacity.
> No strong opinion here, just wondering if there is any merit to limit
> the scope to the MF handler only.
>
> > /* Check for page in userfault range. */
> > diff --git a/mm/memfd.c b/mm/memfd.c
> > index 1d109c1acf211..bfdde4cf90500 100644
> > --- a/mm/memfd.c
> > +++ b/mm/memfd.c
> > @@ -313,7 +313,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
> > #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
> > #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
> >
> > -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
> > +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
> > + MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
> >
> > static int check_sysctl_memfd_noexec(unsigned int *flags)
> > {
> > @@ -387,6 +388,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
> > if (!(flags & MFD_HUGETLB)) {
> > if (flags & ~MFD_ALL_FLAGS)
> > return -EINVAL;
> > + if (flags & MFD_MF_KEEP_UE_MAPPED)
> > + return -EINVAL;
> > } else {
> > /* Allow huge page size encoding in flags. */
> > if (flags & ~(MFD_ALL_FLAGS |
> > @@ -447,6 +450,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
> > file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> > file->f_flags |= O_LARGEFILE;
> >
> > + /*
> > + * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
> > + * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
> > + * seal-able.
> > + *
> > + * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
> > + */
> > + if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
> > + mapping_set_mf_keep_ue_mapped(file->f_mapping);
> > +
> > if (flags & MFD_NOEXEC_SEAL) {
> > struct inode *inode = file_inode(file);
> >
>
> Okay.
>
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 3edebb0cda30b..c5e3e28872797 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -373,11 +373,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
> > * Schedule a process for later kill.
> > * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
> > */
> > -static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> > +static void __add_to_kill(struct task_struct *tsk, struct page *p,
> > struct vm_area_struct *vma, struct list_head *to_kill,
> > unsigned long addr)
> > {
> > struct to_kill *tk;
> > + struct folio *folio;
> > + struct address_space *mapping;
> >
> > tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
> > if (!tk) {
> > @@ -388,8 +390,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> > tk->addr = addr;
> > if (is_zone_device_page(p))
> > tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
> > - else
> > - tk->size_shift = folio_shift(page_folio(p));
> > + else {
> > + folio = page_folio(p);
> > + mapping = folio_mapping(folio);
> > + if (mapping && mapping_mf_keep_ue_mapped(mapping))
> > + /*
> > + * Let userspace know the radius of HWPoison is
> > + * the size of raw page; accessing other pages
> > + * inside the folio is still ok.
> > + */
> > + tk->size_shift = PAGE_SHIFT;
> > + else
> > + tk->size_shift = folio_shift(folio);
> > + }
> >
> > /*
> > * Send SIGKILL if "tk->addr == -EFAULT". Also, as
> > @@ -414,7 +427,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
> > list_add_tail(&tk->nd, to_kill);
> > }
> >
> > -static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
> > +static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
> > struct vm_area_struct *vma, struct list_head *to_kill,
> > unsigned long addr)
> > {
> > @@ -535,7 +548,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
> > * Collect processes when the error hit an anonymous page.
> > */
> > static void collect_procs_anon(const struct folio *folio,
> > - const struct page *page, struct list_head *to_kill,
> > + struct page *page, struct list_head *to_kill,
> > int force_early)
> > {
> > struct task_struct *tsk;
> > @@ -573,7 +586,7 @@ static void collect_procs_anon(const struct folio *folio,
> > * Collect processes when the error hit a file mapped page.
> > */
> > static void collect_procs_file(const struct folio *folio,
> > - const struct page *page, struct list_head *to_kill,
> > + struct page *page, struct list_head *to_kill,
> > int force_early)
> > {
> > struct vm_area_struct *vma;
> > @@ -655,7 +668,7 @@ static void collect_procs_fsdax(const struct page *page,
> > /*
> > * Collect the processes who have the corrupted page mapped to kill.
> > */
> > -static void collect_procs(const struct folio *folio, const struct page *page,
> > +static void collect_procs(const struct folio *folio, struct page *page,
> > struct list_head *tokill, int force_early)
> > {
> > if (!folio->mapping)
> > @@ -1173,6 +1186,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
> > }
> > }
> >
> > + /*
> > + * MF still needs to holds a refcount for the deferred actions in
> > + * filemap_offline_hwpoison_folio.
> > + */
> > + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> > + return res;
> > +
>
> Okay.
>
> > if (has_extra_refcount(ps, p, extra_pins))
> > res = MF_FAILED;
> >
> > @@ -1569,6 +1589,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> > {
> > LIST_HEAD(tokill);
> > bool unmap_success;
> > + bool keep_mapped;
> > int forcekill;
> > bool mlocked = folio_test_mlocked(folio);
> >
> > @@ -1596,8 +1617,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> > */
> > collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
> >
> > - unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> > - if (!unmap_success)
> > + keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);
> > + if (!keep_mapped)
> > + unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
> > +
> > + unmap_success = !folio_mapped(folio);
> > + if (!keep_mapped && !unmap_success)
> > pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
> > pfn, folio_mapcount(folio));
> >
> > @@ -1622,7 +1647,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
> > !unmap_success;
> > kill_procs(&tokill, forcekill, pfn, flags);
> >
> > - return unmap_success;
> > + return unmap_success || keep_mapped;
> > }
>
> Okay.
>
> >
> > static int identify_page_state(unsigned long pfn, struct page *p,
> > @@ -1862,6 +1887,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> > unsigned long count = 0;
> >
> > head = llist_del_all(raw_hwp_list_head(folio));
> > + /*
> > + * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
> > + * it has already taken off the head of the llist.
> > + */
> > + if (head == NULL)
> > + return 0;
> > +
> > llist_for_each_entry_safe(p, next, head, node) {
> > if (move_flag)
> > SetPageHWPoison(p->page);
> > @@ -1878,7 +1910,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> > struct llist_head *head;
> > struct raw_hwp_page *raw_hwp;
> > struct raw_hwp_page *p;
> > - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
> > + struct address_space *mapping = folio->mapping;
> > + bool has_hwpoison = folio_test_set_hwpoison(folio);
> >
> > /*
> > * Once the hwpoison hugepage has lost reliable raw error info,
> > @@ -1897,8 +1930,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> > if (raw_hwp) {
> > raw_hwp->page = page;
> > llist_add(&raw_hwp->node, head);
> > + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> > + /*
> > + * A new raw HWPoison page. Don't return HWPOISON.
> > + * Error event will be counted in action_result().
> > + */
> > + return 0;
> > +
> > /* the first error event will be counted in action_result(). */
> > - if (ret)
> > + if (has_hwpoison)
> > num_poisoned_pages_inc(page_to_pfn(page));
> > } else {
> > /*
> > @@ -1913,7 +1953,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
> > */
> > __folio_free_raw_hwp(folio, false);
> > }
> > - return ret;
> > +
> > + return has_hwpoison ? -EHWPOISON : 0;
> > }
>
> Okay.
>
> >
> > static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
> > @@ -2002,6 +2043,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > return ret;
> > }
> >
> > +static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
> > +{
> > + int ret;
> > + struct llist_node *head;
> > + struct raw_hwp_page *curr, *next;
> > + struct page *page;
> > + unsigned long pfn;
> > +
> > + /*
> > + * Since folio is still in the folio_batch, drop the refcount
> > + * elevated by filemap_get_folios.
> > + */
> > + folio_put_refs(folio, 1);
> > + head = llist_del_all(raw_hwp_list_head(folio));
> > +
> > + /*
> > + * Release refcounts held by try_memory_failure_hugetlb, one per
> > + * HWPoison-ed page in the raw hwp list.
> > + */
> > + llist_for_each_entry(curr, head, node) {
> > + SetPageHWPoison(curr->page);
> > + folio_put(folio);
> > + }
> > +
> > + /* Refcount now should be zero and ready to dissolve folio. */
> > + ret = dissolve_free_hugetlb_folio(folio);
> > + if (ret) {
> > + pr_err("failed to dissolve hugetlb folio: %d\n", ret);
> > + return;
> > + }
> > +
> > + llist_for_each_entry_safe(curr, next, head, node) {
> > + page = curr->page;
> > + pfn = page_to_pfn(page);
> > + drain_all_pages(page_zone(page));
> > + if (!take_page_off_buddy(page))
> > + pr_err("%#lx: unable to take off buddy allocator\n", pfn);
> > +
> > + page_ref_inc(page);
> > + kfree(curr);
> > + pr_info("%#lx: pending hard offline completed\n", pfn);
> > + }
> > +}
> > +
> > +void filemap_offline_hwpoison_folio(struct address_space *mapping,
> > + struct folio *folio)
> > +{
> > + WARN_ON_ONCE(!mapping);
> > +
> > + if (!folio_test_hwpoison(folio))
> > + return;
> > +
> > + /* Pending MFR currently only exist for hugetlb. */
> > + if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
> > + filemap_offline_hwpoison_folio_hugetlb(folio);
> > +}
> > +
> > /*
> > * Taking refcount of hugetlb pages needs extra care about race conditions
> > * with basic operations like hugepage allocation/free/demotion.
>
>
> Looks good.
>
> thanks,
> -jane
^ permalink raw reply [flat|nested] 10+ messages in thread