Hello Jiaqi,

Here is a summary of a few nits in this code:

- Some functions declarations are problematic according to me
- The parameter testing to activate the feature looks incorrect
- The function signature change is probably not necessary
- Maybe we should wait for an agreement on your other proposal:
[PATCH v1 0/2] Only free healthy pages in high-order HWPoison folio

The last item is not a nit, but as your above proposal may require to keep all data of a
hugetlb folio to recycle it correctly (especially the list of poisoned sub-pages), and
to avoid the race condition with returning poisoned pages to the freelist right before
removing them; you may need to change some aspects of this current code.

On 11/16/25 02:32, Jiaqi Yan wrote:

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f0..b7733ef5ee917 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -871,10 +871,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
 
 #ifdef CONFIG_MEMORY_FAILURE
 extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
+extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
+						struct address_space *mapping);
 #else
 static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
 {
 }
+static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio
+						       struct address_space *mapping)
+{
+	return false;
+}
 #endif

You are conditionally declaring this hugetlb_should_keep_hwpoison_mapped() function and implementing it into mm/hugetlb.c, but this file can be compiled in both cases (CONFIG_MEMORY_FAILURE enabled or not) So you either need to have a single consistent declaration with the implementation and use something like that:

bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio, struct address_space *mapping) { +#ifdef CONFIG_MEMORY_FAILURE if (WARN_ON_ONCE(!folio_test_hugetlb(folio))) return false; @@ -6087,6 +6088,9 @@ bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio, return false; return mapping_mf_keep_ue_mapped(mapping); +#else + return false; +#endif }

Or keep your double declaration and hide the implementation when CONFIG_MEMORY_FAILURE is enabled:

+#ifdef CONFIG_MEMORY_FAILURE bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio, struct address_space *mapping) { if (WARN_ON_ONCE(!folio_test_hugetlb(folio))) return false; @@ -6087,6 +6088,9 @@ bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio, return false; return mapping_mf_keep_ue_mapped(mapping); } +#endif

 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878d..9ad511aacde7c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -213,6 +213,8 @@ enum mapping_flags {
 	AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9,
 	AS_KERNEL_FILE = 10,	/* mapping for a fake kernel file that shouldn't
 				   account usage to user cgroups */
+	/* For MFD_MF_KEEP_UE_MAPPED. */
+	AS_MF_KEEP_UE_MAPPED = 11,
 	/* Bits 16-25 are used for FOLIO_ORDER */
 	AS_FOLIO_ORDER_BITS = 5,
 	AS_FOLIO_ORDER_MIN = 16,
@@ -348,6 +350,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres
 	return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags);
 }
 
+static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
+{
+	return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
+static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
+{
+	set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
 static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
 {
 	return mapping->gfp_mask;
@@ -1274,6 +1286,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
 void delete_from_page_cache_batch(struct address_space *mapping,
 				  struct folio_batch *fbatch);
 bool filemap_release_folio(struct folio *folio, gfp_t gfp);
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
+ */
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+				    struct folio *folio);
+#else
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+				    struct folio *folio)
+{
+}
+#endif
 loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
 		int whence);

This filemap_offline_hwpoison_folio() declaration also is problematic in the case without CONFIG_MEMORY_FAILURE, as we implement a public function filemap_offline_hwpoison_folio() in all the files including this "pagemap.h" header.

This coud be solved using "static inline" in this second case.

diff --git a/mm/memfd.c b/mm/memfd.c
index 1d109c1acf211..bfdde4cf90500 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -313,7 +313,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+		       MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
 
 static int check_sysctl_memfd_noexec(unsigned int *flags)
 {
@@ -387,6 +388,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
 	if (!(flags & MFD_HUGETLB)) {
 		if (flags & ~MFD_ALL_FLAGS)
 			return -EINVAL;
+		if (flags & MFD_MF_KEEP_UE_MAPPED)
+			return -EINVAL;
 	} else {
 		/* Allow huge page size encoding in flags. */
 		if (flags & ~(MFD_ALL_FLAGS |
@@ -447,6 +450,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_LARGEFILE;
 
+	/*
+	 * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create; no API
+	 * to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED is not
+	 * seal-able.
+	 *
+	 * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
+	 */
+	if (flags & (MFD_HUGETLB | MFD_MF_KEEP_UE_MAPPED))
+		mapping_set_mf_keep_ue_mapped(file->f_mapping);

The flags value that we need to have in order to set the "keep" value on the address space
is MFD_MF_KEEP_UE_MAPPED alone, as we already verified that the value is only given combined to MFD_HUGETLB. This is a nit identified by Harry Yoo during our internal conversations. Thanks Harry !

+
 	if (flags & MFD_NOEXEC_SEAL) {
 		struct inode *inode = file_inode(file);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30b..c5e3e28872797 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -373,11 +373,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  */
-static void __add_to_kill(struct task_struct *tsk, const struct page *p,
+static void __add_to_kill(struct task_struct *tsk, struct page *p,
 			  struct vm_area_struct *vma, struct list_head *to_kill,
 			  unsigned long addr)

Is there any reason to remove the "const" on the page structure in the signature ?
It looks like you only do that for the new call to page_folio(p), but we don't touch the page

 {
 	struct to_kill *tk;
+	struct folio *folio;

You could use a "const" struct folio *folio too.

+	struct address_space *mapping;
 
 	tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 	if (!tk) {
@@ -388,8 +390,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
 	tk->addr = addr;
 	if (is_zone_device_page(p))
 		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
-	else
-		tk->size_shift = folio_shift(page_folio(p));
+	else {
+		folio = page_folio(p);

Now with both folio and p being "const", the code should work.

+		mapping = folio_mapping(folio);
+		if (mapping && mapping_mf_keep_ue_mapped(mapping))
+			/*
+			 * Let userspace know the radius of HWPoison is
+			 * the size of raw page; accessing other pages
+			 * inside the folio is still ok.
+			 */
+			tk->size_shift = PAGE_SHIFT;
+		else
+			tk->size_shift = folio_shift(folio);
+	}
 
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -414,7 +427,7 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
 	list_add_tail(&tk->nd, to_kill);
 }
 
-static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p,
+static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,

No need to change the signature here too (otherwise you would have missed both functions
add_to_kill_fsdax() and add_to_kill_ksm().

 		struct vm_area_struct *vma, struct list_head *to_kill,
 		unsigned long addr)
 {
@@ -535,7 +548,7 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
  * Collect processes when the error hit an anonymous page.
  */
 static void collect_procs_anon(const struct folio *folio,
-		const struct page *page, struct list_head *to_kill,
+		struct page *page, struct list_head *to_kill,

No need to change

 		int force_early)
 {
 	struct task_struct *tsk;
@@ -573,7 +586,7 @@ static void collect_procs_anon(const struct folio *folio,
  * Collect processes when the error hit a file mapped page.
  */
 static void collect_procs_file(const struct folio *folio,
-		const struct page *page, struct list_head *to_kill,
+		struct page *page, struct list_head *to_kill,
 		int force_early)

No need to change

 {
 	struct vm_area_struct *vma;
@@ -655,7 +668,7 @@ static void collect_procs_fsdax(const struct page *page,
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
-static void collect_procs(const struct folio *folio, const struct page *page,
+static void collect_procs(const struct folio *folio, struct page *page,

No need to change

 		struct list_head *tokill, int force_early)
 {
 	if (!folio->mapping)
@@ -1173,6 +1186,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
 		}
 	}
 
+	/*
+	 * MF still needs to holds a refcount for the deferred actions in
+	 * filemap_offline_hwpoison_folio.
+	 */
+	if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+		return res;
+
 	if (has_extra_refcount(ps, p, extra_pins))
 		res = MF_FAILED;
 
@@ -1569,6 +1589,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
 {
 	LIST_HEAD(tokill);
 	bool unmap_success;
+	bool keep_mapped;
 	int forcekill;
 	bool mlocked = folio_test_mlocked(folio);
 
@@ -1596,8 +1617,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
 	 */
 	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
 
-	unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
-	if (!unmap_success)
+	keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);
+	if (!keep_mapped)
+		unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
+
+	unmap_success = !folio_mapped(folio);
+	if (!keep_mapped && !unmap_success)
 		pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
 		       pfn, folio_mapcount(folio));
 
@@ -1622,7 +1647,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
 		    !unmap_success;
 	kill_procs(&tokill, forcekill, pfn, flags);
 
-	return unmap_success;
+	return unmap_success || keep_mapped;
 }
 
 static int identify_page_state(unsigned long pfn, struct page *p,
@@ -1862,6 +1887,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
 	unsigned long count = 0;
 
 	head = llist_del_all(raw_hwp_list_head(folio));
+	/*
+	 * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
+	 * it has already taken off the head of the llist.
+	 */
+	if (head == NULL)
+		return 0;
+

This may not be necessary depending on how we recycle hugetlb pages -- see below too.

 	llist_for_each_entry_safe(p, next, head, node) {
 		if (move_flag)
 			SetPageHWPoison(p->page);
@@ -1878,7 +1910,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 	struct llist_head *head;
 	struct raw_hwp_page *raw_hwp;
 	struct raw_hwp_page *p;
-	int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
+	struct address_space *mapping = folio->mapping;
+	bool has_hwpoison = folio_test_set_hwpoison(folio);
 
 	/*
 	 * Once the hwpoison hugepage has lost reliable raw error info,
@@ -1897,8 +1930,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 	if (raw_hwp) {
 		raw_hwp->page = page;
 		llist_add(&raw_hwp->node, head);
+		if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+			/*
+			 * A new raw HWPoison page. Don't return HWPOISON.
+			 * Error event will be counted in action_result().
+			 */
+			return 0;
+
 		/* the first error event will be counted in action_result(). */
-		if (ret)
+		if (has_hwpoison)
 			num_poisoned_pages_inc(page_to_pfn(page));
 	} else {
 		/*
@@ -1913,7 +1953,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 		 */
 		__folio_free_raw_hwp(folio, false);
 	}
-	return ret;
+
+	return has_hwpoison ? -EHWPOISON : 0;
 }
 
 static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
@@ -2002,6 +2043,63 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 	return ret;
 }
 
+static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
+{
+	int ret;
+	struct llist_node *head;
+	struct raw_hwp_page *curr, *next;
+	struct page *page;
+	unsigned long pfn;
+
+	/*
+	 * Since folio is still in the folio_batch, drop the refcount
+	 * elevated by filemap_get_folios.
+	 */
+	folio_put_refs(folio, 1);
+	head = llist_del_all(raw_hwp_list_head(folio));

According to me we should wait until your other patch set is approved to decide if the folio raw_hwp_list
has to be removed from the folio or if is should be left there so that the recycling of this huge page
works correctly...

+
+	/*
+	 * Release refcounts held by try_memory_failure_hugetlb, one per
+	 * HWPoison-ed page in the raw hwp list.
+	 */
+	llist_for_each_entry(curr, head, node) {
+		SetPageHWPoison(curr->page);
+		folio_put(folio);
+	}
+
+	/* Refcount now should be zero and ready to dissolve folio. */
+	ret = dissolve_free_hugetlb_folio(folio);
+	if (ret) {
+		pr_err("failed to dissolve hugetlb folio: %d\n", ret);
+		return;
+	}
+
+	llist_for_each_entry_safe(curr, next, head, node) {
+		page = curr->page;
+		pfn = page_to_pfn(page);
+		drain_all_pages(page_zone(page));
+		if (!take_page_off_buddy(page))
+			pr_err("%#lx: unable to take off buddy allocator\n", pfn);
+
+		page_ref_inc(page);
+		kfree(curr);
+		pr_info("%#lx: pending hard offline completed\n", pfn);
+	}
+}

Let's revisit this above function when an agreement is reached on the recycling hugetlb pages proposal.

+
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+				    struct folio *folio)
+{
+	WARN_ON_ONCE(!mapping);
+
+	if (!folio_test_hwpoison(folio))
+		return;
+
+	/* Pending MFR currently only exist for hugetlb. */
+	if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+		filemap_offline_hwpoison_folio_hugetlb(folio);
+}
+
 /*
  * Taking refcount of hugetlb pages needs extra care about race conditions
  * with basic operations like hugepage allocation/free/demotion.

HTH

Best regards,
William.