linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jiaqi Yan <jiaqiyan@google.com>
To: mike.kravetz@oracle.com, peterx@redhat.com, naoya.horiguchi@nec.com
Cc: songmuchun@bytedance.com, duenwen@google.com,
	axelrasmussen@google.com,  jthoughton@google.com,
	rientjes@google.com, linmiaohe@huawei.com,  shy828301@gmail.com,
	baolin.wang@linux.alibaba.com,  wangkefeng.wang@huawei.com,
	akpm@linux-foundation.org, linux-mm@kvack.org,
	 linux-kernel@vger.kernel.org, Jiaqi Yan <jiaqiyan@google.com>
Subject: [RFC PATCH v1 5/7] hugetlb: only VM_FAULT_HWPOISON_LARGE raw page
Date: Fri, 28 Apr 2023 00:41:37 +0000	[thread overview]
Message-ID: <20230428004139.2899856-6-jiaqiyan@google.com> (raw)
In-Reply-To: <20230428004139.2899856-1-jiaqiyan@google.com>

Memory raw pages can become HWPOISON between when userspace maps
a hugepage and when userspace faults in the hugepage.

Today when hugetlb faults somewhere in a hugepage containing
HWPOISON raw pages, the result is a VM_FAULT_HWPOISON_LARGE.

This commit teaches hugetlb page fault handler to only
VM_FAULT_HWPOISON_LARGE if the faulting address is within HWPOISON
raw page; otherwise, fault handler can continue to fault in healthy
raw pages.

Signed-off-by: Jiaqi Yan <jiaqiyan@google.com>
---
 include/linux/mm.h  |   2 +
 mm/hugetlb.c        | 129 ++++++++++++++++++++++++++++++++++++++++++--
 mm/memory-failure.c |   1 +
 3 files changed, 127 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc192f98cb1d..7caa4530953f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3531,6 +3531,7 @@ extern const struct attribute_group memory_failure_attr_group;
  * @nr_expected_unmaps: if a VMA that maps @page when detected is eligible
  *   for high granularity mapping, @page is expected to be unmapped.
  * @nr_actual_unmaps: how many times the raw page is actually unmapped.
+ * @index: index of the poisoned subpage in the folio.
  */
 struct raw_hwp_page {
 	struct llist_node node;
@@ -3538,6 +3539,7 @@ struct raw_hwp_page {
 	int nr_vmas_mapped;
 	int nr_expected_unmaps;
 	int nr_actual_unmaps;
+	unsigned long index;
 };
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1419176b7e51..f8ddf04ae0c4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6158,6 +6158,30 @@ static struct folio *hugetlb_try_find_lock_folio(struct address_space *mapping,
 	return folio;
 }
 
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags);
+
+#ifndef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags)
+{
+	if (unlikely(folio_test_hwpoison(folio))) {
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+	}
+
+	return 0;
+}
+#endif
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			struct address_space *mapping, pgoff_t idx,
@@ -6287,13 +6311,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		/*
 		 * If memory error occurs between mmap() and fault, some process
 		 * don't have hwpoisoned swap entry for errored virtual address.
-		 * So we need to block hugepage fault by PG_hwpoison bit check.
+		 * So we need to block hugepage fault by hwpoison check:
+		 * - without HGM, the check is based on PG_hwpoison
+		 * - with HGM, check if the raw page for address is poisoned
 		 */
-		if (unlikely(folio_test_hwpoison(folio))) {
-			ret = VM_FAULT_HWPOISON_LARGE |
-				VM_FAULT_SET_HINDEX(hstate_index(h));
+		ret = hugetlb_no_page_hwpoison(mm, vma, folio, address, hpte, flags);
+		if (unlikely(ret))
 			goto backout_unlocked;
-		}
 
 		/* Check for page in userfault range. */
 		if (userfaultfd_minor(vma)) {
@@ -8426,6 +8450,11 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * the allocated PTEs created before splitting fails.
 	 */
 
+	/*
+	 * For none and UFFD_WP marker PTEs, given try_to_unmap_one doesn't
+	 * unmap them, delay the splitting until page fault happens. See the
+	 * hugetlb_no_page_hwpoison check in hugetlb_no_page.
+	 */
 	if (unlikely(huge_pte_none_mostly(old_entry))) {
 		ret = -EAGAIN;
 		goto skip;
@@ -8479,6 +8508,96 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+/*
+ * Given a hugetlb PTE, if we want to split it into its next smaller level
+ * PTE, return what size we should use to do HGM walk with allocations.
+ * If given hugetlb PTE is already at smallest PAGESIZE, returns -EINVAL.
+ */
+static int hgm_next_size(struct vm_area_struct *vma, struct hugetlb_pte *hpte)
+{
+	struct hstate *h = hstate_vma(vma), *tmp_h;
+	unsigned int shift;
+	unsigned long curr_size = hugetlb_pte_size(hpte);
+	unsigned long next_size;
+
+	for_each_hgm_shift(h, tmp_h, shift) {
+		next_size = 1UL << shift;
+		if (next_size < curr_size)
+			return next_size;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Check if address is in the range of a HWPOISON raw page.
+ * During checking hugetlb PTE may be split into smaller hguetlb PTEs.
+ */
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags)
+{
+	unsigned long range_start, range_end;
+	unsigned long start_index, end_index;
+	unsigned long folio_start = vma_address(folio_page(folio, 0), vma);
+	struct llist_node *t, *tnode;
+	struct llist_head *raw_hwp_head = raw_hwp_list_head(folio);
+	struct raw_hwp_page *p = NULL;
+	bool contain_hwpoison = false;
+	int hgm_size;
+	int hgm_ret = 0;
+
+	if (likely(!folio_test_hwpoison(folio)))
+		return 0;
+
+	if (hugetlb_enable_hgm_vma(vma))
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+
+recheck:
+	range_start = address & hugetlb_pte_mask(hpte);
+	range_end = range_start + hugetlb_pte_size(hpte);
+	start_index = (range_start - folio_start) / PAGE_SIZE;
+	end_index = start_index + hugetlb_pte_size(hpte) / PAGE_SIZE;
+
+	contain_hwpoison = false;
+	llist_for_each_safe(tnode, t, raw_hwp_head->first) {
+		p = container_of(tnode, struct raw_hwp_page, node);
+		if (start_index <= p->index && p->index < end_index) {
+			contain_hwpoison = true;
+			break;
+		}
+	}
+
+	if (!contain_hwpoison)
+		return 0;
+
+	if (hugetlb_pte_size(hpte) == PAGE_SIZE)
+		return VM_FAULT_HWPOISON;
+
+	/*
+	 * hugetlb_fault already ensured hugetlb_vma_lock_read.
+	 * We also checked hugetlb_pte_size(hpte) != PAGE_SIZE,
+	 * so hgm_size must be something meaningful to HGM.
+	 */
+	hgm_size = hgm_next_size(vma, hpte);
+	VM_BUG_ON(hgm_size == -EINVAL);
+	hgm_ret = hugetlb_full_walk_alloc(hpte, vma, address, hgm_size);
+	if (hgm_ret) {
+		WARN_ON_ONCE(hgm_ret);
+		/*
+		 * When splitting using HGM fails, return like
+		 * HGM is not eligible or enabled.
+		 */
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+	}
+	goto recheck;
+}
+
 #endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */
 
 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b935918ceb..9093ba53feed 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1957,6 +1957,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 		raw_hwp->nr_vmas_mapped = 0;
 		raw_hwp->nr_expected_unmaps = 0;
 		raw_hwp->nr_actual_unmaps = 0;
+		raw_hwp->index = folio_page_idx(folio, page);
 		llist_add(&raw_hwp->node, head);
 		if (hgm_enabled)
 			/*
-- 
2.40.1.495.gc816e09b53d-goog



  parent reply	other threads:[~2023-04-28  0:41 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-28  0:41 [RFC PATCH v1 0/7] PAGE_SIZE Unmapping in Memory Failure Recovery for HugeTLB Pages Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 1/7] hugetlb: add HugeTLB splitting functionality Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 2/7] hugetlb: create PTE level mapping when possible Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 3/7] mm: publish raw_hwp_page in mm.h Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 4/7] mm/memory_failure: unmap raw HWPoison PTEs when possible Jiaqi Yan
     [not found]   ` <20230530022456.GA1434147@hori.linux.bs1.fc.nec.co.jp>
2023-05-30 21:31     ` Jiaqi Yan
2023-05-30 22:11       ` Jiaqi Yan
2023-04-28  0:41 ` Jiaqi Yan [this message]
2023-04-28  0:41 ` [RFC PATCH v1 6/7] selftest/mm: test PAGESIZE unmapping HWPOISON pages Jiaqi Yan
2023-04-28  0:41 ` [RFC PATCH v1 7/7] selftest/mm: test PAGESIZE unmapping UFFD WP marker " Jiaqi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230428004139.2899856-6-jiaqiyan@google.com \
    --to=jiaqiyan@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=duenwen@google.com \
    --cc=jthoughton@google.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.kravetz@oracle.com \
    --cc=naoya.horiguchi@nec.com \
    --cc=peterx@redhat.com \
    --cc=rientjes@google.com \
    --cc=shy828301@gmail.com \
    --cc=songmuchun@bytedance.com \
    --cc=wangkefeng.wang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox