[RFC v2 PATCH 15/17] mm: Convert zero page to large folios on write

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Ryan Roberts <ryan.roberts@arm.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Yu Zhao <yuzhao@google.com>,
	"Yin, Fengwei" <fengwei.yin@intel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
	linux-mm@kvack.org, linux-arm-kernel@lists.infradead.org
Subject: [RFC v2 PATCH 15/17] mm: Convert zero page to large folios on write
Date: Fri, 14 Apr 2023 14:03:01 +0100	[thread overview]
Message-ID: <20230414130303.2345383-16-ryan.roberts@arm.com> (raw)
In-Reply-To: <20230414130303.2345383-1-ryan.roberts@arm.com>

A read fault causes the zero page to be mapped read-only. A subsequent
write fault causes the zero page to be replaced with a zero-filled
private anonymous page. Change the write fault behaviour to replace the
zero page with a large anonymous folio, allocated using the same policy
as if the write fault had happened without the previous read fault.

Experimentation shows that reading multiple contiguous pages is
extremely rare without interleved writes, so we don't bother to map a
large zero page. We just use the small zero page as a marker and expand
the allocation at the write fault.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 mm/memory.c | 115 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 35 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 61cec97a57f3..fac686e9f895 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3110,6 +3110,23 @@ static inline int check_ptes_contig_ro(pte_t *pte, int nr, unsigned long pfn)
 	return nr;
 }

+/*
+ * Checks that all ptes are none except for the pte at offset, which should be
+ * entry. Returns index of first pte that does not meet expectations, or nr if
+ * all are correct.
+ */
+static inline int check_ptes_none_or_entry(pte_t *pte, int nr,
+					pte_t entry, unsigned long offset)
+{
+	int ret;
+
+	ret = check_ptes_none(pte, offset);
+	if (ret == offset && pte_same(pte[offset], entry))
+		ret += 1 + check_ptes_none(pte + offset + 1, nr - offset - 1);
+
+	return ret;
+}
+
 static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 {
 	/*
@@ -3141,6 +3158,7 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 	pte_t *pte;
 	pte_t *first_set = NULL;
 	int ret;
+	unsigned long offset;

 	if (has_transparent_hugepage()) {
 		order = min(order, PMD_SHIFT - PAGE_SHIFT);
@@ -3148,7 +3166,8 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 		for (; order > 1; order--) {
 			nr = 1 << order;
 			addr = ALIGN_DOWN(vmf->address, nr << PAGE_SHIFT);
-			pte = vmf->pte - ((vmf->address - addr) >> PAGE_SHIFT);
+			offset = ((vmf->address - addr) >> PAGE_SHIFT);
+			pte = vmf->pte - offset;

 			/* Check vma bounds. */
 			if (addr < vma->vm_start ||
@@ -3163,8 +3182,9 @@ static int calc_anon_folio_order_alloc(struct vm_fault *vmf, int order)
 			if (pte <= first_set)
 				continue;

-			/* Need to check if all the ptes are none. */
-			ret = check_ptes_none(pte, nr);
+			/* Need to check if all the ptes are none or entry. */
+			ret = check_ptes_none_or_entry(pte, nr,
+							vmf->orig_pte, offset);
 			if (ret == nr)
 				break;

@@ -3479,13 +3499,15 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	struct mmu_notifier_range range;
 	int ret;
 	pte_t orig_pte;
-	unsigned long addr = vmf->address;
-	int order = 0;
-	int pgcount = BIT(order);
-	unsigned long offset = 0;
+	unsigned long addr;
+	int order;
+	int pgcount;
+	unsigned long offset;
 	unsigned long pfn;
 	struct page *page;
 	int i;
+	bool zero;
+	bool anon;

 	delayacct_wpcopy_start();

@@ -3494,36 +3516,54 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;

+	/*
+	 * Set the upper bound of the folio allocation order. If we hit a zero
+	 * page, we allocate a folio with the same policy as allocation upon
+	 * write fault. If we are copying an anon folio, then limit ourself to
+	 * its order as we don't want to copy from multiple folios. For all
+	 * other cases (e.g. file-mapped) CoW a single page.
+	 */
 	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
-		new_folio = vma_alloc_movable_folio(vma, vmf->address, 0, true);
-		if (!new_folio)
-			goto oom;
-	} else {
-		if (old_folio && folio_test_anon(old_folio)) {
-			order = min_t(int, folio_order(old_folio),
+		zero = true;
+		anon = false;
+		order = max_anon_folio_order(vma);
+	} else if (old_folio && folio_test_anon(old_folio)) {
+		zero = false;
+		anon = true;
+		order = min_t(int, folio_order(old_folio),
 						max_anon_folio_order(vma));
+	} else {
+		zero = false;
+		anon = false;
+		order = 0;
+	}
+
 retry:
-			/*
-			 * Estimate the folio order to allocate. We are not
-			 * under the ptl here so this estimate needs to be
-			 * re-checked later once we have the lock.
-			 */
-			vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
-			order = calc_anon_folio_order_copy(vmf, old_folio, order);
-			pte_unmap(vmf->pte);
-		}
+	/*
+	 * Estimate the folio order to allocate. We are not under the ptl here
+	 * so this estimate needs to be re-checked later once we have the lock.
+	 */
+	if (zero || anon) {
+		vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+		order = zero ? calc_anon_folio_order_alloc(vmf, order) :
+			calc_anon_folio_order_copy(vmf, old_folio, order);
+		pte_unmap(vmf->pte);
+	}

-		new_folio = try_vma_alloc_movable_folio(vma, vmf->address,
-							order, false);
-		if (!new_folio)
-			goto oom;
+	/* Allocate the new folio. */
+	new_folio = try_vma_alloc_movable_folio(vma, vmf->address, order, zero);
+	if (!new_folio)
+		goto oom;

-		/* We may have been granted less than we asked for. */
-		order = folio_order(new_folio);
-		pgcount = BIT(order);
-		addr = ALIGN_DOWN(vmf->address, pgcount << PAGE_SHIFT);
-		offset = ((vmf->address - addr) >> PAGE_SHIFT);
+	/* We may have been granted less than we asked for. */
+	order = folio_order(new_folio);
+	pgcount = BIT(order);
+	addr = ALIGN_DOWN(vmf->address, pgcount << PAGE_SHIFT);
+	offset = ((vmf->address - addr) >> PAGE_SHIFT);
+	pfn = pte_pfn(vmf->orig_pte) - offset;

+	/* Copy contents. */
+	if (!zero) {
 		if (likely(old_folio))
 			ret = __wp_page_copy_user_range(&new_folio->page,
 							vmf->page - offset,
@@ -3561,8 +3601,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 * Re-check the pte(s) - we dropped the lock
 	 */
 	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-	pfn = pte_pfn(vmf->orig_pte) - offset;
-	if (likely(check_ptes_contig_ro(vmf->pte, pgcount, pfn) == pgcount)) {
+
+	if (zero)
+		ret = check_ptes_none_or_entry(vmf->pte, pgcount,
+						vmf->orig_pte, offset);
+	else
+		ret = check_ptes_contig_ro(vmf->pte, pgcount, pfn);
+
+	if (likely(ret == pgcount)) {
 		if (old_folio) {
 			if (!folio_test_anon(old_folio)) {
 				VM_BUG_ON(order != 0);
@@ -3570,8 +3616,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
-			VM_BUG_ON(order != 0);
-			inc_mm_counter(mm, MM_ANONPAGES);
+			add_mm_counter(mm, MM_ANONPAGES, pgcount);
 		}
 		flush_cache_range(vma, addr, addr + (pgcount << PAGE_SHIFT));

--
2.25.1

next prev parent reply	other threads:[~2023-04-14 13:04 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-14 13:02 [RFC v2 PATCH 00/17] variable-order, large folios for anonymous memory Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 01/17] mm: Expose clear_huge_page() unconditionally Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 02/17] mm: pass gfp flags and order to vma_alloc_zeroed_movable_folio() Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 03/17] mm: Introduce try_vma_alloc_movable_folio() Ryan Roberts
2023-04-17  8:49   ` Yin, Fengwei
2023-04-17 10:11     ` Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 04/17] mm: Implement folio_add_new_anon_rmap_range() Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 05/17] mm: Routines to determine max anon folio allocation order Ryan Roberts
2023-04-14 14:09   ` Kirill A. Shutemov
2023-04-14 14:38     ` Ryan Roberts
2023-04-14 15:37       ` Kirill A. Shutemov
2023-04-14 16:06         ` Ryan Roberts
2023-04-14 16:18           ` Matthew Wilcox
2023-04-14 16:31             ` Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 06/17] mm: Allocate large folios for anonymous memory Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 07/17] mm: Allow deferred splitting of arbitrary large anon folios Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 08/17] mm: Implement folio_move_anon_rmap_range() Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 09/17] mm: Update wp_page_reuse() to operate on range of pages Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 10/17] mm: Reuse large folios for anonymous memory Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 11/17] mm: Split __wp_page_copy_user() into 2 variants Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 12/17] mm: ptep_clear_flush_range_notify() macro for batch operation Ryan Roberts
2023-04-14 13:02 ` [RFC v2 PATCH 13/17] mm: Implement folio_remove_rmap_range() Ryan Roberts
2023-04-14 13:03 ` [RFC v2 PATCH 14/17] mm: Copy large folios for anonymous memory Ryan Roberts
2023-04-14 13:03 ` Ryan Roberts [this message]
2023-04-14 13:03 ` [RFC v2 PATCH 16/17] mm: mmap: Align unhinted maps to highest anon folio order Ryan Roberts
2023-04-17  8:25   ` Yin, Fengwei
2023-04-17 10:13     ` Ryan Roberts
2023-04-14 13:03 ` [RFC v2 PATCH 17/17] mm: Batch-zap large anonymous folio PTE mappings Ryan Roberts
2023-04-17  8:04 ` [RFC v2 PATCH 00/17] variable-order, large folios for anonymous memory Yin, Fengwei
2023-04-17 10:19   ` Ryan Roberts
2023-04-17  8:19 ` Yin, Fengwei
2023-04-17 10:28   ` Ryan Roberts
2023-04-17 10:54 ` David Hildenbrand
2023-04-17 11:43   ` Ryan Roberts
2023-04-17 14:05     ` David Hildenbrand
2023-04-17 15:38       ` Ryan Roberts
2023-04-17 15:44         ` David Hildenbrand
2023-04-17 16:15           ` Ryan Roberts
2023-04-26 10:41           ` Ryan Roberts
2023-05-17 13:58             ` David Hildenbrand
2023-05-18 11:23               ` Ryan Roberts
2023-04-19 10:12       ` Ryan Roberts
2023-04-19 10:51         ` David Hildenbrand
2023-04-19 11:13           ` Ryan Roberts

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230414130303.2345383-16-ryan.roberts@arm.com \
    --to=ryan.roberts@arm.com \
    --cc=akpm@linux-foundation.org \
    --cc=fengwei.yin@intel.com \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-mm@kvack.org \
    --cc=willy@infradead.org \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox