[RFC PATCH] mm: populate multiple PTEs if file page is large folio

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Yin Fengwei <fengwei.yin@intel.com>
To: willy@infradead.org, linux-mm@kvack.org
Cc: dave.hansen@intel.com, tim.c.chen@intel.com,
	ying.huang@intel.com, fengwei.yin@intel.com
Subject: [RFC PATCH] mm: populate multiple PTEs if file page is large folio
Date: Sat, 14 Jan 2023 00:35:38 +0800	[thread overview]
Message-ID: <20230113163538.23412-1-fengwei.yin@intel.com> (raw)

The page fault number can be reduced by batched PTEs population.
The batch size of PTEs population is not allowed to cross:
  - page table boundaries
  - vma range
  - large folio size
  - fault_around_bytes

fault_around_bytes allows to control batch size if user has
attention to to so.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
* base on next-20230112

 mm/memory.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 56b571c83a0e..755e6e590481 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,10 @@ EXPORT_SYMBOL(mem_map);
 #endif
 
 static vm_fault_t do_fault(struct vm_fault *vmf);
+static inline bool allowed_batched_set_ptes(struct vm_fault *vmf,
+		struct page *page);
+static void do_set_multi_ptes(struct vm_fault *vmf, struct page *page,
+		unsigned long addr);
 
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
@@ -4359,10 +4363,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 
 	/* Re-check under ptl */
 	if (likely(!vmf_pte_changed(vmf))) {
-		do_set_pte(vmf, page, vmf->address);
+		if (allowed_batched_set_ptes(vmf, page))
+			do_set_multi_ptes(vmf, page, vmf->address);
+		else {
+			do_set_pte(vmf, page, vmf->address);
 
-		/* no need to invalidate: a not-present page won't be cached */
-		update_mmu_cache(vma, vmf->address, vmf->pte);
+			/* no need to invalidate: a not-present page
+			 * won't be cached
+			 */
+			update_mmu_cache(vma, vmf->address, vmf->pte);
+		}
 
 		ret = 0;
 	} else {
@@ -4476,6 +4486,92 @@ static inline bool should_fault_around(struct vm_fault *vmf)
 	return fault_around_bytes >> PAGE_SHIFT > 1;
 }
 
+/* Return true if we should do fault-around for file fault, false otherwise */
+static inline bool allowed_batched_set_ptes(struct vm_fault *vmf,
+		struct page *page)
+{
+	struct folio *folio = page_folio(page);
+
+	if (uffd_disable_fault_around(vmf->vma))
+		return false;
+
+	if (!folio_test_large(folio))
+		return false;
+
+	/* TODO: Will revise after anon mapping support folio */
+	if ((vmf->flags & FAULT_FLAG_WRITE) &&
+			!(vmf->vma->vm_flags & VM_SHARED))
+		return false;
+
+	return fault_around_bytes >> PAGE_SHIFT > 1;
+}
+
+static void do_set_multi_ptes(struct vm_fault *vmf, struct page *pg,
+		unsigned long addr)
+{
+	struct folio *folio = page_folio(pg);
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long size, mask, start, end, folio_start, folio_end;
+	int dist, first_idx, i = 0;
+	pte_t *pte;
+
+	/* in page table range */
+	start = ALIGN_DOWN(addr, PMD_SIZE);
+	end = ALIGN(addr, PMD_SIZE);
+
+	/* in fault_around_bytes range */
+	size = READ_ONCE(fault_around_bytes);
+	mask = ~(size - 1) & PAGE_MASK;
+
+	/* in vma range */
+	start = max3(start, (addr & mask), vma->vm_start);
+	end = min3(end, (addr & mask) + size, vma->vm_end);
+
+	/* folio is locked and referenced. It will not be split or
+	 * removed from page cache in this function.
+	 */
+	folio_start = addr - (folio_page_idx(folio, pg) << PAGE_SHIFT);
+	folio_end = folio_start + (folio_nr_pages(folio) << PAGE_SHIFT);
+
+	/* in folio size range */
+	start = max(start, folio_start);
+	end = min(end, folio_end);
+
+	dist = (addr - start) >> PAGE_SHIFT;
+	first_idx = folio_page_idx(folio, pg) - dist;
+	pte = vmf->pte - dist;
+
+	do {
+		struct page *page = folio_page(folio, first_idx + i);
+		bool write = vmf->flags & FAULT_FLAG_WRITE;
+		bool prefault = page != pg;
+		pte_t entry;
+
+		if (!pte_none(*pte))
+			continue;
+
+		flush_icache_page(vma, page);
+		entry = mk_pte(page, vma->vm_page_prot);
+
+		if (prefault)
+			folio_get(folio);
+
+		if (prefault && arch_wants_old_prefaulted_pte())
+			entry = pte_mkold(entry);
+		else
+			entry = pte_sw_mkyoung(entry);
+
+		if (write)
+			entry=maybe_mkwrite(pte_mkdirty(entry), vma);
+
+		inc_mm_counter(vma->vm_mm, mm_counter_file(&folio->page));
+		page_add_file_rmap(page, vma, false);
+
+		set_pte_at(vma->vm_mm, start, pte, entry);
+		update_mmu_cache(vma, start, pte);
+	} while (pte++, start += PAGE_SIZE, i++, start < end);
+}
+
 static vm_fault_t do_read_fault(struct vm_fault *vmf)
 {
 	vm_fault_t ret = 0;
-- 
2.30.2

next             reply	other threads:[~2023-01-13 16:33 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-13 16:35 Yin Fengwei [this message]
2023-01-13 18:13 ` Matthew Wilcox
2023-01-14  0:58   ` Yin, Fengwei
2023-01-17  9:19   ` Yin, Fengwei
2023-01-17 10:37     ` David Hildenbrand
2023-01-17 14:46       ` Matthew Wilcox
2023-01-18  1:41         ` Yin, Fengwei
2023-01-18 14:05         ` Yin, Fengwei
2023-01-18  0:58       ` Yin, Fengwei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230113163538.23412-1-fengwei.yin@intel.com \
    --to=fengwei.yin@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=linux-mm@kvack.org \
    --cc=tim.c.chen@intel.com \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox