From: Yin Fengwei <fengwei.yin@intel.com>
To: willy@infradead.org, linux-mm@kvack.org
Cc: dave.hansen@intel.com, tim.c.chen@intel.com,
ying.huang@intel.com, fengwei.yin@intel.com
Subject: [RFC PATCH] mm: populate multiple PTEs if file page is large folio
Date: Sat, 14 Jan 2023 00:35:38 +0800 [thread overview]
Message-ID: <20230113163538.23412-1-fengwei.yin@intel.com> (raw)
The page fault number can be reduced by batched PTEs population.
The batch size of PTEs population is not allowed to cross:
- page table boundaries
- vma range
- large folio size
- fault_around_bytes
fault_around_bytes allows to control batch size if user has
attention to to so.
Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
* base on next-20230112
mm/memory.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 99 insertions(+), 3 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 56b571c83a0e..755e6e590481 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,10 @@ EXPORT_SYMBOL(mem_map);
#endif
static vm_fault_t do_fault(struct vm_fault *vmf);
+static inline bool allowed_batched_set_ptes(struct vm_fault *vmf,
+ struct page *page);
+static void do_set_multi_ptes(struct vm_fault *vmf, struct page *page,
+ unsigned long addr);
/*
* A number of key systems in x86 including ioremap() rely on the assumption
@@ -4359,10 +4363,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
/* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) {
- do_set_pte(vmf, page, vmf->address);
+ if (allowed_batched_set_ptes(vmf, page))
+ do_set_multi_ptes(vmf, page, vmf->address);
+ else {
+ do_set_pte(vmf, page, vmf->address);
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ /* no need to invalidate: a not-present page
+ * won't be cached
+ */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ }
ret = 0;
} else {
@@ -4476,6 +4486,92 @@ static inline bool should_fault_around(struct vm_fault *vmf)
return fault_around_bytes >> PAGE_SHIFT > 1;
}
+/* Return true if we should do fault-around for file fault, false otherwise */
+static inline bool allowed_batched_set_ptes(struct vm_fault *vmf,
+ struct page *page)
+{
+ struct folio *folio = page_folio(page);
+
+ if (uffd_disable_fault_around(vmf->vma))
+ return false;
+
+ if (!folio_test_large(folio))
+ return false;
+
+ /* TODO: Will revise after anon mapping support folio */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ return false;
+
+ return fault_around_bytes >> PAGE_SHIFT > 1;
+}
+
+static void do_set_multi_ptes(struct vm_fault *vmf, struct page *pg,
+ unsigned long addr)
+{
+ struct folio *folio = page_folio(pg);
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long size, mask, start, end, folio_start, folio_end;
+ int dist, first_idx, i = 0;
+ pte_t *pte;
+
+ /* in page table range */
+ start = ALIGN_DOWN(addr, PMD_SIZE);
+ end = ALIGN(addr, PMD_SIZE);
+
+ /* in fault_around_bytes range */
+ size = READ_ONCE(fault_around_bytes);
+ mask = ~(size - 1) & PAGE_MASK;
+
+ /* in vma range */
+ start = max3(start, (addr & mask), vma->vm_start);
+ end = min3(end, (addr & mask) + size, vma->vm_end);
+
+ /* folio is locked and referenced. It will not be split or
+ * removed from page cache in this function.
+ */
+ folio_start = addr - (folio_page_idx(folio, pg) << PAGE_SHIFT);
+ folio_end = folio_start + (folio_nr_pages(folio) << PAGE_SHIFT);
+
+ /* in folio size range */
+ start = max(start, folio_start);
+ end = min(end, folio_end);
+
+ dist = (addr - start) >> PAGE_SHIFT;
+ first_idx = folio_page_idx(folio, pg) - dist;
+ pte = vmf->pte - dist;
+
+ do {
+ struct page *page = folio_page(folio, first_idx + i);
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ bool prefault = page != pg;
+ pte_t entry;
+
+ if (!pte_none(*pte))
+ continue;
+
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+
+ if (prefault)
+ folio_get(folio);
+
+ if (prefault && arch_wants_old_prefaulted_pte())
+ entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
+
+ if (write)
+ entry=maybe_mkwrite(pte_mkdirty(entry), vma);
+
+ inc_mm_counter(vma->vm_mm, mm_counter_file(&folio->page));
+ page_add_file_rmap(page, vma, false);
+
+ set_pte_at(vma->vm_mm, start, pte, entry);
+ update_mmu_cache(vma, start, pte);
+ } while (pte++, start += PAGE_SIZE, i++, start < end);
+}
+
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
vm_fault_t ret = 0;
--
2.30.2
next reply other threads:[~2023-01-13 16:33 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-01-13 16:35 Yin Fengwei [this message]
2023-01-13 18:13 ` Matthew Wilcox
2023-01-14 0:58 ` Yin, Fengwei
2023-01-17 9:19 ` Yin, Fengwei
2023-01-17 10:37 ` David Hildenbrand
2023-01-17 14:46 ` Matthew Wilcox
2023-01-18 1:41 ` Yin, Fengwei
2023-01-18 14:05 ` Yin, Fengwei
2023-01-18 0:58 ` Yin, Fengwei
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230113163538.23412-1-fengwei.yin@intel.com \
--to=fengwei.yin@intel.com \
--cc=dave.hansen@intel.com \
--cc=linux-mm@kvack.org \
--cc=tim.c.chen@intel.com \
--cc=willy@infradead.org \
--cc=ying.huang@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox