From: <alexlzhu@fb.com>
To: <linux-mm@kvack.org>, <kernel-team@fb.com>
Cc: <willy@infradead.org>, <hannes@cmpxchg.org>, <riel@surriel.com>,
<yuzhao@google.com>, <ningzhang@linux.alibaba.com>,
Alexander Zhu <alexlzhu@fb.com>
Subject: [PATCH v6 3/5] mm: do not remap clean subpages when splitting isolated thp
Date: Wed, 2 Nov 2022 23:01:45 -0700 [thread overview]
Message-ID: <ddc216d41ff8cf2953488a2e041856c6e8dbd51e.1667454613.git.alexlzhu@fb.com> (raw)
In-Reply-To: <cover.1667454613.git.alexlzhu@fb.com>
From: Alexander Zhu <alexlzhu@fb.com>
Changes to avoid remap on zero pages that are free'd in split_huge_page().
Pages are not remapped except in the case of userfaultfd. In the case
of userfaultfd we remap to the shared zero page, similar to what is
done by KSM.
Signed-off-by: Alexander Zhu <alexlzhu@fb.com>
---
include/linux/rmap.h | 2 +-
include/linux/vm_event_item.h | 2 +
mm/huge_memory.c | 8 ++--
mm/migrate.c | 73 +++++++++++++++++++++++++++++++----
mm/migrate_device.c | 4 +-
mm/vmstat.c | 2 +
6 files changed, 77 insertions(+), 14 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bd3504d11b15..3f83bbcf1333 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -428,7 +428,7 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f733ffc5f6f3..3618b10ddec9 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -112,6 +112,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SPLIT_PUD,
#endif
THP_SPLIT_FREE,
+ THP_SPLIT_UNMAP,
+ THP_SPLIT_REMAP_READONLY_ZERO_PAGE,
THP_ZERO_PAGE_ALLOC,
THP_ZERO_PAGE_ALLOC_FAILED,
THP_SWPOUT,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6a5c70080c07..cba0bbbb2a93 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2373,7 +2373,7 @@ static void unmap_folio(struct folio *folio)
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
}
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean)
{
int i = 0;
@@ -2381,7 +2381,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, true);
+ remove_migration_ptes(folio, folio, true, unmap_clean);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -2569,7 +2569,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
- remap_page(folio, nr);
+ remap_page(folio, nr, PageAnon(head));
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2798,7 +2798,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(folio, folio_nr_pages(folio));
+ remap_page(folio, folio_nr_pages(folio), false);
ret = -EBUSY;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..2764b14d3383 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
#include <linux/writeback.h>
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
+#include <linux/vm_event_item.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/compaction.h>
@@ -168,13 +169,62 @@ void putback_movable_pages(struct list_head *l)
}
}
+static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page)
+{
+ void *addr;
+ bool dirty;
+ pte_t newpte;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+ if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED))
+ return false;
+
+ /*
+ * The pmd entry mapping the old thp was flushed and the pte mapping
+ * this subpage has been non present. Therefore, this subpage is
+ * inaccessible. We don't need to remap it if it contains only zeros.
+ */
+ addr = kmap_local_page(page);
+ dirty = memchr_inv(addr, 0, PAGE_SIZE);
+ kunmap_local(addr);
+
+ if (dirty)
+ return false;
+
+ pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false);
+
+ if (userfaultfd_armed(pvmw->vma)) {
+ newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)),
+ pvmw->vma->vm_page_prot));
+ ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte);
+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+ dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES);
+ count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE);
+ return true;
+ }
+
+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page));
+ count_vm_event(THP_SPLIT_UNMAP);
+ return true;
+}
+
+struct rmap_walk_arg {
+ struct folio *folio;
+ bool unmap_clean;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct rmap_walk_arg *rmap_walk_arg = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
@@ -197,6 +247,8 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new))
+ continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -272,13 +324,20 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean)
{
+ struct rmap_walk_arg rmap_walk_arg = {
+ .folio = src,
+ .unmap_clean = unmap_clean,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = src,
+ .arg = &rmap_walk_arg,
};
+ VM_BUG_ON_FOLIO(unmap_clean && src != dst, src);
+
if (locked)
rmap_walk_locked(dst, &rwc);
else
@@ -872,7 +931,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, false, false);
rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1128,7 +1187,7 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false);
out_unlock_both:
folio_unlock(dst);
@@ -1338,7 +1397,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false);
unlock_put_anon:
folio_unlock(dst);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6fa682eef7a0..6508a083d7fd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -421,7 +421,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue;
folio = page_folio(page);
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, false, false);
src_pfns[i] = 0;
folio_unlock(folio);
@@ -847,7 +847,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
src = page_folio(page);
dst = page_folio(newpage);
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, false, false);
folio_unlock(src);
if (is_zone_device_page(page))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a2ba5d7922f4..3d802eb6754d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1360,6 +1360,8 @@ const char * const vmstat_text[] = {
"thp_split_pud",
#endif
"thp_split_free",
+ "thp_split_unmap",
+ "thp_split_remap_readonly_zero_page",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
"thp_swpout",
--
2.30.2
next prev parent reply other threads:[~2022-11-03 6:02 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-03 6:01 [PATCH v6 0/5] THP Shrinker alexlzhu
2022-11-03 6:01 ` [PATCH v6 1/5] mm: add thp_utilization metrics to debugfs alexlzhu
2022-11-03 6:01 ` [PATCH v6 2/5] mm: changes to split_huge_page() to free zero filled tail pages alexlzhu
2022-11-03 6:01 ` alexlzhu [this message]
2022-11-03 12:48 ` [PATCH v6 3/5] mm: do not remap clean subpages when splitting isolated thp kernel test robot
2022-11-03 13:19 ` kernel test robot
2022-11-03 6:01 ` [PATCH v6 4/5] mm: add selftests to split_huge_page() to verify unmap/zap of zero pages alexlzhu
2022-11-03 6:01 ` [PATCH v6 5/5] mm: THP low utilization shrinker alexlzhu
2022-11-03 13:19 ` kernel test robot
2023-01-06 13:41 ` Tarun Sahu
2023-01-02 18:05 ` [PATCH v6 0/5] THP Shrinker David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ddc216d41ff8cf2953488a2e041856c6e8dbd51e.1667454613.git.alexlzhu@fb.com \
--to=alexlzhu@fb.com \
--cc=hannes@cmpxchg.org \
--cc=kernel-team@fb.com \
--cc=linux-mm@kvack.org \
--cc=ningzhang@linux.alibaba.com \
--cc=riel@surriel.com \
--cc=willy@infradead.org \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox