[RFC PATCH 12/14] mm/madvise: introduce batched madvise(MADV_COLLPASE) collapse

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: "Zach O'Keefe" <zokeefe@google.com>
To: Alex Shi <alex.shi@linux.alibaba.com>,
	David Hildenbrand <david@redhat.com>,
	 David Rientjes <rientjes@google.com>,
	Michal Hocko <mhocko@suse.com>,
	 Pasha Tatashin <pasha.tatashin@soleen.com>,
	SeongJae Park <sj@kernel.org>,  Song Liu <songliubraving@fb.com>,
	Vlastimil Babka <vbabka@suse.cz>, Zi Yan <ziy@nvidia.com>,
	 linux-mm@kvack.org
Cc: Andrea Arcangeli <aarcange@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	 Arnd Bergmann <arnd@arndb.de>,
	Axel Rasmussen <axelrasmussen@google.com>,
	 Chris Kennelly <ckennelly@google.com>,
	Chris Zankel <chris@zankel.net>, Helge Deller <deller@gmx.de>,
	 Hugh Dickins <hughd@google.com>,
	Ivan Kokshaysky <ink@jurassic.park.msu.ru>,
	 "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>,
	Jens Axboe <axboe@kernel.dk>,
	 "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
	Matthew Wilcox <willy@infradead.org>,
	 Matt Turner <mattst88@gmail.com>,
	Max Filippov <jcmvbkbc@gmail.com>,
	 Miaohe Lin <linmiaohe@huawei.com>,
	Minchan Kim <minchan@kernel.org>,
	 Patrick Xia <patrickx@google.com>,
	Pavel Begunkov <asml.silence@gmail.com>,
	 Peter Xu <peterx@redhat.com>,
	Richard Henderson <rth@twiddle.net>,
	 Thomas Bogendoerfer <tsbogend@alpha.franken.de>,
	Yang Shi <shy828301@gmail.com>,
	 "Zach O'Keefe" <zokeefe@google.com>
Subject: [RFC PATCH 12/14] mm/madvise: introduce batched madvise(MADV_COLLPASE) collapse
Date: Tue,  8 Mar 2022 13:34:15 -0800	[thread overview]
Message-ID: <20220308213417.1407042-13-zokeefe@google.com> (raw)
In-Reply-To: <20220308213417.1407042-1-zokeefe@google.com>

Introduce the main madvise collapse batched logic, including the overall
locking strategy.  Stubs for individual batched actions, such as
scanning pmds in batch, have been stubbed out, and will be added later
in the series.

Note the main benefit from doing all this work in a batched manner is
that __madvise__collapse_pmd_batch() (stubbed out) can be called inside
a single mmap_lock write.

Per-batch data is stored in a struct madvise_collapse_data array, with
an entry for each pmd to collapse, and is shared between the various
*_batch actions.  This allows for partial success of collapsing a range
of pmds - we continue as long as some pmds can be successfully
collapsed.

A "success" here, is where all pmds can be (or already are) collapsed.
On failure, the caller will need to verify what, if any, partial
successes occurred via smaps or otherwise.

Also note that, where possible, if collapse fails for a particular pmd
after a hugepage has already been allocated, said hugepage is kept on a
per-node free list for the purpose of backing subsequent pmd collapses.
All unused hugepages are returned before _madvise_collapse() returns.

Note that bisect at this patch won't break; madvise(MADV_COLLAPSE) will
return -1 always.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 273 insertions(+), 6 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ca1e523086ed..ea53c706602e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -86,6 +86,9 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
 #define MAX_PTE_MAPPED_THP 8
 
 struct collapse_control {
+	/* Used by MADV_COLLAPSE batch collapse */
+	struct list_head free_hpages[MAX_NUMNODES];
+
 	/* Respect khugepaged_max_ptes_[none|swap|shared] */
 	bool enforce_pte_scan_limits;
 
@@ -99,8 +102,13 @@ struct collapse_control {
 static void collapse_control_init(struct collapse_control *cc,
 				  bool enforce_pte_scan_limits)
 {
+	int i;
+
 	cc->enforce_pte_scan_limits = enforce_pte_scan_limits;
 	cc->last_target_node = NUMA_NO_NODE;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(cc->free_hpages + i);
 }
 
 /**
@@ -1033,7 +1041,7 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
 	/* See comments in pmd_none_or_trans_huge_or_clear_bad() */
 	barrier();
 #endif
-	if (!pmd_present(pmde) || !pmd_none(pmde)) {
+	if (!pmd_present(pmde) || pmd_none(pmde)) {
 		*result = SCAN_PMD_NULL;
 		return NULL;
 	} else if (pmd_trans_huge(pmde)) {
@@ -1054,12 +1062,16 @@ static pmd_t *find_pmd_or_thp_or_none(struct mm_struct *mm,
 static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					int referenced)
+					int referenced,
+					unsigned long vm_flags_ignored,
+					bool *mmap_lock_dropped)
 {
 	int swapped_in = 0;
 	vm_fault_t ret = 0;
 	unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
 
+	if (mmap_lock_dropped)
+		*mmap_lock_dropped = false;
 	for (address = haddr; address < end; address += PAGE_SIZE) {
 		struct vm_fault vmf = {
 			.vma = vma,
@@ -1080,8 +1092,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
 		if (ret & VM_FAULT_RETRY) {
+			if (mmap_lock_dropped)
+				*mmap_lock_dropped = true;
 			mmap_read_lock(mm);
-			if (hugepage_vma_revalidate(mm, haddr, VM_NONE, &vma)) {
+			if (hugepage_vma_revalidate(mm, haddr, vm_flags_ignored, &vma)) {
 				/* vma is no longer available, don't continue to swapin */
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
@@ -1256,7 +1270,8 @@ static void khugepaged_collapse_huge_page(struct mm_struct *mm,
 	 * Continuing to collapse causes inconsistency.
 	 */
 	if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
-						     pmd, referenced)) {
+						     pmd, referenced, VM_NONE,
+						     NULL)) {
 		mmap_read_unlock(mm);
 		goto out_nolock;
 	}
@@ -2520,6 +2535,128 @@ void khugepaged_min_free_kbytes_update(void)
 	mutex_unlock(&khugepaged_mutex);
 }
 
+struct madvise_collapse_data {
+	struct page *hpage; /* Preallocated THP */
+	bool continue_collapse;  /* Should we attempt / continue collapse? */
+
+	struct scan_pmd_result scan_result;
+	pmd_t *pmd;
+};
+
+static int
+madvise_collapse_vma_revalidate_pmd_count(struct mm_struct *mm,
+					  unsigned long address, int nr,
+					  struct vm_area_struct **vmap)
+{
+	/* madvise_collapse() ignores MADV_NOHUGEPAGE */
+	return hugepage_vma_revalidate_pmd_count(mm, address, nr, VM_NOHUGEPAGE,
+			vmap);
+}
+
+/*
+ * Scan pmd to see which we can collapse, and to determine node to allocate on.
+ *
+ * Must be called with mmap_lock in read, and returns with the lock held in
+ * read. Does not drop the lock.
+ *
+ * Set batch_data[i]->continue_collapse to false for any pmd that can't be
+ * collapsed.
+ *
+ * Return the number of existing THPs in batch.
+ */
+static int
+__madvise_collapse_scan_pmd_batch(struct mm_struct *mm,
+				  struct vm_area_struct *vma,
+				  unsigned long batch_start,
+				  struct madvise_collapse_data *batch_data,
+				  int batch_size,
+				  struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+/*
+ * Preallocate and charge huge page for each pmd in the batch, store the
+ * new page in batch_data[i]->hpage.
+ *
+ * Return the number of huge pages allocated.
+ */
+static int
+__madvise_collapse_prealloc_hpages_batch(struct mm_struct *mm,
+					 gfp_t gfp,
+					 int node,
+					 struct madvise_collapse_data *batch_data,
+					 int batch_size,
+					 struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+/*
+ * Do swapin for all ranges in batch, returns true iff successful.
+ *
+ * Called with mmap_lock held in read, and returns with it held in read.
+ * Might drop the lock.
+ *
+ * Set batch_data[i]->continue_collapse to false for any pmd that can't be
+ * collapsed. Else, set batch_data[i]->pmd to the found pmd.
+ */
+static bool
+__madvise_collapse_swapin_pmd_batch(struct mm_struct *mm,
+				    int node,
+				    unsigned long batch_start,
+				    struct madvise_collapse_data *batch_data,
+				    int batch_size,
+				    struct collapse_control *cc)
+
+{
+	/* Implemented in later patch */
+	return true;
+}
+
+/*
+ * Do the collapse operation. Return number of THPs collapsed successfully.
+ *
+ * Called with mmap_lock held in write, and returns with it held. Does not
+ * drop the lock.
+ */
+static int
+__madvise_collapse_pmd_batch(struct mm_struct *mm,
+			     unsigned long batch_start,
+			     int batch_size,
+			     struct madvise_collapse_data *batch_data,
+			     int node,
+			     struct collapse_control *cc)
+{
+	/* Implemented in later patch */
+	return 0;
+}
+
+static bool continue_collapse(struct madvise_collapse_data *batch_data,
+			      int batch_size)
+{
+	int i;
+
+	for (i = 0; i < batch_size; ++i)
+		if (batch_data[i].continue_collapse)
+			return true;
+	return false;
+}
+
+static bool madvise_transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		/* madvise_collapse() ignores MADV_NOHUGEPAGE */
+		return __transparent_hugepage_enabled(vma, vma->vm_flags &
+						      ~VM_NOHUGEPAGE);
+	/* TODO: Support file-backed memory */
+	return false;
+}
+
+#define MADVISE_COLLAPSE_BATCH_SIZE 8
+
 /*
  * Returns 0 if successfully able to collapse range into THPs (or range already
  * backed by THPs). Due to implementation detail, THPs collapsed here may be
@@ -2532,8 +2669,138 @@ static int _madvise_collapse(struct mm_struct *mm,
 			     unsigned long end, gfp_t gfp,
 			     struct collapse_control *cc)
 {
-	/* Implemented in later patch */
-	return -ENOSYS;
+	unsigned long hstart, hend, batch_addr;
+	int ret = -EINVAL, collapsed = 0, nr_hpages = 0, i;
+	struct madvise_collapse_data batch_data[MADVISE_COLLAPSE_BATCH_SIZE];
+
+	mmap_assert_locked(mm);
+	BUG_ON(vma->vm_start > start);
+	BUG_ON(vma->vm_end < end);
+	VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+
+	hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = end & HPAGE_PMD_MASK;
+	nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT;
+	if (hstart >= hend)
+		goto out;
+
+	if (!madvise_transparent_hugepage_enabled(vma))
+		goto out;
+
+	/*
+	 * Request might cover multiple hugepages. Strategy is to batch
+	 * allocation and collapse operations so that we do more work while
+	 * mmap_lock is held exclusively.
+	 *
+	 * While processing batch, mmap_lock is locked/unlocked many times for
+	 * the supplied VMA. It's possible that the original VMA is split while
+	 * lock was dropped. If in the context of the (possibly new) VMA, THP
+	 * collapse is possible, we continue.
+	 */
+	for (batch_addr = hstart;
+	     batch_addr < hend;
+	     batch_addr += HPAGE_PMD_SIZE * MADVISE_COLLAPSE_BATCH_SIZE) {
+		int node, batch_size;
+		int thps; /* Number of existing THPs in range */
+
+		batch_size = (hend - batch_addr) >> HPAGE_PMD_SHIFT;
+		batch_size = min_t(int, batch_size,
+				   MADVISE_COLLAPSE_BATCH_SIZE);
+
+		BUG_ON(batch_size <= 0);
+		memset(batch_data, 0, sizeof(batch_data));
+		cond_resched();
+		VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
+
+		/*
+		 * If first batch, we still hold mmap_lock from madvise
+		 * call and haven't dropped it since checking the VMA. Else,
+		 * we've dropped the lock and we need to revalidate.
+		 */
+		if (batch_addr != hstart) {
+			mmap_read_lock(mm);
+			if (madvise_collapse_vma_revalidate_pmd_count(mm,
+								      batch_addr,
+								      batch_size,
+								      &vma))
+				goto loop_unlock_break;
+		}
+
+		mmap_assert_locked(mm);
+
+		thps = __madvise_collapse_scan_pmd_batch(mm, vma, batch_addr,
+							 batch_data, batch_size,
+							 cc);
+		mmap_read_unlock(mm);
+
+		/* Count existing THPs as-if we collapsed them */
+		collapsed += thps;
+		if (thps == batch_size || !continue_collapse(batch_data,
+							     batch_size))
+			continue;
+
+		node = find_target_node(cc);
+		if (!__madvise_collapse_prealloc_hpages_batch(mm, gfp, node,
+							      batch_data,
+							      batch_size, cc)) {
+			/* No more THPs available - so give up */
+			ret = -ENOMEM;
+			break;
+		}
+
+		mmap_read_lock(mm);
+		if (!__madvise_collapse_swapin_pmd_batch(mm, node, batch_addr,
+							 batch_data, batch_size,
+							 cc))
+			goto loop_unlock_break;
+		mmap_read_unlock(mm);
+		mmap_write_lock(mm);
+		collapsed += __madvise_collapse_pmd_batch(mm,
+				batch_addr, batch_size, batch_data,
+				node, cc);
+		mmap_write_unlock(mm);
+
+		for (i = 0; i < batch_size; ++i) {
+			struct page *page = batch_data[i].hpage;
+
+			if (page && !IS_ERR(page)) {
+				list_add_tail(&page->lru,
+					      &cc->free_hpages[node]);
+				batch_data[i].hpage = NULL;
+			}
+		}
+		/* mmap_lock is unlocked here */
+		continue;
+loop_unlock_break:
+		mmap_read_unlock(mm);
+		break;
+	}
+	/* mmap_lock is unlocked here */
+
+	for (i = 0; i < MADVISE_COLLAPSE_BATCH_SIZE; ++i) {
+		struct page *page = batch_data[i].hpage;
+
+		if (page && !IS_ERR(page)) {
+			mem_cgroup_uncharge(page_folio(page));
+			put_page(page);
+		}
+	}
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page, *tmp;
+
+		list_for_each_entry_safe(page, tmp, cc->free_hpages + i, lru) {
+			list_del(&page->lru);
+			mem_cgroup_uncharge(page_folio(page));
+			put_page(page);
+		}
+	}
+	ret = collapsed == nr_hpages ? 0 : -1;
+	vma = NULL;		/* tell sys_madvise we dropped mmap_lock */
+	mmap_read_lock(mm);	/* sys_madvise expects us to have mmap_lock */
+out:
+	*prev = vma;		/* we didn't drop mmap_lock, so this holds */
+
+	return ret;
 }
 
 int madvise_collapse(struct vm_area_struct *vma,
-- 
2.35.1.616.g0bdcbb4464-goog

next prev parent reply	other threads:[~2022-03-08 21:35 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-08 21:34 [RFC PATCH 00/14] mm: userspace hugepage collapse Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 01/14] mm/rmap: add mm_find_pmd_raw helper Zach O'Keefe
2022-03-09 22:48   ` Yang Shi
2022-03-08 21:34 ` [RFC PATCH 02/14] mm/khugepaged: add struct collapse_control Zach O'Keefe
2022-03-09 22:53   ` Yang Shi
2022-03-08 21:34 ` [RFC PATCH 03/14] mm/khugepaged: add __do_collapse_huge_page() helper Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 04/14] mm/khugepaged: separate khugepaged_scan_pmd() scan and collapse Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 05/14] mm/khugepaged: add mmap_assert_locked() checks to scan_pmd() Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 06/14] mm/khugepaged: add hugepage_vma_revalidate_pmd_count() Zach O'Keefe
2022-03-09 23:15   ` Yang Shi
2022-03-08 21:34 ` [RFC PATCH 07/14] mm/khugepaged: add vm_flags_ignore to hugepage_vma_revalidate_pmd_count() Zach O'Keefe
2022-03-09 23:17   ` Yang Shi
2022-03-10  0:00     ` Zach O'Keefe
2022-03-10  0:41       ` Yang Shi
2022-03-10  1:09         ` Zach O'Keefe
2022-03-10  2:16           ` Yang Shi
2022-03-10 15:50             ` Zach O'Keefe
2022-03-10 18:17               ` Yang Shi
2022-03-10 18:46                 ` David Rientjes
2022-03-10 18:58                   ` Zach O'Keefe
2022-03-10 19:54                   ` Yang Shi
2022-03-10 20:24                     ` Zach O'Keefe
2022-03-10 18:53                 ` Zach O'Keefe
2022-03-10 15:56   ` David Hildenbrand
2022-03-10 18:39     ` Zach O'Keefe
2022-03-10 18:54     ` David Rientjes
2022-03-21 14:27       ` Michal Hocko
2022-03-08 21:34 ` [RFC PATCH 08/14] mm/thp: add madv_thp_vm_flags to __transparent_hugepage_enabled() Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 09/14] mm/khugepaged: record SCAN_PAGE_COMPOUND when scan_pmd() finds THP Zach O'Keefe
2022-03-09 23:40   ` Yang Shi
2022-03-10  0:46     ` Zach O'Keefe
2022-03-10  2:05       ` Yang Shi
2022-03-10  8:37         ` Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 10/14] mm/khugepaged: rename khugepaged-specific/not functions Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 11/14] mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse Zach O'Keefe
2022-03-09 23:43   ` Yang Shi
2022-03-10  1:11     ` Zach O'Keefe
2022-03-08 21:34 ` Zach O'Keefe [this message]
2022-03-10  0:06   ` [RFC PATCH 12/14] mm/madvise: introduce batched madvise(MADV_COLLPASE) collapse Yang Shi
2022-03-10 19:26     ` David Rientjes
2022-03-10 20:16       ` Matthew Wilcox
2022-03-11  0:06         ` Zach O'Keefe
2022-03-25 16:51           ` Zach O'Keefe
2022-03-25 19:54             ` Yang Shi
2022-03-08 21:34 ` [RFC PATCH 13/14] mm/madvise: add __madvise_collapse_*_batch() actions Zach O'Keefe
2022-03-08 21:34 ` [RFC PATCH 14/14] mm/madvise: add process_madvise(MADV_COLLAPSE) Zach O'Keefe
2022-03-21 14:32 ` [RFC PATCH 00/14] mm: userspace hugepage collapse Zi Yan
2022-03-21 14:51   ` Zach O'Keefe
2022-03-21 14:37 ` Michal Hocko
2022-03-21 15:46   ` Zach O'Keefe
2022-03-22 12:11     ` Michal Hocko
2022-03-22 15:53       ` Zach O'Keefe
2022-03-29 12:24         ` Michal Hocko
2022-03-30  0:36           ` Zach O'Keefe
2022-03-22  6:40 ` Zach O'Keefe
2022-03-22 12:05   ` Michal Hocko
2022-03-23 13:30     ` Zach O'Keefe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220308213417.1407042-13-zokeefe@google.com \
    --to=zokeefe@google.com \
    --cc=James.Bottomley@HansenPartnership.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex.shi@linux.alibaba.com \
    --cc=arnd@arndb.de \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=axelrasmussen@google.com \
    --cc=chris@zankel.net \
    --cc=ckennelly@google.com \
    --cc=david@redhat.com \
    --cc=deller@gmx.de \
    --cc=hughd@google.com \
    --cc=ink@jurassic.park.msu.ru \
    --cc=jcmvbkbc@gmail.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-mm@kvack.org \
    --cc=mattst88@gmail.com \
    --cc=mhocko@suse.com \
    --cc=minchan@kernel.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=patrickx@google.com \
    --cc=peterx@redhat.com \
    --cc=rientjes@google.com \
    --cc=rth@twiddle.net \
    --cc=shy828301@gmail.com \
    --cc=sj@kernel.org \
    --cc=songliubraving@fb.com \
    --cc=tsbogend@alpha.franken.de \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox