[RFC PATCH v2 4/4] madvise: avoid trying to split large folio always in cold_pageout

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Yin Fengwei <fengwei.yin@intel.com>
To: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	akpm@linux-foundation.org, minchan@kernel.org, yuzhao@google.com,
	willy@infradead.org, david@redhat.com, ryan.roberts@arm.com,
	shy828301@gmail.com
Cc: fengwei.yin@intel.com
Subject: [RFC PATCH v2 4/4] madvise: avoid trying to split large folio always in cold_pageout
Date: Fri, 21 Jul 2023 17:40:43 +0800	[thread overview]
Message-ID: <20230721094043.2506691-5-fengwei.yin@intel.com> (raw)
In-Reply-To: <20230721094043.2506691-1-fengwei.yin@intel.com>

Current madvise_cold_or_pageout_pte_range() always tries to split
large folio.

Avoid trying to split large folio always by:
  - if large folio is in the request range, don't split it. Leave
    to page reclaim to decide whether the large folio needs be
    split.
  - if large folio crosses boundaries of request range, skip it if
    it's page cache. Try to split it if it's anonymous large folio.
    If failed to split it, just skip it.

Invoke folio_referenced() to clear the A bit for large folio. As it
will acquire pte lock, just do it after release pte lock.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 mm/internal.h |  10 +++++
 mm/madvise.c  | 118 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 93 insertions(+), 35 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index c7dd15d8de3e..cd1ff348d690 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -586,6 +586,16 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma,
 extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
 			       unsigned long bytes);
 
+static inline unsigned int
+folio_op_size(struct folio *folio, pte_t pte,
+		unsigned long addr, unsigned long end)
+{
+	unsigned int nr;
+
+	nr = folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte);
+	return min_t(unsigned int, nr, (end - addr) >> PAGE_SHIFT);
+}
+
 static inline bool
 folio_in_range(struct folio *folio, struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
diff --git a/mm/madvise.c b/mm/madvise.c
index b236e201a738..71af370c3251 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -339,6 +339,23 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
+static inline bool skip_cur_entry(struct folio *folio, bool pageout_anon_only)
+{
+	if (!folio)
+		return true;
+
+	if (folio_is_zone_device(folio))
+		return true;
+
+	if (!folio_test_lru(folio))
+		return true;
+
+	if (pageout_anon_only && !folio_test_anon(folio))
+		return true;
+
+	return false;
+}
+
 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
@@ -352,7 +369,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 	struct folio *folio = NULL;
 	LIST_HEAD(folio_list);
+	LIST_HEAD(reclaim_list);
 	bool pageout_anon_only_filter;
+	unsigned long start = addr;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
@@ -442,54 +461,90 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			continue;
 
 		folio = vm_normal_folio(vma, addr, ptent);
-		if (!folio || folio_is_zone_device(folio))
+		if (skip_cur_entry(folio, pageout_anon_only_filter))
 			continue;
 
 		/*
-		 * Creating a THP page is expensive so split it only if we
-		 * are sure it's worth. Split it if we are only owner.
+		 * Split large folio only if it's anonymous, cross the
+		 * boundaries of request range and we are likely the
+		 * only onwer.
 		 */
 		if (folio_test_large(folio)) {
-			int err;
+			int err, step;
 
 			if (folio_estimated_sharers(folio) != 1)
-				break;
-			if (pageout_anon_only_filter && !folio_test_anon(folio))
-				break;
-			if (!folio_trylock(folio))
-				break;
+				continue;
+			if (folio_in_range(folio, vma, start, end))
+				goto pageout_cold_folio;
+			if (!folio_test_anon(folio) || !folio_trylock(folio))
+				continue;
+
 			folio_get(folio);
+			step = folio_op_size(folio, ptent, addr, end);
 			arch_leave_lazy_mmu_mode();
 			pte_unmap_unlock(start_pte, ptl);
 			start_pte = NULL;
 			err = split_folio(folio);
 			folio_unlock(folio);
 			folio_put(folio);
-			if (err)
-				break;
+
 			start_pte = pte =
 				pte_offset_map_lock(mm, pmd, addr, &ptl);
 			if (!start_pte)
 				break;
 			arch_enter_lazy_mmu_mode();
-			pte--;
-			addr -= PAGE_SIZE;
-			continue;
-		}
 
-		/*
-		 * Do not interfere with other mappings of this folio and
-		 * non-LRU folio.
-		 */
-		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+			/* split success. retry the same entry */
+			if (!err)
+				step = 0;
+
+			/*
+			 * Split fails, jump over the whole folio to avoid
+			 * grabbing same folio but fails to split it again
+			 * and again.
+			 */
+			pte += step - 1;
+			addr += (step - 1) << PAGE_SHIFT;
 			continue;
+		}
 
-		if (pageout_anon_only_filter && !folio_test_anon(folio))
+		/* Do not interfere with other mappings of this folio */
+		if (folio_mapcount(folio) != 1)
 			continue;
 
 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
 		ptep_clear_flush_young_notify(vma, addr, pte);
+
+pageout_cold_folio:
+		if (folio_isolate_lru(folio)) {
+			if (folio_test_unevictable(folio))
+				folio_putback_lru(folio);
+			else
+				list_add(&folio->lru, &folio_list);
+		}
+	}
+
+	if (start_pte) {
+		arch_leave_lazy_mmu_mode();
+		pte_unmap_unlock(start_pte, ptl);
+	}
+
+	while (!list_empty(&folio_list)) {
+		folio = lru_to_folio(&folio_list);
+		list_del(&folio->lru);
+
+		if (folio_test_large(folio)) {
+			int refs;
+			unsigned long flags;
+			struct mem_cgroup *memcg = folio_memcg(folio);
+
+			refs = folio_referenced(folio, 0, memcg, &flags);
+			if ((flags & VM_LOCKED) || (refs == -1)) {
+				folio_putback_lru(folio);
+				continue;
+			}
+		}
+
 		/*
 		 * We are deactivating a folio for accelerating reclaiming.
 		 * VM couldn't reclaim the folio unless we clear PG_young.
@@ -501,22 +556,15 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		if (folio_test_active(folio))
 			folio_set_workingset(folio);
 		if (pageout) {
-			if (folio_isolate_lru(folio)) {
-				if (folio_test_unevictable(folio))
-					folio_putback_lru(folio);
-				else
-					list_add(&folio->lru, &folio_list);
-			}
-		} else
-			folio_deactivate(folio);
+			list_add(&folio->lru, &reclaim_list);
+		} else {
+			folio_clear_active(folio);
+			folio_putback_lru(folio);
+		}
 	}
 
-	if (start_pte) {
-		arch_leave_lazy_mmu_mode();
-		pte_unmap_unlock(start_pte, ptl);
-	}
 	if (pageout)
-		reclaim_pages(&folio_list);
+		reclaim_pages(&reclaim_list);
 	cond_resched();
 
 	return 0;
-- 
2.39.2

next prev parent reply	other threads:[~2023-07-21  9:42 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-21  9:40 [RFC PATCH v2 0/4] fix large folio for madvise_cold_or_pageout() Yin Fengwei
2023-07-21  9:40 ` [RFC PATCH v2 1/4] madvise: not use mapcount() against large folio for sharing check Yin Fengwei
2023-07-21 18:57   ` Yu Zhao
2023-07-23 12:26     ` Yin, Fengwei
2023-07-25  5:22       ` Yu Zhao
2023-07-21  9:40 ` [RFC PATCH v2 2/4] madvise: Use notify-able API to clear and flush page table entries Yin Fengwei
2023-07-25  5:55   ` Yu Zhao
2023-07-26  2:49     ` Yin Fengwei
2023-07-26  3:26       ` Yu Zhao
2023-07-26  4:44         ` Yin Fengwei
2023-07-26  5:40           ` Yu Zhao
2023-07-26  6:21             ` Yin Fengwei
2023-07-27  3:28               ` Yu Zhao
2023-07-28 16:14                 ` Yin, Fengwei
2023-07-21  9:40 ` [RFC PATCH v2 3/4] mm: add functions folio_in_range() and folio_within_vma() Yin Fengwei
2023-07-25  5:42   ` Yu Zhao
2023-07-21  9:40 ` Yin Fengwei [this message]
2023-07-25  5:26   ` [RFC PATCH v2 4/4] madvise: avoid trying to split large folio always in cold_pageout Yu Zhao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230721094043.2506691-5-fengwei.yin@intel.com \
    --to=fengwei.yin@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan@kernel.org \
    --cc=ryan.roberts@arm.com \
    --cc=shy828301@gmail.com \
    --cc=willy@infradead.org \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox