linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Baolin Wang <baolin.wang@linux.alibaba.com>
To: akpm@linux-foundation.org, hughd@google.com
Cc: willy@infradead.org, david@redhat.com,
	wangkefeng.wang@huawei.com, 21cnbao@gmail.com,
	ryan.roberts@arm.com, ying.huang@intel.com, shy828301@gmail.com,
	ziy@nvidia.com, baolin.wang@linux.alibaba.com,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: [RFC PATCH 4/5] mm: shmem: add mTHP support for anonymous share pages
Date: Mon, 22 Apr 2024 15:02:42 +0800	[thread overview]
Message-ID: <8f2725a856dc02c7c89b4139cc3628686c9de524.1713755580.git.baolin.wang@linux.alibaba.com> (raw)
In-Reply-To: <cover.1713755580.git.baolin.wang@linux.alibaba.com>

Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that
can allow THP to be configured through the sysfs interface located at
'/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'.

However, the anonymous share pages will ignore the anonymous mTHP rule
configured through the sysfs interface, and can only use the PMD-mapped
THP, that is not reasonable. Users expect to apply the mTHP rule for
all anonymous pages, including the anonymous share pages, in order to
enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP,
smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture
to reduce TLB miss etc.

The primary strategy is that, the use of huge pages for anonymous shared pages
still follows the global control determined by the mount option "huge=" parameter
or the sysfs interface at '/sys/kernel/mm/transparent_hugepage/shmem_enabled'.
The utilization of mTHP is allowed only when the global 'huge' switch is enabled.
Subsequently, the mTHP sysfs interface (/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled)
is checked to determine the mTHP size that can be used for large folio allocation
for these anonymous shared pages.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/huge_mm.h |  2 +-
 mm/huge_memory.c        |  4 +-
 mm/shmem.c              | 92 +++++++++++++++++++++++++++++++----------
 3 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b67294d5814f..26b6fa98d8ac 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -246,7 +246,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 				       unsigned long orders)
 {
 	/* Optimization to check if required orders are enabled early. */
-	if (enforce_sysfs && vma_is_anonymous(vma)) {
+	if (enforce_sysfs && (vma_is_anonymous(vma) || vma_is_anon_shmem(vma))) {
 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
 
 		if (vm_flags & VM_HUGEPAGE)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9a1b57ef9c60..9e52c0db7580 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -86,7 +86,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long orders)
 {
 	/* Check the intersection of requested and supported orders. */
-	orders &= vma_is_anonymous(vma) ?
+	orders &= (vma_is_anonymous(vma) || vma_is_anon_shmem(vma)) ?
 			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
 	if (!orders)
 		return 0;
@@ -152,7 +152,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 				     !enforce_sysfs, vma->vm_mm, vm_flags)
 			? orders : 0;
 
-	if (!vma_is_anonymous(vma)) {
+	if (!vma_is_anonymous(vma) && !vma_is_anon_shmem(vma)) {
 		/*
 		 * Enforce sysfs THP requirements as necessary. Anonymous vmas
 		 * were already handled in thp_vma_allowable_orders().
diff --git a/mm/shmem.c b/mm/shmem.c
index b4afda71a3f0..8b009e7040b2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1603,6 +1603,39 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
 	return result;
 }
 
+static unsigned long anon_shmem_suitable_orders(struct vm_fault *vmf,
+					struct address_space *mapping, pgoff_t index)
+{
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+	unsigned long orders, pages;
+	int order;
+
+	/*
+	 * Get a list of all the (large) orders below PMD_ORDER + 1 that are enabled
+	 * for this vma. Then filter out the orders that can't be allocated over
+	 * the faulting address and still be fully contained in the vma.
+	 */
+	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
+					  BIT(PMD_ORDER + 1) - 1);
+	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+
+	if (!orders)
+		return orders;
+
+	/* Find the highest order that can add into the page cache */
+	order = highest_order(orders);
+	while (orders) {
+		pages = 1UL << order;
+		index = round_down(index, pages);
+		if (!xa_find(&mapping->i_pages, &index,
+			     index + pages - 1, XA_PRESENT))
+			break;
+		order = next_order(&orders, order);
+	}
+
+	return orders;
+}
+
 static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
 		struct shmem_inode_info *info, pgoff_t index, int order)
 {
@@ -1631,39 +1664,55 @@ static struct folio *shmem_alloc_folio(gfp_t gfp,
 	return (struct folio *)page;
 }
 
-static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
-		struct inode *inode, pgoff_t index,
+static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
+		gfp_t gfp, struct inode *inode, pgoff_t index,
 		struct mm_struct *fault_mm, bool huge)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
 	struct folio *folio;
 	long pages;
 	int error, order;
+	unsigned long orders;
 
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		huge = false;
 
 	if (huge) {
-		pages = HPAGE_PMD_NR;
-		order = HPAGE_PMD_ORDER;
-		index = round_down(index, HPAGE_PMD_NR);
+		if (vma && vma_is_anon_shmem(vma)) {
+			orders = anon_shmem_suitable_orders(vmf, mapping, index);
+			WARN_ON_ONCE(!orders);
+		} else {
+			pages = HPAGE_PMD_NR;
+			orders = BIT(HPAGE_PMD_ORDER);
+			index = round_down(index, HPAGE_PMD_NR);
 
-		/*
-		 * Check for conflict before waiting on a huge allocation.
-		 * Conflict might be that a huge page has just been allocated
-		 * and added to page cache by a racing thread, or that there
-		 * is already at least one small page in the huge extent.
-		 * Be careful to retry when appropriate, but not forever!
-		 * Elsewhere -EEXIST would be the right code, but not here.
-		 */
-		if (xa_find(&mapping->i_pages, &index,
+			/*
+			 * Check for conflict before waiting on a huge allocation.
+			 * Conflict might be that a huge page has just been allocated
+			 * and added to page cache by a racing thread, or that there
+			 * is already at least one small page in the huge extent.
+			 * Be careful to retry when appropriate, but not forever!
+			 * Elsewhere -EEXIST would be the right code, but not here.
+			 */
+			if (xa_find(&mapping->i_pages, &index,
 				index + HPAGE_PMD_NR - 1, XA_PRESENT))
-			return ERR_PTR(-E2BIG);
+				return ERR_PTR(-E2BIG);
+		}
 
-		folio = shmem_alloc_hugefolio(gfp, info, index, order);
-		if (!folio && pages == HPAGE_PMD_NR)
-			count_vm_event(THP_FILE_FALLBACK);
+		order = highest_order(orders);
+		while (orders) {
+			pages = 1 << order;
+			index = round_down(index, pages);
+			folio = shmem_alloc_hugefolio(gfp, info, index, order);
+			if (folio)
+				goto allocated;
+
+			if (pages == HPAGE_PMD_NR)
+				count_vm_event(THP_FILE_FALLBACK);
+			order = next_order(&orders, order);
+		}
 	} else {
 		pages = 1;
 		folio = shmem_alloc_folio(gfp, info, index);
@@ -1671,6 +1720,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
 	if (!folio)
 		return ERR_PTR(-ENOMEM);
 
+allocated:
 	__folio_set_locked(folio);
 	__folio_set_swapbacked(folio);
 
@@ -2043,7 +2093,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 
 		huge_gfp = vma_thp_gfp_mask(vma);
 		huge_gfp = limit_gfp_mask(huge_gfp, gfp);
-		folio = shmem_alloc_and_add_folio(huge_gfp,
+		folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
 				inode, index, fault_mm, true);
 		if (!IS_ERR(folio)) {
 			if (folio_test_pmd_mappable(folio))
@@ -2054,7 +2104,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 			goto repeat;
 	}
 
-	folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
+	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, false);
 	if (IS_ERR(folio)) {
 		error = PTR_ERR(folio);
 		if (error == -EEXIST)
@@ -2065,7 +2115,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 
 alloced:
 	alloced = true;
-	if (folio_test_pmd_mappable(folio) &&
+	if (folio_test_large(folio) &&
 	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
 					folio_next_index(folio) - 1) {
 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-- 
2.39.3



  parent reply	other threads:[~2024-04-22  7:03 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-22  7:02 [RFC PATCH 0/5] " Baolin Wang
2024-04-22  7:02 ` [RFC PATCH 1/5] mm: memory: extend finish_fault() to support large folio Baolin Wang
2024-04-23  8:39   ` Lance Yang
2024-04-25  7:04     ` Baolin Wang
2024-04-23 11:03   ` Ryan Roberts
2024-04-24  3:23     ` Baolin Wang
2024-04-24  8:07       ` Ryan Roberts
2024-04-24  9:26         ` Baolin Wang
2024-04-24  9:57           ` Ryan Roberts
2024-04-22  7:02 ` [RFC PATCH 2/5] mm: shmem: add an 'order' parameter for shmem_alloc_hugefolio() Baolin Wang
2024-04-24  6:28   ` Kefeng Wang
2024-04-24  6:55     ` Baolin Wang
2024-04-22  7:02 ` [RFC PATCH 3/5] mm: shmem: add THP validation for PMD-mapped THP related statistics Baolin Wang
2024-04-23  1:13   ` Barry Song
2024-04-22  7:02 ` Baolin Wang [this message]
2024-04-22  7:02 ` [RFC PATCH 5/5] mm: shmem: add anonymous share mTHP counters Baolin Wang
2024-04-23  1:17   ` Barry Song
2024-04-23  1:46     ` Baolin Wang
2024-04-23 11:39       ` Ryan Roberts
2024-04-24  3:48         ` Baolin Wang
2024-04-23  9:45     ` Lance Yang
2024-04-23 11:22       ` Lance Yang
2024-04-24  3:49         ` Baolin Wang
2024-04-23 11:37     ` David Hildenbrand
2024-04-24  6:10       ` Baolin Wang
2024-04-24  7:11         ` David Hildenbrand
2024-04-24  8:15           ` Ryan Roberts
2024-04-24  9:31             ` Baolin Wang
2024-04-23 10:41 ` [RFC PATCH 0/5] add mTHP support for anonymous share pages Ryan Roberts
2024-04-23 11:05   ` David Hildenbrand
2024-04-24  6:55   ` Baolin Wang
2024-04-24  8:26     ` Ryan Roberts
2024-04-24  9:55       ` Baolin Wang
2024-04-24 10:01         ` Ryan Roberts
2024-04-24 13:49           ` Baolin Wang
2024-04-24 14:20             ` Ryan Roberts
2024-04-25  6:20               ` Baolin Wang
2024-04-25  8:17                 ` Ryan Roberts
2024-04-25  8:26                   ` David Hildenbrand
2024-04-25  8:46                     ` Ryan Roberts
2024-04-25  8:57                       ` David Hildenbrand
2024-04-25  9:05                         ` Baolin Wang
2024-04-25  9:20                           ` David Hildenbrand
2024-04-25  9:50                             ` Baolin Wang
2024-04-25 10:17                               ` Ryan Roberts
2024-04-25 10:19                                 ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8f2725a856dc02c7c89b4139cc3628686c9de524.1713755580.git.baolin.wang@linux.alibaba.com \
    --to=baolin.wang@linux.alibaba.com \
    --cc=21cnbao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ryan.roberts@arm.com \
    --cc=shy828301@gmail.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox