linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 0/3] mTHP allocation stats for file-backed memory
@ 2024-07-16 13:59 Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 1/3] mm: Cleanup count_mthp_stat() definition Ryan Roberts
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Ryan Roberts @ 2024-07-16 13:59 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Baolin Wang,
	Gavin Shan
  Cc: Ryan Roberts, linux-kernel, linux-mm

Hi All,

With the mTHP shmem stat names cleaned up [2], we can now introduce real "file_"
stats for file-backed memory. This series does that. The stats are useful to
give visibility into how file-backed memory is being allocated. I'm planning to
build upon this with controls to restrict the folio sizes that can be allocatd
for pagecache (subject to test results that demonstrate the value).

---
This applies on top of today's mm-unstable (650b6752c8a3). All mm selftests have
been run; no regressions were observed.

Changes since v1 [1]
====================

  - Added patch 2 to tidy up shmem controls; now exposed for order-1, and not
    exposed for any unsupported high orders.
  - Simplified "stats" subdirectory management with sysfs_merge_group().
  - Added R-b/A-b to patch 1; thanks to David, Barry, Baolin, Lance

[1] https://lore.kernel.org/linux-mm/20240711072929.3590000-1-ryan.roberts@arm.com/
[2] https://lore.kernel.org/linux-mm/20240710095503.3193901-1-ryan.roberts@arm.com/

Thanks,
Ryan

Ryan Roberts (3):
  mm: Cleanup count_mthp_stat() definition
  mm: Tidy up shmem mTHP controls and stats
  mm: mTHP stats for pagecache folio allocations

 Documentation/admin-guide/mm/transhuge.rst |  13 +++
 include/linux/huge_mm.h                    |  73 +++++++------
 include/linux/pagemap.h                    |  16 ++-
 mm/filemap.c                               |   6 +-
 mm/huge_memory.c                           | 117 +++++++++++++++------
 mm/memory.c                                |   2 -
 mm/shmem.c                                 |   6 --
 7 files changed, 156 insertions(+), 77 deletions(-)

--
2.43.0



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 1/3] mm: Cleanup count_mthp_stat() definition
  2024-07-16 13:59 [PATCH v2 0/3] mTHP allocation stats for file-backed memory Ryan Roberts
@ 2024-07-16 13:59 ` Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations Ryan Roberts
  2 siblings, 0 replies; 12+ messages in thread
From: Ryan Roberts @ 2024-07-16 13:59 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Baolin Wang,
	Gavin Shan
  Cc: Ryan Roberts, linux-kernel, linux-mm

Let's move count_mthp_stat() so that it's always defined, even when THP
is disabled. Previously uses of the function in files such as shmem.c,
which are compiled even when THP is disabled, required ugly THP
ifdeferry. With this cleanup, we can remove those ifdefs and the
function resolves to a nop when THP is disabled.

I shortly plan to call count_mthp_stat() from more THP-invariant source
files.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Barry Song <baohua@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Lance Yang <ioworker0@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
---
 include/linux/huge_mm.h | 70 ++++++++++++++++++++---------------------
 mm/memory.c             |  2 --
 mm/shmem.c              |  6 ----
 3 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..b8c63c3e967f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -114,6 +114,41 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
 #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
 
+enum mthp_stat_item {
+	MTHP_STAT_ANON_FAULT_ALLOC,
+	MTHP_STAT_ANON_FAULT_FALLBACK,
+	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_SWPOUT,
+	MTHP_STAT_SWPOUT_FALLBACK,
+	MTHP_STAT_SHMEM_ALLOC,
+	MTHP_STAT_SHMEM_FALLBACK,
+	MTHP_STAT_SHMEM_FALLBACK_CHARGE,
+	MTHP_STAT_SPLIT,
+	MTHP_STAT_SPLIT_FAILED,
+	MTHP_STAT_SPLIT_DEFERRED,
+	__MTHP_STAT_COUNT
+};
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
+struct mthp_stat {
+	unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	if (order <= 0 || order > PMD_ORDER)
+		return;
+
+	this_cpu_inc(mthp_stats.stats[order][item]);
+}
+#else
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+}
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 extern unsigned long transparent_hugepage_flags;
@@ -269,41 +304,6 @@ struct thpsize {
 
 #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
 
-enum mthp_stat_item {
-	MTHP_STAT_ANON_FAULT_ALLOC,
-	MTHP_STAT_ANON_FAULT_FALLBACK,
-	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
-	MTHP_STAT_SWPOUT,
-	MTHP_STAT_SWPOUT_FALLBACK,
-	MTHP_STAT_SHMEM_ALLOC,
-	MTHP_STAT_SHMEM_FALLBACK,
-	MTHP_STAT_SHMEM_FALLBACK_CHARGE,
-	MTHP_STAT_SPLIT,
-	MTHP_STAT_SPLIT_FAILED,
-	MTHP_STAT_SPLIT_DEFERRED,
-	__MTHP_STAT_COUNT
-};
-
-struct mthp_stat {
-	unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
-};
-
-#ifdef CONFIG_SYSFS
-DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
-
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
-	if (order <= 0 || order > PMD_ORDER)
-		return;
-
-	this_cpu_inc(mthp_stats.stats[order][item]);
-}
-#else
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
-{
-}
-#endif
-
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/memory.c b/mm/memory.c
index 802d0d8a40f9..a50fdefb8f0b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4597,9 +4597,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	folio_ref_add(folio, nr_pages - 1);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
-#endif
 	folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
 	folio_add_lru_vma(folio, vma);
 setpte:
diff --git a/mm/shmem.c b/mm/shmem.c
index f24dfbd387ba..fce1343f44e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1776,9 +1776,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
 
 			if (pages == HPAGE_PMD_NR)
 				count_vm_event(THP_FILE_FALLBACK);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
-#endif
 			order = next_order(&suitable_orders, order);
 		}
 	} else {
@@ -1803,10 +1801,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
 				count_vm_event(THP_FILE_FALLBACK);
 				count_vm_event(THP_FILE_FALLBACK_CHARGE);
 			}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
-#endif
 		}
 		goto unlock;
 	}
@@ -2180,9 +2176,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
 		if (!IS_ERR(folio)) {
 			if (folio_test_pmd_mappable(folio))
 				count_vm_event(THP_FILE_ALLOC);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
-#endif
 			goto alloced;
 		}
 		if (PTR_ERR(folio) == -EEXIST)
-- 
2.43.0



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats
  2024-07-16 13:59 [PATCH v2 0/3] mTHP allocation stats for file-backed memory Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 1/3] mm: Cleanup count_mthp_stat() definition Ryan Roberts
@ 2024-07-16 13:59 ` Ryan Roberts
  2024-07-22  6:14   ` Baolin Wang
  2024-07-16 13:59 ` [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations Ryan Roberts
  2 siblings, 1 reply; 12+ messages in thread
From: Ryan Roberts @ 2024-07-16 13:59 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Baolin Wang,
	Gavin Shan
  Cc: Ryan Roberts, linux-kernel, linux-mm

Previously we had a situation where shmem mTHP controls and stats were
not exposed for some supported sizes and were exposed for some
unsupported sizes. So let's clean that up.

Anon mTHP can support all large orders (2, PMD_ORDER). But shmem can
support all large orders (1, MAX_PAGECACHE_ORDER). However, per-size
shmem controls and stats were previously being exposed for all the anon
mTHP orders, meaning order-1 was not present, and for arm64 64K base
pages, orders 12 and 13 were exposed but were not supported internally.

Tidy this all up by defining ctrl and stats attribute groups for anon
and file separately. Anon ctrl and stats groups are populated for all
orders in THP_ORDERS_ALL_ANON and file ctrl and stats groups are
populated for all orders in THP_ORDERS_ALL_FILE_DEFAULT.

The side-effect of all this is that different hugepage-*kB directories
contain different sets of controls and stats, depending on which memory
types support that size. This approach is preferred over the
alternative, which is to populate dummy controls and stats for memory
types that do not support a given size.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 mm/huge_memory.c | 110 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 30 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f4be468e06a4..578ac212c172 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -463,8 +463,8 @@ static void thpsize_release(struct kobject *kobj);
 static DEFINE_SPINLOCK(huge_anon_orders_lock);
 static LIST_HEAD(thpsize_list);
 
-static ssize_t thpsize_enabled_show(struct kobject *kobj,
-				    struct kobj_attribute *attr, char *buf)
+static ssize_t anon_enabled_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
 {
 	int order = to_thpsize(kobj)->order;
 	const char *output;
@@ -481,9 +481,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj,
 	return sysfs_emit(buf, "%s\n", output);
 }
 
-static ssize_t thpsize_enabled_store(struct kobject *kobj,
-				     struct kobj_attribute *attr,
-				     const char *buf, size_t count)
+static ssize_t anon_enabled_store(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  const char *buf, size_t count)
 {
 	int order = to_thpsize(kobj)->order;
 	ssize_t ret = count;
@@ -525,19 +525,27 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
 	return ret;
 }
 
-static struct kobj_attribute thpsize_enabled_attr =
-	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+static struct kobj_attribute anon_enabled_attr =
+	__ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
 
-static struct attribute *thpsize_attrs[] = {
-	&thpsize_enabled_attr.attr,
+static struct attribute *anon_ctrl_attrs[] = {
+	&anon_enabled_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group anon_ctrl_attr_grp = {
+	.attrs = anon_ctrl_attrs,
+};
+
+static struct attribute *file_ctrl_attrs[] = {
 #ifdef CONFIG_SHMEM
 	&thpsize_shmem_enabled_attr.attr,
 #endif
 	NULL,
 };
 
-static const struct attribute_group thpsize_attr_group = {
-	.attrs = thpsize_attrs,
+static const struct attribute_group file_ctrl_attr_grp = {
+	.attrs = file_ctrl_attrs,
 };
 
 static const struct kobj_type thpsize_ktype = {
@@ -583,57 +591,99 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 
-static struct attribute *stats_attrs[] = {
+static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
 	&anon_fault_fallback_attr.attr,
 	&anon_fault_fallback_charge_attr.attr,
 	&swpout_attr.attr,
 	&swpout_fallback_attr.attr,
-	&shmem_alloc_attr.attr,
-	&shmem_fallback_attr.attr,
-	&shmem_fallback_charge_attr.attr,
 	&split_attr.attr,
 	&split_failed_attr.attr,
 	&split_deferred_attr.attr,
 	NULL,
 };
 
-static struct attribute_group stats_attr_group = {
+static struct attribute_group anon_stats_attr_grp = {
+	.name = "stats",
+	.attrs = anon_stats_attrs,
+};
+
+static struct attribute *file_stats_attrs[] = {
+#ifdef CONFIG_SHMEM
+	&shmem_alloc_attr.attr,
+	&shmem_fallback_attr.attr,
+	&shmem_fallback_charge_attr.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group file_stats_attr_grp = {
 	.name = "stats",
-	.attrs = stats_attrs,
+	.attrs = file_stats_attrs,
 };
 
+static int sysfs_add_group(struct kobject *kobj,
+			   const struct attribute_group *grp)
+{
+	int ret = -ENOENT;
+
+	/*
+	 * If the group is named, try to merge first, assuming the subdirectory
+	 * was already created. This avoids the warning emitted by
+	 * sysfs_create_group() if the directory already exists.
+	 */
+	if (grp->name)
+		ret = sysfs_merge_group(kobj, grp);
+	if (ret)
+		ret = sysfs_create_group(kobj, grp);
+
+	return ret;
+}
+
 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 {
 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
 	struct thpsize *thpsize;
-	int ret;
+	int ret = -ENOMEM;
 
 	thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
 	if (!thpsize)
-		return ERR_PTR(-ENOMEM);
+		goto err;
+
+	thpsize->order = order;
 
 	ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
 				   "hugepages-%lukB", size);
 	if (ret) {
 		kfree(thpsize);
-		return ERR_PTR(ret);
+		goto err;
 	}
 
-	ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
-	if (ret) {
-		kobject_put(&thpsize->kobj);
-		return ERR_PTR(ret);
+	if (BIT(order) & THP_ORDERS_ALL_ANON) {
+		ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
+		if (ret)
+			goto err_put;
+
+		ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
+		if (ret)
+			goto err_put;
 	}
 
-	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
-	if (ret) {
-		kobject_put(&thpsize->kobj);
-		return ERR_PTR(ret);
+	if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
+		ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
+		if (ret)
+			goto err_put;
+
+		ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
+		if (ret)
+			goto err_put;
 	}
 
-	thpsize->order = order;
 	return thpsize;
+err_put:
+	kobject_put(&thpsize->kobj);
+err:
+	return ERR_PTR(ret);
 }
 
 static void thpsize_release(struct kobject *kobj)
@@ -673,7 +723,7 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 		goto remove_hp_group;
 	}
 
-	orders = THP_ORDERS_ALL_ANON;
+	orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
 	order = highest_order(orders);
 	while (orders) {
 		thpsize = thpsize_create(order, *hugepage_kobj);
-- 
2.43.0



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-16 13:59 [PATCH v2 0/3] mTHP allocation stats for file-backed memory Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 1/3] mm: Cleanup count_mthp_stat() definition Ryan Roberts
  2024-07-16 13:59 ` [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats Ryan Roberts
@ 2024-07-16 13:59 ` Ryan Roberts
  2024-07-19  0:12   ` Barry Song
  2 siblings, 1 reply; 12+ messages in thread
From: Ryan Roberts @ 2024-07-16 13:59 UTC (permalink / raw)
  To: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Baolin Wang,
	Gavin Shan
  Cc: Ryan Roberts, linux-kernel, linux-mm

Expose 3 new mTHP stats for file (pagecache) folio allocations:

  /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
  /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
  /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge

This will provide some insight on the sizes of large folios being
allocated for file-backed memory, and how often allocation is failing.

All non-order-0 (and most order-0) folio allocations are currently done
through filemap_alloc_folio(), and folios are charged in a subsequent
call to filemap_add_folio(). So count file_fallback when allocation
fails in filemap_alloc_folio() and count file_alloc or
file_fallback_charge in filemap_add_folio(), based on whether charging
succeeded or not. There are some users of filemap_add_folio() that
allocate their own order-0 folio by other means, so we would not count
an allocation failure in this case, but we also don't care about order-0
allocations. This approach feels like it should be good enough and
doesn't require any (impractically large) refactoring.

The existing mTHP stats interface is reused to provide consistency to
users. And because we are reusing the same interface, we can reuse the
same infrastructure on the kernel side.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
 include/linux/huge_mm.h                    |  3 +++
 include/linux/pagemap.h                    | 16 ++++++++++++++--
 mm/filemap.c                               |  6 ++++--
 mm/huge_memory.c                           |  7 +++++++
 5 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 058485daf186..d4857e457add 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -512,6 +512,19 @@ shmem_fallback_charge
 	falls back to using small pages even though the allocation was
 	successful.
 
+file_alloc
+	is incremented every time a file huge page is successfully
+	allocated.
+
+file_fallback
+	is incremented if a file huge page is attempted to be allocated
+	but fails and instead falls back to using small pages.
+
+file_fallback_charge
+	is incremented if a file huge page cannot be charged and instead
+	falls back to using small pages even though the allocation was
+	successful.
+
 split
 	is incremented every time a huge page is successfully split into
 	smaller orders. This can happen for a variety of reasons but a
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b8c63c3e967f..4f9109fcdded 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -123,6 +123,9 @@ enum mthp_stat_item {
 	MTHP_STAT_SHMEM_ALLOC,
 	MTHP_STAT_SHMEM_FALLBACK,
 	MTHP_STAT_SHMEM_FALLBACK_CHARGE,
+	MTHP_STAT_FILE_ALLOC,
+	MTHP_STAT_FILE_FALLBACK,
+	MTHP_STAT_FILE_FALLBACK_CHARGE,
 	MTHP_STAT_SPLIT,
 	MTHP_STAT_SPLIT_FAILED,
 	MTHP_STAT_SPLIT_DEFERRED,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6e2f72d03176..95a147b5d117 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
 }
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
+struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
 #else
-static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
+static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
 {
 	return folio_alloc_noprof(gfp, order);
 }
 #endif
 
+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
+{
+	struct folio *folio;
+
+	folio = __filemap_alloc_folio_noprof(gfp, order);
+
+	if (!folio)
+		count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
+
+	return folio;
+}
+
 #define filemap_alloc_folio(...)				\
 	alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 53d5d0410b51..131d514fca29 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 	int ret;
 
 	ret = mem_cgroup_charge(folio, NULL, gfp);
+	count_mthp_stat(folio_order(folio),
+		ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
 	if (ret)
 		return ret;
 
@@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 EXPORT_SYMBOL_GPL(filemap_add_folio);
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
+struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
 {
 	int n;
 	struct folio *folio;
@@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
 	}
 	return folio_alloc_noprof(gfp, order);
 }
-EXPORT_SYMBOL(filemap_alloc_folio_noprof);
+EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
 #endif
 
 /*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 578ac212c172..26d558e3e80f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
 	.attrs = anon_stats_attrs,
 };
 
+DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
+
 static struct attribute *file_stats_attrs[] = {
+	&file_alloc_attr.attr,
+	&file_fallback_attr.attr,
+	&file_fallback_charge_attr.attr,
 #ifdef CONFIG_SHMEM
 	&shmem_alloc_attr.attr,
 	&shmem_fallback_attr.attr,
-- 
2.43.0



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-16 13:59 ` [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations Ryan Roberts
@ 2024-07-19  0:12   ` Barry Song
  2024-07-19  8:56     ` Ryan Roberts
  0 siblings, 1 reply; 12+ messages in thread
From: Barry Song @ 2024-07-19  0:12 UTC (permalink / raw)
  To: Ryan Roberts
  Cc: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Lance Yang, Baolin Wang, Gavin Shan,
	linux-kernel, linux-mm

On Wed, Jul 17, 2024 at 1:59 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> Expose 3 new mTHP stats for file (pagecache) folio allocations:
>
>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge
>
> This will provide some insight on the sizes of large folios being
> allocated for file-backed memory, and how often allocation is failing.
>
> All non-order-0 (and most order-0) folio allocations are currently done
> through filemap_alloc_folio(), and folios are charged in a subsequent
> call to filemap_add_folio(). So count file_fallback when allocation
> fails in filemap_alloc_folio() and count file_alloc or
> file_fallback_charge in filemap_add_folio(), based on whether charging
> succeeded or not. There are some users of filemap_add_folio() that
> allocate their own order-0 folio by other means, so we would not count
> an allocation failure in this case, but we also don't care about order-0
> allocations. This approach feels like it should be good enough and
> doesn't require any (impractically large) refactoring.
>
> The existing mTHP stats interface is reused to provide consistency to
> users. And because we are reusing the same interface, we can reuse the
> same infrastructure on the kernel side.
>
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  include/linux/pagemap.h                    | 16 ++++++++++++++--
>  mm/filemap.c                               |  6 ++++--
>  mm/huge_memory.c                           |  7 +++++++
>  5 files changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 058485daf186..d4857e457add 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -512,6 +512,19 @@ shmem_fallback_charge
>         falls back to using small pages even though the allocation was
>         successful.
>
> +file_alloc
> +       is incremented every time a file huge page is successfully
> +       allocated.
> +
> +file_fallback
> +       is incremented if a file huge page is attempted to be allocated
> +       but fails and instead falls back to using small pages.
> +
> +file_fallback_charge
> +       is incremented if a file huge page cannot be charged and instead
> +       falls back to using small pages even though the allocation was
> +       successful.
> +

I realized that when we talk about fallback, it doesn't necessarily mean
small pages; it could also refer to smaller huge pages.

anon_fault_alloc
        is incremented every time a huge page is successfully
        allocated and charged to handle a page fault.

anon_fault_fallback
        is incremented if a page fault fails to allocate or charge
        a huge page and instead falls back to using huge pages with
        lower orders or small pages.

anon_fault_fallback_charge
        is incremented if a page fault fails to charge a huge page and
        instead falls back to using huge pages with lower orders or
        small pages even though the allocation was successful.

This also applies to files, right?

                do {
                        gfp_t alloc_gfp = gfp;

                        err = -ENOMEM;
                        if (order > 0)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order);
                        if (!folio)
                                continue;

                        /* Init accessed so avoid atomic
mark_page_accessed later */
                        if (fgp_flags & FGP_ACCESSED)
                                __folio_set_referenced(folio);

                        err = filemap_add_folio(mapping, folio, index, gfp);
                        if (!err)
                                break;
                        folio_put(folio);
                        folio = NULL;
                } while (order-- > 0);


>  split
>         is incremented every time a huge page is successfully split into
>         smaller orders. This can happen for a variety of reasons but a
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index b8c63c3e967f..4f9109fcdded 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -123,6 +123,9 @@ enum mthp_stat_item {
>         MTHP_STAT_SHMEM_ALLOC,
>         MTHP_STAT_SHMEM_FALLBACK,
>         MTHP_STAT_SHMEM_FALLBACK_CHARGE,
> +       MTHP_STAT_FILE_ALLOC,
> +       MTHP_STAT_FILE_FALLBACK,
> +       MTHP_STAT_FILE_FALLBACK_CHARGE,
>         MTHP_STAT_SPLIT,
>         MTHP_STAT_SPLIT_FAILED,
>         MTHP_STAT_SPLIT_DEFERRED,
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 6e2f72d03176..95a147b5d117 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
>  }
>
>  #ifdef CONFIG_NUMA
> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
>  #else
> -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> +static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>  {
>         return folio_alloc_noprof(gfp, order);
>  }
>  #endif
>
> +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> +{
> +       struct folio *folio;
> +
> +       folio = __filemap_alloc_folio_noprof(gfp, order);
> +
> +       if (!folio)
> +               count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
> +
> +       return folio;
> +}

Do we need to add and export __filemap_alloc_folio_noprof()? In any case,
we won't call count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK) and
will only allocate the folio instead?

> +
>  #define filemap_alloc_folio(...)                               \
>         alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 53d5d0410b51..131d514fca29 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>         int ret;
>
>         ret = mem_cgroup_charge(folio, NULL, gfp);
> +       count_mthp_stat(folio_order(folio),
> +               ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
>         if (ret)
>                 return ret;

Would the following be better?

        ret = mem_cgroup_charge(folio, NULL, gfp);
         if (ret) {
                 count_mthp_stat(folio_order(folio),
MTHP_STAT_FILE_FALLBACK_CHARGE);
                 return ret;
         }
       count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC);

Anyway, it's up to you. The code just feels a bit off to me :-)

>
> @@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>  EXPORT_SYMBOL_GPL(filemap_add_folio);
>
>  #ifdef CONFIG_NUMA
> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>  {
>         int n;
>         struct folio *folio;
> @@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>         }
>         return folio_alloc_noprof(gfp, order);
>  }
> -EXPORT_SYMBOL(filemap_alloc_folio_noprof);
> +EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
>  #endif
>
>  /*
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 578ac212c172..26d558e3e80f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
>         .attrs = anon_stats_attrs,
>  };
>
> +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
> +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
> +
>  static struct attribute *file_stats_attrs[] = {
> +       &file_alloc_attr.attr,
> +       &file_fallback_attr.attr,
> +       &file_fallback_charge_attr.attr,
>  #ifdef CONFIG_SHMEM
>         &shmem_alloc_attr.attr,
>         &shmem_fallback_attr.attr,
> --
> 2.43.0
>

Thanks
Barry


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-19  0:12   ` Barry Song
@ 2024-07-19  8:56     ` Ryan Roberts
  2024-07-23 22:07       ` Barry Song
  0 siblings, 1 reply; 12+ messages in thread
From: Ryan Roberts @ 2024-07-19  8:56 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Lance Yang, Baolin Wang, Gavin Shan,
	linux-kernel, linux-mm

On 19/07/2024 01:12, Barry Song wrote:
> On Wed, Jul 17, 2024 at 1:59 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> Expose 3 new mTHP stats for file (pagecache) folio allocations:
>>
>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge
>>
>> This will provide some insight on the sizes of large folios being
>> allocated for file-backed memory, and how often allocation is failing.
>>
>> All non-order-0 (and most order-0) folio allocations are currently done
>> through filemap_alloc_folio(), and folios are charged in a subsequent
>> call to filemap_add_folio(). So count file_fallback when allocation
>> fails in filemap_alloc_folio() and count file_alloc or
>> file_fallback_charge in filemap_add_folio(), based on whether charging
>> succeeded or not. There are some users of filemap_add_folio() that
>> allocate their own order-0 folio by other means, so we would not count
>> an allocation failure in this case, but we also don't care about order-0
>> allocations. This approach feels like it should be good enough and
>> doesn't require any (impractically large) refactoring.
>>
>> The existing mTHP stats interface is reused to provide consistency to
>> users. And because we are reusing the same interface, we can reuse the
>> same infrastructure on the kernel side.
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>  Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
>>  include/linux/huge_mm.h                    |  3 +++
>>  include/linux/pagemap.h                    | 16 ++++++++++++++--
>>  mm/filemap.c                               |  6 ++++--
>>  mm/huge_memory.c                           |  7 +++++++
>>  5 files changed, 41 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 058485daf186..d4857e457add 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -512,6 +512,19 @@ shmem_fallback_charge
>>         falls back to using small pages even though the allocation was
>>         successful.
>>
>> +file_alloc
>> +       is incremented every time a file huge page is successfully
>> +       allocated.
>> +
>> +file_fallback
>> +       is incremented if a file huge page is attempted to be allocated
>> +       but fails and instead falls back to using small pages.
>> +
>> +file_fallback_charge
>> +       is incremented if a file huge page cannot be charged and instead
>> +       falls back to using small pages even though the allocation was
>> +       successful.
>> +
> 
> I realized that when we talk about fallback, it doesn't necessarily mean
> small pages; it could also refer to smaller huge pages.

Yes good point, I'll update the documentation to reflect that for all memory types.

> 
> anon_fault_alloc
>         is incremented every time a huge page is successfully
>         allocated and charged to handle a page fault.
> 
> anon_fault_fallback
>         is incremented if a page fault fails to allocate or charge
>         a huge page and instead falls back to using huge pages with
>         lower orders or small pages.
> 
> anon_fault_fallback_charge
>         is incremented if a page fault fails to charge a huge page and
>         instead falls back to using huge pages with lower orders or
>         small pages even though the allocation was successful.
> 
> This also applies to files, right?

It does in the place you highlight below, but page_cache_ra_order() just falls
back immediately to order-0. Regardless, I think we should just document the
user facing docs to allow for a lower high order. That gives the implementation
more flexibility.

> 
>                 do {
>                         gfp_t alloc_gfp = gfp;
> 
>                         err = -ENOMEM;
>                         if (order > 0)
>                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
>                         folio = filemap_alloc_folio(alloc_gfp, order);
>                         if (!folio)
>                                 continue;
> 
>                         /* Init accessed so avoid atomic
> mark_page_accessed later */
>                         if (fgp_flags & FGP_ACCESSED)
>                                 __folio_set_referenced(folio);
> 
>                         err = filemap_add_folio(mapping, folio, index, gfp);
>                         if (!err)
>                                 break;
>                         folio_put(folio);
>                         folio = NULL;
>                 } while (order-- > 0);
> 
> 
>>  split
>>         is incremented every time a huge page is successfully split into
>>         smaller orders. This can happen for a variety of reasons but a
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index b8c63c3e967f..4f9109fcdded 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -123,6 +123,9 @@ enum mthp_stat_item {
>>         MTHP_STAT_SHMEM_ALLOC,
>>         MTHP_STAT_SHMEM_FALLBACK,
>>         MTHP_STAT_SHMEM_FALLBACK_CHARGE,
>> +       MTHP_STAT_FILE_ALLOC,
>> +       MTHP_STAT_FILE_FALLBACK,
>> +       MTHP_STAT_FILE_FALLBACK_CHARGE,
>>         MTHP_STAT_SPLIT,
>>         MTHP_STAT_SPLIT_FAILED,
>>         MTHP_STAT_SPLIT_DEFERRED,
>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>> index 6e2f72d03176..95a147b5d117 100644
>> --- a/include/linux/pagemap.h
>> +++ b/include/linux/pagemap.h
>> @@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
>>  }
>>
>>  #ifdef CONFIG_NUMA
>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
>>  #else
>> -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>> +static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>  {
>>         return folio_alloc_noprof(gfp, order);
>>  }
>>  #endif
>>
>> +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>> +{
>> +       struct folio *folio;
>> +
>> +       folio = __filemap_alloc_folio_noprof(gfp, order);
>> +
>> +       if (!folio)
>> +               count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
>> +
>> +       return folio;
>> +}
> 
> Do we need to add and export __filemap_alloc_folio_noprof()? 

It is exported. See the below change in filemap.c. Previously
filemap_alloc_folio_noprof() was exported, but that is now an inline and
__filemap_alloc_folio_noprof() is exported in its place.

> In any case,
> we won't call count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK) and
> will only allocate the folio instead?

Sorry I don't understand what you mean by this?

> 
>> +
>>  #define filemap_alloc_folio(...)                               \
>>         alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
>>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index 53d5d0410b51..131d514fca29 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>>         int ret;
>>
>>         ret = mem_cgroup_charge(folio, NULL, gfp);
>> +       count_mthp_stat(folio_order(folio),
>> +               ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
>>         if (ret)
>>                 return ret;
> 
> Would the following be better?
> 
>         ret = mem_cgroup_charge(folio, NULL, gfp);
>          if (ret) {
>                  count_mthp_stat(folio_order(folio),
> MTHP_STAT_FILE_FALLBACK_CHARGE);
>                  return ret;
>          }
>        count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC);
> 
> Anyway, it's up to you. The code just feels a bit off to me :-)

Yes, agree your version is better. I also noticed that anon and shmem increment
FALLBACK whenever FALLBACK_CHARGE is incremented so I should apply those same
semantics here.

Thanks,
Ryan


> 
>>
>> @@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>>  EXPORT_SYMBOL_GPL(filemap_add_folio);
>>
>>  #ifdef CONFIG_NUMA
>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>  {
>>         int n;
>>         struct folio *folio;
>> @@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>         }
>>         return folio_alloc_noprof(gfp, order);
>>  }
>> -EXPORT_SYMBOL(filemap_alloc_folio_noprof);
>> +EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
>>  #endif
>>
>>  /*
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 578ac212c172..26d558e3e80f 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
>>         .attrs = anon_stats_attrs,
>>  };
>>
>> +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
>> +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
>> +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
>> +
>>  static struct attribute *file_stats_attrs[] = {
>> +       &file_alloc_attr.attr,
>> +       &file_fallback_attr.attr,
>> +       &file_fallback_charge_attr.attr,
>>  #ifdef CONFIG_SHMEM
>>         &shmem_alloc_attr.attr,
>>         &shmem_fallback_attr.attr,
>> --
>> 2.43.0
>>
> 
> Thanks
> Barry



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats
  2024-07-16 13:59 ` [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats Ryan Roberts
@ 2024-07-22  6:14   ` Baolin Wang
  2024-07-22  7:33     ` Ryan Roberts
  0 siblings, 1 reply; 12+ messages in thread
From: Baolin Wang @ 2024-07-22  6:14 UTC (permalink / raw)
  To: Ryan Roberts, Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Gavin Shan
  Cc: linux-kernel, linux-mm



On 2024/7/16 21:59, Ryan Roberts wrote:
> Previously we had a situation where shmem mTHP controls and stats were
> not exposed for some supported sizes and were exposed for some
> unsupported sizes. So let's clean that up.
> 
> Anon mTHP can support all large orders (2, PMD_ORDER). But shmem can
> support all large orders (1, MAX_PAGECACHE_ORDER). However, per-size
> shmem controls and stats were previously being exposed for all the anon
> mTHP orders, meaning order-1 was not present, and for arm64 64K base
> pages, orders 12 and 13 were exposed but were not supported internally.
> 
> Tidy this all up by defining ctrl and stats attribute groups for anon
> and file separately. Anon ctrl and stats groups are populated for all
> orders in THP_ORDERS_ALL_ANON and file ctrl and stats groups are
> populated for all orders in THP_ORDERS_ALL_FILE_DEFAULT.

Make sense.

> 
> The side-effect of all this is that different hugepage-*kB directories
> contain different sets of controls and stats, depending on which memory
> types support that size. This approach is preferred over the
> alternative, which is to populate dummy controls and stats for memory
> types that do not support a given size.

OK.

> 
> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> ---
>   mm/huge_memory.c | 110 ++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 80 insertions(+), 30 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f4be468e06a4..578ac212c172 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -463,8 +463,8 @@ static void thpsize_release(struct kobject *kobj);
>   static DEFINE_SPINLOCK(huge_anon_orders_lock);
>   static LIST_HEAD(thpsize_list);
>   
> -static ssize_t thpsize_enabled_show(struct kobject *kobj,
> -				    struct kobj_attribute *attr, char *buf)
> +static ssize_t anon_enabled_show(struct kobject *kobj,
> +				 struct kobj_attribute *attr, char *buf)
>   {
>   	int order = to_thpsize(kobj)->order;
>   	const char *output;
> @@ -481,9 +481,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj,
>   	return sysfs_emit(buf, "%s\n", output);
>   }
>   
> -static ssize_t thpsize_enabled_store(struct kobject *kobj,
> -				     struct kobj_attribute *attr,
> -				     const char *buf, size_t count)
> +static ssize_t anon_enabled_store(struct kobject *kobj,
> +				  struct kobj_attribute *attr,
> +				  const char *buf, size_t count)
>   {
>   	int order = to_thpsize(kobj)->order;
>   	ssize_t ret = count;
> @@ -525,19 +525,27 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
>   	return ret;
>   }
>   
> -static struct kobj_attribute thpsize_enabled_attr =
> -	__ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
> +static struct kobj_attribute anon_enabled_attr =
> +	__ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
>   
> -static struct attribute *thpsize_attrs[] = {
> -	&thpsize_enabled_attr.attr,
> +static struct attribute *anon_ctrl_attrs[] = {
> +	&anon_enabled_attr.attr,
> +	NULL,
> +};
> +
> +static const struct attribute_group anon_ctrl_attr_grp = {
> +	.attrs = anon_ctrl_attrs,
> +};
> +
> +static struct attribute *file_ctrl_attrs[] = {
>   #ifdef CONFIG_SHMEM
>   	&thpsize_shmem_enabled_attr.attr,
>   #endif
>   	NULL,
>   };
>   
> -static const struct attribute_group thpsize_attr_group = {
> -	.attrs = thpsize_attrs,
> +static const struct attribute_group file_ctrl_attr_grp = {
> +	.attrs = file_ctrl_attrs,
>   };
>   
>   static const struct kobj_type thpsize_ktype = {
> @@ -583,57 +591,99 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>   DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>   DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>   
> -static struct attribute *stats_attrs[] = {
> +static struct attribute *anon_stats_attrs[] = {
>   	&anon_fault_alloc_attr.attr,
>   	&anon_fault_fallback_attr.attr,
>   	&anon_fault_fallback_charge_attr.attr,
>   	&swpout_attr.attr,
>   	&swpout_fallback_attr.attr,
> -	&shmem_alloc_attr.attr,
> -	&shmem_fallback_attr.attr,
> -	&shmem_fallback_charge_attr.attr,
>   	&split_attr.attr,
>   	&split_failed_attr.attr,
>   	&split_deferred_attr.attr,
>   	NULL,
>   };
>   
> -static struct attribute_group stats_attr_group = {
> +static struct attribute_group anon_stats_attr_grp = {
> +	.name = "stats",
> +	.attrs = anon_stats_attrs,
> +};
> +
> +static struct attribute *file_stats_attrs[] = {
> +#ifdef CONFIG_SHMEM
> +	&shmem_alloc_attr.attr,
> +	&shmem_fallback_attr.attr,
> +	&shmem_fallback_charge_attr.attr,

Shmem should also support swpout_* counters.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats
  2024-07-22  6:14   ` Baolin Wang
@ 2024-07-22  7:33     ` Ryan Roberts
  2024-07-22  8:04       ` Baolin Wang
  0 siblings, 1 reply; 12+ messages in thread
From: Ryan Roberts @ 2024-07-22  7:33 UTC (permalink / raw)
  To: Baolin Wang, Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Gavin Shan
  Cc: linux-kernel, linux-mm

On 22/07/2024 07:14, Baolin Wang wrote:
> 
> 
> On 2024/7/16 21:59, Ryan Roberts wrote:
>> Previously we had a situation where shmem mTHP controls and stats were
>> not exposed for some supported sizes and were exposed for some
>> unsupported sizes. So let's clean that up.
>>
>> Anon mTHP can support all large orders (2, PMD_ORDER). But shmem can
>> support all large orders (1, MAX_PAGECACHE_ORDER). However, per-size
>> shmem controls and stats were previously being exposed for all the anon
>> mTHP orders, meaning order-1 was not present, and for arm64 64K base
>> pages, orders 12 and 13 were exposed but were not supported internally.
>>
>> Tidy this all up by defining ctrl and stats attribute groups for anon
>> and file separately. Anon ctrl and stats groups are populated for all
>> orders in THP_ORDERS_ALL_ANON and file ctrl and stats groups are
>> populated for all orders in THP_ORDERS_ALL_FILE_DEFAULT.
> 
> Make sense.
> 
>>
>> The side-effect of all this is that different hugepage-*kB directories
>> contain different sets of controls and stats, depending on which memory
>> types support that size. This approach is preferred over the
>> alternative, which is to populate dummy controls and stats for memory
>> types that do not support a given size.
> 
> OK.
> 
>>
>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>> ---
>>   mm/huge_memory.c | 110 ++++++++++++++++++++++++++++++++++-------------
>>   1 file changed, 80 insertions(+), 30 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index f4be468e06a4..578ac212c172 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -463,8 +463,8 @@ static void thpsize_release(struct kobject *kobj);
>>   static DEFINE_SPINLOCK(huge_anon_orders_lock);
>>   static LIST_HEAD(thpsize_list);
>>   -static ssize_t thpsize_enabled_show(struct kobject *kobj,
>> -                    struct kobj_attribute *attr, char *buf)
>> +static ssize_t anon_enabled_show(struct kobject *kobj,
>> +                 struct kobj_attribute *attr, char *buf)
>>   {
>>       int order = to_thpsize(kobj)->order;
>>       const char *output;
>> @@ -481,9 +481,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj,
>>       return sysfs_emit(buf, "%s\n", output);
>>   }
>>   -static ssize_t thpsize_enabled_store(struct kobject *kobj,
>> -                     struct kobj_attribute *attr,
>> -                     const char *buf, size_t count)
>> +static ssize_t anon_enabled_store(struct kobject *kobj,
>> +                  struct kobj_attribute *attr,
>> +                  const char *buf, size_t count)
>>   {
>>       int order = to_thpsize(kobj)->order;
>>       ssize_t ret = count;
>> @@ -525,19 +525,27 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
>>       return ret;
>>   }
>>   -static struct kobj_attribute thpsize_enabled_attr =
>> -    __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
>> +static struct kobj_attribute anon_enabled_attr =
>> +    __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
>>   -static struct attribute *thpsize_attrs[] = {
>> -    &thpsize_enabled_attr.attr,
>> +static struct attribute *anon_ctrl_attrs[] = {
>> +    &anon_enabled_attr.attr,
>> +    NULL,
>> +};
>> +
>> +static const struct attribute_group anon_ctrl_attr_grp = {
>> +    .attrs = anon_ctrl_attrs,
>> +};
>> +
>> +static struct attribute *file_ctrl_attrs[] = {
>>   #ifdef CONFIG_SHMEM
>>       &thpsize_shmem_enabled_attr.attr,
>>   #endif
>>       NULL,
>>   };
>>   -static const struct attribute_group thpsize_attr_group = {
>> -    .attrs = thpsize_attrs,
>> +static const struct attribute_group file_ctrl_attr_grp = {
>> +    .attrs = file_ctrl_attrs,
>>   };
>>     static const struct kobj_type thpsize_ktype = {
>> @@ -583,57 +591,99 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>   DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>   DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>>   -static struct attribute *stats_attrs[] = {
>> +static struct attribute *anon_stats_attrs[] = {
>>       &anon_fault_alloc_attr.attr,
>>       &anon_fault_fallback_attr.attr,
>>       &anon_fault_fallback_charge_attr.attr,
>>       &swpout_attr.attr,
>>       &swpout_fallback_attr.attr,
>> -    &shmem_alloc_attr.attr,
>> -    &shmem_fallback_attr.attr,
>> -    &shmem_fallback_charge_attr.attr,
>>       &split_attr.attr,
>>       &split_failed_attr.attr,
>>       &split_deferred_attr.attr,
>>       NULL,
>>   };
>>   -static struct attribute_group stats_attr_group = {
>> +static struct attribute_group anon_stats_attr_grp = {
>> +    .name = "stats",
>> +    .attrs = anon_stats_attrs,
>> +};
>> +
>> +static struct attribute *file_stats_attrs[] = {
>> +#ifdef CONFIG_SHMEM
>> +    &shmem_alloc_attr.attr,
>> +    &shmem_fallback_attr.attr,
>> +    &shmem_fallback_charge_attr.attr,
> 
> Shmem should also support swpout_* counters.

OK, so to put it another way, swpout_* stats are required all orders in
(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT) if CONFIG_SHMEM is defined,
else all orders in THP_ORDERS_ALL_ANON. Have I understood correctly?

If so, I'll fix that in the next version.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats
  2024-07-22  7:33     ` Ryan Roberts
@ 2024-07-22  8:04       ` Baolin Wang
  0 siblings, 0 replies; 12+ messages in thread
From: Baolin Wang @ 2024-07-22  8:04 UTC (permalink / raw)
  To: Ryan Roberts, Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Barry Song, Lance Yang, Gavin Shan
  Cc: linux-kernel, linux-mm



On 2024/7/22 15:33, Ryan Roberts wrote:
> On 22/07/2024 07:14, Baolin Wang wrote:
>>
>>
>> On 2024/7/16 21:59, Ryan Roberts wrote:
>>> Previously we had a situation where shmem mTHP controls and stats were
>>> not exposed for some supported sizes and were exposed for some
>>> unsupported sizes. So let's clean that up.
>>>
>>> Anon mTHP can support all large orders (2, PMD_ORDER). But shmem can
>>> support all large orders (1, MAX_PAGECACHE_ORDER). However, per-size
>>> shmem controls and stats were previously being exposed for all the anon
>>> mTHP orders, meaning order-1 was not present, and for arm64 64K base
>>> pages, orders 12 and 13 were exposed but were not supported internally.
>>>
>>> Tidy this all up by defining ctrl and stats attribute groups for anon
>>> and file separately. Anon ctrl and stats groups are populated for all
>>> orders in THP_ORDERS_ALL_ANON and file ctrl and stats groups are
>>> populated for all orders in THP_ORDERS_ALL_FILE_DEFAULT.
>>
>> Make sense.
>>
>>>
>>> The side-effect of all this is that different hugepage-*kB directories
>>> contain different sets of controls and stats, depending on which memory
>>> types support that size. This approach is preferred over the
>>> alternative, which is to populate dummy controls and stats for memory
>>> types that do not support a given size.
>>
>> OK.
>>
>>>
>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>> ---
>>>    mm/huge_memory.c | 110 ++++++++++++++++++++++++++++++++++-------------
>>>    1 file changed, 80 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index f4be468e06a4..578ac212c172 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -463,8 +463,8 @@ static void thpsize_release(struct kobject *kobj);
>>>    static DEFINE_SPINLOCK(huge_anon_orders_lock);
>>>    static LIST_HEAD(thpsize_list);
>>>    -static ssize_t thpsize_enabled_show(struct kobject *kobj,
>>> -                    struct kobj_attribute *attr, char *buf)
>>> +static ssize_t anon_enabled_show(struct kobject *kobj,
>>> +                 struct kobj_attribute *attr, char *buf)
>>>    {
>>>        int order = to_thpsize(kobj)->order;
>>>        const char *output;
>>> @@ -481,9 +481,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj,
>>>        return sysfs_emit(buf, "%s\n", output);
>>>    }
>>>    -static ssize_t thpsize_enabled_store(struct kobject *kobj,
>>> -                     struct kobj_attribute *attr,
>>> -                     const char *buf, size_t count)
>>> +static ssize_t anon_enabled_store(struct kobject *kobj,
>>> +                  struct kobj_attribute *attr,
>>> +                  const char *buf, size_t count)
>>>    {
>>>        int order = to_thpsize(kobj)->order;
>>>        ssize_t ret = count;
>>> @@ -525,19 +525,27 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
>>>        return ret;
>>>    }
>>>    -static struct kobj_attribute thpsize_enabled_attr =
>>> -    __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
>>> +static struct kobj_attribute anon_enabled_attr =
>>> +    __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
>>>    -static struct attribute *thpsize_attrs[] = {
>>> -    &thpsize_enabled_attr.attr,
>>> +static struct attribute *anon_ctrl_attrs[] = {
>>> +    &anon_enabled_attr.attr,
>>> +    NULL,
>>> +};
>>> +
>>> +static const struct attribute_group anon_ctrl_attr_grp = {
>>> +    .attrs = anon_ctrl_attrs,
>>> +};
>>> +
>>> +static struct attribute *file_ctrl_attrs[] = {
>>>    #ifdef CONFIG_SHMEM
>>>        &thpsize_shmem_enabled_attr.attr,
>>>    #endif
>>>        NULL,
>>>    };
>>>    -static const struct attribute_group thpsize_attr_group = {
>>> -    .attrs = thpsize_attrs,
>>> +static const struct attribute_group file_ctrl_attr_grp = {
>>> +    .attrs = file_ctrl_attrs,
>>>    };
>>>      static const struct kobj_type thpsize_ktype = {
>>> @@ -583,57 +591,99 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>>    DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>>    DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>>>    -static struct attribute *stats_attrs[] = {
>>> +static struct attribute *anon_stats_attrs[] = {
>>>        &anon_fault_alloc_attr.attr,
>>>        &anon_fault_fallback_attr.attr,
>>>        &anon_fault_fallback_charge_attr.attr,
>>>        &swpout_attr.attr,
>>>        &swpout_fallback_attr.attr,
>>> -    &shmem_alloc_attr.attr,
>>> -    &shmem_fallback_attr.attr,
>>> -    &shmem_fallback_charge_attr.attr,
>>>        &split_attr.attr,
>>>        &split_failed_attr.attr,
>>>        &split_deferred_attr.attr,
>>>        NULL,
>>>    };
>>>    -static struct attribute_group stats_attr_group = {
>>> +static struct attribute_group anon_stats_attr_grp = {
>>> +    .name = "stats",
>>> +    .attrs = anon_stats_attrs,
>>> +};
>>> +
>>> +static struct attribute *file_stats_attrs[] = {
>>> +#ifdef CONFIG_SHMEM
>>> +    &shmem_alloc_attr.attr,
>>> +    &shmem_fallback_attr.attr,
>>> +    &shmem_fallback_charge_attr.attr,
>>
>> Shmem should also support swpout_* counters.
> 
> OK, so to put it another way, swpout_* stats are required all orders in
> (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT) if CONFIG_SHMEM is defined,
> else all orders in THP_ORDERS_ALL_ANON. Have I understood correctly?
> 
> If so, I'll fix that in the next version.

Yes, I think so.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-19  8:56     ` Ryan Roberts
@ 2024-07-23 22:07       ` Barry Song
  2024-07-24  8:12         ` Ryan Roberts
  0 siblings, 1 reply; 12+ messages in thread
From: Barry Song @ 2024-07-23 22:07 UTC (permalink / raw)
  To: Ryan Roberts
  Cc: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Lance Yang, Baolin Wang, Gavin Shan,
	linux-kernel, linux-mm

On Fri, Jul 19, 2024 at 8:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 19/07/2024 01:12, Barry Song wrote:
> > On Wed, Jul 17, 2024 at 1:59 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> Expose 3 new mTHP stats for file (pagecache) folio allocations:
> >>
> >>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
> >>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
> >>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge
> >>
> >> This will provide some insight on the sizes of large folios being
> >> allocated for file-backed memory, and how often allocation is failing.
> >>
> >> All non-order-0 (and most order-0) folio allocations are currently done
> >> through filemap_alloc_folio(), and folios are charged in a subsequent
> >> call to filemap_add_folio(). So count file_fallback when allocation
> >> fails in filemap_alloc_folio() and count file_alloc or
> >> file_fallback_charge in filemap_add_folio(), based on whether charging
> >> succeeded or not. There are some users of filemap_add_folio() that
> >> allocate their own order-0 folio by other means, so we would not count
> >> an allocation failure in this case, but we also don't care about order-0
> >> allocations. This approach feels like it should be good enough and
> >> doesn't require any (impractically large) refactoring.
> >>
> >> The existing mTHP stats interface is reused to provide consistency to
> >> users. And because we are reusing the same interface, we can reuse the
> >> same infrastructure on the kernel side.
> >>
> >> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >> ---
> >>  Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
> >>  include/linux/huge_mm.h                    |  3 +++
> >>  include/linux/pagemap.h                    | 16 ++++++++++++++--
> >>  mm/filemap.c                               |  6 ++++--
> >>  mm/huge_memory.c                           |  7 +++++++
> >>  5 files changed, 41 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> >> index 058485daf186..d4857e457add 100644
> >> --- a/Documentation/admin-guide/mm/transhuge.rst
> >> +++ b/Documentation/admin-guide/mm/transhuge.rst
> >> @@ -512,6 +512,19 @@ shmem_fallback_charge
> >>         falls back to using small pages even though the allocation was
> >>         successful.
> >>
> >> +file_alloc
> >> +       is incremented every time a file huge page is successfully
> >> +       allocated.
> >> +
> >> +file_fallback
> >> +       is incremented if a file huge page is attempted to be allocated
> >> +       but fails and instead falls back to using small pages.
> >> +
> >> +file_fallback_charge
> >> +       is incremented if a file huge page cannot be charged and instead
> >> +       falls back to using small pages even though the allocation was
> >> +       successful.
> >> +
> >
> > I realized that when we talk about fallback, it doesn't necessarily mean
> > small pages; it could also refer to smaller huge pages.
>
> Yes good point, I'll update the documentation to reflect that for all memory types.
>
> >
> > anon_fault_alloc
> >         is incremented every time a huge page is successfully
> >         allocated and charged to handle a page fault.
> >
> > anon_fault_fallback
> >         is incremented if a page fault fails to allocate or charge
> >         a huge page and instead falls back to using huge pages with
> >         lower orders or small pages.
> >
> > anon_fault_fallback_charge
> >         is incremented if a page fault fails to charge a huge page and
> >         instead falls back to using huge pages with lower orders or
> >         small pages even though the allocation was successful.
> >
> > This also applies to files, right?
>
> It does in the place you highlight below, but page_cache_ra_order() just falls
> back immediately to order-0. Regardless, I think we should just document the
> user facing docs to allow for a lower high order. That gives the implementation
> more flexibility.
>
> >
> >                 do {
> >                         gfp_t alloc_gfp = gfp;
> >
> >                         err = -ENOMEM;
> >                         if (order > 0)
> >                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
> >                         folio = filemap_alloc_folio(alloc_gfp, order);
> >                         if (!folio)
> >                                 continue;
> >
> >                         /* Init accessed so avoid atomic
> > mark_page_accessed later */
> >                         if (fgp_flags & FGP_ACCESSED)
> >                                 __folio_set_referenced(folio);
> >
> >                         err = filemap_add_folio(mapping, folio, index, gfp);
> >                         if (!err)
> >                                 break;
> >                         folio_put(folio);
> >                         folio = NULL;
> >                 } while (order-- > 0);
> >
> >
> >>  split
> >>         is incremented every time a huge page is successfully split into
> >>         smaller orders. This can happen for a variety of reasons but a
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index b8c63c3e967f..4f9109fcdded 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -123,6 +123,9 @@ enum mthp_stat_item {
> >>         MTHP_STAT_SHMEM_ALLOC,
> >>         MTHP_STAT_SHMEM_FALLBACK,
> >>         MTHP_STAT_SHMEM_FALLBACK_CHARGE,
> >> +       MTHP_STAT_FILE_ALLOC,
> >> +       MTHP_STAT_FILE_FALLBACK,
> >> +       MTHP_STAT_FILE_FALLBACK_CHARGE,
> >>         MTHP_STAT_SPLIT,
> >>         MTHP_STAT_SPLIT_FAILED,
> >>         MTHP_STAT_SPLIT_DEFERRED,
> >> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> >> index 6e2f72d03176..95a147b5d117 100644
> >> --- a/include/linux/pagemap.h
> >> +++ b/include/linux/pagemap.h
> >> @@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
> >>  }
> >>
> >>  #ifdef CONFIG_NUMA
> >> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
> >> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
> >>  #else
> >> -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >> +static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>  {
> >>         return folio_alloc_noprof(gfp, order);
> >>  }
> >>  #endif
> >>
> >> +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >> +{
> >> +       struct folio *folio;
> >> +
> >> +       folio = __filemap_alloc_folio_noprof(gfp, order);
> >> +
> >> +       if (!folio)
> >> +               count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
> >> +
> >> +       return folio;
> >> +}
> >
> > Do we need to add and export __filemap_alloc_folio_noprof()?
>
> It is exported. See the below change in filemap.c. Previously
> filemap_alloc_folio_noprof() was exported, but that is now an inline and
> __filemap_alloc_folio_noprof() is exported in its place.
>
> > In any case,
> > we won't call count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK) and
> > will only allocate the folio instead?
>
> Sorry I don't understand what you mean by this?

Ryan, my question is whether exporting __filemap_alloc_folio_noprof() might lead
to situations where people call __filemap_alloc_folio_noprof() and
forget to call
count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK). Currently, it seems
that counting is always necessary? Another option is that we still
call count mthp
within filemap_alloc_folio_noprof() and make it noinline if
__filemap_alloc_folio_noprof()
is not inline?

>
> >
> >> +
> >>  #define filemap_alloc_folio(...)                               \
> >>         alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
> >>
> >> diff --git a/mm/filemap.c b/mm/filemap.c
> >> index 53d5d0410b51..131d514fca29 100644
> >> --- a/mm/filemap.c
> >> +++ b/mm/filemap.c
> >> @@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
> >>         int ret;
> >>
> >>         ret = mem_cgroup_charge(folio, NULL, gfp);
> >> +       count_mthp_stat(folio_order(folio),
> >> +               ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
> >>         if (ret)
> >>                 return ret;
> >
> > Would the following be better?
> >
> >         ret = mem_cgroup_charge(folio, NULL, gfp);
> >          if (ret) {
> >                  count_mthp_stat(folio_order(folio),
> > MTHP_STAT_FILE_FALLBACK_CHARGE);
> >                  return ret;
> >          }
> >        count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC);
> >
> > Anyway, it's up to you. The code just feels a bit off to me :-)
>
> Yes, agree your version is better. I also noticed that anon and shmem increment
> FALLBACK whenever FALLBACK_CHARGE is incremented so I should apply those same
> semantics here.
>
> Thanks,
> Ryan
>
>
> >
> >>
> >> @@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
> >>  EXPORT_SYMBOL_GPL(filemap_add_folio);
> >>
> >>  #ifdef CONFIG_NUMA
> >> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>  {
> >>         int n;
> >>         struct folio *folio;
> >> @@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>         }
> >>         return folio_alloc_noprof(gfp, order);
> >>  }
> >> -EXPORT_SYMBOL(filemap_alloc_folio_noprof);
> >> +EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
> >>  #endif
> >>
> >>  /*
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 578ac212c172..26d558e3e80f 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
> >>         .attrs = anon_stats_attrs,
> >>  };
> >>
> >> +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
> >> +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
> >> +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
> >> +
> >>  static struct attribute *file_stats_attrs[] = {
> >> +       &file_alloc_attr.attr,
> >> +       &file_fallback_attr.attr,
> >> +       &file_fallback_charge_attr.attr,
> >>  #ifdef CONFIG_SHMEM
> >>         &shmem_alloc_attr.attr,
> >>         &shmem_fallback_attr.attr,
> >> --
> >> 2.43.0
> >>
> >

Thanks
Barry


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-23 22:07       ` Barry Song
@ 2024-07-24  8:12         ` Ryan Roberts
  2024-07-24 10:23           ` Barry Song
  0 siblings, 1 reply; 12+ messages in thread
From: Ryan Roberts @ 2024-07-24  8:12 UTC (permalink / raw)
  To: Barry Song
  Cc: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Lance Yang, Baolin Wang, Gavin Shan,
	linux-kernel, linux-mm

On 23/07/2024 23:07, Barry Song wrote:
> On Fri, Jul 19, 2024 at 8:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>
>> On 19/07/2024 01:12, Barry Song wrote:
>>> On Wed, Jul 17, 2024 at 1:59 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
>>>>
>>>> Expose 3 new mTHP stats for file (pagecache) folio allocations:
>>>>
>>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
>>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
>>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge
>>>>
>>>> This will provide some insight on the sizes of large folios being
>>>> allocated for file-backed memory, and how often allocation is failing.
>>>>
>>>> All non-order-0 (and most order-0) folio allocations are currently done
>>>> through filemap_alloc_folio(), and folios are charged in a subsequent
>>>> call to filemap_add_folio(). So count file_fallback when allocation
>>>> fails in filemap_alloc_folio() and count file_alloc or
>>>> file_fallback_charge in filemap_add_folio(), based on whether charging
>>>> succeeded or not. There are some users of filemap_add_folio() that
>>>> allocate their own order-0 folio by other means, so we would not count
>>>> an allocation failure in this case, but we also don't care about order-0
>>>> allocations. This approach feels like it should be good enough and
>>>> doesn't require any (impractically large) refactoring.
>>>>
>>>> The existing mTHP stats interface is reused to provide consistency to
>>>> users. And because we are reusing the same interface, we can reuse the
>>>> same infrastructure on the kernel side.
>>>>
>>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
>>>> ---
>>>>  Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
>>>>  include/linux/huge_mm.h                    |  3 +++
>>>>  include/linux/pagemap.h                    | 16 ++++++++++++++--
>>>>  mm/filemap.c                               |  6 ++++--
>>>>  mm/huge_memory.c                           |  7 +++++++
>>>>  5 files changed, 41 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>>>> index 058485daf186..d4857e457add 100644
>>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>>> @@ -512,6 +512,19 @@ shmem_fallback_charge
>>>>         falls back to using small pages even though the allocation was
>>>>         successful.
>>>>
>>>> +file_alloc
>>>> +       is incremented every time a file huge page is successfully
>>>> +       allocated.
>>>> +
>>>> +file_fallback
>>>> +       is incremented if a file huge page is attempted to be allocated
>>>> +       but fails and instead falls back to using small pages.
>>>> +
>>>> +file_fallback_charge
>>>> +       is incremented if a file huge page cannot be charged and instead
>>>> +       falls back to using small pages even though the allocation was
>>>> +       successful.
>>>> +
>>>
>>> I realized that when we talk about fallback, it doesn't necessarily mean
>>> small pages; it could also refer to smaller huge pages.
>>
>> Yes good point, I'll update the documentation to reflect that for all memory types.
>>
>>>
>>> anon_fault_alloc
>>>         is incremented every time a huge page is successfully
>>>         allocated and charged to handle a page fault.
>>>
>>> anon_fault_fallback
>>>         is incremented if a page fault fails to allocate or charge
>>>         a huge page and instead falls back to using huge pages with
>>>         lower orders or small pages.
>>>
>>> anon_fault_fallback_charge
>>>         is incremented if a page fault fails to charge a huge page and
>>>         instead falls back to using huge pages with lower orders or
>>>         small pages even though the allocation was successful.
>>>
>>> This also applies to files, right?
>>
>> It does in the place you highlight below, but page_cache_ra_order() just falls
>> back immediately to order-0. Regardless, I think we should just document the
>> user facing docs to allow for a lower high order. That gives the implementation
>> more flexibility.
>>
>>>
>>>                 do {
>>>                         gfp_t alloc_gfp = gfp;
>>>
>>>                         err = -ENOMEM;
>>>                         if (order > 0)
>>>                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
>>>                         folio = filemap_alloc_folio(alloc_gfp, order);
>>>                         if (!folio)
>>>                                 continue;
>>>
>>>                         /* Init accessed so avoid atomic
>>> mark_page_accessed later */
>>>                         if (fgp_flags & FGP_ACCESSED)
>>>                                 __folio_set_referenced(folio);
>>>
>>>                         err = filemap_add_folio(mapping, folio, index, gfp);
>>>                         if (!err)
>>>                                 break;
>>>                         folio_put(folio);
>>>                         folio = NULL;
>>>                 } while (order-- > 0);
>>>
>>>
>>>>  split
>>>>         is incremented every time a huge page is successfully split into
>>>>         smaller orders. This can happen for a variety of reasons but a
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index b8c63c3e967f..4f9109fcdded 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -123,6 +123,9 @@ enum mthp_stat_item {
>>>>         MTHP_STAT_SHMEM_ALLOC,
>>>>         MTHP_STAT_SHMEM_FALLBACK,
>>>>         MTHP_STAT_SHMEM_FALLBACK_CHARGE,
>>>> +       MTHP_STAT_FILE_ALLOC,
>>>> +       MTHP_STAT_FILE_FALLBACK,
>>>> +       MTHP_STAT_FILE_FALLBACK_CHARGE,
>>>>         MTHP_STAT_SPLIT,
>>>>         MTHP_STAT_SPLIT_FAILED,
>>>>         MTHP_STAT_SPLIT_DEFERRED,
>>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>>>> index 6e2f72d03176..95a147b5d117 100644
>>>> --- a/include/linux/pagemap.h
>>>> +++ b/include/linux/pagemap.h
>>>> @@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
>>>>  }
>>>>
>>>>  #ifdef CONFIG_NUMA
>>>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
>>>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
>>>>  #else
>>>> -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>> +static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>>  {
>>>>         return folio_alloc_noprof(gfp, order);
>>>>  }
>>>>  #endif
>>>>
>>>> +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>> +{
>>>> +       struct folio *folio;
>>>> +
>>>> +       folio = __filemap_alloc_folio_noprof(gfp, order);
>>>> +
>>>> +       if (!folio)
>>>> +               count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
>>>> +
>>>> +       return folio;
>>>> +}
>>>
>>> Do we need to add and export __filemap_alloc_folio_noprof()?
>>
>> It is exported. See the below change in filemap.c. Previously
>> filemap_alloc_folio_noprof() was exported, but that is now an inline and
>> __filemap_alloc_folio_noprof() is exported in its place.
>>
>>> In any case,
>>> we won't call count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK) and
>>> will only allocate the folio instead?
>>
>> Sorry I don't understand what you mean by this?
> 
> Ryan, my question is whether exporting __filemap_alloc_folio_noprof() might lead
> to situations where people call __filemap_alloc_folio_noprof() and
> forget to call
> count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK). Currently, it seems
> that counting is always necessary? 

OK to make sure I've understood, I think you are saying that there is a risk
that people will call __filemap_alloc_folio_noprof() and bypass the stat
counting? But its the same problem with all "_noprof" functions; if those are
called directly, it bypasses the memory allocation profiling infrastructure. So
this just needs to be a case of "don't do that" IMHO. filemap_alloc_folio() is
the public API.

> Another option is that we still
> call count mthp
> within filemap_alloc_folio_noprof() and make it noinline if
> __filemap_alloc_folio_noprof()
> is not inline?

I could certainly make filemap_alloc_folio_noprof() always out-of-line and then
handle the counting privately in the compilation unit. But before my change
filemap_alloc_folio_noprof() was inline for the !CONFIG_NUMA case and I was
trying not to change that. I think what you're suggesting would be tidier
though. I'll make the change. But I don't think it solves the wider problem of
the possibility that people can call private APIs. That's what review is for.

Thanks,
Ryan


> 
>>
>>>
>>>> +
>>>>  #define filemap_alloc_folio(...)                               \
>>>>         alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
>>>>
>>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>>> index 53d5d0410b51..131d514fca29 100644
>>>> --- a/mm/filemap.c
>>>> +++ b/mm/filemap.c
>>>> @@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>>>>         int ret;
>>>>
>>>>         ret = mem_cgroup_charge(folio, NULL, gfp);
>>>> +       count_mthp_stat(folio_order(folio),
>>>> +               ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
>>>>         if (ret)
>>>>                 return ret;
>>>
>>> Would the following be better?
>>>
>>>         ret = mem_cgroup_charge(folio, NULL, gfp);
>>>          if (ret) {
>>>                  count_mthp_stat(folio_order(folio),
>>> MTHP_STAT_FILE_FALLBACK_CHARGE);
>>>                  return ret;
>>>          }
>>>        count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC);
>>>
>>> Anyway, it's up to you. The code just feels a bit off to me :-)
>>
>> Yes, agree your version is better. I also noticed that anon and shmem increment
>> FALLBACK whenever FALLBACK_CHARGE is incremented so I should apply those same
>> semantics here.
>>
>> Thanks,
>> Ryan
>>
>>
>>>
>>>>
>>>> @@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
>>>>  EXPORT_SYMBOL_GPL(filemap_add_folio);
>>>>
>>>>  #ifdef CONFIG_NUMA
>>>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>>  {
>>>>         int n;
>>>>         struct folio *folio;
>>>> @@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
>>>>         }
>>>>         return folio_alloc_noprof(gfp, order);
>>>>  }
>>>> -EXPORT_SYMBOL(filemap_alloc_folio_noprof);
>>>> +EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
>>>>  #endif
>>>>
>>>>  /*
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 578ac212c172..26d558e3e80f 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
>>>>         .attrs = anon_stats_attrs,
>>>>  };
>>>>
>>>> +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
>>>> +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
>>>> +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
>>>> +
>>>>  static struct attribute *file_stats_attrs[] = {
>>>> +       &file_alloc_attr.attr,
>>>> +       &file_fallback_attr.attr,
>>>> +       &file_fallback_charge_attr.attr,
>>>>  #ifdef CONFIG_SHMEM
>>>>         &shmem_alloc_attr.attr,
>>>>         &shmem_fallback_attr.attr,
>>>> --
>>>> 2.43.0
>>>>
>>>
> 
> Thanks
> Barry



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations
  2024-07-24  8:12         ` Ryan Roberts
@ 2024-07-24 10:23           ` Barry Song
  0 siblings, 0 replies; 12+ messages in thread
From: Barry Song @ 2024-07-24 10:23 UTC (permalink / raw)
  To: Ryan Roberts
  Cc: Andrew Morton, Hugh Dickins, Jonathan Corbet,
	Matthew Wilcox (Oracle),
	David Hildenbrand, Lance Yang, Baolin Wang, Gavin Shan,
	linux-kernel, linux-mm

On Wed, Jul 24, 2024 at 8:12 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 23/07/2024 23:07, Barry Song wrote:
> > On Fri, Jul 19, 2024 at 8:56 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>
> >> On 19/07/2024 01:12, Barry Song wrote:
> >>> On Wed, Jul 17, 2024 at 1:59 AM Ryan Roberts <ryan.roberts@arm.com> wrote:
> >>>>
> >>>> Expose 3 new mTHP stats for file (pagecache) folio allocations:
> >>>>
> >>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_alloc
> >>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback
> >>>>   /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/file_fallback_charge
> >>>>
> >>>> This will provide some insight on the sizes of large folios being
> >>>> allocated for file-backed memory, and how often allocation is failing.
> >>>>
> >>>> All non-order-0 (and most order-0) folio allocations are currently done
> >>>> through filemap_alloc_folio(), and folios are charged in a subsequent
> >>>> call to filemap_add_folio(). So count file_fallback when allocation
> >>>> fails in filemap_alloc_folio() and count file_alloc or
> >>>> file_fallback_charge in filemap_add_folio(), based on whether charging
> >>>> succeeded or not. There are some users of filemap_add_folio() that
> >>>> allocate their own order-0 folio by other means, so we would not count
> >>>> an allocation failure in this case, but we also don't care about order-0
> >>>> allocations. This approach feels like it should be good enough and
> >>>> doesn't require any (impractically large) refactoring.
> >>>>
> >>>> The existing mTHP stats interface is reused to provide consistency to
> >>>> users. And because we are reusing the same interface, we can reuse the
> >>>> same infrastructure on the kernel side.
> >>>>
> >>>> Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
> >>>> ---
> >>>>  Documentation/admin-guide/mm/transhuge.rst | 13 +++++++++++++
> >>>>  include/linux/huge_mm.h                    |  3 +++
> >>>>  include/linux/pagemap.h                    | 16 ++++++++++++++--
> >>>>  mm/filemap.c                               |  6 ++++--
> >>>>  mm/huge_memory.c                           |  7 +++++++
> >>>>  5 files changed, 41 insertions(+), 4 deletions(-)
> >>>>
> >>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> >>>> index 058485daf186..d4857e457add 100644
> >>>> --- a/Documentation/admin-guide/mm/transhuge.rst
> >>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
> >>>> @@ -512,6 +512,19 @@ shmem_fallback_charge
> >>>>         falls back to using small pages even though the allocation was
> >>>>         successful.
> >>>>
> >>>> +file_alloc
> >>>> +       is incremented every time a file huge page is successfully
> >>>> +       allocated.
> >>>> +
> >>>> +file_fallback
> >>>> +       is incremented if a file huge page is attempted to be allocated
> >>>> +       but fails and instead falls back to using small pages.
> >>>> +
> >>>> +file_fallback_charge
> >>>> +       is incremented if a file huge page cannot be charged and instead
> >>>> +       falls back to using small pages even though the allocation was
> >>>> +       successful.
> >>>> +
> >>>
> >>> I realized that when we talk about fallback, it doesn't necessarily mean
> >>> small pages; it could also refer to smaller huge pages.
> >>
> >> Yes good point, I'll update the documentation to reflect that for all memory types.
> >>
> >>>
> >>> anon_fault_alloc
> >>>         is incremented every time a huge page is successfully
> >>>         allocated and charged to handle a page fault.
> >>>
> >>> anon_fault_fallback
> >>>         is incremented if a page fault fails to allocate or charge
> >>>         a huge page and instead falls back to using huge pages with
> >>>         lower orders or small pages.
> >>>
> >>> anon_fault_fallback_charge
> >>>         is incremented if a page fault fails to charge a huge page and
> >>>         instead falls back to using huge pages with lower orders or
> >>>         small pages even though the allocation was successful.
> >>>
> >>> This also applies to files, right?
> >>
> >> It does in the place you highlight below, but page_cache_ra_order() just falls
> >> back immediately to order-0. Regardless, I think we should just document the
> >> user facing docs to allow for a lower high order. That gives the implementation
> >> more flexibility.
> >>
> >>>
> >>>                 do {
> >>>                         gfp_t alloc_gfp = gfp;
> >>>
> >>>                         err = -ENOMEM;
> >>>                         if (order > 0)
> >>>                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
> >>>                         folio = filemap_alloc_folio(alloc_gfp, order);
> >>>                         if (!folio)
> >>>                                 continue;
> >>>
> >>>                         /* Init accessed so avoid atomic
> >>> mark_page_accessed later */
> >>>                         if (fgp_flags & FGP_ACCESSED)
> >>>                                 __folio_set_referenced(folio);
> >>>
> >>>                         err = filemap_add_folio(mapping, folio, index, gfp);
> >>>                         if (!err)
> >>>                                 break;
> >>>                         folio_put(folio);
> >>>                         folio = NULL;
> >>>                 } while (order-- > 0);
> >>>
> >>>
> >>>>  split
> >>>>         is incremented every time a huge page is successfully split into
> >>>>         smaller orders. This can happen for a variety of reasons but a
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index b8c63c3e967f..4f9109fcdded 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -123,6 +123,9 @@ enum mthp_stat_item {
> >>>>         MTHP_STAT_SHMEM_ALLOC,
> >>>>         MTHP_STAT_SHMEM_FALLBACK,
> >>>>         MTHP_STAT_SHMEM_FALLBACK_CHARGE,
> >>>> +       MTHP_STAT_FILE_ALLOC,
> >>>> +       MTHP_STAT_FILE_FALLBACK,
> >>>> +       MTHP_STAT_FILE_FALLBACK_CHARGE,
> >>>>         MTHP_STAT_SPLIT,
> >>>>         MTHP_STAT_SPLIT_FAILED,
> >>>>         MTHP_STAT_SPLIT_DEFERRED,
> >>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> >>>> index 6e2f72d03176..95a147b5d117 100644
> >>>> --- a/include/linux/pagemap.h
> >>>> +++ b/include/linux/pagemap.h
> >>>> @@ -562,14 +562,26 @@ static inline void *detach_page_private(struct page *page)
> >>>>  }
> >>>>
> >>>>  #ifdef CONFIG_NUMA
> >>>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
> >>>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
> >>>>  #else
> >>>> -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>> +static inline struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>>  {
> >>>>         return folio_alloc_noprof(gfp, order);
> >>>>  }
> >>>>  #endif
> >>>>
> >>>> +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>> +{
> >>>> +       struct folio *folio;
> >>>> +
> >>>> +       folio = __filemap_alloc_folio_noprof(gfp, order);
> >>>> +
> >>>> +       if (!folio)
> >>>> +               count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK);
> >>>> +
> >>>> +       return folio;
> >>>> +}
> >>>
> >>> Do we need to add and export __filemap_alloc_folio_noprof()?
> >>
> >> It is exported. See the below change in filemap.c. Previously
> >> filemap_alloc_folio_noprof() was exported, but that is now an inline and
> >> __filemap_alloc_folio_noprof() is exported in its place.
> >>
> >>> In any case,
> >>> we won't call count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK) and
> >>> will only allocate the folio instead?
> >>
> >> Sorry I don't understand what you mean by this?
> >
> > Ryan, my question is whether exporting __filemap_alloc_folio_noprof() might lead
> > to situations where people call __filemap_alloc_folio_noprof() and
> > forget to call
> > count_mthp_stat(order, MTHP_STAT_FILE_FALLBACK). Currently, it seems
> > that counting is always necessary?
>
> OK to make sure I've understood, I think you are saying that there is a risk
> that people will call __filemap_alloc_folio_noprof() and bypass the stat
> counting? But its the same problem with all "_noprof" functions; if those are
> called directly, it bypasses the memory allocation profiling infrastructure. So
> this just needs to be a case of "don't do that" IMHO. filemap_alloc_folio() is
> the public API.

Indeed. Maybe I'm overthinking it.

>
> > Another option is that we still
> > call count mthp
> > within filemap_alloc_folio_noprof() and make it noinline if
> > __filemap_alloc_folio_noprof()
> > is not inline?
>
> I could certainly make filemap_alloc_folio_noprof() always out-of-line and then
> handle the counting privately in the compilation unit. But before my change
> filemap_alloc_folio_noprof() was inline for the !CONFIG_NUMA case and I was
> trying not to change that. I think what you're suggesting would be tidier
> though. I'll make the change. But I don't think it solves the wider problem of
> the possibility that people can call private APIs. That's what review is for.

Agreed.

>
> Thanks,
> Ryan
>
>
> >
> >>
> >>>
> >>>> +
> >>>>  #define filemap_alloc_folio(...)                               \
> >>>>         alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
> >>>>
> >>>> diff --git a/mm/filemap.c b/mm/filemap.c
> >>>> index 53d5d0410b51..131d514fca29 100644
> >>>> --- a/mm/filemap.c
> >>>> +++ b/mm/filemap.c
> >>>> @@ -963,6 +963,8 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
> >>>>         int ret;
> >>>>
> >>>>         ret = mem_cgroup_charge(folio, NULL, gfp);
> >>>> +       count_mthp_stat(folio_order(folio),
> >>>> +               ret ? MTHP_STAT_FILE_FALLBACK_CHARGE : MTHP_STAT_FILE_ALLOC);
> >>>>         if (ret)
> >>>>                 return ret;
> >>>
> >>> Would the following be better?
> >>>
> >>>         ret = mem_cgroup_charge(folio, NULL, gfp);
> >>>          if (ret) {
> >>>                  count_mthp_stat(folio_order(folio),
> >>> MTHP_STAT_FILE_FALLBACK_CHARGE);
> >>>                  return ret;
> >>>          }
> >>>        count_mthp_stat(folio_order(folio), MTHP_STAT_FILE_ALLOC);
> >>>
> >>> Anyway, it's up to you. The code just feels a bit off to me :-)
> >>
> >> Yes, agree your version is better. I also noticed that anon and shmem increment
> >> FALLBACK whenever FALLBACK_CHARGE is incremented so I should apply those same
> >> semantics here.
> >>
> >> Thanks,
> >> Ryan
> >>
> >>
> >>>
> >>>>
> >>>> @@ -990,7 +992,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
> >>>>  EXPORT_SYMBOL_GPL(filemap_add_folio);
> >>>>
> >>>>  #ifdef CONFIG_NUMA
> >>>> -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>> +struct folio *__filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>>  {
> >>>>         int n;
> >>>>         struct folio *folio;
> >>>> @@ -1007,7 +1009,7 @@ struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
> >>>>         }
> >>>>         return folio_alloc_noprof(gfp, order);
> >>>>  }
> >>>> -EXPORT_SYMBOL(filemap_alloc_folio_noprof);
> >>>> +EXPORT_SYMBOL(__filemap_alloc_folio_noprof);
> >>>>  #endif
> >>>>
> >>>>  /*
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 578ac212c172..26d558e3e80f 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -608,7 +608,14 @@ static struct attribute_group anon_stats_attr_grp = {
> >>>>         .attrs = anon_stats_attrs,
> >>>>  };
> >>>>
> >>>> +DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC);
> >>>> +DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK);
> >>>> +DEFINE_MTHP_STAT_ATTR(file_fallback_charge, MTHP_STAT_FILE_FALLBACK_CHARGE);
> >>>> +
> >>>>  static struct attribute *file_stats_attrs[] = {
> >>>> +       &file_alloc_attr.attr,
> >>>> +       &file_fallback_attr.attr,
> >>>> +       &file_fallback_charge_attr.attr,
> >>>>  #ifdef CONFIG_SHMEM
> >>>>         &shmem_alloc_attr.attr,
> >>>>         &shmem_fallback_attr.attr,
> >>>> --
> >>>> 2.43.0
> >>>>
> >>>
> >

Thanks
 Barry


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2024-07-24 10:23 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-07-16 13:59 [PATCH v2 0/3] mTHP allocation stats for file-backed memory Ryan Roberts
2024-07-16 13:59 ` [PATCH v2 1/3] mm: Cleanup count_mthp_stat() definition Ryan Roberts
2024-07-16 13:59 ` [PATCH v2 2/3] mm: Tidy up shmem mTHP controls and stats Ryan Roberts
2024-07-22  6:14   ` Baolin Wang
2024-07-22  7:33     ` Ryan Roberts
2024-07-22  8:04       ` Baolin Wang
2024-07-16 13:59 ` [PATCH v2 3/3] mm: mTHP stats for pagecache folio allocations Ryan Roberts
2024-07-19  0:12   ` Barry Song
2024-07-19  8:56     ` Ryan Roberts
2024-07-23 22:07       ` Barry Song
2024-07-24  8:12         ` Ryan Roberts
2024-07-24 10:23           ` Barry Song

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox