From: Yafang Shao <laoar.shao@gmail.com>
To: akpm@linux-foundation.org, david@redhat.com, ziy@nvidia.com,
baolin.wang@linux.alibaba.com, lorenzo.stoakes@oracle.com,
Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
dev.jain@arm.com, hannes@cmpxchg.org, usamaarif642@gmail.com,
gutierrez.asier@huawei-partners.com, willy@infradead.org,
ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
Yafang Shao <laoar.shao@gmail.com>
Subject: [RFC PATCH v2 1/5] mm: thp: Add a new mode "bpf"
Date: Tue, 20 May 2025 14:04:59 +0800 [thread overview]
Message-ID: <20250520060504.20251-2-laoar.shao@gmail.com> (raw)
In-Reply-To: <20250520060504.20251-1-laoar.shao@gmail.com>
Background
----------
Historically, our production environment has always configured THP to never
due to past incidents. This has made system administrators hesitant to
switch to madvise.
New Motivation
--------------
We’ve now identified that AI workloads can achieve significant performance
gains with THP enabled. To balance safety and performance, we aim to allow
THP only for AI services while keeping the global system setting at never.
Proposed Solution
-----------------
Johannes suggested introducing a dedicated mode for this use case [0]. This
approach elegantly solves our problem while avoiding the complexity of
managing BPF alongside other THP modes.
Link: https://lore.kernel.org/linux-mm/20250509164654.GA608090@cmpxchg.org/ [0]
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
include/linux/huge_mm.h | 2 ++
mm/huge_memory.c | 65 ++++++++++++++++++++++++++++++++++++-----
2 files changed, 59 insertions(+), 8 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e893d546a49f..3b5429f73e6e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -54,6 +54,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG, /* "bpf" mode */
};
struct kobject;
@@ -174,6 +175,7 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item)
extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
+extern unsigned long huge_anon_orders_bpf;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 47d76d03ce30..8af56ee8d979 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -79,6 +79,7 @@ static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
+unsigned long huge_anon_orders_bpf __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;
@@ -297,12 +298,15 @@ static ssize_t enabled_show(struct kobject *kobj,
const char *output;
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
- output = "[always] madvise never";
+ output = "[always] bpf madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG,
+ &transparent_hugepage_flags))
+ output = "always [bpf] madvise never";
else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags))
- output = "always [madvise] never";
+ output = "always bpf [madvise] never";
else
- output = "always madvise [never]";
+ output = "always bpf madvise [never]";
return sysfs_emit(buf, "%s\n", output);
}
@@ -315,13 +319,20 @@ static ssize_t enabled_store(struct kobject *kobj,
if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "bpf")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG, &transparent_hugepage_flags);
} else
ret = -EINVAL;
@@ -495,13 +506,15 @@ static ssize_t anon_enabled_show(struct kobject *kobj,
const char *output;
if (test_bit(order, &huge_anon_orders_always))
- output = "[always] inherit madvise never";
+ output = "[always] bpf inherit madvise never";
+ else if (test_bit(order, &huge_anon_orders_bpf))
+ output = "always [bpf] inherit madvise never";
else if (test_bit(order, &huge_anon_orders_inherit))
- output = "always [inherit] madvise never";
+ output = "always bpf [inherit] madvise never";
else if (test_bit(order, &huge_anon_orders_madvise))
- output = "always inherit [madvise] never";
+ output = "always bpf inherit [madvise] never";
else
- output = "always inherit madvise [never]";
+ output = "always bpf inherit madvise [never]";
return sysfs_emit(buf, "%s\n", output);
}
@@ -515,25 +528,36 @@ static ssize_t anon_enabled_store(struct kobject *kobj,
if (sysfs_streq(buf, "always")) {
spin_lock(&huge_anon_orders_lock);
+ clear_bit(order, &huge_anon_orders_bpf);
clear_bit(order, &huge_anon_orders_inherit);
clear_bit(order, &huge_anon_orders_madvise);
set_bit(order, &huge_anon_orders_always);
spin_unlock(&huge_anon_orders_lock);
+ } else if (sysfs_streq(buf, "bpf")) {
+ spin_lock(&huge_anon_orders_lock);
+ clear_bit(order, &huge_anon_orders_always);
+ clear_bit(order, &huge_anon_orders_inherit);
+ clear_bit(order, &huge_anon_orders_madvise);
+ set_bit(order, &huge_anon_orders_bpf);
+ spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "inherit")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
+ clear_bit(order, &huge_anon_orders_bpf);
clear_bit(order, &huge_anon_orders_madvise);
set_bit(order, &huge_anon_orders_inherit);
spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "madvise")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
+ clear_bit(order, &huge_anon_orders_bpf);
clear_bit(order, &huge_anon_orders_inherit);
set_bit(order, &huge_anon_orders_madvise);
spin_unlock(&huge_anon_orders_lock);
} else if (sysfs_streq(buf, "never")) {
spin_lock(&huge_anon_orders_lock);
clear_bit(order, &huge_anon_orders_always);
+ clear_bit(order, &huge_anon_orders_bpf);
clear_bit(order, &huge_anon_orders_inherit);
clear_bit(order, &huge_anon_orders_madvise);
spin_unlock(&huge_anon_orders_lock);
@@ -943,10 +967,22 @@ static int __init setup_transparent_hugepage(char *str)
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "bpf")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
} else if (!strcmp(str, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG,
+ &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
ret = 1;
@@ -955,6 +991,8 @@ static int __init setup_transparent_hugepage(char *str)
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_BPF_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
}
out:
@@ -967,8 +1005,8 @@ __setup("transparent_hugepage=", setup_transparent_hugepage);
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
+ unsigned long always, bpf, inherit, madvise;
char *token, *range, *policy, *subtoken;
- unsigned long always, inherit, madvise;
char *start_size, *end_size;
int start, end, nr;
char *p;
@@ -978,6 +1016,7 @@ static int __init setup_thp_anon(char *str)
strscpy(str_dup, str);
always = huge_anon_orders_always;
+ bpf = huge_anon_orders_bpf;
madvise = huge_anon_orders_madvise;
inherit = huge_anon_orders_inherit;
p = str_dup;
@@ -1019,18 +1058,27 @@ static int __init setup_thp_anon(char *str)
bitmap_set(&always, start, nr);
bitmap_clear(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&bpf, start, nr);
+ } else if (!strcmp(policy, "bpf")) {
+ bitmap_set(&bpf, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&always, start, nr);
+ bitmap_clear(&madvise, start, nr);
} else if (!strcmp(policy, "madvise")) {
bitmap_set(&madvise, start, nr);
bitmap_clear(&inherit, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&bpf, start, nr);
} else if (!strcmp(policy, "inherit")) {
bitmap_set(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&bpf, start, nr);
} else if (!strcmp(policy, "never")) {
bitmap_clear(&inherit, start, nr);
bitmap_clear(&madvise, start, nr);
bitmap_clear(&always, start, nr);
+ bitmap_clear(&bpf, start, nr);
} else {
pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
goto err;
@@ -1041,6 +1089,7 @@ static int __init setup_thp_anon(char *str)
huge_anon_orders_always = always;
huge_anon_orders_madvise = madvise;
huge_anon_orders_inherit = inherit;
+ huge_anon_orders_bpf = bpf;
anon_orders_configured = true;
return 1;
--
2.43.5
next prev parent reply other threads:[~2025-05-20 6:06 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-20 6:04 [RFC PATCH v2 0/5] mm, bpf: BPF based THP adjustment Yafang Shao
2025-05-20 6:04 ` Yafang Shao [this message]
2025-05-20 6:05 ` [RFC PATCH v2 2/5] mm: thp: Add hook for " Yafang Shao
2025-05-20 6:05 ` [RFC PATCH v2 3/5] mm: thp: add struct ops " Yafang Shao
2025-05-20 6:05 ` [RFC PATCH v2 4/5] bpf: Add get_current_comm to bpf_base_func_proto Yafang Shao
2025-05-20 23:32 ` Andrii Nakryiko
2025-05-20 6:05 ` [RFC PATCH v2 5/5] selftests/bpf: Add selftest for THP adjustment Yafang Shao
2025-05-20 6:52 ` [RFC PATCH v2 0/5] mm, bpf: BPF based " Nico Pache
2025-05-20 7:25 ` Yafang Shao
2025-05-20 13:10 ` Matthew Wilcox
2025-05-20 14:08 ` Yafang Shao
2025-05-20 14:22 ` Lorenzo Stoakes
2025-05-20 14:32 ` Usama Arif
2025-05-20 14:35 ` Lorenzo Stoakes
2025-05-20 14:42 ` Matthew Wilcox
2025-05-20 14:56 ` David Hildenbrand
2025-05-21 4:28 ` Yafang Shao
2025-05-20 14:46 ` Usama Arif
2025-05-20 15:00 ` David Hildenbrand
2025-05-20 9:43 ` David Hildenbrand
2025-05-20 9:49 ` Lorenzo Stoakes
2025-05-20 12:06 ` Yafang Shao
2025-05-20 13:45 ` Lorenzo Stoakes
2025-05-20 15:54 ` David Hildenbrand
2025-05-21 4:02 ` Yafang Shao
2025-05-21 3:52 ` Yafang Shao
2025-05-20 11:59 ` Yafang Shao
2025-05-25 3:01 ` Yafang Shao
2025-05-26 7:41 ` Gutierrez Asier
2025-05-26 9:37 ` Yafang Shao
2025-05-26 8:14 ` David Hildenbrand
2025-05-26 9:37 ` Yafang Shao
2025-05-26 10:49 ` David Hildenbrand
2025-05-26 14:53 ` Liam R. Howlett
2025-05-26 15:54 ` Liam R. Howlett
2025-05-26 16:51 ` David Hildenbrand
2025-05-26 17:07 ` Liam R. Howlett
2025-05-26 17:12 ` David Hildenbrand
2025-05-26 20:30 ` Gutierrez Asier
2025-05-26 20:37 ` David Hildenbrand
2025-05-27 5:46 ` Yafang Shao
2025-05-27 7:57 ` David Hildenbrand
2025-05-27 8:13 ` Yafang Shao
2025-05-27 8:30 ` David Hildenbrand
2025-05-27 8:40 ` Yafang Shao
2025-05-27 9:27 ` David Hildenbrand
2025-05-27 9:43 ` Yafang Shao
2025-05-27 12:19 ` David Hildenbrand
2025-05-28 2:04 ` Yafang Shao
2025-05-28 20:32 ` David Hildenbrand
2025-05-26 14:32 ` Zi Yan
2025-05-27 5:53 ` Yafang Shao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250520060504.20251-2-laoar.shao@gmail.com \
--to=laoar.shao@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=david@redhat.com \
--cc=dev.jain@arm.com \
--cc=gutierrez.asier@huawei-partners.com \
--cc=hannes@cmpxchg.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=npache@redhat.com \
--cc=ryan.roberts@arm.com \
--cc=usamaarif642@gmail.com \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox