From: Yafang Shao <laoar.shao@gmail.com>
To: akpm@linux-foundation.org, david@redhat.com, ziy@nvidia.com,
baolin.wang@linux.alibaba.com, lorenzo.stoakes@oracle.com,
Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
dev.jain@arm.com, hannes@cmpxchg.org, usamaarif642@gmail.com,
gutierrez.asier@huawei-partners.com, willy@infradead.org,
ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
Yafang Shao <laoar.shao@gmail.com>
Subject: [RFC PATCH v3 4/5] mm: thp: add bpf thp struct ops
Date: Sun, 8 Jun 2025 15:35:15 +0800 [thread overview]
Message-ID: <20250608073516.22415-5-laoar.shao@gmail.com> (raw)
In-Reply-To: <20250608073516.22415-1-laoar.shao@gmail.com>
A new bpf_thp struct ops is introduced to provide finer-grained control
over THP allocation policy. The struct ops includes two APIs for
determining the THP allocator and reclaimer behavior:
- THP allocator
int (*allocator)(unsigned long vm_flags, unsigned long tva_flags);
The BPF program returns either THP_ALLOC_CURRENT or THP_ALLOC_KHUGEPAGED,
indicating whether THP allocation should be performed synchronously
(current task) or asynchronously (khugepaged).
The decision is based on the current task context, VMA flags, and TVA
flags.
- THP reclaimer
int (*reclaimer)(bool vma_madvised);
The BPF program returns either RECLAIMER_CURRENT or RECLAIMER_KSWAPD,
determining whether memory reclamation is handled by the current task or
kswapd.
The decision depends on the current task and VMA flags.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
include/linux/huge_mm.h | 13 +--
mm/Makefile | 3 +
mm/bpf_thp.c | 184 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 190 insertions(+), 10 deletions(-)
create mode 100644 mm/bpf_thp.c
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6a40ebf25f5c..0d02c9b56a85 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -54,6 +54,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */
};
struct kobject;
@@ -192,16 +193,8 @@ static inline bool hugepage_global_always(void)
#define THP_ALLOC_KHUGEPAGED (1 << 1)
#define THP_ALLOC_CURRENT (1 << 2)
-static inline int bpf_thp_allocator(unsigned long vm_flags,
- unsigned long tva_flags)
-{
- return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
-}
-
-static inline gfp_t bpf_thp_gfp_mask(bool vma_madvised)
-{
- return 0;
-}
+int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags);
+gfp_t bpf_thp_gfp_mask(bool vma_madvised);
static inline int highest_order(unsigned long orders)
{
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..e5f41cf3fd61 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -99,6 +99,9 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+ifdef CONFIG_BPF_SYSCALL
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += bpf_thp.o
+endif
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/bpf_thp.c b/mm/bpf_thp.c
new file mode 100644
index 000000000000..894d6cb93107
--- /dev/null
+++ b/mm/bpf_thp.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/huge_mm.h>
+#include <linux/khugepaged.h>
+
+#define RECLAIMER_CURRENT (1 << 1)
+#define RECLAIMER_KSWAPD (1 << 2)
+#define RECLAIMER_BOTH (RECLAIMER_CURRENT | RECLAIMER_KSWAPD)
+
+struct bpf_thp_ops {
+ /**
+ * @allocator: Specifies whether the THP allocation is performed
+ * by the current task or by khugepaged.
+ * @vm_flags: Flags for the VMA in the current allocation context
+ * @tva_flags: Flags for the TVA in the current allocation context
+ *
+ * Rerurn:
+ * - THP_ALLOC_CURRENT: THP was allocated synchronously by the calling
+ * task's context.
+ * - THP_ALLOC_KHUGEPAGED: THP was allocated asynchronously by the
+ * khugepaged kernel thread.
+ * - 0: THP allocation is disallowed in the current context.
+ */
+ int (*allocator)(unsigned long vm_flags, unsigned long tva_flags);
+ /**
+ * @reclaimer: Specifies the entity performing page reclaim:
+ * - current task context
+ * - kswapd
+ * - none (no reclaim)
+ * @vma_madvised: MADV flags for this VMA (e.g., MADV_HUGEPAGE, MADV_NOHUGEPAGE)
+ *
+ * Return:
+ * - RECLAIMER_CURRENT: Direct reclaim by the current task if THP
+ * allocation fails.
+ * - RECLAIMER_KSWAPD: Wake kswapd to reclaim memory if THP allocation fails.
+ * - RECLAIMER_ALL: Both current and kswapd will perform the reclaim
+ * - 0: No reclaim will be attempted.
+ */
+ int (*reclaimer)(bool vma_madvised);
+};
+
+static struct bpf_thp_ops bpf_thp;
+
+int bpf_thp_allocator(unsigned long vm_flags, unsigned long tva_flags)
+{
+ int allocator;
+
+ /* No BPF program is attached */
+ if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED)))
+ return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+
+ if (current_is_khugepaged())
+ return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+ if (!bpf_thp.allocator)
+ return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+
+ allocator = bpf_thp.allocator(vm_flags, tva_flags);
+ if (!allocator)
+ return 0;
+ /* invalid return value */
+ if (allocator & ~(THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT))
+ return THP_ALLOC_KHUGEPAGED | THP_ALLOC_CURRENT;
+ return allocator;
+}
+
+gfp_t bpf_thp_gfp_mask(bool vma_madvised)
+{
+ int reclaimer;
+
+ if (!(transparent_hugepage_flags & (1<<TRANSPARENT_HUGEPAGE_BPF_ATTACHED)))
+ return 0;
+
+ if (!bpf_thp.reclaimer)
+ return 0;
+
+ reclaimer = bpf_thp.reclaimer(vma_madvised);
+ switch (reclaimer) {
+ case RECLAIMER_CURRENT:
+ return GFP_TRANSHUGE | __GFP_NORETRY;
+ case RECLAIMER_KSWAPD:
+ return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+ case RECLAIMER_BOTH:
+ return GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM | __GFP_NORETRY;
+ default:
+ return 0;
+ }
+}
+
+static bool bpf_thp_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_func_proto *
+bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
+ .get_func_proto = bpf_thp_get_func_proto,
+ .is_valid_access = bpf_thp_ops_is_valid_access,
+};
+
+static int bpf_thp_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_thp_ops *ops = kdata;
+
+ /* TODO: add support for multiple attaches */
+ if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
+ &transparent_hugepage_flags))
+ return -EOPNOTSUPP;
+ bpf_thp.allocator = ops->allocator;
+ bpf_thp.reclaimer = ops->reclaimer;
+ return 0;
+}
+
+static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
+{
+ clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
+ bpf_thp.allocator = NULL;
+ bpf_thp.reclaimer = NULL;
+}
+
+static int bpf_thp_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ return 0;
+}
+
+static int bpf_thp_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_thp_init(struct btf *btf)
+{
+ return 0;
+}
+
+static int allocator(unsigned long vm_flags, unsigned long tva_flags)
+{
+ return 0;
+}
+
+static int reclaimer(bool vma_madvised)
+{
+ return 0;
+}
+
+static struct bpf_thp_ops __bpf_thp_ops = {
+ .allocator = allocator,
+ .reclaimer = reclaimer,
+};
+
+static struct bpf_struct_ops bpf_bpf_thp_ops = {
+ .verifier_ops = &thp_bpf_verifier_ops,
+ .init = bpf_thp_init,
+ .check_member = bpf_thp_check_member,
+ .init_member = bpf_thp_init_member,
+ .reg = bpf_thp_reg,
+ .unreg = bpf_thp_unreg,
+ .name = "bpf_thp_ops",
+ .cfi_stubs = &__bpf_thp_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_thp_ops_init(void)
+{
+ int err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
+
+ if (err)
+ pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
+ return err;
+}
+late_initcall(bpf_thp_ops_init);
--
2.43.5
next prev parent reply other threads:[~2025-06-08 7:36 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-08 7:35 [RFC PATCH v3 0/5] mm, bpf: BPF based THP adjustment Yafang Shao
2025-06-08 7:35 ` [RFC PATCH v3 1/5] mm, thp: use __thp_vma_allowable_orders() in khugepaged_enter_vma() Yafang Shao
2025-07-17 14:48 ` Usama Arif
2025-07-20 2:37 ` Yafang Shao
2025-06-08 7:35 ` [RFC PATCH v3 2/5] mm, thp: add bpf thp hook to determine thp allocator Yafang Shao
2025-07-17 15:30 ` Usama Arif
2025-07-20 3:00 ` Yafang Shao
2025-06-08 7:35 ` [RFC PATCH v3 3/5] mm, thp: add bpf thp hook to determine thp reclaimer Yafang Shao
2025-07-17 16:06 ` Usama Arif
2025-07-20 3:03 ` Yafang Shao
2025-06-08 7:35 ` Yafang Shao [this message]
2025-07-17 16:25 ` [RFC PATCH v3 4/5] mm: thp: add bpf thp struct ops Usama Arif
2025-07-17 18:21 ` Amery Hung
2025-07-20 3:07 ` Yafang Shao
2025-06-08 7:35 ` [RFC PATCH v3 5/5] selftests/bpf: Add selftest for THP adjustment Yafang Shao
2025-07-15 22:42 ` [RFC PATCH v3 0/5] mm, bpf: BPF based " David Hildenbrand
2025-07-17 3:09 ` Yafang Shao
2025-07-17 8:52 ` David Hildenbrand
2025-07-17 9:05 ` Lorenzo Stoakes
2025-07-20 2:32 ` Yafang Shao
2025-07-20 15:56 ` David Hildenbrand
2025-07-22 2:40 ` Yafang Shao
2025-07-22 7:28 ` David Hildenbrand
2025-07-22 10:09 ` Lorenzo Stoakes
2025-07-22 11:56 ` Yafang Shao
2025-07-22 12:04 ` Lorenzo Stoakes
2025-07-22 12:16 ` Yafang Shao
2025-07-22 11:46 ` Yafang Shao
2025-07-22 11:54 ` Lorenzo Stoakes
2025-07-22 12:02 ` Yafang Shao
2025-07-22 12:08 ` Lorenzo Stoakes
2025-07-17 16:35 ` Usama Arif
2025-07-20 2:54 ` Yafang Shao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250608073516.22415-5-laoar.shao@gmail.com \
--to=laoar.shao@gmail.com \
--cc=Liam.Howlett@oracle.com \
--cc=akpm@linux-foundation.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=david@redhat.com \
--cc=dev.jain@arm.com \
--cc=gutierrez.asier@huawei-partners.com \
--cc=hannes@cmpxchg.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=npache@redhat.com \
--cc=ryan.roberts@arm.com \
--cc=usamaarif642@gmail.com \
--cc=willy@infradead.org \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox