* [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 14:11 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma() Yafang Shao
` (10 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
Since a task with MMF_DISABLE_THP_COMPLETELY cannot use THP, remove it from
the khugepaged_mm_slot to stop khugepaged from processing it.
After this change, the following semantic relationship always holds:
MMF_VM_HUGEPAGE is set == task is in khugepaged mm_slot
MMF_VM_HUGEPAGE is not set == task is not in khugepaged mm_slot
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
---
include/linux/khugepaged.h | 4 ++++
kernel/sys.c | 7 ++++--
mm/khugepaged.c | 49 ++++++++++++++++++++------------------
3 files changed, 35 insertions(+), 25 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eb1946a70cff..f14680cd9854 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,7 @@ extern void __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
extern void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags);
+extern void khugepaged_enter_mm(struct mm_struct *mm);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
@@ -42,6 +43,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
}
+static inline void khugepaged_enter_mm(struct mm_struct *mm)
+{
+}
static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr, bool install_pmd)
{
diff --git a/kernel/sys.c b/kernel/sys.c
index a46d9b75880b..2c445bf44ce3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
+#include <linux/khugepaged.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
@@ -2479,7 +2480,7 @@ static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
/* Flags are only allowed when disabling. */
if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
return -EINVAL;
- if (mmap_write_lock_killable(current->mm))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
if (thp_disable) {
if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
@@ -2493,7 +2494,9 @@ static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
}
- mmap_write_unlock(current->mm);
+
+ khugepaged_enter_mm(mm);
+ mmap_write_unlock(mm);
return 0;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 7ab2d1a42df3..f47ac8c19447 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -396,15 +396,10 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline int hpage_collapse_test_exit(struct mm_struct *mm)
-{
- return atomic_read(&mm->mm_users) == 0;
-}
-
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
- return hpage_collapse_test_exit(mm) ||
- mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
+ return !atomic_read(&mm->mm_users) || /* exit */
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); /* disable */
}
static bool hugepage_pmd_enabled(void)
@@ -437,7 +432,7 @@ void __khugepaged_enter(struct mm_struct *mm)
int wakeup;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
+ VM_WARN_ON_ONCE(hpage_collapse_test_exit_or_disable(mm));
if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
@@ -460,14 +455,25 @@ void __khugepaged_enter(struct mm_struct *mm)
wake_up_interruptible(&khugepaged_wait);
}
+void khugepaged_enter_mm(struct mm_struct *mm)
+{
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ return;
+ if (mm_flags_test(MMF_VM_HUGEPAGE, mm))
+ return;
+ if (!hugepage_pmd_enabled())
+ return;
+
+ __khugepaged_enter(mm);
+}
+
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
- hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
- __khugepaged_enter(vma->vm_mm);
- }
+ if (!thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ return;
+
+ khugepaged_enter_mm(vma->vm_mm);
}
void __khugepaged_exit(struct mm_struct *mm)
@@ -491,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm)
} else if (slot) {
/*
* This is required to serialize against
- * hpage_collapse_test_exit() (which is guaranteed to run
+ * hpage_collapse_test_exit_or_disable() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we return all
* pagetables will be destroyed) until khugepaged has finished
* working on the pagetables under the mmap_lock.
@@ -1429,16 +1435,13 @@ static void collect_mm_slot(struct mm_slot *slot)
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit(mm)) {
+ if (hpage_collapse_test_exit_or_disable(mm)) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
- /*
- * Not strictly needed because the mm exited already.
- *
- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
- */
+ /* If the mm is disabled, this flag must be cleared. */
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
/* khugepaged_mm_lock actually not necessary for the below */
mm_slot_free(mm_slot_cache, slot);
@@ -1749,7 +1752,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
continue;
- if (hpage_collapse_test_exit(mm))
+ if (hpage_collapse_test_exit_or_disable(mm))
continue;
/*
* When a vma is registered with uffd-wp, we cannot recycle
@@ -2500,9 +2503,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
VM_BUG_ON(khugepaged_scan.mm_slot != slot);
/*
* Release the current mm_slot if this mm is about to die, or
- * if we scanned all vmas of this mm.
+ * if we scanned all vmas of this mm, or if this mm is disabled.
*/
- if (hpage_collapse_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot
2025-09-26 9:33 ` [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot Yafang Shao
@ 2025-09-26 14:11 ` Usama Arif
2025-09-28 2:21 ` Yafang Shao
0 siblings, 1 reply; 24+ messages in thread
From: Usama Arif @ 2025-09-26 14:11 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 26/09/2025 10:33, Yafang Shao wrote:
> Since a task with MMF_DISABLE_THP_COMPLETELY cannot use THP, remove it from
> the khugepaged_mm_slot to stop khugepaged from processing it.
>
> After this change, the following semantic relationship always holds:
>
> MMF_VM_HUGEPAGE is set == task is in khugepaged mm_slot
> MMF_VM_HUGEPAGE is not set == task is not in khugepaged mm_slot
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Acked-by: Lance Yang <lance.yang@linux.dev>
> ---
> include/linux/khugepaged.h | 4 ++++
> kernel/sys.c | 7 ++++--
> mm/khugepaged.c | 49 ++++++++++++++++++++------------------
> 3 files changed, 35 insertions(+), 25 deletions(-)
>
Hi Yafang,
Thanks for the patch! Sorry wasnt able to review the previous revisions.
I think it would be good to separate this patch out of the series?
It would make the review of this series shorter and this patch can be merged independently.
In the commit message, we also need to write explicitly that when prctl
PR_SET_THP_DISABLE is cleared, the mm is added back for khugepaged to consider.
Could you also mention in the commit message why the BUG was turned into WARN?
Thanks!
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index eb1946a70cff..f14680cd9854 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,7 @@ extern void __khugepaged_enter(struct mm_struct *mm);
> extern void __khugepaged_exit(struct mm_struct *mm);
> extern void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags);
> +extern void khugepaged_enter_mm(struct mm_struct *mm);
> extern void khugepaged_min_free_kbytes_update(void);
> extern bool current_is_khugepaged(void);
> extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> @@ -42,6 +43,9 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> }
> +static inline void khugepaged_enter_mm(struct mm_struct *mm)
> +{
> +}
> static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
> unsigned long addr, bool install_pmd)
> {
> diff --git a/kernel/sys.c b/kernel/sys.c
> index a46d9b75880b..2c445bf44ce3 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -8,6 +8,7 @@
> #include <linux/export.h>
> #include <linux/mm.h>
> #include <linux/mm_inline.h>
> +#include <linux/khugepaged.h>
> #include <linux/utsname.h>
> #include <linux/mman.h>
> #include <linux/reboot.h>
> @@ -2479,7 +2480,7 @@ static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
> /* Flags are only allowed when disabling. */
> if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
> return -EINVAL;
> - if (mmap_write_lock_killable(current->mm))
> + if (mmap_write_lock_killable(mm))
> return -EINTR;
> if (thp_disable) {
> if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
> @@ -2493,7 +2494,9 @@ static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
> mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
> mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
> }
> - mmap_write_unlock(current->mm);
> +
> + khugepaged_enter_mm(mm);
> + mmap_write_unlock(mm);
> return 0;
> }
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 7ab2d1a42df3..f47ac8c19447 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -396,15 +396,10 @@ void __init khugepaged_destroy(void)
> kmem_cache_destroy(mm_slot_cache);
> }
>
> -static inline int hpage_collapse_test_exit(struct mm_struct *mm)
> -{
> - return atomic_read(&mm->mm_users) == 0;
> -}
> -
> static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
> {
> - return hpage_collapse_test_exit(mm) ||
> - mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
> + return !atomic_read(&mm->mm_users) || /* exit */
> + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); /* disable */
> }
>
> static bool hugepage_pmd_enabled(void)
> @@ -437,7 +432,7 @@ void __khugepaged_enter(struct mm_struct *mm)
> int wakeup;
>
> /* __khugepaged_exit() must not run from under us */
> - VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
> + VM_WARN_ON_ONCE(hpage_collapse_test_exit_or_disable(mm));
> if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
> return;
>
> @@ -460,14 +455,25 @@ void __khugepaged_enter(struct mm_struct *mm)
> wake_up_interruptible(&khugepaged_wait);
> }
>
> +void khugepaged_enter_mm(struct mm_struct *mm)
> +{
> + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> + return;
> + if (mm_flags_test(MMF_VM_HUGEPAGE, mm))
> + return;
> + if (!hugepage_pmd_enabled())
> + return;
> +
> + __khugepaged_enter(mm);
> +}
> +
> void khugepaged_enter_vma(struct vm_area_struct *vma,
> vm_flags_t vm_flags)
> {
> - if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
> - hugepage_pmd_enabled()) {
> - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> - __khugepaged_enter(vma->vm_mm);
> - }
> + if (!thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> + return;
> +
> + khugepaged_enter_mm(vma->vm_mm);
> }
>
> void __khugepaged_exit(struct mm_struct *mm)
> @@ -491,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm)
> } else if (slot) {
> /*
> * This is required to serialize against
> - * hpage_collapse_test_exit() (which is guaranteed to run
> + * hpage_collapse_test_exit_or_disable() (which is guaranteed to run
> * under mmap sem read mode). Stop here (after we return all
> * pagetables will be destroyed) until khugepaged has finished
> * working on the pagetables under the mmap_lock.
> @@ -1429,16 +1435,13 @@ static void collect_mm_slot(struct mm_slot *slot)
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit_or_disable(mm)) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
>
> - /*
> - * Not strictly needed because the mm exited already.
> - *
> - * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> - */
> + /* If the mm is disabled, this flag must be cleared. */
> + mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>
> /* khugepaged_mm_lock actually not necessary for the below */
> mm_slot_free(mm_slot_cache, slot);
> @@ -1749,7 +1752,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
> continue;
>
> - if (hpage_collapse_test_exit(mm))
> + if (hpage_collapse_test_exit_or_disable(mm))
> continue;
> /*
> * When a vma is registered with uffd-wp, we cannot recycle
> @@ -2500,9 +2503,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> VM_BUG_ON(khugepaged_scan.mm_slot != slot);
> /*
> * Release the current mm_slot if this mm is about to die, or
> - * if we scanned all vmas of this mm.
> + * if we scanned all vmas of this mm, or if this mm is disabled.
> */
> - if (hpage_collapse_test_exit(mm) || !vma) {
> + if (hpage_collapse_test_exit_or_disable(mm) || !vma) {
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot
2025-09-26 14:11 ` Usama Arif
@ 2025-09-28 2:21 ` Yafang Shao
0 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-28 2:21 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, gutierrez.asier, willy,
ast, daniel, andrii, ameryhung, rientjes, corbet, 21cnbao,
shakeel.butt, tj, lance.yang, bpf, linux-mm, linux-doc,
linux-kernel
On Fri, Sep 26, 2025 at 10:11 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 26/09/2025 10:33, Yafang Shao wrote:
> > Since a task with MMF_DISABLE_THP_COMPLETELY cannot use THP, remove it from
> > the khugepaged_mm_slot to stop khugepaged from processing it.
> >
> > After this change, the following semantic relationship always holds:
> >
> > MMF_VM_HUGEPAGE is set == task is in khugepaged mm_slot
> > MMF_VM_HUGEPAGE is not set == task is not in khugepaged mm_slot
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > Acked-by: Lance Yang <lance.yang@linux.dev>
> > ---
> > include/linux/khugepaged.h | 4 ++++
> > kernel/sys.c | 7 ++++--
> > mm/khugepaged.c | 49 ++++++++++++++++++++------------------
> > 3 files changed, 35 insertions(+), 25 deletions(-)
> >
>
>
> Hi Yafang,
>
> Thanks for the patch! Sorry wasnt able to review the previous revisions.
>
> I think it would be good to separate this patch out of the series?
right. This commit is independent of this series. I will send it seperately.
> It would make the review of this series shorter and this patch can be merged independently.
>
> In the commit message, we also need to write explicitly that when prctl
> PR_SET_THP_DISABLE is cleared, the mm is added back for khugepaged to consider.
ack.
>
> Could you also mention in the commit message why the BUG was turned into WARN?
>
I believe we should replace VM_BUG_ON_MM() with VM_WARN_ON_MM() [0]. A
warning is sufficient for debugging in this context.
[0] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/deprecated.rst#n32
--
Regards
Yafang
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma()
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 14:49 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 03/12] mm: thp: remove vm_flags parameter from thp_vma_allowable_order() Yafang Shao
` (9 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao, Yang Shi
The khugepaged_enter_vma() function requires handling in two specific
scenarios:
1. New VMA creation
When a new VMA is created, if vma->vm_mm is not present in
khugepaged_mm_slot, it must be added. In this case,
khugepaged_enter_vma() is called after vma->vm_flags have been set,
allowing direct use of the VMA's flags.
2. VMA flag modification
When vma->vm_flags are modified (particularly when VM_HUGEPAGE is set),
the system must recheck whether to add vma->vm_mm to khugepaged_mm_slot.
Currently, khugepaged_enter_vma() is called before the flag update, so
the call must be relocated to occur after vma->vm_flags have been set.
Additionally, khugepaged_enter_vma() is invoked in other contexts, such as
during VMA merging. However, these calls are unnecessary because the
existing VMA already ensures that vma->vm_mm is registered in
khugepaged_mm_slot. While removing these redundant calls represents a
potential optimization, that change should be addressed separately.
Because VMA merging only occurs when the vm_flags of both VMAs are
identical (excluding special flags like VM_SOFTDIRTY), we can safely use
target->vm_flags instead.
After this change, we can further remove vm_flags parameter from
thp_vma_allowable_order(). That will be handled in a followup patch.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
---
include/linux/khugepaged.h | 6 ++----
mm/huge_memory.c | 2 +-
mm/khugepaged.c | 11 ++---------
mm/madvise.c | 7 +++++++
mm/vma.c | 6 +++---
5 files changed, 15 insertions(+), 17 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index f14680cd9854..b30814d3d665 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -13,8 +13,7 @@ extern void khugepaged_destroy(void);
extern int start_stop_khugepaged(void);
extern void __khugepaged_enter(struct mm_struct *mm);
extern void __khugepaged_exit(struct mm_struct *mm);
-extern void khugepaged_enter_vma(struct vm_area_struct *vma,
- vm_flags_t vm_flags);
+extern void khugepaged_enter_vma(struct vm_area_struct *vma);
extern void khugepaged_enter_mm(struct mm_struct *mm);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
@@ -39,8 +38,7 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
static inline void khugepaged_exit(struct mm_struct *mm)
{
}
-static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
{
}
static inline void khugepaged_enter_mm(struct mm_struct *mm)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1b81680b4225..ac6601f30e65 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1346,7 +1346,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = vmf_anon_prepare(vmf);
if (ret)
return ret;
- khugepaged_enter_vma(vma, vma->vm_flags);
+ khugepaged_enter_vma(vma);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f47ac8c19447..04121ae7d18d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -353,12 +353,6 @@ int hugepage_madvise(struct vm_area_struct *vma,
#endif
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
- /*
- * If the vma become good for khugepaged to scan,
- * register it here without waiting a page fault that
- * may not happen any time soon.
- */
- khugepaged_enter_vma(vma, *vm_flags);
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
@@ -467,10 +461,9 @@ void khugepaged_enter_mm(struct mm_struct *mm)
__khugepaged_enter(mm);
}
-void khugepaged_enter_vma(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+void khugepaged_enter_vma(struct vm_area_struct *vma)
{
- if (!thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
return;
khugepaged_enter_mm(vma->vm_mm);
diff --git a/mm/madvise.c b/mm/madvise.c
index 35ed4ab0d7c5..ab8b5d47badb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1425,6 +1425,13 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
error = madvise_update_vma(new_flags, madv_behavior);
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (!error && new_flags & VM_HUGEPAGE)
+ khugepaged_enter_mm(vma->vm_mm);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
diff --git a/mm/vma.c b/mm/vma.c
index a1ec405bda25..6a548b0d64cd 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -973,7 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (err || commit_merge(vmg))
goto abort;
- khugepaged_enter_vma(vmg->target, vmg->vm_flags);
+ khugepaged_enter_vma(vmg->target);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -1093,7 +1093,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* following VMA if we have VMAs on both sides.
*/
if (vmg->target && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->target, vmg->vm_flags);
+ khugepaged_enter_vma(vmg->target);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
}
@@ -2520,7 +2520,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* call covers the non-merge case.
*/
if (!vma_is_anonymous(vma))
- khugepaged_enter_vma(vma, map->vm_flags);
+ khugepaged_enter_vma(vma);
*vmap = vma;
return 0;
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma()
2025-09-26 9:33 ` [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma() Yafang Shao
@ 2025-09-26 14:49 ` Usama Arif
2025-09-28 2:35 ` Yafang Shao
0 siblings, 1 reply; 24+ messages in thread
From: Usama Arif @ 2025-09-26 14:49 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yang Shi
On 26/09/2025 10:33, Yafang Shao wrote:
> The khugepaged_enter_vma() function requires handling in two specific
> scenarios:
> 1. New VMA creation
> When a new VMA is created, if vma->vm_mm is not present in
> khugepaged_mm_slot, it must be added. In this case,
> khugepaged_enter_vma() is called after vma->vm_flags have been set,
> allowing direct use of the VMA's flags.
> 2. VMA flag modification
> When vma->vm_flags are modified (particularly when VM_HUGEPAGE is set),
> the system must recheck whether to add vma->vm_mm to khugepaged_mm_slot.
> Currently, khugepaged_enter_vma() is called before the flag update, so
> the call must be relocated to occur after vma->vm_flags have been set.
>
> Additionally, khugepaged_enter_vma() is invoked in other contexts, such as
> during VMA merging. However, these calls are unnecessary because the
> existing VMA already ensures that vma->vm_mm is registered in
> khugepaged_mm_slot. While removing these redundant calls represents a
> potential optimization, that change should be addressed separately.
> Because VMA merging only occurs when the vm_flags of both VMAs are
> identical (excluding special flags like VM_SOFTDIRTY), we can safely use
> target->vm_flags instead.
>
The patch looks good to me, but if we are sure that khugepaged_enter_vma
is not needed in VMA merging case, we should remove it in this patch itself.
If the reason we are removing what flags are being considered when calling
khugepaged_enter_vma in VMA merging case is because the calls are unnecessary,
then we should just remove the calls and not modify them
(if its safe and functionally correct :))
> After this change, we can further remove vm_flags parameter from
> thp_vma_allowable_order(). That will be handled in a followup patch.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Cc: Yang Shi <shy828301@gmail.com>
> ---
> include/linux/khugepaged.h | 6 ++----
> mm/huge_memory.c | 2 +-
> mm/khugepaged.c | 11 ++---------
> mm/madvise.c | 7 +++++++
> mm/vma.c | 6 +++---
> 5 files changed, 15 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index f14680cd9854..b30814d3d665 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -13,8 +13,7 @@ extern void khugepaged_destroy(void);
> extern int start_stop_khugepaged(void);
> extern void __khugepaged_enter(struct mm_struct *mm);
> extern void __khugepaged_exit(struct mm_struct *mm);
> -extern void khugepaged_enter_vma(struct vm_area_struct *vma,
> - vm_flags_t vm_flags);
> +extern void khugepaged_enter_vma(struct vm_area_struct *vma);
> extern void khugepaged_enter_mm(struct mm_struct *mm);
> extern void khugepaged_min_free_kbytes_update(void);
> extern bool current_is_khugepaged(void);
> @@ -39,8 +38,7 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm
> static inline void khugepaged_exit(struct mm_struct *mm)
> {
> }
> -static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
> - vm_flags_t vm_flags)
> +static inline void khugepaged_enter_vma(struct vm_area_struct *vma)
> {
> }
> static inline void khugepaged_enter_mm(struct mm_struct *mm)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1b81680b4225..ac6601f30e65 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1346,7 +1346,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> ret = vmf_anon_prepare(vmf);
> if (ret)
> return ret;
> - khugepaged_enter_vma(vma, vma->vm_flags);
> + khugepaged_enter_vma(vma);
>
> if (!(vmf->flags & FAULT_FLAG_WRITE) &&
> !mm_forbids_zeropage(vma->vm_mm) &&
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f47ac8c19447..04121ae7d18d 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -353,12 +353,6 @@ int hugepage_madvise(struct vm_area_struct *vma,
> #endif
> *vm_flags &= ~VM_NOHUGEPAGE;
> *vm_flags |= VM_HUGEPAGE;
> - /*
> - * If the vma become good for khugepaged to scan,
> - * register it here without waiting a page fault that
> - * may not happen any time soon.
> - */
> - khugepaged_enter_vma(vma, *vm_flags);
> break;
> case MADV_NOHUGEPAGE:
> *vm_flags &= ~VM_HUGEPAGE;
> @@ -467,10 +461,9 @@ void khugepaged_enter_mm(struct mm_struct *mm)
> __khugepaged_enter(mm);
> }
>
> -void khugepaged_enter_vma(struct vm_area_struct *vma,
> - vm_flags_t vm_flags)
> +void khugepaged_enter_vma(struct vm_area_struct *vma)
> {
> - if (!thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
> return;
>
> khugepaged_enter_mm(vma->vm_mm);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 35ed4ab0d7c5..ab8b5d47badb 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -1425,6 +1425,13 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
> VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
>
> error = madvise_update_vma(new_flags, madv_behavior);
> + /*
> + * If the vma become good for khugepaged to scan,
> + * register it here without waiting a page fault that
> + * may not happen any time soon.
> + */
> + if (!error && new_flags & VM_HUGEPAGE)
> + khugepaged_enter_mm(vma->vm_mm);
> out:
> /*
> * madvise() returns EAGAIN if kernel resources, such as
> diff --git a/mm/vma.c b/mm/vma.c
> index a1ec405bda25..6a548b0d64cd 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -973,7 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
> if (err || commit_merge(vmg))
> goto abort;
>
> - khugepaged_enter_vma(vmg->target, vmg->vm_flags);
> + khugepaged_enter_vma(vmg->target);
> vmg->state = VMA_MERGE_SUCCESS;
> return vmg->target;
>
> @@ -1093,7 +1093,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
> * following VMA if we have VMAs on both sides.
> */
> if (vmg->target && !vma_expand(vmg)) {
> - khugepaged_enter_vma(vmg->target, vmg->vm_flags);
> + khugepaged_enter_vma(vmg->target);
> vmg->state = VMA_MERGE_SUCCESS;
> return vmg->target;
> }
> @@ -2520,7 +2520,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
> * call covers the non-merge case.
> */
> if (!vma_is_anonymous(vma))
> - khugepaged_enter_vma(vma, map->vm_flags);
> + khugepaged_enter_vma(vma);
> *vmap = vma;
> return 0;
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma()
2025-09-26 14:49 ` Usama Arif
@ 2025-09-28 2:35 ` Yafang Shao
0 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-28 2:35 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, gutierrez.asier, willy,
ast, daniel, andrii, ameryhung, rientjes, corbet, 21cnbao,
shakeel.butt, tj, lance.yang, bpf, linux-mm, linux-doc,
linux-kernel, Yang Shi
On Fri, Sep 26, 2025 at 10:50 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 26/09/2025 10:33, Yafang Shao wrote:
> > The khugepaged_enter_vma() function requires handling in two specific
> > scenarios:
> > 1. New VMA creation
> > When a new VMA is created, if vma->vm_mm is not present in
> > khugepaged_mm_slot, it must be added. In this case,
> > khugepaged_enter_vma() is called after vma->vm_flags have been set,
> > allowing direct use of the VMA's flags.
> > 2. VMA flag modification
> > When vma->vm_flags are modified (particularly when VM_HUGEPAGE is set),
> > the system must recheck whether to add vma->vm_mm to khugepaged_mm_slot.
> > Currently, khugepaged_enter_vma() is called before the flag update, so
> > the call must be relocated to occur after vma->vm_flags have been set.
> >
> > Additionally, khugepaged_enter_vma() is invoked in other contexts, such as
> > during VMA merging. However, these calls are unnecessary because the
> > existing VMA already ensures that vma->vm_mm is registered in
> > khugepaged_mm_slot. While removing these redundant calls represents a
> > potential optimization, that change should be addressed separately.
> > Because VMA merging only occurs when the vm_flags of both VMAs are
> > identical (excluding special flags like VM_SOFTDIRTY), we can safely use
> > target->vm_flags instead.
> >
>
> The patch looks good to me, but if we are sure that khugepaged_enter_vma
> is not needed in VMA merging case,
Calling khugepaged_enter_vma() is unnecessary during VMA merging
because it's already handled: for non-anonymous VMAs, it's called upon
creation, and for anonymous VMAs, it's handled at page fault.
> we should remove it in this patch itself.
I'd prefer to handle this cleanup separately. The goal is to keep the
THP changes minimal, even though I've already made significant
modifications ;-)
> If the reason we are removing what flags are being considered when calling
> khugepaged_enter_vma in VMA merging case is because the calls are unnecessary,
Actually, the rationale is that the flags can be removed because:
Because VMA merging only occurs when the vm_flags of both VMAs are
identical (excluding special flags like VM_SOFTDIRTY), we can safely use
target->vm_flags instead.
I will update the commit log to clarify this point.
> then we should just remove the calls and not modify them
> (if its safe and functionally correct :))
--
Regards
Yafang
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 03/12] mm: thp: remove vm_flags parameter from thp_vma_allowable_order()
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 01/12] mm: thp: remove disabled task from khugepaged_mm_slot Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 02/12] mm: thp: remove vm_flags parameter from khugepaged_enter_vma() Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 14:54 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection Yafang Shao
` (8 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
Because all calls to thp_vma_allowable_order() pass vma->vm_flags as the
vma_flags argument, we can remove the parameter and have the function
access vma->vm_flags directly.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
fs/proc/task_mmu.c | 3 +--
include/linux/huge_mm.h | 16 ++++++++--------
mm/huge_memory.c | 4 ++--
mm/khugepaged.c | 10 +++++-----
mm/memory.c | 11 +++++------
mm/shmem.c | 2 +-
6 files changed, 22 insertions(+), 24 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc35a0543f01..e713d1905750 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1369,8 +1369,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS,
- THP_ORDERS_ALL));
+ !!thp_vma_allowable_orders(vma, TVA_SMAPS, THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc985..a635dcbb2b99 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -101,8 +101,8 @@ enum tva_type {
TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
};
-#define thp_vma_allowable_order(vma, vm_flags, type, order) \
- (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order)))
+#define thp_vma_allowable_order(vma, type, order) \
+ (!!thp_vma_allowable_orders(vma, type, BIT(order)))
#define split_folio(f) split_folio_to_list(f, NULL)
@@ -266,14 +266,12 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders);
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
- * @vm_flags: use these vm_flags instead of vma->vm_flags
* @type: TVA type
* @orders: bitfield of all orders to consider
*
@@ -287,10 +285,11 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
*/
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
+ vm_flags_t vm_flags = vma->vm_flags;
+
/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
@@ -309,7 +308,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
return 0;
}
- return __thp_vma_allowable_orders(vma, vm_flags, type, orders);
+ return __thp_vma_allowable_orders(vma, type, orders);
}
struct thpsize {
@@ -329,8 +328,10 @@ struct thpsize {
* through madvise or prctl.
*/
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
- vm_flags_t vm_flags, bool forced_collapse)
+ bool forced_collapse)
{
+ vm_flags_t vm_flags = vma->vm_flags;
+
/* Are THPs disabled for this VMA? */
if (vm_flags & VM_NOHUGEPAGE)
return true;
@@ -560,7 +561,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
}
static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ac6601f30e65..1ac476fe6dc5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -98,7 +98,6 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
}
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
- vm_flags_t vm_flags,
enum tva_type type,
unsigned long orders)
{
@@ -106,6 +105,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
const bool in_pf = type == TVA_PAGEFAULT;
const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
+ vm_flags_t vm_flags = vma->vm_flags;
/* Check the intersection of requested and supported orders. */
if (vma_is_anonymous(vma))
@@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 04121ae7d18d..9eeb868adcd3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -463,7 +463,7 @@ void khugepaged_enter_mm(struct mm_struct *mm)
void khugepaged_enter_vma(struct vm_area_struct *vma)
{
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, TVA_KHUGEPAGED, PMD_ORDER))
return;
khugepaged_enter_mm(vma->vm_mm);
@@ -915,7 +915,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1526,7 +1526,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
* analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2421,7 +2421,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2752,7 +2752,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/memory.c b/mm/memory.c
index 7e32eb79ba99..cd04e4894725 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4558,7 +4558,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT,
BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
@@ -5107,7 +5107,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
* for this vma. Then filter out the orders that can't be allocated over
* the faulting address and still be fully contained in the vma.
*/
- orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
+ orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT,
BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
@@ -5379,7 +5379,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* PMD mappings if THPs are disabled. As we already have a THP,
* behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma,
/* forced_collapse=*/ true))
return ret;
@@ -6280,7 +6280,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
- vm_flags_t vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -6295,7 +6294,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
+ thp_vma_allowable_order(vma, TVA_PAGEFAULT, PUD_ORDER)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -6329,7 +6328,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
+ thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4855eee22731..cc2c90656b66 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1780,7 +1780,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 03/12] mm: thp: remove vm_flags parameter from thp_vma_allowable_order()
2025-09-26 9:33 ` [PATCH v8 mm-new 03/12] mm: thp: remove vm_flags parameter from thp_vma_allowable_order() Yafang Shao
@ 2025-09-26 14:54 ` Usama Arif
0 siblings, 0 replies; 24+ messages in thread
From: Usama Arif @ 2025-09-26 14:54 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 26/09/2025 10:33, Yafang Shao wrote:
> Because all calls to thp_vma_allowable_order() pass vma->vm_flags as the
> vma_flags argument, we can remove the parameter and have the function
> access vma->vm_flags directly.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
> fs/proc/task_mmu.c | 3 +--
> include/linux/huge_mm.h | 16 ++++++++--------
> mm/huge_memory.c | 4 ++--
> mm/khugepaged.c | 10 +++++-----
> mm/memory.c | 11 +++++------
> mm/shmem.c | 2 +-
> 6 files changed, 22 insertions(+), 24 deletions(-)
>
Acked-by: Usama Arif <usamaarif642@gmail.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (2 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 03/12] mm: thp: remove vm_flags parameter from thp_vma_allowable_order() Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 15:13 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 05/12] mm: thp: decouple THP allocation between swap and page fault paths Yafang Shao
` (7 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic
THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF
programs to influence THP order selection based on factors such as:
- Workload identity
For example, workloads running in specific containers or cgroups.
- Allocation context
Whether the allocation occurs during a page fault, khugepaged, swap or
other paths.
- VMA's memory advice settings
MADV_HUGEPAGE or MADV_NOHUGEPAGE
- Memory pressure
PSI system data or associated cgroup PSI metrics
The kernel API of this new BPF hook is as follows,
/**
* thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
* @vma: vm_area_struct associated with the THP allocation
* @type: TVA type for current @vma
* @orders: Bitmask of available THP orders for this allocation
*
* Return: The suggested THP order for allocation from the BPF program. Must be
* a valid, available order.
*/
typedef int thp_order_fn_t(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders);
Only a single BPF program can be attached at any given time, though it can
be dynamically updated to adjust the policy. The implementation supports
anonymous THP, shmem THP, and mTHP, with future extensions planned for
file-backed THP.
This functionality is only active when system-wide THP is configured to
madvise or always mode. It remains disabled in never mode. Additionally,
if THP is explicitly disabled for a specific task via prctl(), this BPF
functionality will also be unavailable for that task.
This BPF hook enables the implementation of flexible THP allocation
policies at the system, per-cgroup, or per-task level.
This feature requires CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL to be
enabled. Note that this capability is currently unstable and may undergo
significant changes—including potential removal—in future kernel versions.
Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
MAINTAINERS | 1 +
include/linux/huge_mm.h | 23 +++++
mm/Kconfig | 12 +++
mm/Makefile | 1 +
mm/huge_memory_bpf.c | 204 ++++++++++++++++++++++++++++++++++++++++
5 files changed, 241 insertions(+)
create mode 100644 mm/huge_memory_bpf.c
diff --git a/MAINTAINERS b/MAINTAINERS
index ca8e3d18eedd..7be34b2a64fd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16257,6 +16257,7 @@ F: include/linux/huge_mm.h
F: include/linux/khugepaged.h
F: include/trace/events/huge_memory.h
F: mm/huge_memory.c
+F: mm/huge_memory_bpf.c
F: mm/khugepaged.c
F: mm/mm_slot.h
F: tools/testing/selftests/mm/khugepaged.c
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a635dcbb2b99..fea94c059bed 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -56,6 +56,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */
};
struct kobject;
@@ -269,6 +270,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
enum tva_type type,
unsigned long orders);
+#ifdef CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL
+
+unsigned long
+bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
+ unsigned long orders);
+
+#else
+
+static inline unsigned long
+bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
+ unsigned long orders)
+{
+ return orders;
+}
+
+#endif
+
/**
* thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
* @vma: the vm area to check
@@ -290,6 +308,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
{
vm_flags_t vm_flags = vma->vm_flags;
+ /* The BPF-specified order overrides which order is selected. */
+ orders &= bpf_hook_thp_get_orders(vma, type, orders);
+ if (!orders)
+ return 0;
+
/*
* Optimization to check if required orders are enabled early. Only
* forced collapse ignores sysfs configs.
diff --git a/mm/Kconfig b/mm/Kconfig
index bde9f842a4a8..fd7459eecb2d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -895,6 +895,18 @@ config NO_PAGE_MAPCOUNT
EXPERIMENTAL because the impact of some changes is still unclear.
+config BPF_THP_GET_ORDER_EXPERIMENTAL
+ bool "BPF-based THP order selection (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL
+
+ help
+ Enable dynamic THP order selection using BPF programs. This
+ experimental feature allows custom BPF logic to determine optimal
+ transparent hugepage allocation sizes at runtime.
+
+ WARNING: This feature is unstable and may change in future kernel
+ versions.
+
endif # TRANSPARENT_HUGEPAGE
# simple helper to make the code a bit easier to read
diff --git a/mm/Makefile b/mm/Makefile
index 21abb3353550..62ebfa23635a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+obj-$(CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL) += huge_memory_bpf.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c
new file mode 100644
index 000000000000..b59a65d70a93
--- /dev/null
+++ b/mm/huge_memory_bpf.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF-based THP policy management
+ *
+ * Author: Yafang Shao <laoar.shao@gmail.com>
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/huge_mm.h>
+#include <linux/khugepaged.h>
+
+/**
+ * @thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
+ * @vma: vm_area_struct associated with the THP allocation
+ * @type: TVA type for current @vma
+ * @orders: Bitmask of available THP orders for this allocation
+ *
+ * Return: The suggested THP order for allocation from the BPF program. Must be
+ * a valid, available order.
+ */
+typedef int thp_order_fn_t(struct vm_area_struct *vma,
+ enum tva_type type,
+ unsigned long orders);
+
+struct bpf_thp_ops {
+ thp_order_fn_t __rcu *thp_get_order;
+};
+
+static struct bpf_thp_ops bpf_thp;
+static DEFINE_SPINLOCK(thp_ops_lock);
+
+unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma,
+ enum tva_type type,
+ unsigned long orders)
+{
+ thp_order_fn_t *bpf_hook_thp_get_order;
+ int bpf_order;
+
+ /* No BPF program is attached */
+ if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
+ &transparent_hugepage_flags))
+ return orders;
+
+ rcu_read_lock();
+ bpf_hook_thp_get_order = rcu_dereference(bpf_thp.thp_get_order);
+ if (!bpf_hook_thp_get_order)
+ goto out;
+
+ bpf_order = bpf_hook_thp_get_order(vma, type, orders);
+ orders &= BIT(bpf_order);
+
+out:
+ rcu_read_unlock();
+ return orders;
+}
+
+static bool bpf_thp_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_func_proto *
+bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
+ .get_func_proto = bpf_thp_get_func_proto,
+ .is_valid_access = bpf_thp_ops_is_valid_access,
+};
+
+static int bpf_thp_init(struct btf *btf)
+{
+ return 0;
+}
+
+static int bpf_thp_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ /* The call site operates under RCU protection. */
+ if (prog->sleepable)
+ return -EINVAL;
+ return 0;
+}
+
+static int bpf_thp_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return 0;
+}
+
+static int bpf_thp_reg(void *kdata, struct bpf_link *link)
+{
+ struct bpf_thp_ops *ops = kdata;
+
+ spin_lock(&thp_ops_lock);
+ if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
+ &transparent_hugepage_flags)) {
+ spin_unlock(&thp_ops_lock);
+ return -EBUSY;
+ }
+ WARN_ON_ONCE(rcu_access_pointer(bpf_thp.thp_get_order));
+ rcu_assign_pointer(bpf_thp.thp_get_order, ops->thp_get_order);
+ spin_unlock(&thp_ops_lock);
+ return 0;
+}
+
+static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
+{
+ thp_order_fn_t *old_fn;
+
+ spin_lock(&thp_ops_lock);
+ clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
+ old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, NULL,
+ lockdep_is_held(&thp_ops_lock));
+ WARN_ON_ONCE(!old_fn);
+ spin_unlock(&thp_ops_lock);
+
+ synchronize_rcu();
+}
+
+static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ thp_order_fn_t *old_fn, *new_fn;
+ struct bpf_thp_ops *old = old_kdata;
+ struct bpf_thp_ops *ops = kdata;
+ int ret = 0;
+
+ if (!ops || !old)
+ return -EINVAL;
+
+ spin_lock(&thp_ops_lock);
+ /* The prog has aleady been removed. */
+ if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
+ &transparent_hugepage_flags)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ new_fn = rcu_dereference(ops->thp_get_order);
+ old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, new_fn,
+ lockdep_is_held(&thp_ops_lock));
+ WARN_ON_ONCE(!old_fn || !new_fn);
+
+out:
+ spin_unlock(&thp_ops_lock);
+ if (!ret)
+ synchronize_rcu();
+ return ret;
+}
+
+static int bpf_thp_validate(void *kdata)
+{
+ struct bpf_thp_ops *ops = kdata;
+
+ if (!ops->thp_get_order) {
+ pr_err("bpf_thp: required ops isn't implemented\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int bpf_thp_get_order(struct vm_area_struct *vma,
+ enum tva_type type,
+ unsigned long orders)
+{
+ return -1;
+}
+
+static struct bpf_thp_ops __bpf_thp_ops = {
+ .thp_get_order = (thp_order_fn_t __rcu *)bpf_thp_get_order,
+};
+
+static struct bpf_struct_ops bpf_bpf_thp_ops = {
+ .verifier_ops = &thp_bpf_verifier_ops,
+ .init = bpf_thp_init,
+ .check_member = bpf_thp_check_member,
+ .init_member = bpf_thp_init_member,
+ .reg = bpf_thp_reg,
+ .unreg = bpf_thp_unreg,
+ .update = bpf_thp_update,
+ .validate = bpf_thp_validate,
+ .cfi_stubs = &__bpf_thp_ops,
+ .owner = THIS_MODULE,
+ .name = "bpf_thp_ops",
+};
+
+static int __init bpf_thp_ops_init(void)
+{
+ int err;
+
+ err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
+ if (err)
+ pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
+ return err;
+}
+late_initcall(bpf_thp_ops_init);
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection
2025-09-26 9:33 ` [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection Yafang Shao
@ 2025-09-26 15:13 ` Usama Arif
2025-09-26 19:17 ` Randy Dunlap
2025-09-28 2:13 ` Yafang Shao
0 siblings, 2 replies; 24+ messages in thread
From: Usama Arif @ 2025-09-26 15:13 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 26/09/2025 10:33, Yafang Shao wrote:
> This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic
> THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF
> programs to influence THP order selection based on factors such as:
> - Workload identity
> For example, workloads running in specific containers or cgroups.
> - Allocation context
> Whether the allocation occurs during a page fault, khugepaged, swap or
> other paths.
> - VMA's memory advice settings
> MADV_HUGEPAGE or MADV_NOHUGEPAGE
> - Memory pressure
> PSI system data or associated cgroup PSI metrics
>
> The kernel API of this new BPF hook is as follows,
>
> /**
> * thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
> * @vma: vm_area_struct associated with the THP allocation
> * @type: TVA type for current @vma
> * @orders: Bitmask of available THP orders for this allocation
> *
> * Return: The suggested THP order for allocation from the BPF program. Must be
> * a valid, available order.
> */
> typedef int thp_order_fn_t(struct vm_area_struct *vma,
> enum tva_type type,
> unsigned long orders);
>
> Only a single BPF program can be attached at any given time, though it can
> be dynamically updated to adjust the policy. The implementation supports
> anonymous THP, shmem THP, and mTHP, with future extensions planned for
> file-backed THP.
>
> This functionality is only active when system-wide THP is configured to
> madvise or always mode. It remains disabled in never mode. Additionally,
> if THP is explicitly disabled for a specific task via prctl(), this BPF
> functionality will also be unavailable for that task.
>
> This BPF hook enables the implementation of flexible THP allocation
> policies at the system, per-cgroup, or per-task level.
>
> This feature requires CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL to be
> enabled. Note that this capability is currently unstable and may undergo
> significant changes—including potential removal—in future kernel versions.
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
> MAINTAINERS | 1 +
> include/linux/huge_mm.h | 23 +++++
> mm/Kconfig | 12 +++
> mm/Makefile | 1 +
> mm/huge_memory_bpf.c | 204 ++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 241 insertions(+)
> create mode 100644 mm/huge_memory_bpf.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ca8e3d18eedd..7be34b2a64fd 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16257,6 +16257,7 @@ F: include/linux/huge_mm.h
> F: include/linux/khugepaged.h
> F: include/trace/events/huge_memory.h
> F: mm/huge_memory.c
> +F: mm/huge_memory_bpf.c
> F: mm/khugepaged.c
> F: mm/mm_slot.h
> F: tools/testing/selftests/mm/khugepaged.c
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index a635dcbb2b99..fea94c059bed 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -56,6 +56,7 @@ enum transparent_hugepage_flag {
> TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
> TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
> TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
> + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */
> };
>
> struct kobject;
> @@ -269,6 +270,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
> enum tva_type type,
> unsigned long orders);
>
> +#ifdef CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL
> +
> +unsigned long
> +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
> + unsigned long orders);
> +
> +#else
> +
> +static inline unsigned long
> +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
> + unsigned long orders)
> +{
> + return orders;
> +}
> +
> +#endif
> +
> /**
> * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
> * @vma: the vm area to check
> @@ -290,6 +308,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> {
> vm_flags_t vm_flags = vma->vm_flags;
>
> + /* The BPF-specified order overrides which order is selected. */
> + orders &= bpf_hook_thp_get_orders(vma, type, orders);
> + if (!orders)
> + return 0;
> +
> /*
> * Optimization to check if required orders are enabled early. Only
> * forced collapse ignores sysfs configs.
> diff --git a/mm/Kconfig b/mm/Kconfig
> index bde9f842a4a8..fd7459eecb2d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -895,6 +895,18 @@ config NO_PAGE_MAPCOUNT
>
> EXPERIMENTAL because the impact of some changes is still unclear.
>
> +config BPF_THP_GET_ORDER_EXPERIMENTAL
> + bool "BPF-based THP order selection (EXPERIMENTAL)"
> + depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL
> +
> + help
> + Enable dynamic THP order selection using BPF programs. This
> + experimental feature allows custom BPF logic to determine optimal
> + transparent hugepage allocation sizes at runtime.
> +
> + WARNING: This feature is unstable and may change in future kernel
> + versions.
> +
I am assuming this series opens up the possibility of additional hooks being added in
the future. Instead of naming this BPF_THP_GET_ORDER_EXPERIMENTAL, should we
name it BPF_THP? Otherwise we will end up with 1 Kconfig option per hook, which
is quite bad.
Also It would be really nice if we dont put "EXPERIMENTAL" in the name of the defconfig.
If its decided that its not experimental anymore without any change to the code needed,
renaming the defconfig will break it for everyone.
> endif # TRANSPARENT_HUGEPAGE
>
> # simple helper to make the code a bit easier to read
> diff --git a/mm/Makefile b/mm/Makefile
> index 21abb3353550..62ebfa23635a 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
> obj-$(CONFIG_NUMA) += memory-tiers.o
> obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
> obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
> +obj-$(CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL) += huge_memory_bpf.o
> obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
> obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
> obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c
> new file mode 100644
> index 000000000000..b59a65d70a93
> --- /dev/null
> +++ b/mm/huge_memory_bpf.c
> @@ -0,0 +1,204 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * BPF-based THP policy management
> + *
> + * Author: Yafang Shao <laoar.shao@gmail.com>
> + */
> +
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/huge_mm.h>
> +#include <linux/khugepaged.h>
> +
> +/**
> + * @thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
> + * @vma: vm_area_struct associated with the THP allocation
> + * @type: TVA type for current @vma
> + * @orders: Bitmask of available THP orders for this allocation
> + *
> + * Return: The suggested THP order for allocation from the BPF program. Must be
> + * a valid, available order.
> + */
> +typedef int thp_order_fn_t(struct vm_area_struct *vma,
> + enum tva_type type,
> + unsigned long orders);
> +
> +struct bpf_thp_ops {
> + thp_order_fn_t __rcu *thp_get_order;
> +};
> +
> +static struct bpf_thp_ops bpf_thp;
> +static DEFINE_SPINLOCK(thp_ops_lock);
> +
> +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma,
> + enum tva_type type,
> + unsigned long orders)
> +{
> + thp_order_fn_t *bpf_hook_thp_get_order;
> + int bpf_order;
> +
> + /* No BPF program is attached */
> + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> + &transparent_hugepage_flags))
> + return orders;
> +
> + rcu_read_lock();
> + bpf_hook_thp_get_order = rcu_dereference(bpf_thp.thp_get_order);
> + if (!bpf_hook_thp_get_order)
Should we warn over here if we are going to out? TRANSPARENT_HUGEPAGE_BPF_ATTACHED
being set + !bpf_hook_thp_get_order shouldnt be possible, right?
> + goto out;
> +
> + bpf_order = bpf_hook_thp_get_order(vma, type, orders);
> + orders &= BIT(bpf_order);
> +
> +out:
> + rcu_read_unlock();
> + return orders;
> +}
> +
> +static bool bpf_thp_ops_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
> +}
> +
> +static const struct bpf_func_proto *
> +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + return bpf_base_func_proto(func_id, prog);
> +}
> +
> +static const struct bpf_verifier_ops thp_bpf_verifier_ops = {
> + .get_func_proto = bpf_thp_get_func_proto,
> + .is_valid_access = bpf_thp_ops_is_valid_access,
> +};
> +
> +static int bpf_thp_init(struct btf *btf)
> +{
> + return 0;
> +}
> +
> +static int bpf_thp_check_member(const struct btf_type *t,
> + const struct btf_member *member,
> + const struct bpf_prog *prog)
> +{
> + /* The call site operates under RCU protection. */
> + if (prog->sleepable)
> + return -EINVAL;
> + return 0;
> +}
> +
> +static int bpf_thp_init_member(const struct btf_type *t,
> + const struct btf_member *member,
> + void *kdata, const void *udata)
> +{
> + return 0;
> +}
> +
> +static int bpf_thp_reg(void *kdata, struct bpf_link *link)
> +{
> + struct bpf_thp_ops *ops = kdata;
> +
> + spin_lock(&thp_ops_lock);
> + if (test_and_set_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> + &transparent_hugepage_flags)) {
> + spin_unlock(&thp_ops_lock);
> + return -EBUSY;
> + }
> + WARN_ON_ONCE(rcu_access_pointer(bpf_thp.thp_get_order));
> + rcu_assign_pointer(bpf_thp.thp_get_order, ops->thp_get_order);
> + spin_unlock(&thp_ops_lock);
> + return 0;
> +}
> +
> +static void bpf_thp_unreg(void *kdata, struct bpf_link *link)
> +{
> + thp_order_fn_t *old_fn;
> +
> + spin_lock(&thp_ops_lock);
> + clear_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED, &transparent_hugepage_flags);
> + old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, NULL,
> + lockdep_is_held(&thp_ops_lock));
> + WARN_ON_ONCE(!old_fn);
> + spin_unlock(&thp_ops_lock);
> +
> + synchronize_rcu();
> +}
> +
> +static int bpf_thp_update(void *kdata, void *old_kdata, struct bpf_link *link)
> +{
> + thp_order_fn_t *old_fn, *new_fn;
> + struct bpf_thp_ops *old = old_kdata;
> + struct bpf_thp_ops *ops = kdata;
> + int ret = 0;
> +
> + if (!ops || !old)
> + return -EINVAL;
> +
> + spin_lock(&thp_ops_lock);
> + /* The prog has aleady been removed. */
> + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> + &transparent_hugepage_flags)) {
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + new_fn = rcu_dereference(ops->thp_get_order);
> + old_fn = rcu_replace_pointer(bpf_thp.thp_get_order, new_fn,
> + lockdep_is_held(&thp_ops_lock));
> + WARN_ON_ONCE(!old_fn || !new_fn);
> +
> +out:
> + spin_unlock(&thp_ops_lock);
> + if (!ret)
> + synchronize_rcu();
> + return ret;
> +}
> +
> +static int bpf_thp_validate(void *kdata)
> +{
> + struct bpf_thp_ops *ops = kdata;
> +
> + if (!ops->thp_get_order) {
> + pr_err("bpf_thp: required ops isn't implemented\n");
> + return -EINVAL;
> + }
> + return 0;
> +}
> +
> +static int bpf_thp_get_order(struct vm_area_struct *vma,
> + enum tva_type type,
> + unsigned long orders)
> +{
> + return -1;
> +}
> +
> +static struct bpf_thp_ops __bpf_thp_ops = {
> + .thp_get_order = (thp_order_fn_t __rcu *)bpf_thp_get_order,
> +};
> +
> +static struct bpf_struct_ops bpf_bpf_thp_ops = {
> + .verifier_ops = &thp_bpf_verifier_ops,
> + .init = bpf_thp_init,
> + .check_member = bpf_thp_check_member,
> + .init_member = bpf_thp_init_member,
> + .reg = bpf_thp_reg,
> + .unreg = bpf_thp_unreg,
> + .update = bpf_thp_update,
> + .validate = bpf_thp_validate,
> + .cfi_stubs = &__bpf_thp_ops,
> + .owner = THIS_MODULE,
> + .name = "bpf_thp_ops",
> +};
> +
> +static int __init bpf_thp_ops_init(void)
> +{
> + int err;
> +
> + err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops);
> + if (err)
> + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err);
> + return err;
> +}
> +late_initcall(bpf_thp_ops_init);
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection
2025-09-26 15:13 ` Usama Arif
@ 2025-09-26 19:17 ` Randy Dunlap
2025-09-28 2:13 ` Yafang Shao
1 sibling, 0 replies; 24+ messages in thread
From: Randy Dunlap @ 2025-09-26 19:17 UTC (permalink / raw)
To: Usama Arif, Yafang Shao, akpm, david, ziy, baolin.wang,
lorenzo.stoakes, Liam.Howlett, npache, ryan.roberts, dev.jain,
hannes, gutierrez.asier, willy, ast, daniel, andrii, ameryhung,
rientjes, corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 9/26/25 8:13 AM, Usama Arif wrote:
>> +config BPF_THP_GET_ORDER_EXPERIMENTAL
>> + bool "BPF-based THP order selection (EXPERIMENTAL)"
>> + depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL
>> +
>> + help
>> + Enable dynamic THP order selection using BPF programs. This
>> + experimental feature allows custom BPF logic to determine optimal
>> + transparent hugepage allocation sizes at runtime.
>> +
>> + WARNING: This feature is unstable and may change in future kernel
>> + versions.
>> +
> I am assuming this series opens up the possibility of additional hooks being added in
> the future. Instead of naming this BPF_THP_GET_ORDER_EXPERIMENTAL, should we
> name it BPF_THP? Otherwise we will end up with 1 Kconfig option per hook, which
> is quite bad.
>
> Also It would be really nice if we dont put "EXPERIMENTAL" in the name of the defconfig.
> If its decided that its not experimental anymore without any change to the code needed,
> renaming the defconfig will break it for everyone.
s/defconfig/Kconfig symbol/
Otherwise agreed.
Thanks.
--
~Randy
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection
2025-09-26 15:13 ` Usama Arif
2025-09-26 19:17 ` Randy Dunlap
@ 2025-09-28 2:13 ` Yafang Shao
1 sibling, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-28 2:13 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, gutierrez.asier, willy,
ast, daniel, andrii, ameryhung, rientjes, corbet, 21cnbao,
shakeel.butt, tj, lance.yang, bpf, linux-mm, linux-doc,
linux-kernel
On Fri, Sep 26, 2025 at 11:13 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 26/09/2025 10:33, Yafang Shao wrote:
> > This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic
> > THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF
> > programs to influence THP order selection based on factors such as:
> > - Workload identity
> > For example, workloads running in specific containers or cgroups.
> > - Allocation context
> > Whether the allocation occurs during a page fault, khugepaged, swap or
> > other paths.
> > - VMA's memory advice settings
> > MADV_HUGEPAGE or MADV_NOHUGEPAGE
> > - Memory pressure
> > PSI system data or associated cgroup PSI metrics
> >
> > The kernel API of this new BPF hook is as follows,
> >
> > /**
> > * thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
> > * @vma: vm_area_struct associated with the THP allocation
> > * @type: TVA type for current @vma
> > * @orders: Bitmask of available THP orders for this allocation
> > *
> > * Return: The suggested THP order for allocation from the BPF program. Must be
> > * a valid, available order.
> > */
> > typedef int thp_order_fn_t(struct vm_area_struct *vma,
> > enum tva_type type,
> > unsigned long orders);
> >
> > Only a single BPF program can be attached at any given time, though it can
> > be dynamically updated to adjust the policy. The implementation supports
> > anonymous THP, shmem THP, and mTHP, with future extensions planned for
> > file-backed THP.
> >
> > This functionality is only active when system-wide THP is configured to
> > madvise or always mode. It remains disabled in never mode. Additionally,
> > if THP is explicitly disabled for a specific task via prctl(), this BPF
> > functionality will also be unavailable for that task.
> >
> > This BPF hook enables the implementation of flexible THP allocation
> > policies at the system, per-cgroup, or per-task level.
> >
> > This feature requires CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL to be
> > enabled. Note that this capability is currently unstable and may undergo
> > significant changes—including potential removal—in future kernel versions.
> >
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> > MAINTAINERS | 1 +
> > include/linux/huge_mm.h | 23 +++++
> > mm/Kconfig | 12 +++
> > mm/Makefile | 1 +
> > mm/huge_memory_bpf.c | 204 ++++++++++++++++++++++++++++++++++++++++
> > 5 files changed, 241 insertions(+)
> > create mode 100644 mm/huge_memory_bpf.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index ca8e3d18eedd..7be34b2a64fd 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -16257,6 +16257,7 @@ F: include/linux/huge_mm.h
> > F: include/linux/khugepaged.h
> > F: include/trace/events/huge_memory.h
> > F: mm/huge_memory.c
> > +F: mm/huge_memory_bpf.c
> > F: mm/khugepaged.c
> > F: mm/mm_slot.h
> > F: tools/testing/selftests/mm/khugepaged.c
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index a635dcbb2b99..fea94c059bed 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -56,6 +56,7 @@ enum transparent_hugepage_flag {
> > TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
> > TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
> > TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
> > + TRANSPARENT_HUGEPAGE_BPF_ATTACHED, /* BPF prog is attached */
> > };
> >
> > struct kobject;
> > @@ -269,6 +270,23 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
> > enum tva_type type,
> > unsigned long orders);
> >
> > +#ifdef CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL
> > +
> > +unsigned long
> > +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
> > + unsigned long orders);
> > +
> > +#else
> > +
> > +static inline unsigned long
> > +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type,
> > + unsigned long orders)
> > +{
> > + return orders;
> > +}
> > +
> > +#endif
> > +
> > /**
> > * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
> > * @vma: the vm area to check
> > @@ -290,6 +308,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> > {
> > vm_flags_t vm_flags = vma->vm_flags;
> >
> > + /* The BPF-specified order overrides which order is selected. */
> > + orders &= bpf_hook_thp_get_orders(vma, type, orders);
> > + if (!orders)
> > + return 0;
> > +
> > /*
> > * Optimization to check if required orders are enabled early. Only
> > * forced collapse ignores sysfs configs.
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index bde9f842a4a8..fd7459eecb2d 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -895,6 +895,18 @@ config NO_PAGE_MAPCOUNT
> >
> > EXPERIMENTAL because the impact of some changes is still unclear.
> >
> > +config BPF_THP_GET_ORDER_EXPERIMENTAL
> > + bool "BPF-based THP order selection (EXPERIMENTAL)"
> > + depends on TRANSPARENT_HUGEPAGE && BPF_SYSCALL
> > +
> > + help
> > + Enable dynamic THP order selection using BPF programs. This
> > + experimental feature allows custom BPF logic to determine optimal
> > + transparent hugepage allocation sizes at runtime.
> > +
> > + WARNING: This feature is unstable and may change in future kernel
> > + versions.
> > +
>
> I am assuming this series opens up the possibility of additional hooks being added in
> the future. Instead of naming this BPF_THP_GET_ORDER_EXPERIMENTAL, should we
> name it BPF_THP? Otherwise we will end up with 1 Kconfig option per hook, which
> is quite bad.
makes sense.
>
> Also It would be really nice if we dont put "EXPERIMENTAL" in the name of the defconfig.
> If its decided that its not experimental anymore without any change to the code needed,
> renaming the defconfig will break it for everyone.
makes sense to me.
Lorenzo, what do you think ?
>
>
> > endif # TRANSPARENT_HUGEPAGE
> >
> > # simple helper to make the code a bit easier to read
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 21abb3353550..62ebfa23635a 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -99,6 +99,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
> > obj-$(CONFIG_NUMA) += memory-tiers.o
> > obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
> > obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
> > +obj-$(CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL) += huge_memory_bpf.o
> > obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
> > obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
> > obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
> > diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c
> > new file mode 100644
> > index 000000000000..b59a65d70a93
> > --- /dev/null
> > +++ b/mm/huge_memory_bpf.c
> > @@ -0,0 +1,204 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * BPF-based THP policy management
> > + *
> > + * Author: Yafang Shao <laoar.shao@gmail.com>
> > + */
> > +
> > +#include <linux/bpf.h>
> > +#include <linux/btf.h>
> > +#include <linux/huge_mm.h>
> > +#include <linux/khugepaged.h>
> > +
> > +/**
> > + * @thp_order_fn_t: Get the suggested THP order from a BPF program for allocation
> > + * @vma: vm_area_struct associated with the THP allocation
> > + * @type: TVA type for current @vma
> > + * @orders: Bitmask of available THP orders for this allocation
> > + *
> > + * Return: The suggested THP order for allocation from the BPF program. Must be
> > + * a valid, available order.
> > + */
> > +typedef int thp_order_fn_t(struct vm_area_struct *vma,
> > + enum tva_type type,
> > + unsigned long orders);
> > +
> > +struct bpf_thp_ops {
> > + thp_order_fn_t __rcu *thp_get_order;
> > +};
> > +
> > +static struct bpf_thp_ops bpf_thp;
> > +static DEFINE_SPINLOCK(thp_ops_lock);
> > +
> > +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma,
> > + enum tva_type type,
> > + unsigned long orders)
> > +{
> > + thp_order_fn_t *bpf_hook_thp_get_order;
> > + int bpf_order;
> > +
> > + /* No BPF program is attached */
> > + if (!test_bit(TRANSPARENT_HUGEPAGE_BPF_ATTACHED,
> > + &transparent_hugepage_flags))
> > + return orders;
> > +
> > + rcu_read_lock();
> > + bpf_hook_thp_get_order = rcu_dereference(bpf_thp.thp_get_order);
> > + if (!bpf_hook_thp_get_order)
>
> Should we warn over here if we are going to out? TRANSPARENT_HUGEPAGE_BPF_ATTACHED
> being set + !bpf_hook_thp_get_order shouldnt be possible, right?
will add a warning in the next version.
--
Regards
Yafang
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 05/12] mm: thp: decouple THP allocation between swap and page fault paths
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (3 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 04/12] mm: thp: add support for BPF based THP order selection Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 15:19 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged Yafang Shao
` (6 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
The new BPF capability enables finer-grained THP policy decisions by
introducing separate handling for swap faults versus normal page faults.
As highlighted by Barry:
We’ve observed that swapping in large folios can lead to more
swap thrashing for some workloads- e.g. kernel build. Consequently,
some workloads might prefer swapping in smaller folios than those
allocated by alloc_anon_folio().
While prtcl() could potentially be extended to leverage this new policy,
doing so would require modifications to the uAPI.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Barry Song <21cnbao@gmail.com>
---
include/linux/huge_mm.h | 3 ++-
mm/huge_memory.c | 2 +-
mm/memory.c | 2 +-
3 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fea94c059bed..bd30694f6a9c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -97,9 +97,10 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
enum tva_type {
TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
- TVA_PAGEFAULT, /* Serving a page fault. */
+ TVA_PAGEFAULT, /* Serving a non-swap page fault. */
TVA_KHUGEPAGED, /* Khugepaged collapse. */
TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */
+ TVA_SWAP_PAGEFAULT, /* serving a swap page fault. */
};
#define thp_vma_allowable_order(vma, type, order) \
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ac476fe6dc5..08372dfcb41a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,7 +102,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long orders)
{
const bool smaps = type == TVA_SMAPS;
- const bool in_pf = type == TVA_PAGEFAULT;
+ const bool in_pf = (type == TVA_PAGEFAULT || type == TVA_SWAP_PAGEFAULT);
const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
vm_flags_t vm_flags = vma->vm_flags;
diff --git a/mm/memory.c b/mm/memory.c
index cd04e4894725..58ea0f93f79e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4558,7 +4558,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
*/
- orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT,
+ orders = thp_vma_allowable_orders(vma, TVA_SWAP_PAGEFAULT,
BIT(PMD_ORDER) - 1);
orders = thp_vma_suitable_orders(vma, vmf->address, orders);
orders = thp_swap_suitable_orders(swp_offset(entry),
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 05/12] mm: thp: decouple THP allocation between swap and page fault paths
2025-09-26 9:33 ` [PATCH v8 mm-new 05/12] mm: thp: decouple THP allocation between swap and page fault paths Yafang Shao
@ 2025-09-26 15:19 ` Usama Arif
0 siblings, 0 replies; 24+ messages in thread
From: Usama Arif @ 2025-09-26 15:19 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 26/09/2025 10:33, Yafang Shao wrote:
> The new BPF capability enables finer-grained THP policy decisions by
> introducing separate handling for swap faults versus normal page faults.
>
> As highlighted by Barry:
>
> We’ve observed that swapping in large folios can lead to more
> swap thrashing for some workloads- e.g. kernel build. Consequently,
> some workloads might prefer swapping in smaller folios than those
> allocated by alloc_anon_folio().
>
> While prtcl() could potentially be extended to leverage this new policy,
> doing so would require modifications to the uAPI.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Barry Song <21cnbao@gmail.com>
> ---
> include/linux/huge_mm.h | 3 ++-
> mm/huge_memory.c | 2 +-
> mm/memory.c | 2 +-
> 3 files changed, 4 insertions(+), 3 deletions(-)
>
Acked-by: Usama Arif <usamaarif642@gmail.com>
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (4 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 05/12] mm: thp: decouple THP allocation between swap and page fault paths Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 15:27 ` Usama Arif
2025-09-26 9:33 ` [PATCH v8 mm-new 07/12] bpf: mark mm->owner as __safe_rcu_or_null Yafang Shao
` (5 subsequent siblings)
11 siblings, 1 reply; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
khugepaged_enter_vma() ultimately invokes any attached BPF function with
the TVA_KHUGEPAGED flag set when determining whether or not to enable
khugepaged THP for a freshly faulted in VMA.
Currently, on fault, we invoke this in do_huge_pmd_anonymous_page(), as
invoked by create_huge_pmd() and only when we have already checked to
see if an allowable TVA_PAGEFAULT order is specified.
Since we might want to disallow THP on fault-in but allow it via
khugepaged, we move things around so we always attempt to enter
khugepaged upon fault.
This change is safe because:
- the checks for thp_vma_allowable_order(TVA_KHUGEPAGED) and
thp_vma_allowable_order(TVA_PAGEFAULT) are functionally equivalent
- khugepaged operates at the MM level rather than per-VMA. The THP
allocation might fail during page faults due to transient conditions
(e.g., memory pressure), it is safe to add this MM to khugepaged for
subsequent defragmentation.
While we could also extend prctl() to utilize this new policy, such a
change would require a uAPI modification to PR_SET_THP_DISABLE.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
---
mm/huge_memory.c | 1 -
mm/memory.c | 13 ++++++++-----
2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08372dfcb41a..2b155a734c78 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1346,7 +1346,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = vmf_anon_prepare(vmf);
if (ret)
return ret;
- khugepaged_enter_vma(vma);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
diff --git a/mm/memory.c b/mm/memory.c
index 58ea0f93f79e..64f91191ffff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6327,11 +6327,14 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
- if (pmd_none(*vmf.pmd) &&
- thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) {
- ret = create_huge_pmd(&vmf);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
+ if (pmd_none(*vmf.pmd)) {
+ if (vma_is_anonymous(vma))
+ khugepaged_enter_vma(vma);
+ if (thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) {
+ ret = create_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ }
} else {
vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged
2025-09-26 9:33 ` [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged Yafang Shao
@ 2025-09-26 15:27 ` Usama Arif
2025-09-28 2:58 ` Yafang Shao
0 siblings, 1 reply; 24+ messages in thread
From: Usama Arif @ 2025-09-26 15:27 UTC (permalink / raw)
To: Yafang Shao, akpm, david, ziy, baolin.wang, lorenzo.stoakes,
Liam.Howlett, npache, ryan.roberts, dev.jain, hannes,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel
On 26/09/2025 10:33, Yafang Shao wrote:
> khugepaged_enter_vma() ultimately invokes any attached BPF function with
> the TVA_KHUGEPAGED flag set when determining whether or not to enable
> khugepaged THP for a freshly faulted in VMA.
>
> Currently, on fault, we invoke this in do_huge_pmd_anonymous_page(), as
> invoked by create_huge_pmd() and only when we have already checked to
> see if an allowable TVA_PAGEFAULT order is specified.
>
> Since we might want to disallow THP on fault-in but allow it via
> khugepaged, we move things around so we always attempt to enter
> khugepaged upon fault.
>
> This change is safe because:
> - the checks for thp_vma_allowable_order(TVA_KHUGEPAGED) and
> thp_vma_allowable_order(TVA_PAGEFAULT) are functionally equivalent
hmm I dont think this is the case. __thp_vma_allowable_orders
deals with TVA_PAGEFAULT (in_pf) differently from TVA_KHUGEPAGED.
> - khugepaged operates at the MM level rather than per-VMA. The THP
> allocation might fail during page faults due to transient conditions
> (e.g., memory pressure), it is safe to add this MM to khugepaged for
> subsequent defragmentation.
>
> While we could also extend prctl() to utilize this new policy, such a
> change would require a uAPI modification to PR_SET_THP_DISABLE.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> Acked-by: Lance Yang <lance.yang@linux.dev>
> ---
> mm/huge_memory.c | 1 -
> mm/memory.c | 13 ++++++++-----
> 2 files changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 08372dfcb41a..2b155a734c78 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1346,7 +1346,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> ret = vmf_anon_prepare(vmf);
> if (ret)
> return ret;
> - khugepaged_enter_vma(vma);
>
> if (!(vmf->flags & FAULT_FLAG_WRITE) &&
> !mm_forbids_zeropage(vma->vm_mm) &&
> diff --git a/mm/memory.c b/mm/memory.c
> index 58ea0f93f79e..64f91191ffff 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -6327,11 +6327,14 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
> if (pud_trans_unstable(vmf.pud))
> goto retry_pud;
>
> - if (pmd_none(*vmf.pmd) &&
> - thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) {
> - ret = create_huge_pmd(&vmf);
> - if (!(ret & VM_FAULT_FALLBACK))
> - return ret;
> + if (pmd_none(*vmf.pmd)) {
> + if (vma_is_anonymous(vma))
> + khugepaged_enter_vma(vma);
> + if (thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) {
> + ret = create_huge_pmd(&vmf);
> + if (!(ret & VM_FAULT_FALLBACK))
> + return ret;
> + }
> } else {
> vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
>
^ permalink raw reply [flat|nested] 24+ messages in thread* Re: [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged
2025-09-26 15:27 ` Usama Arif
@ 2025-09-28 2:58 ` Yafang Shao
0 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-28 2:58 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, gutierrez.asier, willy,
ast, daniel, andrii, ameryhung, rientjes, corbet, 21cnbao,
shakeel.butt, tj, lance.yang, bpf, linux-mm, linux-doc,
linux-kernel
On Fri, Sep 26, 2025 at 11:27 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 26/09/2025 10:33, Yafang Shao wrote:
> > khugepaged_enter_vma() ultimately invokes any attached BPF function with
> > the TVA_KHUGEPAGED flag set when determining whether or not to enable
> > khugepaged THP for a freshly faulted in VMA.
> >
> > Currently, on fault, we invoke this in do_huge_pmd_anonymous_page(), as
> > invoked by create_huge_pmd() and only when we have already checked to
> > see if an allowable TVA_PAGEFAULT order is specified.
> >
> > Since we might want to disallow THP on fault-in but allow it via
> > khugepaged, we move things around so we always attempt to enter
> > khugepaged upon fault.
> >
> > This change is safe because:
> > - the checks for thp_vma_allowable_order(TVA_KHUGEPAGED) and
> > thp_vma_allowable_order(TVA_PAGEFAULT) are functionally equivalent
>
> hmm I dont think this is the case. __thp_vma_allowable_orders
> deals with TVA_PAGEFAULT (in_pf) differently from TVA_KHUGEPAGED.
Since this change only applies when vma_is_anonymous(vma) is true, we
can safely focus the logic in __thp_vma_allowable_orders() on
anonymous VMAs. For such VMAs, the TVA_KHUGEPAGED check is strictly
more restrictive than the TVA_PAGEFAULT check. Specifically:
- If __thp_vma_allowable_orders(TVA_PAGEFAULT) returns 0 (disallowed),
then __thp_vma_allowable_orders(TVA_KHUGEPAGED) will also return 0.
- Even if the page fault check returns a set of orders, the khugepaged
check may still return 0.
Thus, this change is safe. I'll clarify this in the commit log. Please
correct me if I'm missing something.
--
Regards
Yafang
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v8 mm-new 07/12] bpf: mark mm->owner as __safe_rcu_or_null
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (5 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 06/12] mm: thp: enable THP allocation exclusively through khugepaged Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 08/12] bpf: mark vma->vm_mm as __safe_trusted_or_null Yafang Shao
` (4 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
When CONFIG_MEMCG is enabled, we can access mm->owner under RCU. The
owner can be NULL. With this change, BPF helpers can safely access
mm->owner to retrieve the associated task from the mm. We can then make
policy decision based on the task attribute.
The typical use case is as follows,
bpf_rcu_read_lock(); // rcu lock must be held for rcu trusted field
@owner = @mm->owner; // mm_struct::owner is rcu trusted or null
if (!@owner)
goto out;
/* Do something based on the task attribute */
out:
bpf_rcu_read_unlock();
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
kernel/bpf/verifier.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4f69a9e9af6..d400e18ee31e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7123,6 +7123,9 @@ BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
+#ifdef CONFIG_MEMCG
+ struct task_struct __rcu *owner;
+#endif
};
/* skb->sk, req->sk are not RCU protected, but we mark them as such
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* [PATCH v8 mm-new 08/12] bpf: mark vma->vm_mm as __safe_trusted_or_null
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (6 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 07/12] bpf: mark mm->owner as __safe_rcu_or_null Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 09/12] selftests/bpf: add a simple BPF based THP policy Yafang Shao
` (3 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
The vma->vm_mm might be NULL and it can be accessed outside of RCU. Thus,
we can mark it as trusted_or_null. With this change, BPF helpers can safely
access vma->vm_mm to retrieve the associated mm_struct from the VMA.
Then we can make policy decision from the VMA.
The "trusted" annotation enables direct access to vma->vm_mm within kfuncs
marked with KF_TRUSTED_ARGS or KF_RCU, such as bpf_task_get_cgroup1() and
bpf_task_under_cgroup(). Conversely, "null" enforcement requires all
callsites using vma->vm_mm to perform NULL checks.
The lsm selftest must be modified because it directly accesses vma->vm_mm
without a NULL pointer check; otherwise it will break due to this
change.
For the VMA based THP policy, the use case is as follows,
@mm = @vma->vm_mm; // vm_area_struct::vm_mm is trusted or null
if (!@mm)
return;
bpf_rcu_read_lock(); // rcu lock must be held to dereference the owner
@owner = @mm->owner; // mm_struct::owner is rcu trusted or null
if (!@owner)
goto out;
@cgroup1 = bpf_task_get_cgroup1(@owner, MEMCG_HIERARCHY_ID);
/* make the decision based on the @cgroup1 attribute */
bpf_cgroup_release(@cgroup1); // release the associated cgroup
out:
bpf_rcu_read_unlock();
PSI memory information can be obtained from the associated cgroup to inform
policy decisions. Since upstream PSI support is currently limited to cgroup
v2, the following example demonstrates cgroup v2 implementation:
@owner = @mm->owner;
if (@owner) {
// @ancestor_cgid is user-configured
@ancestor = bpf_cgroup_from_id(@ancestor_cgid);
if (bpf_task_under_cgroup(@owner, @ancestor)) {
@psi_group = @ancestor->psi;
/* Extract PSI metrics from @psi_group and
* implement policy logic based on the values
*/
}
}
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
---
kernel/bpf/verifier.c | 5 +++++
tools/testing/selftests/bpf/progs/lsm.c | 8 +++++---
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d400e18ee31e..b708b98f796c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7165,6 +7165,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket) {
struct sock *sk;
};
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
+ struct mm_struct *vm_mm;
+};
+
static bool type_is_rcu(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
const char *field_name, u32 btf_id)
@@ -7206,6 +7210,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
index 0c13b7409947..7de173daf27b 100644
--- a/tools/testing/selftests/bpf/progs/lsm.c
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -89,14 +89,16 @@ SEC("lsm/file_mprotect")
int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
unsigned long reqprot, unsigned long prot, int ret)
{
- if (ret != 0)
+ struct mm_struct *mm = vma->vm_mm;
+
+ if (ret != 0 || !mm)
return ret;
__s32 pid = bpf_get_current_pid_tgid() >> 32;
int is_stack = 0;
- is_stack = (vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack);
+ is_stack = (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack);
if (is_stack && monitored_pid == pid) {
mprotect_count++;
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* [PATCH v8 mm-new 09/12] selftests/bpf: add a simple BPF based THP policy
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (7 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 08/12] bpf: mark vma->vm_mm as __safe_trusted_or_null Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 10/12] selftests/bpf: add test case to update " Yafang Shao
` (2 subsequent siblings)
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
This test case implements a basic THP policy that sets THPeligible to 1 for
a specific task and to 0 for all others. I selected THPeligible for
verification because its straightforward nature makes it ideal for
validating the BPF THP policy functionality.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
MAINTAINERS | 2 +
tools/testing/selftests/bpf/config | 3 +
.../selftests/bpf/prog_tests/thp_adjust.c | 258 ++++++++++++++++++
.../selftests/bpf/progs/test_thp_adjust.c | 41 +++
4 files changed, 304 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/thp_adjust.c
create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 7be34b2a64fd..c1219bcd27c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16260,6 +16260,8 @@ F: mm/huge_memory.c
F: mm/huge_memory_bpf.c
F: mm/khugepaged.c
F: mm/mm_slot.h
+F: tools/testing/selftests/bpf/prog_tests/thp_adjust.c
+F: tools/testing/selftests/bpf/progs/test_thp_adjust*
F: tools/testing/selftests/mm/khugepaged.c
F: tools/testing/selftests/mm/split_huge_page_test.c
F: tools/testing/selftests/mm/transhuge-stress.c
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 8916ab814a3e..7ccb9809e276 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -26,6 +26,7 @@ CONFIG_DMABUF_HEAPS=y
CONFIG_DMABUF_HEAPS_SYSTEM=y
CONFIG_DUMMY=y
CONFIG_DYNAMIC_FTRACE=y
+CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL=y
CONFIG_FPROBE=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_FUNCTION_ERROR_INJECTION=y
@@ -51,6 +52,7 @@ CONFIG_IPV6_TUNNEL=y
CONFIG_KEYS=y
CONFIG_LIRC=y
CONFIG_LWTUNNEL=y
+CONFIG_MEMCG=y
CONFIG_MODULE_SIG=y
CONFIG_MODULE_SRCVERSION_ALL=y
CONFIG_MODULE_UNLOAD=y
@@ -114,6 +116,7 @@ CONFIG_SECURITY=y
CONFIG_SECURITYFS=y
CONFIG_SYN_COOKIES=y
CONFIG_TEST_BPF=m
+CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_UDMABUF=y
CONFIG_USERFAULTFD=y
CONFIG_VSOCKETS=y
diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
new file mode 100644
index 000000000000..b14f57040654
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <math.h>
+#include <sys/mman.h>
+#include <test_progs.h>
+#include "test_thp_adjust.skel.h"
+
+#define LEN (16 * 1024 * 1024) /* 16MB */
+#define THP_ENABLED_FILE "/sys/kernel/mm/transparent_hugepage/enabled"
+#define PMD_SIZE_FILE "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+
+static struct test_thp_adjust *skel;
+static char old_mode[32];
+static long pagesize;
+
+static int thp_mode_save(void)
+{
+ const char *start, *end;
+ char buf[128];
+ int fd, err;
+ size_t len;
+
+ fd = open(THP_ENABLED_FILE, O_RDONLY);
+ if (fd == -1)
+ return -1;
+
+ err = read(fd, buf, sizeof(buf) - 1);
+ if (err == -1)
+ goto close;
+
+ start = strchr(buf, '[');
+ end = start ? strchr(start, ']') : NULL;
+ if (!start || !end || end <= start) {
+ err = -1;
+ goto close;
+ }
+
+ len = end - start - 1;
+ if (len >= sizeof(old_mode))
+ len = sizeof(old_mode) - 1;
+ strncpy(old_mode, start + 1, len);
+ old_mode[len] = '\0';
+
+close:
+ close(fd);
+ return err;
+}
+
+static int thp_mode_set(const char *desired_mode)
+{
+ int fd, err;
+
+ fd = open(THP_ENABLED_FILE, O_RDWR);
+ if (fd == -1)
+ return -1;
+
+ err = write(fd, desired_mode, strlen(desired_mode));
+ close(fd);
+ return err;
+}
+
+static int thp_mode_reset(void)
+{
+ int fd, err;
+
+ fd = open(THP_ENABLED_FILE, O_WRONLY);
+ if (fd == -1)
+ return -1;
+
+ err = write(fd, old_mode, strlen(old_mode));
+ close(fd);
+ return err;
+}
+
+static char *thp_alloc(void)
+{
+ char *addr;
+ int err, i;
+
+ addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+ if (addr == MAP_FAILED)
+ return NULL;
+
+ err = madvise(addr, LEN, MADV_HUGEPAGE);
+ if (err == -1)
+ goto unmap;
+
+ /* Accessing a single byte within a page is sufficient to trigger a page fault. */
+ for (i = 0; i < LEN; i += pagesize)
+ addr[i] = 1;
+ return addr;
+
+unmap:
+ munmap(addr, LEN);
+ return NULL;
+}
+
+static void thp_free(char *ptr)
+{
+ munmap(ptr, LEN);
+}
+
+static int get_pmd_order(void)
+{
+ ssize_t bytes_read, size;
+ int fd, order, ret = -1;
+ char buf[64], *endptr;
+
+ fd = open(PMD_SIZE_FILE, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ bytes_read = read(fd, buf, sizeof(buf) - 1);
+ if (bytes_read <= 0)
+ goto close_fd;
+
+ /* Remove potential newline character */
+ if (buf[bytes_read - 1] == '\n')
+ buf[bytes_read - 1] = '\0';
+
+ size = strtoul(buf, &endptr, 10);
+ if (endptr == buf || *endptr != '\0')
+ goto close_fd;
+ if (size % pagesize != 0)
+ goto close_fd;
+ ret = size / pagesize;
+ if ((ret & (ret - 1)) == 0) {
+ order = 0;
+ while (ret > 1) {
+ ret >>= 1;
+ order++;
+ }
+ ret = order;
+ }
+
+close_fd:
+ close(fd);
+ return ret;
+}
+
+static int get_thp_eligible(pid_t pid, unsigned long addr)
+{
+ int this_vma = 0, eligible = -1;
+ unsigned long start, end;
+ char smaps_path[64];
+ FILE *smaps_file;
+ char line[4096];
+
+ snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid);
+ smaps_file = fopen(smaps_path, "r");
+ if (!smaps_file)
+ return -1;
+
+ while (fgets(line, sizeof(line), smaps_file)) {
+ if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
+ /* addr is monotonic */
+ if (addr < start)
+ break;
+ this_vma = (addr >= start && addr < end) ? 1 : 0;
+ continue;
+ }
+
+ if (!this_vma)
+ continue;
+
+ if (strstr(line, "THPeligible:")) {
+ sscanf(line, "THPeligible: %d", &eligible);
+ break;
+ }
+ }
+
+ fclose(smaps_file);
+ return eligible;
+}
+
+static void subtest_thp_eligible(void)
+{
+ struct bpf_link *ops_link;
+ int elighble;
+ pid_t pid;
+ char *ptr;
+
+ ops_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops);
+ if (!ASSERT_OK_PTR(ops_link, "attach struct_ops"))
+ return;
+
+ pid = getpid();
+ ptr = thp_alloc();
+ if (!ASSERT_OK_PTR(ptr, "THP alloc"))
+ goto detach;
+
+ skel->bss->pid_eligible = pid;
+ elighble = get_thp_eligible(pid, (unsigned long)ptr);
+ ASSERT_EQ(elighble, 1, "THPeligible");
+
+ skel->bss->pid_eligible = 0;
+ skel->bss->pid_not_eligible = pid;
+ elighble = get_thp_eligible(pid, (unsigned long)ptr);
+ ASSERT_EQ(elighble, 0, "THP not eligible");
+
+ skel->bss->pid_eligible = 0;
+ skel->bss->pid_not_eligible = 0;
+ elighble = get_thp_eligible(pid, (unsigned long)ptr);
+ ASSERT_EQ(elighble, 0, "THP not eligible");
+
+ thp_free(ptr);
+detach:
+ bpf_link__destroy(ops_link);
+}
+
+static int thp_adjust_setup(void)
+{
+ int err = -1, pmd_order;
+
+ pagesize = sysconf(_SC_PAGESIZE);
+ pmd_order = get_pmd_order();
+ if (!ASSERT_NEQ(pmd_order, -1, "get_pmd_order"))
+ return -1;
+
+ if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save"))
+ return -1;
+ if (!ASSERT_GE(thp_mode_set("madvise"), 0, "THP mode set"))
+ return -1;
+
+ skel = test_thp_adjust__open();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ goto thp_reset;
+
+ skel->bss->pmd_order = pmd_order;
+
+ err = test_thp_adjust__load(skel);
+ if (!ASSERT_OK(err, "load"))
+ goto destroy;
+ return 0;
+
+destroy:
+ test_thp_adjust__destroy(skel);
+thp_reset:
+ ASSERT_GE(thp_mode_reset(), 0, "THP mode reset");
+ return err;
+}
+
+static void thp_adjust_destroy(void)
+{
+ test_thp_adjust__destroy(skel);
+ ASSERT_GE(thp_mode_reset(), 0, "THP mode reset");
+}
+
+void test_thp_adjust(void)
+{
+ if (thp_adjust_setup() == -1)
+ return;
+
+ if (test__start_subtest("thp_eligible"))
+ subtest_thp_eligible();
+
+ thp_adjust_destroy();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
new file mode 100644
index 000000000000..ed8c510693a0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int pid_not_eligible, pid_eligible;
+int pmd_order;
+
+SEC("struct_ops/thp_get_order")
+int BPF_PROG(thp_eligible, struct vm_area_struct *vma, enum tva_type tva_type,
+ unsigned long orders)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int suggested_order = 0;
+ struct task_struct *p;
+
+ if (tva_type != TVA_SMAPS)
+ return 0;
+
+ if (!mm)
+ return 0;
+
+ /* This BPF hook is already under RCU */
+ p = mm->owner;
+ if (!p || (p->pid != pid_eligible && p->pid != pid_not_eligible))
+ return 0;
+
+ if (p->pid == pid_eligible)
+ suggested_order = pmd_order;
+ else
+ suggested_order = 30; /* invalid order */
+ return suggested_order;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops thp_eligible_ops = {
+ .thp_get_order = (void *)thp_eligible,
+};
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* [PATCH v8 mm-new 10/12] selftests/bpf: add test case to update THP policy
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (8 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 09/12] selftests/bpf: add a simple BPF based THP policy Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 11/12] selftests/bpf: add test cases for invalid thp_adjust usage Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 12/12] Documentation: add BPF-based THP policy management Yafang Shao
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
This test case exercises the BPF THP update mechanism by modifying an
existing policy. The behavior confirms that:
- EBUSY error occurs when attempting to install a new BPF program while
another is active
- Updates to currently running programs are successfully processed
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
.../selftests/bpf/prog_tests/thp_adjust.c | 23 +++++++++++++++++++
.../selftests/bpf/progs/test_thp_adjust.c | 14 +++++++++++
2 files changed, 37 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
index b14f57040654..72b2ec31025a 100644
--- a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
+++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
@@ -208,6 +208,27 @@ static void subtest_thp_eligible(void)
bpf_link__destroy(ops_link);
}
+static void subtest_thp_policy_update(void)
+{
+ struct bpf_link *old_link, *new_link;
+ int err;
+
+ old_link = bpf_map__attach_struct_ops(skel->maps.swap_ops);
+ if (!ASSERT_OK_PTR(old_link, "attach_old_link"))
+ return;
+
+ new_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops);
+ if (!ASSERT_NULL(new_link, "attach_new_link"))
+ goto destory_old;
+ ASSERT_EQ(errno, EBUSY, "attach_new_link");
+
+ err = bpf_link__update_map(old_link, skel->maps.thp_eligible_ops);
+ ASSERT_EQ(err, 0, "update_old_link");
+
+destory_old:
+ bpf_link__destroy(old_link);
+}
+
static int thp_adjust_setup(void)
{
int err = -1, pmd_order;
@@ -253,6 +274,8 @@ void test_thp_adjust(void)
if (test__start_subtest("thp_eligible"))
subtest_thp_eligible();
+ if (test__start_subtest("policy_update"))
+ subtest_thp_policy_update();
thp_adjust_destroy();
}
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
index ed8c510693a0..8f3bc4768edc 100644
--- a/tools/testing/selftests/bpf/progs/test_thp_adjust.c
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
@@ -39,3 +39,17 @@ SEC(".struct_ops.link")
struct bpf_thp_ops thp_eligible_ops = {
.thp_get_order = (void *)thp_eligible,
};
+
+SEC("struct_ops/thp_get_order")
+int BPF_PROG(alloc_not_in_swap, struct vm_area_struct *vma, enum tva_type tva_type,
+ unsigned long orders)
+{
+ if (tva_type == TVA_SWAP_PAGEFAULT)
+ return 0;
+ return -1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops swap_ops = {
+ .thp_get_order = (void *)alloc_not_in_swap,
+};
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* [PATCH v8 mm-new 11/12] selftests/bpf: add test cases for invalid thp_adjust usage
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (9 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 10/12] selftests/bpf: add test case to update " Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
2025-09-26 9:33 ` [PATCH v8 mm-new 12/12] Documentation: add BPF-based THP policy management Yafang Shao
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
1. The trusted vma->vm_mm pointer can be null and must be checked before
dereferencing.
2. The trusted mm->owner pointer can be null and must be checked before
dereferencing.
3. Sleepable programs are prohibited because the call site operates under
RCU protection.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
.../selftests/bpf/prog_tests/thp_adjust.c | 7 +++++
.../bpf/progs/test_thp_adjust_sleepable.c | 22 ++++++++++++++
.../bpf/progs/test_thp_adjust_trusted_owner.c | 30 +++++++++++++++++++
.../bpf/progs/test_thp_adjust_trusted_vma.c | 27 +++++++++++++++++
4 files changed, 86 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust_sleepable.c
create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_owner.c
create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_vma.c
diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
index 72b2ec31025a..2e9864732c11 100644
--- a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
+++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
@@ -4,6 +4,9 @@
#include <sys/mman.h>
#include <test_progs.h>
#include "test_thp_adjust.skel.h"
+#include "test_thp_adjust_sleepable.skel.h"
+#include "test_thp_adjust_trusted_vma.skel.h"
+#include "test_thp_adjust_trusted_owner.skel.h"
#define LEN (16 * 1024 * 1024) /* 16MB */
#define THP_ENABLED_FILE "/sys/kernel/mm/transparent_hugepage/enabled"
@@ -278,4 +281,8 @@ void test_thp_adjust(void)
subtest_thp_policy_update();
thp_adjust_destroy();
+
+ RUN_TESTS(test_thp_adjust_trusted_vma);
+ RUN_TESTS(test_thp_adjust_trusted_owner);
+ RUN_TESTS(test_thp_adjust_sleepable);
}
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust_sleepable.c b/tools/testing/selftests/bpf/progs/test_thp_adjust_sleepable.c
new file mode 100644
index 000000000000..4db78f2f0b2d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust_sleepable.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops.s/thp_get_order")
+__failure __msg("attach to unsupported member thp_get_order of struct bpf_thp_ops")
+int BPF_PROG(thp_sleepable, struct vm_area_struct *vma, enum tva_type tva_type,
+ unsigned long orders)
+{
+ return -1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops vma_ops = {
+ .thp_get_order = (void *)thp_sleepable,
+};
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_owner.c b/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_owner.c
new file mode 100644
index 000000000000..88bb09cb7cc2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_owner.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/thp_get_order")
+__failure __msg("R3 pointer arithmetic on rcu_ptr_or_null_ prohibited, null-check it first")
+int BPF_PROG(thp_trusted_owner, struct vm_area_struct *vma, enum tva_type tva_type,
+ unsigned long orders)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *p;
+
+ if (!mm)
+ return 0;
+
+ p = mm->owner;
+ bpf_printk("The task name is %s\n", p->comm);
+ return -1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops vma_ops = {
+ .thp_get_order = (void *)thp_trusted_owner,
+};
diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_vma.c b/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_vma.c
new file mode 100644
index 000000000000..df7b0c160153
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_thp_adjust_trusted_vma.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+SEC("struct_ops/thp_get_order")
+__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'")
+int BPF_PROG(thp_trusted_vma, struct vm_area_struct *vma, enum tva_type tva_type,
+ unsigned long orders)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *p = mm->owner;
+
+ if (!p)
+ return 0;
+ return -1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_thp_ops vma_ops = {
+ .thp_get_order = (void *)thp_trusted_vma,
+};
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread* [PATCH v8 mm-new 12/12] Documentation: add BPF-based THP policy management
2025-09-26 9:33 [PATCH v8 mm-new 00/12] mm, bpf: BPF based THP order selection Yafang Shao
` (10 preceding siblings ...)
2025-09-26 9:33 ` [PATCH v8 mm-new 11/12] selftests/bpf: add test cases for invalid thp_adjust usage Yafang Shao
@ 2025-09-26 9:33 ` Yafang Shao
11 siblings, 0 replies; 24+ messages in thread
From: Yafang Shao @ 2025-09-26 9:33 UTC (permalink / raw)
To: akpm, david, ziy, baolin.wang, lorenzo.stoakes, Liam.Howlett,
npache, ryan.roberts, dev.jain, hannes, usamaarif642,
gutierrez.asier, willy, ast, daniel, andrii, ameryhung, rientjes,
corbet, 21cnbao, shakeel.butt, tj, lance.yang
Cc: bpf, linux-mm, linux-doc, linux-kernel, Yafang Shao
Add the documentation.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
Documentation/admin-guide/mm/transhuge.rst | 39 ++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 1654211cc6cf..fa03bcdb8854 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -738,3 +738,42 @@ support enabled just fine as always. No difference can be noted in
hugetlbfs other than there will be less overall fragmentation. All
usual features belonging to hugetlbfs are preserved and
unaffected. libhugetlbfs will also work fine as usual.
+
+BPF-based THP adjustment
+========================
+
+Overview
+--------
+
+When the system is configured with "always" or "madvise" THP mode, a BPF program
+can be used to adjust THP allocation policies dynamically. This enables
+fine-grained control over THP decisions based on various factors including
+workload identity, allocation context, and system memory pressure.
+
+Program Interface
+-----------------
+
+This feature implements a struct_ops BPF program with the following interface::
+
+ int thp_get_order(struct vm_area_struct *vma,
+ enum tva_type type,
+ unsigned long orders);
+
+Parameters::
+
+ @vma: vm_area_struct associated with the THP allocation
+ @type: TVA type for current @vma
+ @orders: Bitmask of available THP orders for this allocation
+
+Return value::
+
+ The suggested THP order for allocation from the BPF program. Must be
+ a valid, available order.
+
+Implementation Notes
+--------------------
+
+This is currently an experimental feature.
+CONFIG_BPF_THP_GET_ORDER_EXPERIMENTAL must be enabled to use it.
+Only one BPF program can be attached at a time, but the program can be updated
+dynamically to adjust policies without requiring affected tasks to be restarted.
--
2.47.3
^ permalink raw reply [flat|nested] 24+ messages in thread