From: Yafang Shao <laoar.shao@gmail.com>
To: roman.gushchin@linux.dev, inwardvessel@gmail.com,
shakeel.butt@linux.dev, akpm@linux-foundation.org,
ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
mkoutny@suse.com, yu.c.chen@intel.com, zhao1.liu@intel.com
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
Yafang Shao <laoar.shao@gmail.com>
Subject: [RFC PATCH bpf-next 3/3] mm: set numa balancing hot threshold with bpf
Date: Tue, 13 Jan 2026 20:12:38 +0800 [thread overview]
Message-ID: <20260113121238.11300-4-laoar.shao@gmail.com> (raw)
In-Reply-To: <20260113121238.11300-1-laoar.shao@gmail.com>
Our experimentation with NUMA balancing across our server fleet
revealed that different workloads require distinct hot thresholds.
This allows migrating the maximum number of cross-NUMA pages
while avoiding significant latency impact on sensitive workloads.
We can also configure other per-workload NUMA balancing parameters via BPF,
such as scan_size_mb in /sys/kernel/debug/sched/numa_balancing/.
This can be implemented later if the core approach proves acceptable.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
include/linux/sched/numa_balancing.h | 9 +++++++++
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 1 -
mm/bpf_numa_balancing.c | 28 ++++++++++++++++++++++++++++
4 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index c58d32ab39a7..bbf5b884aa47 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -36,7 +36,9 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false bpf_numab_enabled_key;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
int bpf_numab_hook(struct task_struct *p);
+unsigned int bpf_numab_hot_thresh(struct task_struct *p);
static inline bool task_numab_enabled(struct task_struct *p)
{
if (static_branch_unlikely(&sched_numa_balancing))
@@ -63,6 +65,13 @@ static inline bool task_numab_mode_tiering(void)
return true;
return false;
}
+
+static inline unsigned int task_numab_hot_thresh(struct task_struct *p)
+{
+ if (!static_branch_unlikely(&bpf_numab_enabled_key))
+ return sysctl_numa_balancing_hot_threshold;
+ return bpf_numab_hot_thresh(p);
+}
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f6583ef83b2..d51ddd46f4be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1917,7 +1917,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
return true;
}
- def_th = sysctl_numa_balancing_hot_threshold;
+ def_th = task_numab_hot_thresh(p);
rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1247e4b0c2b0..d72eaa472d7d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2961,7 +2961,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
-extern unsigned int sysctl_numa_balancing_hot_threshold;
#ifdef CONFIG_SCHED_HRTICK
diff --git a/mm/bpf_numa_balancing.c b/mm/bpf_numa_balancing.c
index aac4eec7c6ba..26e80434f337 100644
--- a/mm/bpf_numa_balancing.c
+++ b/mm/bpf_numa_balancing.c
@@ -9,6 +9,7 @@ typedef int numab_fn_t(struct task_struct *p);
struct bpf_numab_ops {
numab_fn_t *numab_hook;
+ unsigned int hot_thresh;
/* TODO:
* The cgroup_id embedded in this struct is set at compile time
@@ -52,6 +53,30 @@ int bpf_numab_hook(struct task_struct *p)
return ret;
}
+unsigned int bpf_numab_hot_thresh(struct task_struct *p)
+{
+ unsigned int ret = sysctl_numa_balancing_hot_threshold;
+ struct bpf_numab_ops *bpf_numab;
+ struct mem_cgroup *task_memcg;
+
+ if (unlikely(!p->mm))
+ return ret;
+
+ rcu_read_lock();
+ task_memcg = mem_cgroup_from_task(rcu_dereference(p->mm->owner));
+ if (!task_memcg)
+ goto out;
+
+ bpf_numab = rcu_dereference(task_memcg->bpf_numab);
+ if (!bpf_numab || !bpf_numab->hot_thresh)
+ goto out;
+
+ ret = bpf_numab->hot_thresh;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static const struct bpf_func_proto *
bpf_numab_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -105,6 +130,9 @@ static int bpf_numab_init_member(const struct btf_type *t,
*/
kbpf_numab->cgroup_id = ubpf_numab->cgroup_id;
return 1;
+ case offsetof(struct bpf_numab_ops, hot_thresh):
+ kbpf_numab->hot_thresh = ubpf_numab->hot_thresh;
+ return 1;
}
return 0;
}
--
2.43.5
prev parent reply other threads:[~2026-01-13 12:13 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-13 12:12 [RFC PATCH bpf-next 0/3] BPF-based NUMA balancing Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 1/3] sched: add helpers for numa balancing Yafang Shao
2026-01-13 12:42 ` bot+bpf-ci
2026-01-13 12:48 ` Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 2/3] mm: add support for bpf based " Yafang Shao
2026-01-13 12:29 ` bot+bpf-ci
2026-01-13 12:46 ` Yafang Shao
2026-01-13 12:12 ` Yafang Shao [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260113121238.11300-4-laoar.shao@gmail.com \
--to=laoar.shao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=inwardvessel@gmail.com \
--cc=linux-mm@kvack.org \
--cc=mkoutny@suse.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=yu.c.chen@intel.com \
--cc=zhao1.liu@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox