linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Yafang Shao <laoar.shao@gmail.com>
To: roman.gushchin@linux.dev, inwardvessel@gmail.com,
	shakeel.butt@linux.dev, akpm@linux-foundation.org,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	mkoutny@suse.com, yu.c.chen@intel.com, zhao1.liu@intel.com
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
	Yafang Shao <laoar.shao@gmail.com>
Subject: [RFC PATCH bpf-next 3/3] mm: set numa balancing hot threshold with bpf
Date: Tue, 13 Jan 2026 20:12:38 +0800	[thread overview]
Message-ID: <20260113121238.11300-4-laoar.shao@gmail.com> (raw)
In-Reply-To: <20260113121238.11300-1-laoar.shao@gmail.com>

Our experimentation with NUMA balancing across our server fleet
revealed that different workloads require distinct hot thresholds.
This allows migrating the maximum number of cross-NUMA pages
while avoiding significant latency impact on sensitive workloads.

We can also configure other per-workload NUMA balancing parameters via BPF,
such as scan_size_mb in /sys/kernel/debug/sched/numa_balancing/.
This can be implemented later if the core approach proves acceptable.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/sched/numa_balancing.h |  9 +++++++++
 kernel/sched/fair.c                  |  2 +-
 kernel/sched/sched.h                 |  1 -
 mm/bpf_numa_balancing.c              | 28 ++++++++++++++++++++++++++++
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index c58d32ab39a7..bbf5b884aa47 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -36,7 +36,9 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false bpf_numab_enabled_key;
+extern unsigned int sysctl_numa_balancing_hot_threshold;
 int bpf_numab_hook(struct task_struct *p);
+unsigned int bpf_numab_hot_thresh(struct task_struct *p);
 static inline bool task_numab_enabled(struct task_struct *p)
 {
 	if (static_branch_unlikely(&sched_numa_balancing))
@@ -63,6 +65,13 @@ static inline bool task_numab_mode_tiering(void)
 		return true;
 	return false;
 }
+
+static inline unsigned int task_numab_hot_thresh(struct task_struct *p)
+{
+	if (!static_branch_unlikely(&bpf_numab_enabled_key))
+		return sysctl_numa_balancing_hot_threshold;
+	return bpf_numab_hot_thresh(p);
+}
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f6583ef83b2..d51ddd46f4be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1917,7 +1917,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 			return true;
 		}
 
-		def_th = sysctl_numa_balancing_hot_threshold;
+		def_th = task_numab_hot_thresh(p);
 		rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
 		numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1247e4b0c2b0..d72eaa472d7d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2961,7 +2961,6 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
-extern unsigned int sysctl_numa_balancing_hot_threshold;
 
 #ifdef CONFIG_SCHED_HRTICK
 
diff --git a/mm/bpf_numa_balancing.c b/mm/bpf_numa_balancing.c
index aac4eec7c6ba..26e80434f337 100644
--- a/mm/bpf_numa_balancing.c
+++ b/mm/bpf_numa_balancing.c
@@ -9,6 +9,7 @@ typedef int numab_fn_t(struct task_struct *p);
 
 struct bpf_numab_ops {
 	numab_fn_t *numab_hook;
+	unsigned int hot_thresh;
 
 	/* TODO:
 	 * The cgroup_id embedded in this struct is set at compile time
@@ -52,6 +53,30 @@ int bpf_numab_hook(struct task_struct *p)
 	return ret;
 }
 
+unsigned int bpf_numab_hot_thresh(struct task_struct *p)
+{
+	unsigned int ret = sysctl_numa_balancing_hot_threshold;
+	struct bpf_numab_ops *bpf_numab;
+	struct mem_cgroup *task_memcg;
+
+	if (unlikely(!p->mm))
+		return ret;
+
+	rcu_read_lock();
+	task_memcg = mem_cgroup_from_task(rcu_dereference(p->mm->owner));
+	if (!task_memcg)
+		goto out;
+
+	bpf_numab = rcu_dereference(task_memcg->bpf_numab);
+	if (!bpf_numab || !bpf_numab->hot_thresh)
+		goto out;
+
+	ret = bpf_numab->hot_thresh;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 static const struct bpf_func_proto *
 bpf_numab_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -105,6 +130,9 @@ static int bpf_numab_init_member(const struct btf_type *t,
 		 */
 		kbpf_numab->cgroup_id = ubpf_numab->cgroup_id;
 		return 1;
+	case offsetof(struct bpf_numab_ops, hot_thresh):
+		kbpf_numab->hot_thresh = ubpf_numab->hot_thresh;
+		return 1;
 	}
 	return 0;
 }
-- 
2.43.5



      parent reply	other threads:[~2026-01-13 12:13 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-13 12:12 [RFC PATCH bpf-next 0/3] BPF-based NUMA balancing Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 1/3] sched: add helpers for numa balancing Yafang Shao
2026-01-13 12:42   ` bot+bpf-ci
2026-01-13 12:48     ` Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 2/3] mm: add support for bpf based " Yafang Shao
2026-01-13 12:29   ` bot+bpf-ci
2026-01-13 12:46     ` Yafang Shao
2026-01-13 12:12 ` Yafang Shao [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260113121238.11300-4-laoar.shao@gmail.com \
    --to=laoar.shao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=inwardvessel@gmail.com \
    --cc=linux-mm@kvack.org \
    --cc=mkoutny@suse.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=yu.c.chen@intel.com \
    --cc=zhao1.liu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox