linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Yafang Shao <laoar.shao@gmail.com>
To: roman.gushchin@linux.dev, inwardvessel@gmail.com,
	shakeel.butt@linux.dev, akpm@linux-foundation.org,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	mkoutny@suse.com, yu.c.chen@intel.com, zhao1.liu@intel.com
Cc: bpf@vger.kernel.org, linux-mm@kvack.org,
	Yafang Shao <laoar.shao@gmail.com>
Subject: [RFC PATCH bpf-next 2/3] mm: add support for bpf based numa balancing
Date: Tue, 13 Jan 2026 20:12:37 +0800	[thread overview]
Message-ID: <20260113121238.11300-3-laoar.shao@gmail.com> (raw)
In-Reply-To: <20260113121238.11300-1-laoar.shao@gmail.com>

bpf_numab_ops enables NUMA balancing for tasks within a specific memcg,
even when global NUMA balancing is disabled. This allows selective NUMA
optimization for workloads that benefit from it, while avoiding potential
latency spikes for other workloads.

The policy must be attached to a leaf memory cgroup. To reduce lookup
overhead, we can cache memcg::bpf_numab in the mm_struct of tasks within
the memcg when it becomes a performance bottleneck.

The cgroup ID is embedded in bpf_numab_ops as a compile-time constant,
which restricts each instance to a single cgroup and prevents attachment
to multiple cgroups. Roman is working on a solution to remove this
limitation, after which we can migrate to the new approach.

Currently only the normal mode is supported.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 MAINTAINERS                          |   1 +
 include/linux/memcontrol.h           |   6 +
 include/linux/sched/numa_balancing.h |  10 +-
 mm/Makefile                          |   5 +
 mm/bpf_numa_balancing.c              | 224 +++++++++++++++++++++++++++
 5 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 mm/bpf_numa_balancing.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 70c2b73b3941..0d2c083557e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4807,6 +4807,7 @@ L:	bpf@vger.kernel.org
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	mm/bpf_memcontrol.c
+F:	mm/bpf_numa_balancing.c
 
 BPF [MISC]
 L:	bpf@vger.kernel.org
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 229ac9835adb..b02e8f380275 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,6 +29,7 @@ struct obj_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct bpf_numab_ops;
 
 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
@@ -284,6 +285,11 @@ struct mem_cgroup {
 	struct lru_gen_mm_list mm_list;
 #endif
 
+#ifdef CONFIG_BPF
+	/* per cgroup NUMA balancing control */
+	struct bpf_numab_ops __rcu *bpf_numab;
+#endif
+
 #ifdef CONFIG_MEMCG_V1
 	/* Legacy consumer-oriented counters */
 	struct page_counter kmem;		/* v1 only */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 792b6665f476..c58d32ab39a7 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -35,17 +35,25 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
 				int src_nid, int dst_cpu);
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false bpf_numab_enabled_key;
+int bpf_numab_hook(struct task_struct *p);
 static inline bool task_numab_enabled(struct task_struct *p)
 {
 	if (static_branch_unlikely(&sched_numa_balancing))
 		return true;
-	return false;
+	if (!static_branch_unlikely(&bpf_numab_enabled_key))
+		return false;
+
+	/* A BPF prog is attached. */
+	return bpf_numab_hook(p);
 }
 
 static inline bool task_numab_mode_normal(void)
 {
 	if (sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL)
 		return true;
+	if (static_branch_unlikely(&bpf_numab_enabled_key))
+		return true;
 	return false;
 }
 
diff --git a/mm/Makefile b/mm/Makefile
index bf46fe31dc14..c2b887491f09 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -107,8 +107,13 @@ ifdef CONFIG_SWAP
 obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
 ifdef CONFIG_BPF_SYSCALL
+ifdef CONFIG_NUMA_BALANCING
 obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
 endif
+endif
+ifdef CONFIG_BPF_SYSCALL
+obj-$(CONFIG_MEMCG) += bpf_numa_balancing.o
+endif
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_GUP_TEST) += gup_test.o
 obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o
diff --git a/mm/bpf_numa_balancing.c b/mm/bpf_numa_balancing.c
new file mode 100644
index 000000000000..aac4eec7c6ba
--- /dev/null
+++ b/mm/bpf_numa_balancing.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/memcontrol.h>
+#include <linux/sched/numa_balancing.h>
+
+typedef int numab_fn_t(struct task_struct *p);
+
+struct bpf_numab_ops {
+	numab_fn_t *numab_hook;
+
+	/* TODO:
+	 * The cgroup_id embedded in this struct is set at compile time
+	 * and cannot be modified during BPF program attach time.
+	 * Modifying it at attach time requires libbpf support,
+	 * which is currently under development by Roman.
+	 */
+	int cgroup_id;
+};
+
+static DEFINE_SPINLOCK(numab_ops_lock);
+DEFINE_STATIC_KEY_FALSE(bpf_numab_enabled_key);
+
+int bpf_numab_hook(struct task_struct *p)
+{
+	struct bpf_numab_ops *bpf_numab;
+	struct mem_cgroup *task_memcg;
+	int ret = 0;
+
+	if (!p->mm)
+		return 0;
+
+	/* We can cache memcg::bpf_numab to mm::bpf_numab if it becomes a bettleneck */
+	rcu_read_lock();
+	task_memcg = mem_cgroup_from_task(rcu_dereference(p->mm->owner));
+	if (!task_memcg)
+		goto out;
+
+	/* Users can install BPF NUMA policies on leaf memory cgroups.
+	 * This eliminates the need to traverse the cgroup hierarchy or
+	 * propagate policies during registration, simplifying the kernel design.
+	 */
+	bpf_numab = rcu_dereference(task_memcg->bpf_numab);
+	if (!bpf_numab || !bpf_numab->numab_hook)
+		goto out;
+
+	ret = bpf_numab->numab_hook(p);
+
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+static const struct bpf_func_proto *
+bpf_numab_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id, prog);
+}
+
+static bool bpf_numab_ops_is_valid_access(int off, int size,
+					  enum bpf_access_type type,
+					  const struct bpf_prog *prog,
+					  struct bpf_insn_access_aux *info)
+{
+	return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static const struct bpf_verifier_ops bpf_numab_verifier_ops = {
+	.get_func_proto = bpf_numab_get_func_proto,
+	.is_valid_access = bpf_numab_ops_is_valid_access,
+};
+
+static int bpf_numab_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int bpf_numab_check_member(const struct btf_type *t,
+				  const struct btf_member *member,
+				  const struct bpf_prog *prog)
+{
+	/* The call site operates under RCU protection. */
+	if (prog->sleepable)
+		return -EINVAL;
+	return 0;
+}
+
+static int bpf_numab_init_member(const struct btf_type *t,
+			       const struct btf_member *member,
+			       void *kdata, const void *udata)
+{
+	const struct bpf_numab_ops *ubpf_numab;
+	struct bpf_numab_ops *kbpf_numab;
+	u32 moff;
+
+	ubpf_numab = (const struct bpf_numab_ops *)udata;
+	kbpf_numab = (struct bpf_numab_ops *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+	switch (moff) {
+	case offsetof(struct bpf_numab_ops, cgroup_id):
+		/* bpf_struct_ops only handles func ptrs and zero-ed members.
+		 * Return 1 to bypass the default handler.
+		 */
+		kbpf_numab->cgroup_id = ubpf_numab->cgroup_id;
+		return 1;
+	}
+	return 0;
+}
+
+static int bpf_numab_reg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_numab_ops *ops = kdata;
+	struct mem_cgroup *memcg;
+	int err = 0;
+
+	/* Only the link mode is supported. */
+	if (!link)
+		return -EOPNOTSUPP;
+
+	/* Depends on CONFIG_SHRINKER_DEBUG */
+	memcg = mem_cgroup_get_from_ino(ops->cgroup_id);
+	if (!memcg || IS_ERR(memcg))
+		return -ENOENT;
+
+	spin_lock(&numab_ops_lock);
+	/* Each memory cgroup can have at most one attached BPF program to ensure
+	 * exclusive control and avoid interference between different BPF policies.
+	 */
+	if (rcu_access_pointer(memcg->bpf_numab)) {
+		err = -EBUSY;
+		goto out;
+	}
+	rcu_assign_pointer(memcg->bpf_numab, ops);
+	spin_unlock(&numab_ops_lock);
+	static_branch_inc(&bpf_numab_enabled_key);
+
+out:
+	mem_cgroup_put(memcg);
+	return err;
+}
+
+static void bpf_numab_unreg(void *kdata, struct bpf_link *link)
+{
+	struct bpf_numab_ops *ops = kdata;
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_get_from_ino(ops->cgroup_id);
+	if (!memcg)
+		return;
+
+	spin_lock(&numab_ops_lock);
+	if (!rcu_access_pointer(memcg->bpf_numab)) {
+		spin_unlock(&numab_ops_lock);
+		return;
+	}
+	rcu_replace_pointer(memcg->bpf_numab, NULL, lockdep_is_held(&numab_ops_lock));
+	spin_unlock(&numab_ops_lock);
+	static_branch_dec(&bpf_numab_enabled_key);
+	synchronize_rcu();
+}
+
+static int bpf_numab_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+	struct bpf_numab_ops *ops = kdata;
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_get_from_ino(ops->cgroup_id);
+	if (!memcg)
+		return -EINVAL;
+
+	spin_lock(&numab_ops_lock);
+	/* The update can proceed regardless of whether memcg->bpf_numab has been previously set. */
+	rcu_replace_pointer(memcg->bpf_numab, ops, lockdep_is_held(&numab_ops_lock));
+	spin_unlock(&numab_ops_lock);
+	synchronize_rcu();
+	return 0;
+}
+
+static int bpf_numab_validate(void *kdata)
+{
+	struct bpf_numab_ops *ops = kdata;
+
+	if (!ops->numab_hook) {
+		pr_err("bpf_numab: required ops isn't implemented\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int bpf_numa_balancing(struct task_struct *p)
+{
+	return 1;
+}
+
+static struct bpf_numab_ops __bpf_numab_ops = {
+	.numab_hook = (numab_fn_t *)bpf_numa_balancing,
+};
+
+static struct bpf_struct_ops bpf_bpf_numab_ops = {
+	.verifier_ops = &bpf_numab_verifier_ops,
+	.init = bpf_numab_init,
+	.check_member = bpf_numab_check_member,
+	.init_member = bpf_numab_init_member,
+	.reg = bpf_numab_reg,
+	.unreg = bpf_numab_unreg,
+	.update = bpf_numab_update,
+	.validate = bpf_numab_validate,
+	.cfi_stubs = &__bpf_numab_ops,
+	.owner = THIS_MODULE,
+	.name = "bpf_numab_ops",
+};
+
+static int __init bpf_numab_ops_init(void)
+{
+	int err;
+
+	err = register_bpf_struct_ops(&bpf_bpf_numab_ops, bpf_numab_ops);
+	if (err)
+		pr_err("bpf_numab: Failed to register struct_ops (%d)\n", err);
+	return err;
+}
+late_initcall(bpf_numab_ops_init);
-- 
2.43.5



  parent reply	other threads:[~2026-01-13 12:13 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-13 12:12 [RFC PATCH bpf-next 0/3] BPF-based NUMA balancing Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 1/3] sched: add helpers for numa balancing Yafang Shao
2026-01-13 12:42   ` bot+bpf-ci
2026-01-13 12:48     ` Yafang Shao
2026-01-13 12:12 ` Yafang Shao [this message]
2026-01-13 12:29   ` [RFC PATCH bpf-next 2/3] mm: add support for bpf based " bot+bpf-ci
2026-01-13 12:46     ` Yafang Shao
2026-01-13 12:12 ` [RFC PATCH bpf-next 3/3] mm: set numa balancing hot threshold with bpf Yafang Shao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260113121238.11300-3-laoar.shao@gmail.com \
    --to=laoar.shao@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=inwardvessel@gmail.com \
    --cc=linux-mm@kvack.org \
    --cc=mkoutny@suse.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=yu.c.chen@intel.com \
    --cc=zhao1.liu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox