[PATCH bpf-next v3 12/17] bpf: selftests: BPF OOM struct ops test

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Roman Gushchin <roman.gushchin@linux.dev>
To: bpf@vger.kernel.org
Cc: Michal Hocko <mhocko@suse.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Matt Bobrowski <mattbobrowski@google.com>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	JP Kobryn <inwardvessel@gmail.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Suren Baghdasaryan <surenb@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Roman Gushchin <roman.gushchin@linux.dev>
Subject: [PATCH bpf-next v3 12/17] bpf: selftests: BPF OOM struct ops test
Date: Mon, 26 Jan 2026 18:44:15 -0800	[thread overview]
Message-ID: <20260127024421.494929-13-roman.gushchin@linux.dev> (raw)
In-Reply-To: <20260127024421.494929-1-roman.gushchin@linux.dev>

Implement a kselftest for the OOM handling functionality.

The OOM handling policy which is implemented in BPF is to
kill all tasks belonging to the biggest leaf cgroup, which
doesn't contain unkillable tasks (tasks with oom_score_adj
set to -1000). Pagecache size is excluded from the accounting.

The test creates a hierarchy of memory cgroups, causes an
OOM at the top level, checks that the expected process is
killed and verifies the memcg's oom statistics.

The same BPF OOM policy is attached to a memory cgroup and
system-wide. In the first case the program does nothing and
returns false, so it's executed the second time, when it
properly handles the OOM.

Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 .../selftests/bpf/prog_tests/test_oom.c       | 256 ++++++++++++++++++
 tools/testing/selftests/bpf/progs/test_oom.c  | 111 ++++++++
 2 files changed, 367 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c

diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c
new file mode 100644
index 000000000000..a1eadbe1ae83
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+#include "test_oom.skel.h"
+
+struct cgroup_desc {
+	const char *path;
+	int fd;
+	unsigned long long id;
+	int pid;
+	size_t target;
+	size_t max;
+	int oom_score_adj;
+	bool victim;
+};
+
+#define MB (1024 * 1024)
+#define OOM_SCORE_ADJ_MIN	(-1000)
+#define OOM_SCORE_ADJ_MAX	1000
+
+static struct cgroup_desc cgroups[] = {
+	{ .path = "/oom_test", .max = 80 * MB},
+	{ .path = "/oom_test/cg1", .target = 10 * MB,
+	  .oom_score_adj = OOM_SCORE_ADJ_MAX },
+	{ .path = "/oom_test/cg2", .target = 40 * MB,
+	  .oom_score_adj = OOM_SCORE_ADJ_MIN },
+	{ .path = "/oom_test/cg3" },
+	{ .path = "/oom_test/cg3/cg4", .target = 30 * MB,
+	  .victim = true },
+	{ .path = "/oom_test/cg3/cg5", .target = 20 * MB },
+};
+
+static int spawn_task(struct cgroup_desc *desc)
+{
+	char *ptr;
+	int pid;
+
+	pid = fork();
+	if (pid < 0)
+		return pid;
+
+	if (pid > 0) {
+		/* parent */
+		desc->pid = pid;
+		return 0;
+	}
+
+	/* child */
+	if (desc->oom_score_adj) {
+		char buf[64];
+		int fd = open("/proc/self/oom_score_adj", O_WRONLY);
+
+		if (fd < 0)
+			return -1;
+
+		snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj);
+		write(fd, buf, sizeof(buf));
+		close(fd);
+	}
+
+	ptr = (char *)malloc(desc->target);
+	if (!ptr)
+		return -ENOMEM;
+
+	memset(ptr, 'a', desc->target);
+
+	while (1)
+		sleep(1000);
+
+	return 0;
+}
+
+static void setup_environment(void)
+{
+	int i, err;
+
+	err = setup_cgroup_environment();
+	if (!ASSERT_OK(err, "setup_cgroup_environment"))
+		goto cleanup;
+
+	for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
+		cgroups[i].fd = create_and_get_cgroup(cgroups[i].path);
+		if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup"))
+			goto cleanup;
+
+		cgroups[i].id = get_cgroup_id(cgroups[i].path);
+		if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id"))
+			goto cleanup;
+
+		/* Freeze the top-level cgroup */
+		if (i == 0) {
+			/* Freeze the top-level cgroup */
+			err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1");
+			if (!ASSERT_OK(err, "freeze cgroup"))
+				goto cleanup;
+		}
+
+		/* Recursively enable the memory controller */
+		if (!cgroups[i].target) {
+
+			err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control",
+						"+memory");
+			if (!ASSERT_OK(err, "enable memory controller"))
+				goto cleanup;
+		}
+
+		/* Set memory.max */
+		if (cgroups[i].max) {
+			char buf[256];
+
+			snprintf(buf, sizeof(buf), "%lu", cgroups[i].max);
+			err = write_cgroup_file(cgroups[i].path, "memory.max", buf);
+			if (!ASSERT_OK(err, "set memory.max"))
+				goto cleanup;
+
+			snprintf(buf, sizeof(buf), "0");
+			write_cgroup_file(cgroups[i].path, "memory.swap.max", buf);
+
+		}
+
+		/* Spawn tasks creating memory pressure */
+		if (cgroups[i].target) {
+			char buf[256];
+
+			err = spawn_task(&cgroups[i]);
+			if (!ASSERT_OK(err, "spawn task"))
+				goto cleanup;
+
+			snprintf(buf, sizeof(buf), "%d", cgroups[i].pid);
+			err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf);
+			if (!ASSERT_OK(err, "put child into a cgroup"))
+				goto cleanup;
+		}
+	}
+
+	return;
+
+cleanup:
+	cleanup_cgroup_environment();
+
+	// TODO return an error?
+}
+
+static int run_and_wait_for_oom(void)
+{
+	int ret = -1;
+	bool first = true;
+	char buf[4096] = {};
+	size_t size;
+
+	/* Unfreeze the top-level cgroup */
+	ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
+	if (!ASSERT_OK(ret, "freeze cgroup"))
+		return -1;
+
+	for (;;) {
+		int i, status;
+		pid_t pid = wait(&status);
+
+		if (pid == -1) {
+			if (errno == EINTR)
+				continue;
+			/* ECHILD */
+			break;
+		}
+
+		if (!first)
+			continue;
+
+		first = false;
+
+		/* Check which process was terminated first */
+		for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
+			if (!ASSERT_OK(cgroups[i].victim !=
+				       (pid == cgroups[i].pid),
+				       "correct process was killed")) {
+				ret = -1;
+				break;
+			}
+
+			if (!cgroups[i].victim)
+				continue;
+
+			/* Check the memcg oom counter */
+			size = read_cgroup_file(cgroups[i].path,
+						"memory.events",
+						buf, sizeof(buf));
+			if (!ASSERT_OK(size <= 0, "read memory.events")) {
+				ret = -1;
+				break;
+			}
+
+			if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL,
+				       "oom_kill count check")) {
+				ret = -1;
+				break;
+			}
+		}
+
+		/* Kill all remaining tasks */
+		for (i = 0; i < ARRAY_SIZE(cgroups); i++)
+			if (cgroups[i].pid && cgroups[i].pid != pid)
+				kill(cgroups[i].pid, SIGKILL);
+	}
+
+	return ret;
+}
+
+void test_oom(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts);
+	struct bpf_link *link1 = NULL, *link2 = NULL;
+	struct test_oom *skel;
+	int err = 0;
+
+	setup_environment();
+
+	skel = test_oom__open_and_load();
+	if (!skel) {
+		err = -errno;
+		CHECK_FAIL(err);
+		goto cleanup;
+	}
+
+	opts.flags = BPF_F_CGROUP_FD;
+	opts.target_fd = cgroups[0].fd;
+	link1 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts);
+	if (!link1) {
+		err = -errno;
+		CHECK_FAIL(err);
+		goto cleanup;
+	}
+
+	opts.target_fd = get_root_cgroup();
+	link2 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts);
+	if (!link2) {
+		err = -errno;
+		CHECK_FAIL(err);
+		goto cleanup;
+	}
+
+	/* Unfreeze all child tasks and create the memory pressure */
+	err = run_and_wait_for_oom();
+	CHECK_FAIL(err);
+
+cleanup:
+	bpf_link__destroy(link1);
+	bpf_link__destroy(link2);
+	write_cgroup_file(cgroups[0].path, "cgroup.kill", "1");
+	write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
+	cleanup_cgroup_environment();
+	test_oom__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c
new file mode 100644
index 000000000000..7ff354e416bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_oom.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define OOM_SCORE_ADJ_MIN	(-1000)
+
+static bool mem_cgroup_killable(struct mem_cgroup *memcg)
+{
+	struct task_struct *task;
+	bool ret = true;
+
+	bpf_for_each(css_task, task, &memcg->css, CSS_TASK_ITER_PROCS)
+		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+			return false;
+
+	return ret;
+}
+
+/*
+ * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks
+ * and kill all belonging tasks.
+ */
+SEC("struct_ops.s/handle_out_of_memory")
+int BPF_PROG(test_out_of_memory, struct oom_control *oc, struct bpf_struct_ops_link *link)
+{
+	struct task_struct *task;
+	struct mem_cgroup *root_memcg = oc->memcg;
+	struct mem_cgroup *memcg, *victim = NULL;
+	struct cgroup_subsys_state *css_pos, *css;
+	unsigned long usage, max_usage = 0;
+	unsigned long pagecache = 0;
+	int ret = 0;
+
+	if (root_memcg)
+		root_memcg = bpf_get_mem_cgroup(&root_memcg->css);
+	else
+		root_memcg = bpf_get_root_mem_cgroup();
+
+	if (!root_memcg)
+		return 0;
+
+	css = &root_memcg->css;
+	if (css && css->cgroup == link->cgroup)
+		goto exit;
+
+	bpf_rcu_read_lock();
+	bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) {
+		if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants)
+			continue;
+
+		memcg = bpf_get_mem_cgroup(css_pos);
+		if (!memcg)
+			continue;
+
+		usage = bpf_mem_cgroup_usage(memcg);
+		pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES);
+
+		if (usage > pagecache)
+			usage -= pagecache;
+		else
+			usage = 0;
+
+		if ((usage > max_usage) && mem_cgroup_killable(memcg)) {
+			max_usage = usage;
+			if (victim)
+				bpf_put_mem_cgroup(victim);
+			victim = bpf_get_mem_cgroup(&memcg->css);
+		}
+
+		bpf_put_mem_cgroup(memcg);
+	}
+	bpf_rcu_read_unlock();
+
+	if (!victim)
+		goto exit;
+
+	bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) {
+		struct task_struct *t = bpf_task_acquire(task);
+
+		if (t) {
+			/*
+			 * If the task is already an OOM victim, it will
+			 * quit soon and release some memory.
+			 */
+			if (bpf_task_is_oom_victim(task)) {
+				bpf_task_release(t);
+				ret = 1;
+				break;
+			}
+
+			bpf_oom_kill_process(oc, task, "bpf oom test");
+			bpf_task_release(t);
+			ret = 1;
+		}
+	}
+
+	bpf_put_mem_cgroup(victim);
+exit:
+	bpf_put_mem_cgroup(root_memcg);
+
+	return ret;
+}
+
+SEC(".struct_ops.link")
+struct bpf_oom_ops test_bpf_oom = {
+	.name = "bpf_test_policy",
+	.handle_out_of_memory = (void *)test_out_of_memory,
+};
-- 
2.52.0

next prev parent reply	other threads:[~2026-01-27  2:45 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-27  2:44 [PATCH bpf-next v3 00/17] mm: BPF OOM Roman Gushchin
2026-01-27  2:44 ` [PATCH bpf-next v3 01/17] bpf: move bpf_struct_ops_link into bpf.h Roman Gushchin
2026-01-27  5:50   ` Yafang Shao
2026-01-28 11:28   ` Matt Bobrowski
2026-01-27  2:44 ` [PATCH bpf-next v3 02/17] bpf: allow attaching struct_ops to cgroups Roman Gushchin
2026-01-27  3:08   ` bot+bpf-ci
2026-01-27  5:49   ` Yafang Shao
2026-01-28  3:10   ` Josh Don
2026-01-28 18:52     ` Roman Gushchin
2026-01-28 11:25   ` Matt Bobrowski
2026-01-28 19:18     ` Roman Gushchin
2026-01-27  2:44 ` [PATCH bpf-next v3 03/17] libbpf: fix return value on memory allocation failure Roman Gushchin
2026-01-27  5:52   ` Yafang Shao
2026-01-27  2:44 ` [PATCH bpf-next v3 04/17] libbpf: introduce bpf_map__attach_struct_ops_opts() Roman Gushchin
2026-01-27  3:08   ` bot+bpf-ci
2026-01-27  2:44 ` [PATCH bpf-next v3 05/17] bpf: mark struct oom_control's memcg field as TRUSTED_OR_NULL Roman Gushchin
2026-01-27  6:06   ` Yafang Shao
2026-02-02  4:56   ` Matt Bobrowski
2026-01-27  2:44 ` [PATCH bpf-next v3 06/17] mm: define mem_cgroup_get_from_ino() outside of CONFIG_SHRINKER_DEBUG Roman Gushchin
2026-01-27  6:12   ` Yafang Shao
2026-02-02  3:50   ` Shakeel Butt
2026-01-27  2:44 ` [PATCH bpf-next v3 07/17] mm: introduce BPF OOM struct ops Roman Gushchin
2026-01-27  9:38   ` Michal Hocko
2026-01-27 21:12     ` Roman Gushchin
2026-01-28  8:00       ` Michal Hocko
2026-01-28 18:44         ` Roman Gushchin
2026-02-02  4:06       ` Matt Bobrowski
2026-01-28  3:26   ` Josh Don
2026-01-28 19:03     ` Roman Gushchin
2026-01-28 11:19   ` Michal Hocko
2026-01-28 18:53     ` Roman Gushchin
2026-01-29 21:00   ` Martin KaFai Lau
2026-01-30 23:29     ` Roman Gushchin
2026-02-02 20:27       ` Martin KaFai Lau
2026-01-27  2:44 ` [PATCH bpf-next v3 08/17] mm: introduce bpf_oom_kill_process() bpf kfunc Roman Gushchin
2026-01-27 20:21   ` Martin KaFai Lau
2026-01-27 20:47     ` Roman Gushchin
2026-02-02  4:49   ` Matt Bobrowski
2026-01-27  2:44 ` [PATCH bpf-next v3 09/17] mm: introduce bpf_out_of_memory() BPF kfunc Roman Gushchin
2026-01-28 20:21   ` Matt Bobrowski
2026-01-27  2:44 ` [PATCH bpf-next v3 10/17] mm: introduce bpf_task_is_oom_victim() kfunc Roman Gushchin
2026-02-02  5:39   ` Matt Bobrowski
2026-02-02 17:30     ` Alexei Starovoitov
2026-02-03  0:14       ` Roman Gushchin
2026-02-03 13:23         ` Michal Hocko
2026-02-03 16:31           ` Alexei Starovoitov
2026-02-04  9:02             ` Michal Hocko
2026-02-05  0:12               ` Alexei Starovoitov
2026-01-27  2:44 ` [PATCH bpf-next v3 11/17] bpf: selftests: introduce read_cgroup_file() helper Roman Gushchin
2026-01-27  3:08   ` bot+bpf-ci
2026-01-27  2:44 ` Roman Gushchin [this message]
2026-01-27  2:44 ` [PATCH bpf-next v3 13/17] sched: psi: add a trace point to psi_avgs_work() Roman Gushchin
2026-01-27  2:44 ` [PATCH bpf-next v3 14/17] sched: psi: add cgroup_id field to psi_group structure Roman Gushchin
2026-01-27  2:44 ` [PATCH bpf-next v3 15/17] bpf: allow calling bpf_out_of_memory() from a PSI tracepoint Roman Gushchin
2026-01-27  9:02 ` [PATCH bpf-next v3 00/17] mm: BPF OOM Michal Hocko
2026-01-27 21:01   ` Roman Gushchin
2026-01-28  8:06     ` Michal Hocko
2026-01-28 16:59       ` Alexei Starovoitov
2026-01-28 18:23         ` Roman Gushchin
2026-01-28 18:53           ` Alexei Starovoitov
2026-02-02  3:26         ` Matt Bobrowski
2026-02-02 17:50           ` Alexei Starovoitov
2026-02-04 23:52             ` Matt Bobrowski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260127024421.494929-13-roman.gushchin@linux.dev \
    --to=roman.gushchin@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=inwardvessel@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mattbobrowski@google.com \
    --cc=mhocko@suse.com \
    --cc=shakeel.butt@linux.dev \
    --cc=surenb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox