Re: [RFC PATCH v4 4/4] selftest/bpf: add selftest for BPF based THP order seletection

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Zi Yan <ziy@nvidia.com>
To: Yafang Shao <laoar.shao@gmail.com>
Cc: akpm@linux-foundation.org, david@redhat.com,
	baolin.wang@linux.alibaba.com, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, npache@redhat.com, ryan.roberts@arm.com,
	dev.jain@arm.com, hannes@cmpxchg.org, usamaarif642@gmail.com,
	gutierrez.asier@huawei-partners.com, willy@infradead.org,
	ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
	ameryhung@gmail.com, bpf@vger.kernel.org, linux-mm@kvack.org
Subject: Re: [RFC PATCH v4 4/4] selftest/bpf: add selftest for BPF based THP order seletection
Date: Tue, 29 Jul 2025 11:36:23 -0400	[thread overview]
Message-ID: <BADFCED9-4C30-4ED6-88F3-D8CB7054CC56@nvidia.com> (raw)
In-Reply-To: <20250729091807.84310-5-laoar.shao@gmail.com>

On 29 Jul 2025, at 5:18, Yafang Shao wrote:

> This self-test verifies that PMD-mapped THP allocation is restricted in
> page faults for tasks within a specific cgroup, while still permitting
> THP allocation via khugepaged.
>
> Since THP allocation depends on various factors (e.g., system memory
> pressure), using the actual allocated THP size for validation is
> unreliable. Instead, we check the return value of get_suggested_order(),
> which indicates whether the system intends to allocate a THP, regardless of
> whether the allocation ultimately succeeds.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  tools/testing/selftests/bpf/config            |   2 +
>  .../selftests/bpf/prog_tests/thp_adjust.c     | 183 ++++++++++++++++++
>  .../selftests/bpf/progs/test_thp_adjust.c     |  69 +++++++
>  .../bpf/progs/test_thp_adjust_failure.c       |  24 +++
>  4 files changed, 278 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/thp_adjust.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c
>

The program below will only work on architectures with 4KB base page
and PMD order is 9. It is better to read base page size and PMD page size
from the system.



> diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
> index f74e1ea0ad3b..0364f945347d 100644
> --- a/tools/testing/selftests/bpf/config
> +++ b/tools/testing/selftests/bpf/config
> @@ -118,3 +118,5 @@ CONFIG_XDP_SOCKETS=y
>  CONFIG_XFRM_INTERFACE=y
>  CONFIG_TCP_CONG_DCTCP=y
>  CONFIG_TCP_CONG_BBR=y
> +CONFIG_TRANSPARENT_HUGEPAGE=y
> +CONFIG_MEMCG=y
> diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
> new file mode 100644
> index 000000000000..31d03383cbb8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c
> @@ -0,0 +1,183 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <sys/mman.h>
> +#include <test_progs.h>
> +#include "cgroup_helpers.h"
> +#include "test_thp_adjust.skel.h"
> +#include "test_thp_adjust_failure.skel.h"
> +
> +#define LEN (16 * 1024 * 1024) /* 16MB */
> +#define THP_ENABLED_PATH "/sys/kernel/mm/transparent_hugepage/enabled"
> +
> +static char *thp_addr;
> +static char old_mode[32];
> +
> +int thp_mode_save(void)
> +{
> +	const char *start, *end;
> +	char buf[128];
> +	int fd, err;
> +	size_t len;
> +
> +	fd = open(THP_ENABLED_PATH, O_RDONLY);
> +	if (fd == -1)
> +		return -1;
> +
> +	err = read(fd, buf, sizeof(buf) - 1);
> +	if (err == -1)
> +		goto close;
> +
> +	start = strchr(buf, '[');
> +	end = start ? strchr(start, ']') : NULL;
> +	if (!start || !end || end <= start) {
> +		err = -1;
> +		goto close;
> +	}
> +
> +	len = end - start - 1;
> +	if (len >= sizeof(old_mode))
> +		len = sizeof(old_mode) - 1;
> +	strncpy(old_mode, start + 1, len);
> +	old_mode[len] = '\0';
> +
> +close:
> +	close(fd);
> +	return err;
> +}
> +
> +int thp_set(const char *desired_mode)
> +{
> +	int fd, err;
> +
> +	fd = open(THP_ENABLED_PATH, O_RDWR);
> +	if (fd == -1)
> +		return -1;
> +
> +	err = write(fd, desired_mode, strlen(desired_mode));
> +	close(fd);
> +	return err;
> +}
> +
> +int thp_reset(void)
> +{
> +	int fd, err;
> +
> +	fd = open(THP_ENABLED_PATH, O_WRONLY);
> +	if (fd == -1)
> +		return -1;
> +
> +	err = write(fd, old_mode, strlen(old_mode));
> +	close(fd);
> +	return err;
> +}
> +
> +int thp_alloc(void)
> +{
> +	int err, i;
> +
> +	thp_addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
> +	if (thp_addr == MAP_FAILED)
> +		return -1;
> +
> +	err = madvise(thp_addr, LEN, MADV_HUGEPAGE);
> +	if (err == -1)
> +		goto unmap;
> +
> +	for (i = 0; i < LEN; i += 4096)
> +		thp_addr[i] = 1;
> +	return 0;
> +
> +unmap:
> +	munmap(thp_addr, LEN);
> +	return -1;
> +}
> +
> +void thp_free(void)
> +{
> +	if (!thp_addr)
> +		return;
> +	munmap(thp_addr, LEN);
> +}
> +
> +void subtest_thp_adjust(void)
> +{
> +	struct bpf_link *fentry_link, *ops_link;
> +	struct test_thp_adjust *skel;
> +	int err, cgrp_fd, cgrp_id;
> +
> +	err = setup_cgroup_environment();
> +	if (!ASSERT_OK(err, "cgrp_env_setup"))
> +		return;
> +
> +	cgrp_fd = create_and_get_cgroup("thp_adjust");
> +	if (!ASSERT_GE(cgrp_fd, 0, "create_and_get_cgroup"))
> +		goto cleanup;
> +
> +	err = join_cgroup("thp_adjust");
> +	if (!ASSERT_OK(err, "join_cgroup"))
> +		goto close_fd;
> +
> +	cgrp_id = get_cgroup_id("thp_adjust");
> +	if (!ASSERT_GE(cgrp_id, 0, "create_and_get_cgroup"))
> +		goto join_root;
> +
> +	if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save"))
> +		goto join_root;
> +	if (!ASSERT_GE(thp_set("madvise"), 0, "THP mode set"))
> +		goto join_root;
> +
> +	skel = test_thp_adjust__open();
> +	if (!ASSERT_OK_PTR(skel, "open"))
> +		goto thp_reset;
> +
> +	skel->bss->cgrp_id = cgrp_id;
> +	skel->bss->target_pid = getpid();
> +
> +	err = test_thp_adjust__load(skel);
> +	if (!ASSERT_OK(err, "load"))
> +		goto destroy;
> +
> +	fentry_link = bpf_program__attach_trace(skel->progs.thp_run);
> +	if (!ASSERT_OK_PTR(fentry_link, "attach fentry"))
> +		goto destroy;
> +
> +	ops_link = bpf_map__attach_struct_ops(skel->maps.thp);
> +	if (!ASSERT_OK_PTR(ops_link, "attach struct_ops"))
> +		goto destroy;
> +
> +	if (!ASSERT_NEQ(thp_alloc(), -1, "THP alloc"))
> +		goto destroy;
> +
> +	/* After attaching struct_ops, THP will be allocated only in khugepaged . */
> +	if (!ASSERT_EQ(skel->bss->pf_alloc, 0, "alloc_in_pf"))
> +		goto thp_free;
> +	if (!ASSERT_GT(skel->bss->pf_disallow, 0, "alloc_in_pf"))
> +		goto thp_free;
> +
> +	if (!ASSERT_GT(skel->bss->khugepaged_alloc, 0, "alloc_in_khugepaged"))
> +		goto thp_free;
> +	ASSERT_EQ(skel->bss->khugepaged_disallow, 0, "alloc_in_pf");
> +
> +thp_free:
> +	thp_free();
> +destroy:
> +	test_thp_adjust__destroy(skel);
> +thp_reset:
> +	ASSERT_GE(thp_reset(), 0, "THP mode reset");
> +join_root:
> +	/* We must join the root cgroup before removing the created cgroup. */
> +	err = join_root_cgroup();
> +	ASSERT_OK(err, "join_cgroup to root");
> +close_fd:
> +	close(cgrp_fd);
> +	remove_cgroup("thp_adjust");
> +cleanup:
> +	cleanup_cgroup_environment();
> +}
> +
> +void test_thp_adjust(void)
> +{
> +	if (test__start_subtest("thp_adjust"))
> +		subtest_thp_adjust();
> +	RUN_TESTS(test_thp_adjust_failure);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
> new file mode 100644
> index 000000000000..bb4aad50c7a8
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c
> @@ -0,0 +1,69 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +#define TVA_IN_PF (1 << 1)
> +
> +int pf_alloc, pf_disallow, khugepaged_alloc, khugepaged_disallow;
> +int cgrp_id, target_pid;
> +
> +/* Detecting whether a task can successfully allocate THP is unreliable because
> + * it may be influenced by system memory pressure. Instead of making the result
> + * dependent on unpredictable factors, we should simply check
> + * get_suggested_order()'s return value, which is deterministic.
> + */
> +SEC("fexit/get_suggested_order")
> +int BPF_PROG(thp_run, struct mm_struct *mm, unsigned long tva_flags, int order, int retval)
> +{
> +	struct task_struct *current = bpf_get_current_task_btf();
> +
> +	if (current->pid != target_pid || order != 9)
> +		return 0;
> +
> +	if (tva_flags & TVA_IN_PF) {
> +		if (retval == 9)
> +			pf_alloc++;
> +		else if (!retval)
> +			pf_disallow++;
> +	} else {
> +		if (retval == 9)
> +			khugepaged_alloc++;
> +		else if (!retval)
> +			khugepaged_disallow++;
> +	}
> +	return 0;
> +}
> +
> +SEC("struct_ops/get_suggested_order")
> +int BPF_PROG(bpf_suggested_order, struct mm_struct *mm, unsigned long tva_flags, int order)
> +{
> +	struct mem_cgroup *memcg = bpf_mm_get_mem_cgroup(mm);
> +	int suggested_order = order;
> +
> +	/* Only works when CONFIG_MEMCG is enabled. */
> +	if (!memcg)
> +		return suggested_order;
> +
> +	if (memcg->css.cgroup->kn->id == cgrp_id) {
> +		/* BPF THP allocation policy:
> +		 * - Disallow PMD allocation in page fault context
> +		 */
> +		if (tva_flags & TVA_IN_PF && order == 9) {
> +			suggested_order = 0;
> +			goto out;
> +		}
> +	}
> +
> +out:
> +	bpf_put_mem_cgroup(memcg);
> +	return suggested_order;
> +}
> +
> +SEC(".struct_ops.link")
> +struct bpf_thp_ops thp = {
> +	.get_suggested_order = (void *)bpf_suggested_order,
> +};
> diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c b/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c
> new file mode 100644
> index 000000000000..b080aead9b87
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust_failure.c
> @@ -0,0 +1,24 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include "vmlinux.h"
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +#include "bpf_misc.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +SEC("struct_ops/get_suggested_order")
> +__failure __msg("Unreleased reference")
> +int BPF_PROG(unreleased_task, struct mm_struct *mm, bool vma_madvised)
> +{
> +	struct task_struct *p = bpf_mm_get_task(mm);
> +
> +	/* The task should be released with bpf_task_release() */
> +	return p ? 9 : 0;
> +}
> +
> +SEC(".struct_ops.link")
> +struct bpf_thp_ops thp = {
> +	.get_suggested_order = (void *)unreleased_task,
> +};
> -- 
> 2.43.5


Best Regards,
Yan, Zi

next prev parent reply	other threads:[~2025-07-29 15:36 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-29  9:18 [RFC PATCH v4 0/4] mm, bpf: BPF based THP order selection Yafang Shao
2025-07-29  9:18 ` [RFC PATCH v4 1/4] mm: thp: add support for " Yafang Shao
2025-07-29 15:32   ` Zi Yan
2025-07-30  2:36     ` Yafang Shao
2025-07-29  9:18 ` [RFC PATCH v4 2/4] mm: thp: add a new kfunc bpf_mm_get_mem_cgroup() Yafang Shao
2025-07-29  9:18 ` [RFC PATCH v4 3/4] mm: thp: add a new kfunc bpf_mm_get_task() Yafang Shao
2025-07-29  9:18 ` [RFC PATCH v4 4/4] selftest/bpf: add selftest for BPF based THP order seletection Yafang Shao
2025-07-29 15:36   ` Zi Yan [this message]
2025-07-30  2:38     ` Yafang Shao
2025-07-29 15:07 ` [RFC PATCH v4 0/4] mm, bpf: BPF based THP order selection Zi Yan
2025-07-30  2:31   ` Yafang Shao
2025-07-30  9:58     ` David Hildenbrand
2025-07-31  2:07       ` Yafang Shao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=BADFCED9-4C30-4ED6-88F3-D8CB7054CC56@nvidia.com \
    --to=ziy@nvidia.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=ameryhung@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=david@redhat.com \
    --cc=dev.jain@arm.com \
    --cc=gutierrez.asier@huawei-partners.com \
    --cc=hannes@cmpxchg.org \
    --cc=laoar.shao@gmail.com \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=npache@redhat.com \
    --cc=ryan.roberts@arm.com \
    --cc=usamaarif642@gmail.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox