* [PATCH bpf-next v5 1/6] vmalloc: introduce execmem_alloc, execmem_free, and execmem_fill
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
@ 2022-11-28 19:02 ` Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 2/6] x86/alternative: support execmem_alloc() and execmem_free() Song Liu
` (4 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
execmem_alloc is used to allocate memory to host dynamic kernel text
(modules, BPF programs, etc.) with huge pages. This is similar to the
proposal by Peter in [1].
A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. execmem_alloc allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*. To be more accurate, the vmap_area is first added to
vmap_area_* tree and then moved to free_text_area_*. This extra move
simplifies the logic of execmem_alloc.
vmap_area in free_text_area_* tree are backed with memory, but we need
subtree_max_size for tree operations. Therefore, vm_struct for these
vmap_area are stored in a separate list, all_text_vm.
The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of execmem_alloc can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.
Memory allocated by execmem_alloc() is set to RO+X before returning to the
caller. Therefore, the caller cannot write directly write to the memory.
Instead, the caller is required to use execmem_fill() to update the memory.
For the safety and security of X memory, execmem_fill() checks the data
being updated always in the memory allocated by one execmem_alloc() call.
execmem_fill() uses text_poke like mechanism and requires arch support.
Specifically, the arch need to implement arch_execmem_fill().
In execmem_free(), the memory is first erased with arch_invalidate_exec().
Then, the memory is added to free_text_area_*. If this free creates big
enough continuous free space (> PMD_SIZE), execmem_free() will try to free
the backing vm_struct.
Hopefully, this will be the first step towards a unified memory allocator
for memory with special permissions.
[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/
Signed-off-by: Song Liu <song@kernel.org>
---
include/linux/vmalloc.h | 5 +
mm/nommu.c | 12 ++
mm/vmalloc.c | 321 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 338 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..30aa8c187d40 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -154,6 +154,11 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller) __alloc_size(1);
void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *execmem_alloc(unsigned long size, unsigned long align) __alloc_size(1);
+void *execmem_fill(void *dst, void *src, size_t len);
+void execmem_free(void *addr);
+void *arch_fill_execmem(void *dst, void *src, size_t len);
+int arch_invalidate_execmem(void *ptr, size_t len);
extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index 214c70e1d059..e3039fd4f65b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -371,6 +371,18 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
}
EXPORT_SYMBOL(vm_map_pages_zero);
+void *execmem_alloc(unsigned long size, unsigned long align)
+{
+ return NULL;
+}
+
+void *execmem_fill(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+void execmem_free(const void *addr) { }
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ccaa461998f3..1f7f2134f9bd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+#ifndef PMD_ALIGN
+#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
+#endif
+#define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -769,6 +774,38 @@ static LIST_HEAD(free_vmap_area_list);
*/
static struct rb_root free_vmap_area_root = RB_ROOT;
+/*
+ * free_text_area for execmem_alloc()
+ */
+static DEFINE_SPINLOCK(free_text_area_lock);
+/*
+ * This linked list is used in pair with free_text_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_text_area_list);
+
+/*
+ * This augment red-black tree represents the free text space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ *
+ * vmap_area in this tree are backed by RO+X memory, but they do
+ * not have valid vm pointer (because we need subtree_max_size).
+ * The vm for these vmap_area are stored in all_text_vm.
+ */
+static struct rb_root free_text_area_root = RB_ROOT;
+
+/*
+ * List of vm_struct for free_text_area_root. This list is rarely
+ * accessed, so the O(N) complexity is not likely a real issue.
+ */
+struct vm_struct *all_text_vm;
+
/*
* Preload a CPU with one object for "no edge" split case. The
* aim is to get rid of allocations from the atomic context, thus
@@ -3313,6 +3350,290 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+#define EXEC_MEM_START MODULES_VADDR
+#define EXEC_MEM_END MODULES_END
+#else
+#define EXEC_MEM_START VMALLOC_START
+#define EXEC_MEM_END VMALLOC_END
+#endif
+
+static void move_vmap_to_free_text_tree(void *addr)
+{
+ struct vmap_area *va;
+
+ /* remove from vmap_area_root */
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
+ if (WARN_ON_ONCE(!va)) {
+ spin_unlock(&vmap_area_lock);
+ return;
+ }
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ /* make the memory RO+X */
+ memset(addr, 0, va->va_end - va->va_start);
+ set_memory_ro(va->va_start, (va->va_end - va->va_start) >> PAGE_SHIFT);
+ set_memory_x(va->va_start, (va->va_end - va->va_start) >> PAGE_SHIFT);
+
+ spin_lock(&free_text_area_lock);
+ /* add to all_text_vm */
+ va->vm->next = all_text_vm;
+ all_text_vm = va->vm;
+
+ /* add to free_text_area_root */
+ merge_or_add_vmap_area_augment(va, &free_text_area_root, &free_text_area_list);
+ spin_unlock(&free_text_area_lock);
+}
+
+/**
+ * execmem_alloc - allocate virtually contiguous RO+X memory
+ * @size: allocation size
+ * @align: desired alignment
+ *
+ * This is used to allocate dynamic kernel text, such as module text, BPF
+ * programs, etc. User need to use text_poke to update the memory allocated
+ * by execmem_alloc.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *execmem_alloc(unsigned long size, unsigned long align)
+{
+ struct vmap_area *va, *tmp;
+ unsigned long addr;
+ enum fit_type type;
+ int ret;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!va))
+ return NULL;
+
+again:
+ preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+ tmp = find_vmap_lowest_match(&free_text_area_root, size, align, 1, false);
+
+ if (!tmp) {
+ unsigned long alloc_size;
+ void *ptr;
+
+ spin_unlock(&free_text_area_lock);
+
+ /*
+ * Not enough continuous space in free_text_area_root, try
+ * allocate more memory. The memory is first added to
+ * vmap_area_root, and then moved to free_text_area_root.
+ */
+ alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+ ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, EXEC_MEM_START,
+ EXEC_MEM_END, GFP_KERNEL, PAGE_KERNEL,
+ VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (unlikely(!ptr))
+ goto err_out;
+
+ move_vmap_to_free_text_tree(ptr);
+ goto again;
+ }
+
+ addr = roundup(tmp->va_start, align);
+ type = classify_va_fit_type(tmp, addr, size);
+
+ ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+ tmp, addr, size);
+ spin_unlock(&free_text_area_lock);
+ if (ret)
+ goto err_out;
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->vm = NULL;
+
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ return (void *)addr;
+
+err_out:
+ kmem_cache_free(vmap_area_cachep, va);
+ return NULL;
+}
+
+void __weak *arch_fill_execmem(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+int __weak arch_invalidate_execmem(void *ptr, size_t len)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * execmem_fill - Copy text to RO+X memory allocated by execmem_alloc()
+ * @dst: pointer to memory allocated by execmem_alloc()
+ * @src: pointer to data being copied from
+ * @len: number of bytes to be copied
+ *
+ * execmem_fill() will only update memory allocated by a single execmem_fill()
+ * call. If dst + len goes beyond the boundary of one allocation,
+ * execmem_fill() is aborted.
+ *
+ * If @addr is NULL, no operation is performed.
+ */
+void *execmem_fill(void *dst, void *src, size_t len)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)dst, &vmap_area_root);
+
+ /*
+ * If no va, or va has a vm attached, this memory is not allocated
+ * by execmem_alloc().
+ */
+ if (WARN_ON_ONCE(!va))
+ goto err_out;
+ if (WARN_ON_ONCE(va->vm))
+ goto err_out;
+
+ /* Disallow filling across execmem_alloc boundaries */
+ if (WARN_ON_ONCE((unsigned long)dst + len > va->va_end))
+ goto err_out;
+
+ spin_unlock(&vmap_area_lock);
+
+ return arch_fill_execmem(dst, src, len);
+
+err_out:
+ spin_unlock(&vmap_area_lock);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct vm_struct *find_and_unlink_text_vm(unsigned long start, unsigned long end)
+{
+ struct vm_struct *vm, *prev_vm;
+
+ lockdep_assert_held(&free_text_area_lock);
+
+ vm = all_text_vm;
+ while (vm) {
+ unsigned long vm_addr = (unsigned long)vm->addr;
+
+ /* vm is within this free space, we can free it */
+ if ((vm_addr >= start) && ((vm_addr + vm->size) <= end))
+ goto unlink_vm;
+ vm = vm->next;
+ }
+ return NULL;
+
+unlink_vm:
+ if (all_text_vm == vm) {
+ all_text_vm = vm->next;
+ } else {
+ prev_vm = all_text_vm;
+ while (prev_vm->next != vm)
+ prev_vm = prev_vm->next;
+ prev_vm->next = vm->next;
+ }
+ return vm;
+}
+
+/**
+ * execmem_free - Release memory allocated by execmem_alloc()
+ * @addr: Memory base address
+ *
+ * If @addr is NULL, no operation is performed.
+ */
+void execmem_free(void *addr)
+{
+ unsigned long free_start, free_end, free_addr;
+ struct vm_struct *vm;
+ struct vmap_area *va;
+
+ might_sleep();
+
+ if (!addr)
+ return;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
+ if (WARN_ON_ONCE(!va)) {
+ spin_unlock(&vmap_area_lock);
+ return;
+ }
+ WARN_ON_ONCE(va->vm);
+
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ /* Invalidate text in the region */
+ arch_invalidate_execmem(addr, va->va_end - va->va_start);
+
+ spin_lock(&free_text_area_lock);
+ va = merge_or_add_vmap_area_augment(va,
+ &free_text_area_root, &free_text_area_list);
+
+ if (WARN_ON_ONCE(!va))
+ goto out;
+
+ free_start = PMD_ALIGN(va->va_start);
+ free_end = PMD_ALIGN_DOWN(va->va_end);
+
+ /*
+ * Only try to free vm when there is at least one PMD_SIZE free
+ * continuous memory.
+ */
+ if (free_start >= free_end)
+ goto out;
+
+ /*
+ * TODO: It is possible that multiple vm are ready to be freed
+ * after one execmem_free(). But we free at most one vm for now.
+ */
+ vm = find_and_unlink_text_vm(free_start, free_end);
+ if (!vm)
+ goto out;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, GFP_ATOMIC, NUMA_NO_NODE);
+ if (unlikely(!va))
+ goto out_save_vm;
+
+ free_addr = __alloc_vmap_area(&free_text_area_root, &free_text_area_list,
+ vm->size, 1, (unsigned long)vm->addr,
+ (unsigned long)vm->addr + vm->size);
+
+ if (WARN_ON_ONCE(free_addr != (unsigned long)vm->addr))
+ goto out_save_vm;
+
+ va->va_start = (unsigned long)vm->addr;
+ va->va_end = va->va_start + vm->size;
+ va->vm = vm;
+ spin_unlock(&free_text_area_lock);
+
+ set_memory_nx(va->va_start, vm->size >> PAGE_SHIFT);
+ set_memory_rw(va->va_start, vm->size >> PAGE_SHIFT);
+
+ /* put the va to vmap_area_root, and then free it with vfree */
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ vfree(vm->addr);
+ return;
+
+out_save_vm:
+ /*
+ * vm is removed from all_text_vm, but not freed. Add it back,
+ * so that we can use or free it later.
+ */
+ vm->next = all_text_vm;
+ all_text_vm = vm;
+out:
+ spin_unlock(&free_text_area_lock);
+}
+
/**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH bpf-next v5 2/6] x86/alternative: support execmem_alloc() and execmem_free()
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 1/6] vmalloc: introduce execmem_alloc, execmem_free, and execmem_fill Song Liu
@ 2022-11-28 19:02 ` Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs Song Liu
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
Implement arch_fill_execmem() and arch_invalidate_execmem() to support
execmem_alloc.
arch_fill_execmem() copies dynamic kernel text (such as BPF programs) to
RO+X memory region allocated by execmem_alloc().
arch_invalidate_execmem() fills memory with 0xcc after it is released by
execmem_free().
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/kernel/alternative.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cadcea035e0..e38829f19a81 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1270,6 +1270,18 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
return addr;
}
+void *arch_fill_execmem(void *dst, void *src, size_t len)
+{
+ if (text_poke_copy(dst, src, len) == NULL)
+ return ERR_PTR(-EINVAL);
+ return dst;
+}
+
+int arch_invalidate_execmem(void *ptr, size_t len)
+{
+ return IS_ERR_OR_NULL(text_poke_set(ptr, 0xcc, len));
+}
+
/**
* text_poke_set - memset into (an unused part of) RX memory
* @addr: address to modify
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 1/6] vmalloc: introduce execmem_alloc, execmem_free, and execmem_fill Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 2/6] x86/alternative: support execmem_alloc() and execmem_free() Song Liu
@ 2022-11-28 19:02 ` Song Liu
2022-11-29 8:35 ` Christoph Hellwig
2022-11-28 19:02 ` [PATCH bpf-next v5 4/6] bpf: use execmem_alloc for bpf program and bpf dispatcher Song Liu
` (2 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
Add logic to test execmem_[alloc|fill|free] in test_vmalloc.c.
No need to change tools/testing/selftests/vm/test_vmalloc.sh.
Gate the export of execmem_* with DEBUG_TEST_VMALLOC_EXEMEM_ALLOC so
they are only exported when the developers are running tests.
Signed-off-by: Song Liu <song@kernel.org>
---
lib/test_vmalloc.c | 35 +++++++++++++++++++++++++++++++++++
mm/vmalloc.c | 12 ++++++++++++
2 files changed, 47 insertions(+)
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index cf7780572f5b..9c78f0693f59 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -50,6 +50,7 @@ __param(int, run_test_mask, INT_MAX,
"\t\tid: 128, name: pcpu_alloc_test\n"
"\t\tid: 256, name: kvfree_rcu_1_arg_vmalloc_test\n"
"\t\tid: 512, name: kvfree_rcu_2_arg_vmalloc_test\n"
+ "\t\tid: 1024, name: execmem_alloc_test\n"
/* Add a new test case description here. */
);
@@ -352,6 +353,39 @@ kvfree_rcu_2_arg_vmalloc_test(void)
return 0;
}
+/* This should match the define in vmalloc.c */
+#define DEBUG_TEST_VMALLOC_EXEMEM_ALLOC 0
+
+static int
+execmem_alloc_test(void)
+{
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
+ /* allocate variable size, up to 64kB */
+ size_t size = (i % 1024 + 1) * 64;
+ void *p, *tmp;
+
+ p = execmem_alloc(size, 64);
+ if (!p)
+ return -1;
+
+ tmp = execmem_fill(p, "a", 1);
+ if (tmp != p)
+ return -1;
+
+ tmp = execmem_fill(p + size - 1, "b", 1);
+ if (tmp != p + size - 1)
+ return -1;
+
+ execmem_free(p);
+#endif
+ }
+
+ return 0;
+}
+
struct test_case_desc {
const char *test_name;
int (*test_func)(void);
@@ -368,6 +402,7 @@ static struct test_case_desc test_case_array[] = {
{ "pcpu_alloc_test", pcpu_alloc_test },
{ "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test },
{ "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test },
+ { "execmem_alloc_test", execmem_alloc_test },
/* Add a new test case here. */
};
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1f7f2134f9bd..95ee8282b66c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3387,6 +3387,9 @@ static void move_vmap_to_free_text_tree(void *addr)
spin_unlock(&free_text_area_lock);
}
+/* This should match the define in test_vmalloc.c */
+#define DEBUG_TEST_VMALLOC_EXEMEM_ALLOC 0
+
/**
* execmem_alloc - allocate virtually contiguous RO+X memory
* @size: allocation size
@@ -3459,6 +3462,9 @@ void *execmem_alloc(unsigned long size, unsigned long align)
kmem_cache_free(vmap_area_cachep, va);
return NULL;
}
+#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
+EXPORT_SYMBOL_GPL(execmem_alloc);
+#endif
void __weak *arch_fill_execmem(void *dst, void *src, size_t len)
{
@@ -3510,6 +3516,9 @@ void *execmem_fill(void *dst, void *src, size_t len)
spin_unlock(&vmap_area_lock);
return ERR_PTR(-EINVAL);
}
+#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
+EXPORT_SYMBOL_GPL(execmem_fill);
+#endif
static struct vm_struct *find_and_unlink_text_vm(unsigned long start, unsigned long end)
{
@@ -3633,6 +3642,9 @@ void execmem_free(void *addr)
out:
spin_unlock(&free_text_area_lock);
}
+#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
+EXPORT_SYMBOL_GPL(execmem_free);
+#endif
/**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread* Re: [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs
2022-11-28 19:02 ` [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs Song Liu
@ 2022-11-29 8:35 ` Christoph Hellwig
2022-11-29 17:31 ` Song Liu
0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2022-11-29 8:35 UTC (permalink / raw)
To: Song Liu
Cc: bpf, linux-mm, akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof
> +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> +EXPORT_SYMBOL_GPL(execmem_alloc);
> +#endif
> +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> +EXPORT_SYMBOL_GPL(execmem_fill);
> +#endif
> +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> +EXPORT_SYMBOL_GPL(execmem_free);
> +#endif
Still NAK. These symbols never have any business being exported
ever. Just force the test to be built-in if you want to test this
functionality.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs
2022-11-29 8:35 ` Christoph Hellwig
@ 2022-11-29 17:31 ` Song Liu
2022-11-29 20:22 ` Christoph Hellwig
0 siblings, 1 reply; 10+ messages in thread
From: Song Liu @ 2022-11-29 17:31 UTC (permalink / raw)
To: Christoph Hellwig
Cc: bpf, linux-mm, akpm, x86, peterz, rick.p.edgecombe, rppt, mcgrof
On Tue, Nov 29, 2022 at 12:35 AM Christoph Hellwig <hch@lst.de> wrote:
>
> > +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> > +EXPORT_SYMBOL_GPL(execmem_alloc);
> > +#endif
>
> > +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> > +EXPORT_SYMBOL_GPL(execmem_fill);
> > +#endif
>
> > +#if DEBUG_TEST_VMALLOC_EXEMEM_ALLOC
> > +EXPORT_SYMBOL_GPL(execmem_free);
> > +#endif
>
> Still NAK. These symbols never have any business being exported
> ever. Just force the test to be built-in if you want to test this
> functionality.
OK. I knew this was really ugly. I just want to know your thoughts on it.
I guess we can just drop 3/6 of the set, and everything else would just work.
Thanks,
Song
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs
2022-11-29 17:31 ` Song Liu
@ 2022-11-29 20:22 ` Christoph Hellwig
0 siblings, 0 replies; 10+ messages in thread
From: Christoph Hellwig @ 2022-11-29 20:22 UTC (permalink / raw)
To: Song Liu
Cc: Christoph Hellwig, bpf, linux-mm, akpm, x86, peterz,
rick.p.edgecombe, rppt, mcgrof
On Tue, Nov 29, 2022 at 09:31:24AM -0800, Song Liu wrote:
> I guess we can just drop 3/6 of the set, and everything else would just work.
Yes.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH bpf-next v5 4/6] bpf: use execmem_alloc for bpf program and bpf dispatcher
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
` (2 preceding siblings ...)
2022-11-28 19:02 ` [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs Song Liu
@ 2022-11-28 19:02 ` Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 5/6] vmalloc: introduce register_text_tail_vm() Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 6/6] x86: use register_text_tail_vm Song Liu
5 siblings, 0 replies; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
Use execmem_alloc, execmem_free, and execmem_fill instead of
bpf_prog_pack_alloc, bpf_prog_pack_free, and bpf_arch_text_copy.
execmem_free doesn't require extra size information. Therefore, the free
and error handling path can be simplified.
There are some tests that show the benefit of execmem_alloc.
Run 100 instances of the following benchmark from bpf selftests:
tools/testing/selftests/bpf/bench -w2 -d100 -a trig-kprobe
which loads 7 BPF programs, and triggers one of them.
Then use perf to monitor TLB related counters:
perf stat -e iTLB-load-misses,itlb_misses.walk_completed_4k, \
itlb_misses.walk_completed_2m_4m -a
The following results are from a qemu VM with 32 cores.
Before bpf_prog_pack:
iTLB-load-misses: 350k/s
itlb_misses.walk_completed_4k: 90k/s
itlb_misses.walk_completed_2m_4m: 0.1/s
With bpf_prog_pack (current upstream):
iTLB-load-misses: 220k/s
itlb_misses.walk_completed_4k: 68k/s
itlb_misses.walk_completed_2m_4m: 0.2/s
With execmem_alloc (with this set):
iTLB-load-misses: 185k/s
itlb_misses.walk_completed_4k: 58k/s
itlb_misses.walk_completed_2m_4m: 1/s
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/net/bpf_jit_comp.c | 23 +----
include/linux/bpf.h | 3 -
include/linux/filter.h | 5 -
kernel/bpf/core.c | 180 +++---------------------------------
kernel/bpf/dispatcher.c | 11 +--
5 files changed, 21 insertions(+), 201 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index cec5195602bc..43b93570d8f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -229,11 +229,6 @@ static void jit_fill_hole(void *area, unsigned int size)
memset(area, 0xcc, size);
}
-int bpf_arch_text_invalidate(void *dst, size_t len)
-{
- return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
-}
-
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
@@ -2509,11 +2504,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (proglen <= 0) {
out_image:
image = NULL;
- if (header) {
- bpf_arch_text_copy(&header->size, &rw_header->size,
- sizeof(rw_header->size));
+ if (header)
bpf_jit_binary_pack_free(header, rw_header);
- }
+
/* Fall back to interpreter mode */
prog = orig_prog;
if (extra_pass) {
@@ -2563,8 +2556,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (!prog->is_func || extra_pass) {
/*
* bpf_jit_binary_pack_finalize fails in two scenarios:
- * 1) header is not pointing to proper module memory;
- * 2) the arch doesn't support bpf_arch_text_copy().
+ * 1) header is not pointing to memory allocated by
+ * execmem_alloc;
+ * 2) the arch doesn't support execmem_free().
*
* Both cases are serious bugs and justify WARN_ON.
*/
@@ -2610,13 +2604,6 @@ bool bpf_jit_supports_kfunc_call(void)
return true;
}
-void *bpf_arch_text_copy(void *dst, void *src, size_t len)
-{
- if (text_poke_copy(dst, src, len) == NULL)
- return ERR_PTR(-EINVAL);
- return dst;
-}
-
/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
bool bpf_jit_supports_subprog_tailcalls(void)
{
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c6aa6912ea16..e04c63b621da 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2775,9 +2775,6 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
-void *bpf_arch_text_copy(void *dst, void *src, size_t len);
-int bpf_arch_text_invalidate(void *dst, size_t len);
-
struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bf701976056e..4373ff388759 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1023,8 +1023,6 @@ extern long bpf_jit_limit_max;
typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
-void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);
-
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -1037,9 +1035,6 @@ void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
-void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
-void bpf_prog_pack_free(struct bpf_binary_header *hdr);
-
static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
return list_empty(&fp->aux->ksym.lnode) ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2e57fc839a5c..56397556c622 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -810,149 +810,6 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
-/*
- * BPF program pack allocator.
- *
- * Most BPF programs are pretty small. Allocating a hole page for each
- * program is sometime a waste. Many small bpf program also adds pressure
- * to instruction TLB. To solve this issue, we introduce a BPF program pack
- * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
- * to host BPF programs.
- */
-#define BPF_PROG_CHUNK_SHIFT 6
-#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
-#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
-
-struct bpf_prog_pack {
- struct list_head list;
- void *ptr;
- unsigned long bitmap[];
-};
-
-void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
-{
- memset(area, 0, size);
-}
-
-#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
-
-static DEFINE_MUTEX(pack_mutex);
-static LIST_HEAD(pack_list);
-
-/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
- * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
- */
-#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
-#else
-#define BPF_PROG_PACK_SIZE PAGE_SIZE
-#endif
-
-#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
-
-static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
- struct bpf_prog_pack *pack;
-
- pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
- GFP_KERNEL);
- if (!pack)
- return NULL;
- pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
- if (!pack->ptr) {
- kfree(pack);
- return NULL;
- }
- bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
- bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
- list_add_tail(&pack->list, &pack_list);
-
- set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- return pack;
-}
-
-void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
- unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
- struct bpf_prog_pack *pack;
- unsigned long pos;
- void *ptr = NULL;
-
- mutex_lock(&pack_mutex);
- if (size > BPF_PROG_PACK_SIZE) {
- size = round_up(size, PAGE_SIZE);
- ptr = module_alloc(size);
- if (ptr) {
- bpf_fill_ill_insns(ptr, size);
- set_vm_flush_reset_perms(ptr);
- set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
- set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
- }
- goto out;
- }
- list_for_each_entry(pack, &pack_list, list) {
- pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
- nbits, 0);
- if (pos < BPF_PROG_CHUNK_COUNT)
- goto found_free_area;
- }
-
- pack = alloc_new_pack(bpf_fill_ill_insns);
- if (!pack)
- goto out;
-
- pos = 0;
-
-found_free_area:
- bitmap_set(pack->bitmap, pos, nbits);
- ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
-
-out:
- mutex_unlock(&pack_mutex);
- return ptr;
-}
-
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
-{
- struct bpf_prog_pack *pack = NULL, *tmp;
- unsigned int nbits;
- unsigned long pos;
-
- mutex_lock(&pack_mutex);
- if (hdr->size > BPF_PROG_PACK_SIZE) {
- module_memfree(hdr);
- goto out;
- }
-
- list_for_each_entry(tmp, &pack_list, list) {
- if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
- pack = tmp;
- break;
- }
- }
-
- if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
- goto out;
-
- nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
-
- WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
- "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
-
- bitmap_clear(pack->bitmap, pos, nbits);
- if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
- BPF_PROG_CHUNK_COUNT, 0) == 0) {
- list_del(&pack->list);
- module_memfree(pack->ptr);
- kfree(pack);
- }
-out:
- mutex_unlock(&pack_mutex);
-}
-
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -1052,6 +909,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
bpf_jit_uncharge_modmem(size);
}
+#define BPF_PROG_EXEC_ALIGN 64
+#define BPF_PROG_EXEC_MASK (~(BPF_PROG_EXEC_ALIGN - 1))
+
/* Allocate jit binary from bpf_prog_pack allocator.
* Since the allocated memory is RO+X, the JIT engine cannot write directly
* to the memory. To solve this problem, a RW buffer is also allocated at
@@ -1074,11 +934,11 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
alignment > BPF_IMAGE_ALIGNMENT);
/* add 16 bytes for a random section of illegal instructions */
- size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_EXEC_ALIGN);
if (bpf_jit_charge_modmem(size))
return NULL;
- ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ ro_header = execmem_alloc(size, BPF_PROG_EXEC_ALIGN);
if (!ro_header) {
bpf_jit_uncharge_modmem(size);
return NULL;
@@ -1086,8 +946,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
- bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -1097,7 +956,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
(*rw_header)->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
- BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ BPF_PROG_EXEC_ALIGN - sizeof(*ro_header));
start = prandom_u32_max(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
@@ -1113,12 +972,12 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
{
void *ptr;
- ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+ ptr = execmem_fill(ro_header, rw_header, rw_header->size);
kvfree(rw_header);
if (IS_ERR(ptr)) {
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
return PTR_ERR(ptr);
}
return 0;
@@ -1128,18 +987,13 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
* 1) when the program is freed after;
* 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
* For case 2), we need to free both the RO memory and the RW buffer.
- *
- * bpf_jit_binary_pack_free requires proper ro_header->size. However,
- * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
- * must be set with either bpf_jit_binary_pack_finalize (normal path) or
- * bpf_arch_text_copy (when jit fails).
*/
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
- u32 size = ro_header->size;
+ u32 size = rw_header ? rw_header->size : ro_header->size;
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
@@ -1150,7 +1004,7 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr;
- addr = real_start & BPF_PROG_CHUNK_MASK;
+ addr = real_start & BPF_PROG_EXEC_MASK;
return (void *)addr;
}
@@ -2740,16 +2594,6 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
-void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
-{
- return ERR_PTR(-ENOTSUPP);
-}
-
-int __weak bpf_arch_text_invalidate(void *dst, size_t len)
-{
- return -ENOTSUPP;
-}
-
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 04f0a045dcaa..c41dff3379db 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -126,11 +126,11 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
/* Prepare the dispatcher in d->rw_image. Then use
- * bpf_arch_text_copy to update d->image, which is RO+X.
+ * execmem_fill to update d->image, which is RO+X.
*/
if (bpf_dispatcher_prepare(d, new, tmp))
return;
- if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
+ if (IS_ERR(execmem_fill(new, tmp, PAGE_SIZE / 2)))
return;
}
@@ -152,15 +152,12 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
+ d->image = execmem_alloc(PAGE_SIZE, PAGE_SIZE /* align */);
if (!d->image)
goto out;
d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!d->rw_image) {
- u32 size = PAGE_SIZE;
-
- bpf_arch_text_copy(d->image, &size, sizeof(size));
- bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ execmem_free((struct bpf_binary_header *)d->image);
d->image = NULL;
goto out;
}
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH bpf-next v5 5/6] vmalloc: introduce register_text_tail_vm()
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
` (3 preceding siblings ...)
2022-11-28 19:02 ` [PATCH bpf-next v5 4/6] bpf: use execmem_alloc for bpf program and bpf dispatcher Song Liu
@ 2022-11-28 19:02 ` Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 6/6] x86: use register_text_tail_vm Song Liu
5 siblings, 0 replies; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
Allow arch code to register some memory to be used by execmem_alloc().
One possible use case is to allocate PMD pages for kernl text up to
PMD_ALIGN(_etext), and use (_etext, PMD_ALIGN(_etext)) for
execmem_alloc. Currently, only one such region is supported.
Signed-off-by: Song Liu <song@kernel.org>
---
include/linux/vmalloc.h | 4 ++++
mm/vmalloc.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 38 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 30aa8c187d40..2babbe3031a5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -132,11 +132,15 @@ extern void vm_unmap_aliases(void);
#ifdef CONFIG_MMU
extern void __init vmalloc_init(void);
extern unsigned long vmalloc_nr_pages(void);
+void register_text_tail_vm(unsigned long start, unsigned long end);
#else
static inline void vmalloc_init(void)
{
}
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
+void register_text_tail_vm(unsigned long start, unsigned long end)
+{
+}
#endif
extern void *vmalloc(unsigned long size) __alloc_size(1);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 95ee8282b66c..fe5c8e5a5ec7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -77,6 +77,9 @@ static const bool vmap_allow_huge = false;
#endif
#define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
+static struct vm_struct text_tail_vm;
+static struct vmap_area text_tail_va;
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -655,6 +658,8 @@ int is_vmalloc_or_module_addr(const void *x)
unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
+ if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
+ return 1;
#endif
return is_vmalloc_addr(x);
}
@@ -2439,6 +2444,35 @@ static void vmap_init_free_space(void)
}
}
+/*
+ * register_text_tail_vm() allows arch code to register memory regions
+ * for execmem_alloc. Unlike regular memory regions used by execmem_alloc,
+ * this region is never freed by vfree_exec.
+ *
+ * One possible use case is to allocate PMD pages for kernl text up to
+ * PMD_ALIGN(_etext), and use (_etext, PMD_ALIGN(_etext)) for
+ * execmem_alloc.
+ */
+void register_text_tail_vm(unsigned long start, unsigned long end)
+{
+ struct vmap_area *va;
+
+ /* only support one region */
+ if (WARN_ON_ONCE(text_tail_vm.addr))
+ return;
+
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ return;
+ text_tail_vm.addr = (void *)start;
+ text_tail_vm.size = end - start;
+ text_tail_va.va_start = start;
+ text_tail_va.va_end = end;
+ text_tail_va.vm = &text_tail_vm;
+ memcpy(va, &text_tail_va, sizeof(*va));
+ insert_vmap_area_augment(va, NULL, &free_text_area_root, &free_text_area_list);
+}
+
void __init vmalloc_init(void)
{
struct vmap_area *va;
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH bpf-next v5 6/6] x86: use register_text_tail_vm
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
` (4 preceding siblings ...)
2022-11-28 19:02 ` [PATCH bpf-next v5 5/6] vmalloc: introduce register_text_tail_vm() Song Liu
@ 2022-11-28 19:02 ` Song Liu
5 siblings, 0 replies; 10+ messages in thread
From: Song Liu @ 2022-11-28 19:02 UTC (permalink / raw)
To: bpf, linux-mm
Cc: akpm, x86, peterz, hch, rick.p.edgecombe, rppt, mcgrof, Song Liu
Allocate 2MB pages up to round_up(_etext, 2MB), and register memory
[round_up(_etext, 4kb), round_up(_etext, 2MB)] with register_text_tail_vm
so that we can use this part of memory for dynamic kernel text (BPF
programs, etc.).
Here is an example:
[root@eth50-1 ~]# grep _etext /proc/kallsyms
ffffffff82202a08 T _etext
[root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3
ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf]
ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf]
ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf]
[root@eth50-1 ~]# grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
0xffffffff82200000-0xffffffff82400000 2M ro PSE x pmd
ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, and
bpf programs.
Also update Documentation/x86/x86_64/mm.rst to show execmem can be mapped
to kernel text addresses.
Signed-off-by: Song Liu <song@kernel.org>
---
Documentation/x86/x86_64/mm.rst | 4 ++--
arch/x86/include/asm/pgtable_64_types.h | 1 +
arch/x86/mm/init_64.c | 4 +++-
3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/Documentation/x86/x86_64/mm.rst b/Documentation/x86/x86_64/mm.rst
index 9798676bb0bf..6ee95e5fa7e9 100644
--- a/Documentation/x86/x86_64/mm.rst
+++ b/Documentation/x86/x86_64/mm.rst
@@ -62,7 +62,7 @@ Complete virtual memory map with 4-level page tables
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
- ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping and execmem, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
@@ -121,7 +121,7 @@ Complete virtual memory map with 5-level page tables
ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
- ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping and execmem, mapped to physical address 0
ffffffff80000000 |-2048 MB | | |
ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
ffffffffff000000 | -16 MB | | |
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 04f36063ad54..c0f9cceb109a 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -101,6 +101,7 @@ extern unsigned int ptrs_per_p4d;
#define PUD_MASK (~(PUD_SIZE - 1))
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
/*
* See Documentation/x86/x86_64/mm.rst for a description of the memory map.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..5b42fc0c6099 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1373,7 +1373,7 @@ void mark_rodata_ro(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long)__end_rodata_hpage_align;
- unsigned long text_end = PFN_ALIGN(_etext);
+ unsigned long text_end = PMD_ALIGN(_etext);
unsigned long rodata_end = PFN_ALIGN(__end_rodata);
unsigned long all_end;
@@ -1414,6 +1414,8 @@ void mark_rodata_ro(void)
(void *)rodata_end, (void *)_sdata);
debug_checkwx();
+ register_text_tail_vm(PFN_ALIGN((unsigned long)_etext),
+ PMD_ALIGN((unsigned long)_etext));
}
int kern_addr_valid(unsigned long addr)
--
2.30.2
^ permalink raw reply [flat|nested] 10+ messages in thread