* [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
[not found] <20240430111354.637356-1-vdonnefort@google.com>
@ 2024-04-30 11:13 ` Vincent Donnefort
2024-05-02 13:30 ` David Hildenbrand
2024-05-08 2:34 ` Steven Rostedt
2024-04-30 11:13 ` [PATCH v22 3/5] tracing: Allow user-space mapping of the ring-buffer Vincent Donnefort
1 sibling, 2 replies; 12+ messages in thread
From: Vincent Donnefort @ 2024-04-30 11:13 UTC (permalink / raw)
To: rostedt, mhiramat, linux-kernel, linux-trace-kernel
Cc: mathieu.desnoyers, kernel-team, rdunlap, rppt, david,
Vincent Donnefort, linux-mm
In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:
ring_buffer_{map,unmap}()
And controls on the ring-buffer:
ring_buffer_map_get_reader() /* swap reader and head */
Mapping the ring-buffer also involves:
A unique ID for each subbuf of the ring-buffer, currently they are
only identified through their in-kernel VA.
A meta-page, where are stored ring-buffer statistics and a
description for the current reader
The linear mapping exposes the meta-page, and each subbuf of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.
Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping subbufs.
CC: <linux-mm@kvack.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dc5ae4e96aee..96d2140b471e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
#include <linux/poll.h>
+#include <uapi/linux/trace_mmap.h>
+
struct trace_buffer;
struct ring_buffer_iter;
@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
#define trace_rb_cpu_prepare NULL
#endif
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+ struct vm_area_struct *vma);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
#endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index 000000000000..b682e9925539
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _TRACE_MMAP_H_
+#define _TRACE_MMAP_H_
+
+#include <linux/types.h>
+
+/**
+ * struct trace_buffer_meta - Ring-buffer Meta-page description
+ * @meta_page_size: Size of this meta-page.
+ * @meta_struct_len: Size of this structure.
+ * @subbuf_size: Size of each sub-buffer.
+ * @nr_subbufs: Number of subbfs in the ring-buffer, including the reader.
+ * @reader.lost_events: Number of events lost at the time of the reader swap.
+ * @reader.id: subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1]
+ * @reader.read: Number of bytes read on the reader subbuf.
+ * @flags: Placeholder for now, 0 until new features are supported.
+ * @entries: Number of entries in the ring-buffer.
+ * @overrun: Number of entries lost in the ring-buffer.
+ * @read: Number of entries that have been read.
+ * @Reserved1: Internal use only.
+ * @Reserved2: Internal use only.
+ */
+struct trace_buffer_meta {
+ __u32 meta_page_size;
+ __u32 meta_struct_len;
+
+ __u32 subbuf_size;
+ __u32 nr_subbufs;
+
+ struct {
+ __u64 lost_events;
+ __u32 id;
+ __u32 read;
+ } reader;
+
+ __u64 flags;
+
+ __u64 entries;
+ __u64 overrun;
+ __u64 read;
+
+ __u64 Reserved1;
+ __u64 Reserved2;
+};
+
+#endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cc9ebe593571..fc66d01ff472 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
+#include <linux/cacheflush.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
@@ -26,6 +27,7 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/mm.h>
#include <asm/local64.h>
#include <asm/local.h>
@@ -338,6 +340,7 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
+ u32 id; /* ID for external mapping */
struct buffer_data_page *page; /* Actual data page */
};
@@ -484,6 +487,12 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
/* pages removed since last reset */
unsigned long pages_removed;
+
+ unsigned int mapped;
+ struct mutex mapping_lock;
+ unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct trace_buffer_meta *meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
return buffer->time_stamp_abs;
}
-static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
return local_read(&bpage->entries) & RB_WRITE_MASK;
@@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+
+ meta->reader.read = cpu_buffer->reader_page->read;
+ meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.lost_events = cpu_buffer->lost_events;
+
+ meta->entries = local_read(&cpu_buffer->entries);
+ meta->overrun = local_read(&cpu_buffer->overrun);
+ meta->read = cpu_buffer->read;
+
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
+ if (cpu_buffer->mapped)
+ rb_update_meta_page(cpu_buffer);
+
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
}
@@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ /* It's up to the callers to not try to swap mapped buffers */
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
@@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ cpu_buffer->mapped) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->mapped) {
+ err = -EBUSY;
+ goto error;
+ }
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
@@ -6057,6 +6096,370 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct page *page;
+
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ page = alloc_page(GFP_USER | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ cpu_buffer->meta_page = page_to_virt(page);
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+ free_page(addr);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *subbuf_ids)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
+ struct buffer_page *first_subbuf, *subbuf;
+ int id = 0;
+
+ subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+ cpu_buffer->reader_page->id = id++;
+
+ first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
+ do {
+ if (WARN_ON(id >= nr_subbufs))
+ break;
+
+ subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf->id = id;
+
+ rb_inc_page(&subbuf);
+ id++;
+ } while (subbuf != first_subbuf);
+
+ /* install subbuf ID to kern VA translation */
+ cpu_buffer->subbuf_ids = subbuf_ids;
+
+ meta->meta_page_size = PAGE_SIZE;
+ meta->meta_struct_len = sizeof(*meta);
+ meta->nr_subbufs = nr_subbufs;
+ meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+
+ rb_update_meta_page(cpu_buffer);
+}
+
+static struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+/*
+ * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need
+ * to be set-up or torn-down.
+ */
+static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
+ bool inc)
+{
+ unsigned long flags;
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ if (inc && cpu_buffer->mapped == UINT_MAX)
+ return -EBUSY;
+
+ if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+ return -EINVAL;
+
+ mutex_lock(&cpu_buffer->buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (inc)
+ cpu_buffer->mapped++;
+ else
+ cpu_buffer->mapped--;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ mutex_unlock(&cpu_buffer->buffer->mutex);
+
+ return 0;
+}
+
+/*
+ * +--------------+ pgoff == 0
+ * | meta page |
+ * +--------------+ pgoff == 1
+ * | subbuffer 0 |
+ * | |
+ * +--------------+ pgoff == (1 + (1 << subbuf_order))
+ * | subbuffer 1 |
+ * | |
+ * ...
+ */
+#ifdef CONFIG_MMU
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+ unsigned int subbuf_pages, subbuf_order;
+ struct page **pages;
+ int p = 0, s = 0;
+ int err;
+
+ /* Refuse MP_PRIVATE or writable mappings */
+ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
+ !(vma->vm_flags & VM_MAYSHARE))
+ return -EPERM;
+
+ /*
+ * Make sure the mapping cannot become writable later. Also tell the VM
+ * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
+ * prevent migration, GUP and dump (VM_IO).
+ */
+ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
+ nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+
+ vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ if (!vma_pages || vma_pages > nr_pages)
+ return -EINVAL;
+
+ nr_pages = vma_pages;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (!pgoff) {
+ pages[p++] = virt_to_page(cpu_buffer->meta_page);
+
+ /*
+ * TODO: Align sub-buffers on their size, once
+ * vm_insert_pages() supports the zero-page.
+ */
+ } else {
+ /* Skip the meta-page */
+ pgoff--;
+
+ if (pgoff % subbuf_pages) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ s += pgoff / subbuf_pages;
+ }
+
+ while (s < nr_subbufs && p < nr_pages) {
+ struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
+ int off = 0;
+
+ for (; off < (1 << (subbuf_order)); off++, page++) {
+ if (p >= nr_pages)
+ break;
+
+ pages[p++] = page;
+ }
+ s++;
+ }
+
+ err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+
+out:
+ kfree(pages);
+
+ return err;
+}
+#else
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+ struct vm_area_struct *vma)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *subbuf_ids;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->mapped) {
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err)
+ err = __rb_inc_dec_mapped(cpu_buffer, true);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return err;
+ }
+
+ /* prevent another thread from changing buffer/sub-buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err)
+ goto unlock;
+
+ /* subbuf_ids include the reader while nr_pages does not */
+ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
+ if (!subbuf_ids) {
+ rb_free_meta_page(cpu_buffer);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ /*
+ * Lock all readers to block any subbuf swap until the subbuf IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ cpu_buffer->mapped = 1;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ } else {
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ }
+
+unlock:
+ mutex_unlock(&buffer->mutex);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ err = -ENODEV;
+ goto out;
+ } else if (cpu_buffer->mapped > 1) {
+ __rb_inc_dec_mapped(cpu_buffer, false);
+ goto out;
+ }
+
+ mutex_lock(&buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ cpu_buffer->mapped = 0;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+
+ mutex_unlock(&buffer->mutex);
+
+out:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long reader_size;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+consume:
+ if (rb_per_cpu_empty(cpu_buffer))
+ goto out;
+
+ reader_size = rb_page_size(cpu_buffer->reader_page);
+
+ /*
+ * There are data to be read on the current reader page, we can
+ * return to the caller. But before that, we assume the latter will read
+ * everything. Let's update the kernel reader accordingly.
+ */
+ if (cpu_buffer->reader_page->read < reader_size) {
+ while (cpu_buffer->reader_page->read < reader_size)
+ rb_advance_reader(cpu_buffer);
+ goto out;
+ }
+
+ if (WARN_ON(!rb_get_reader_page(cpu_buffer)))
+ goto out;
+
+ goto consume;
+
+out:
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+
+ rb_update_meta_page(cpu_buffer);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
--
2.44.0.769.g3c40516874-goog
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v22 3/5] tracing: Allow user-space mapping of the ring-buffer
[not found] <20240430111354.637356-1-vdonnefort@google.com>
2024-04-30 11:13 ` [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
@ 2024-04-30 11:13 ` Vincent Donnefort
1 sibling, 0 replies; 12+ messages in thread
From: Vincent Donnefort @ 2024-04-30 11:13 UTC (permalink / raw)
To: rostedt, mhiramat, linux-kernel, linux-trace-kernel
Cc: mathieu.desnoyers, kernel-team, rdunlap, rppt, david,
Vincent Donnefort, linux-mm
Currently, user-space extracts data from the ring-buffer via splice,
which is handy for storage or network sharing. However, due to splice
limitations, it is imposible to do real-time analysis without a copy.
A solution for that problem is to let the user-space map the ring-buffer
directly.
The mapping is exposed via the per-CPU file trace_pipe_raw. The first
element of the mapping is the meta-page. It is followed by each
subbuffer constituting the ring-buffer, ordered by their unique page ID:
* Meta-page -- include/uapi/linux/trace_mmap.h for a description
* Subbuf ID 0
* Subbuf ID 1
...
It is therefore easy to translate a subbuf ID into an offset in the
mapping:
reader_id = meta->reader->id;
reader_offset = meta->meta_page_size + reader_id * meta->subbuf_size;
When new data is available, the mapper must call a newly introduced ioctl:
TRACE_MMAP_IOCTL_GET_READER. This will update the Meta-page reader ID to
point to the next reader containing unread data.
Mapping will prevent snapshot and buffer size modifications.
CC: <linux-mm@kvack.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index b682e9925539..bd1066754220 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -43,4 +43,6 @@ struct trace_buffer_meta {
__u64 Reserved2;
};
+#define TRACE_MMAP_IOCTL_GET_READER _IO('T', 0x1)
+
#endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 233d1af39fff..a35e7f598233 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return;
}
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
local_irq_save(flags);
update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
@@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
lockdep_assert_held(&trace_types_lock);
spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX) {
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
spin_unlock(&tr->snapshot_trigger_lock);
return -EBUSY;
}
@@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -8194,15 +8200,32 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return ret;
}
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
+ int err;
+
+ if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
+ if (!(file->f_flags & O_NONBLOCK)) {
+ err = ring_buffer_wait(iter->array_buffer->buffer,
+ iter->cpu_file,
+ iter->tr->buffer_percent,
+ NULL, NULL);
+ if (err)
+ return err;
+ }
- if (cmd)
- return -ENOIOCTLCMD;
+ return ring_buffer_map_get_reader(iter->array_buffer->buffer,
+ iter->cpu_file);
+ } else if (cmd) {
+ return -ENOTTY;
+ }
+ /*
+ * An ioctl call with cmd 0 to the ring buffer file will wake up all
+ * waiters
+ */
mutex_lock(&trace_types_lock);
/* Make sure the waiters see the new wait_index */
@@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+static void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+#else
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+#endif
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
+ put_snapshot_map(iter->tr);
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+ .close = tracing_buffers_mmap_close,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+ int ret = 0;
+
+ ret = get_snapshot_map(iter->tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
+ if (ret)
+ put_snapshot_map(iter->tr);
+
+ vma->vm_ops = &tracing_buffers_vmops;
+
+ return ret;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
@@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = {
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
+ .mmap = tracing_buffers_mmap,
};
static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 64450615ca0c..749a182dab48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,7 @@ struct trace_array {
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
+ unsigned int mapped;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
--
2.44.0.769.g3c40516874-goog
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-04-30 11:13 ` [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
@ 2024-05-02 13:30 ` David Hildenbrand
2024-05-02 13:38 ` Vincent Donnefort
2024-05-08 2:34 ` Steven Rostedt
1 sibling, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-05-02 13:30 UTC (permalink / raw)
To: Vincent Donnefort, rostedt, mhiramat, linux-kernel, linux-trace-kernel
Cc: mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
On 30.04.24 13:13, Vincent Donnefort wrote:
> In preparation for allowing the user-space to map a ring-buffer, add
> a set of mapping functions:
>
> ring_buffer_{map,unmap}()
>
> And controls on the ring-buffer:
>
> ring_buffer_map_get_reader() /* swap reader and head */
>
> Mapping the ring-buffer also involves:
>
> A unique ID for each subbuf of the ring-buffer, currently they are
> only identified through their in-kernel VA.
>
> A meta-page, where are stored ring-buffer statistics and a
> description for the current reader
>
> The linear mapping exposes the meta-page, and each subbuf of the
> ring-buffer, ordered following their unique ID, assigned during the
> first mapping.
>
> Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
> size will remain unmodified and the splice enabling functions will in
> reality simply memcpy the data instead of swapping subbufs.
>
> CC: <linux-mm@kvack.org>
> Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
>
> diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> index dc5ae4e96aee..96d2140b471e 100644
> --- a/include/linux/ring_buffer.h
> +++ b/include/linux/ring_buffer.h
[...]
> +/*
> + * +--------------+ pgoff == 0
> + * | meta page |
> + * +--------------+ pgoff == 1
> + * | subbuffer 0 |
> + * | |
> + * +--------------+ pgoff == (1 + (1 << subbuf_order))
> + * | subbuffer 1 |
> + * | |
> + * ...
> + */
> +#ifdef CONFIG_MMU
> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> + struct vm_area_struct *vma)
> +{
> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
> + unsigned int subbuf_pages, subbuf_order;
> + struct page **pages;
> + int p = 0, s = 0;
> + int err;
> +
> + /* Refuse MP_PRIVATE or writable mappings */
> + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
> + !(vma->vm_flags & VM_MAYSHARE))
> + return -EPERM;
> +
> + /*
> + * Make sure the mapping cannot become writable later. Also tell the VM
> + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
> + * prevent migration, GUP and dump (VM_IO).
> + */
> + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
> +
> + lockdep_assert_held(&cpu_buffer->mapping_lock);
> +
> + subbuf_order = cpu_buffer->buffer->subbuf_order;
> + subbuf_pages = 1 << subbuf_order;
> +
> + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
> + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
> +
> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> + if (!vma_pages || vma_pages > nr_pages)
> + return -EINVAL;
> +
> + nr_pages = vma_pages;
> +
> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
> + if (!pages)
> + return -ENOMEM;
> +
> + if (!pgoff) {
> + pages[p++] = virt_to_page(cpu_buffer->meta_page);
> +
> + /*
> + * TODO: Align sub-buffers on their size, once
> + * vm_insert_pages() supports the zero-page.
> + */
> + } else {
> + /* Skip the meta-page */
> + pgoff--;
> +
> + if (pgoff % subbuf_pages) {
> + err = -EINVAL;
> + goto out;
> + }
> +
> + s += pgoff / subbuf_pages;
> + }
> +
> + while (s < nr_subbufs && p < nr_pages) {
> + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
> + int off = 0;
> +
> + for (; off < (1 << (subbuf_order)); off++, page++) {
> + if (p >= nr_pages)
> + break;
> +
> + pages[p++] = page;
> + }
> + s++;
> + }
> +
> + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
Nit: I did not immediately understand if we could end here with p <
nr_pages (IOW, pages[] not completely filled).
One source of confusion is the "s < nr_subbufs" check in the while loop:
why is "p < nr_pages" insufficient?
For the MM bits:
Acked-by: David Hildenbrand <david@redhat.com>
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-02 13:30 ` David Hildenbrand
@ 2024-05-02 13:38 ` Vincent Donnefort
2024-05-02 13:46 ` Steven Rostedt
0 siblings, 1 reply; 12+ messages in thread
From: Vincent Donnefort @ 2024-05-02 13:38 UTC (permalink / raw)
To: David Hildenbrand
Cc: rostedt, mhiramat, linux-kernel, linux-trace-kernel,
mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
On Thu, May 02, 2024 at 03:30:32PM +0200, David Hildenbrand wrote:
> On 30.04.24 13:13, Vincent Donnefort wrote:
> > In preparation for allowing the user-space to map a ring-buffer, add
> > a set of mapping functions:
> >
> > ring_buffer_{map,unmap}()
> >
> > And controls on the ring-buffer:
> >
> > ring_buffer_map_get_reader() /* swap reader and head */
> >
> > Mapping the ring-buffer also involves:
> >
> > A unique ID for each subbuf of the ring-buffer, currently they are
> > only identified through their in-kernel VA.
> >
> > A meta-page, where are stored ring-buffer statistics and a
> > description for the current reader
> >
> > The linear mapping exposes the meta-page, and each subbuf of the
> > ring-buffer, ordered following their unique ID, assigned during the
> > first mapping.
> >
> > Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
> > size will remain unmodified and the splice enabling functions will in
> > reality simply memcpy the data instead of swapping subbufs.
> >
> > CC: <linux-mm@kvack.org>
> > Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
> >
> > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
> > index dc5ae4e96aee..96d2140b471e 100644
> > --- a/include/linux/ring_buffer.h
> > +++ b/include/linux/ring_buffer.h
>
> [...]
>
> > +/*
> > + * +--------------+ pgoff == 0
> > + * | meta page |
> > + * +--------------+ pgoff == 1
> > + * | subbuffer 0 |
> > + * | |
> > + * +--------------+ pgoff == (1 + (1 << subbuf_order))
> > + * | subbuffer 1 |
> > + * | |
> > + * ...
> > + */
> > +#ifdef CONFIG_MMU
> > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> > + struct vm_area_struct *vma)
> > +{
> > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
> > + unsigned int subbuf_pages, subbuf_order;
> > + struct page **pages;
> > + int p = 0, s = 0;
> > + int err;
> > +
> > + /* Refuse MP_PRIVATE or writable mappings */
> > + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
> > + !(vma->vm_flags & VM_MAYSHARE))
> > + return -EPERM;
> > +
> > + /*
> > + * Make sure the mapping cannot become writable later. Also tell the VM
> > + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
> > + * prevent migration, GUP and dump (VM_IO).
> > + */
> > + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
> > +
> > + lockdep_assert_held(&cpu_buffer->mapping_lock);
> > +
> > + subbuf_order = cpu_buffer->buffer->subbuf_order;
> > + subbuf_pages = 1 << subbuf_order;
> > +
> > + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
> > + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
> > +
> > + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> > + if (!vma_pages || vma_pages > nr_pages)
> > + return -EINVAL;
> > +
> > + nr_pages = vma_pages;
> > +
> > + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
> > + if (!pages)
> > + return -ENOMEM;
> > +
> > + if (!pgoff) {
> > + pages[p++] = virt_to_page(cpu_buffer->meta_page);
> > +
> > + /*
> > + * TODO: Align sub-buffers on their size, once
> > + * vm_insert_pages() supports the zero-page.
> > + */
> > + } else {
> > + /* Skip the meta-page */
> > + pgoff--;
> > +
> > + if (pgoff % subbuf_pages) {
> > + err = -EINVAL;
> > + goto out;
> > + }
> > +
> > + s += pgoff / subbuf_pages;
> > + }
> > +
> > + while (s < nr_subbufs && p < nr_pages) {
> > + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
> > + int off = 0;
> > +
> > + for (; off < (1 << (subbuf_order)); off++, page++) {
> > + if (p >= nr_pages)
> > + break;
> > +
> > + pages[p++] = page;
> > + }
> > + s++;
> > + }
> > +
> > + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
>
> Nit: I did not immediately understand if we could end here with p < nr_pages
> (IOW, pages[] not completely filled).
>
> One source of confusion is the "s < nr_subbufs" check in the while loop: why
> is "p < nr_pages" insufficient?
Hum, indeed, the "s < nr_subbufs" check is superfluous, nr_pages, is already
capped by the number of subbufs, there's no way we can overflow subbuf_ids[].
>
>
> For the MM bits:
>
> Acked-by: David Hildenbrand <david@redhat.com>
Thanks a lot for having a look at the series, very much appreciated!
>
>
>
> --
> Cheers,
>
> David / dhildenb
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-02 13:38 ` Vincent Donnefort
@ 2024-05-02 13:46 ` Steven Rostedt
0 siblings, 0 replies; 12+ messages in thread
From: Steven Rostedt @ 2024-05-02 13:46 UTC (permalink / raw)
To: Vincent Donnefort
Cc: David Hildenbrand, mhiramat, linux-kernel, linux-trace-kernel,
mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
On Thu, 2 May 2024 14:38:32 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:
> > > + while (s < nr_subbufs && p < nr_pages) {
> > > + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
> > > + int off = 0;
> > > +
> > > + for (; off < (1 << (subbuf_order)); off++, page++) {
> > > + if (p >= nr_pages)
> > > + break;
> > > +
> > > + pages[p++] = page;
> > > + }
> > > + s++;
> > > + }
> > > +
> > > + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
> >
> > Nit: I did not immediately understand if we could end here with p < nr_pages
> > (IOW, pages[] not completely filled).
> >
> > One source of confusion is the "s < nr_subbufs" check in the while loop: why
> > is "p < nr_pages" insufficient?
>
> Hum, indeed, the "s < nr_subbufs" check is superfluous, nr_pages, is already
> capped by the number of subbufs, there's no way we can overflow subbuf_ids[].
We can keep it as is, or perhaps change it to:
while (p < nr_pages) {
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
break;
page = virt_to_page(cpu_buffer->subbuf_ids[s]);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
break;
pages[p++] = page;
}
s++;
}
I don't like having an unchecked dependency between s and p.
-- Steve
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-04-30 11:13 ` [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
2024-05-02 13:30 ` David Hildenbrand
@ 2024-05-08 2:34 ` Steven Rostedt
2024-05-09 11:05 ` Vincent Donnefort
2024-05-10 9:19 ` David Hildenbrand
1 sibling, 2 replies; 12+ messages in thread
From: Steven Rostedt @ 2024-05-08 2:34 UTC (permalink / raw)
To: Vincent Donnefort
Cc: mhiramat, linux-kernel, linux-trace-kernel, mathieu.desnoyers,
kernel-team, rdunlap, rppt, david, linux-mm
On Tue, 30 Apr 2024 12:13:51 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:
> +#ifdef CONFIG_MMU
> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> + struct vm_area_struct *vma)
> +{
> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
> + unsigned int subbuf_pages, subbuf_order;
> + struct page **pages;
> + int p = 0, s = 0;
> + int err;
> +
> + /* Refuse MP_PRIVATE or writable mappings */
> + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
> + !(vma->vm_flags & VM_MAYSHARE))
> + return -EPERM;
> +
> + /*
> + * Make sure the mapping cannot become writable later. Also tell the VM
> + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
> + * prevent migration, GUP and dump (VM_IO).
> + */
> + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
Do we really need the VM_IO?
When testing this in gdb, I would get:
(gdb) p tmap->map->subbuf_size
Cannot access memory at address 0x7ffff7fc2008
It appears that you can't ptrace IO memory. When I removed that flag,
gdb has no problem reading that memory.
I think we should drop that flag.
Can you send a v23 with that removed, Shuah's update, and also the
change below:
> +
> + lockdep_assert_held(&cpu_buffer->mapping_lock);
> +
> + subbuf_order = cpu_buffer->buffer->subbuf_order;
> + subbuf_pages = 1 << subbuf_order;
> +
> + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
> + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
> +
> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> + if (!vma_pages || vma_pages > nr_pages)
> + return -EINVAL;
> +
> + nr_pages = vma_pages;
> +
> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
> + if (!pages)
> + return -ENOMEM;
> +
> + if (!pgoff) {
> + pages[p++] = virt_to_page(cpu_buffer->meta_page);
> +
> + /*
> + * TODO: Align sub-buffers on their size, once
> + * vm_insert_pages() supports the zero-page.
> + */
> + } else {
> + /* Skip the meta-page */
> + pgoff--;
> +
> + if (pgoff % subbuf_pages) {
> + err = -EINVAL;
> + goto out;
> + }
> +
> + s += pgoff / subbuf_pages;
> + }
> +
> + while (s < nr_subbufs && p < nr_pages) {
> + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
> + int off = 0;
> +
> + for (; off < (1 << (subbuf_order)); off++, page++) {
> + if (p >= nr_pages)
> + break;
> +
> + pages[p++] = page;
> + }
> + s++;
> + }
The above can be made to:
while (p < nr_pages) {
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
break;
page = virt_to_page(cpu_buffer->subbuf_ids[s]);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
break;
pages[p++] = page;
}
s++;
}
Thanks.
-- Steve
> +
> + err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
> +
> +out:
> + kfree(pages);
> +
> + return err;
> +}
> +#else
> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> + struct vm_area_struct *vma)
> +{
> + return -EOPNOTSUPP;
> +}
> +#endif
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-08 2:34 ` Steven Rostedt
@ 2024-05-09 11:05 ` Vincent Donnefort
2024-05-10 9:15 ` David Hildenbrand
2024-05-10 9:19 ` David Hildenbrand
1 sibling, 1 reply; 12+ messages in thread
From: Vincent Donnefort @ 2024-05-09 11:05 UTC (permalink / raw)
To: Steven Rostedt
Cc: mhiramat, linux-kernel, linux-trace-kernel, mathieu.desnoyers,
kernel-team, rdunlap, rppt, david, linux-mm
On Tue, May 07, 2024 at 10:34:02PM -0400, Steven Rostedt wrote:
> On Tue, 30 Apr 2024 12:13:51 +0100
> Vincent Donnefort <vdonnefort@google.com> wrote:
>
> > +#ifdef CONFIG_MMU
> > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> > + struct vm_area_struct *vma)
> > +{
> > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
> > + unsigned int subbuf_pages, subbuf_order;
> > + struct page **pages;
> > + int p = 0, s = 0;
> > + int err;
> > +
> > + /* Refuse MP_PRIVATE or writable mappings */
> > + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
> > + !(vma->vm_flags & VM_MAYSHARE))
> > + return -EPERM;
> > +
> > + /*
> > + * Make sure the mapping cannot become writable later. Also tell the VM
> > + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
> > + * prevent migration, GUP and dump (VM_IO).
> > + */
> > + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
>
> Do we really need the VM_IO?
>
> When testing this in gdb, I would get:
>
> (gdb) p tmap->map->subbuf_size
> Cannot access memory at address 0x7ffff7fc2008
>
> It appears that you can't ptrace IO memory. When I removed that flag,
> gdb has no problem reading that memory.
Yeah, VM_IO indeed implies DONTDUMP. VM_IO was part of Linus recommendations.
But perhaps, VM_DONTEXPAND and MIXEDMAP (implicitely set by vm_insert_pages) are
enough protection?
I don't see how anything could use GUP there and as David pointed-out on the
previous version, it doesn't event prevent the GUP-fast path.
>
> I think we should drop that flag.
>
> Can you send a v23 with that removed, Shuah's update, and also the
> change below:
Ack.
[...]
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-09 11:05 ` Vincent Donnefort
@ 2024-05-10 9:15 ` David Hildenbrand
2024-05-10 10:57 ` Vincent Donnefort
0 siblings, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-05-10 9:15 UTC (permalink / raw)
To: Vincent Donnefort, Steven Rostedt
Cc: mhiramat, linux-kernel, linux-trace-kernel, mathieu.desnoyers,
kernel-team, rdunlap, rppt, linux-mm
On 09.05.24 13:05, Vincent Donnefort wrote:
> On Tue, May 07, 2024 at 10:34:02PM -0400, Steven Rostedt wrote:
>> On Tue, 30 Apr 2024 12:13:51 +0100
>> Vincent Donnefort <vdonnefort@google.com> wrote:
>>
>>> +#ifdef CONFIG_MMU
>>> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
>>> + struct vm_area_struct *vma)
>>> +{
>>> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
>>> + unsigned int subbuf_pages, subbuf_order;
>>> + struct page **pages;
>>> + int p = 0, s = 0;
>>> + int err;
>>> +
>>> + /* Refuse MP_PRIVATE or writable mappings */
>>> + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
>>> + !(vma->vm_flags & VM_MAYSHARE))
>>> + return -EPERM;
>>> +
>>> + /*
>>> + * Make sure the mapping cannot become writable later. Also tell the VM
>>> + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
>>> + * prevent migration, GUP and dump (VM_IO).
>>> + */
>>> + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
>>
>> Do we really need the VM_IO?
>>
>> When testing this in gdb, I would get:
>>
>> (gdb) p tmap->map->subbuf_size
>> Cannot access memory at address 0x7ffff7fc2008
>>
>> It appears that you can't ptrace IO memory. When I removed that flag,
>> gdb has no problem reading that memory.
>
> Yeah, VM_IO indeed implies DONTDUMP. VM_IO was part of Linus recommendations.
Yes, the VM should recognize that memory to some degree as being special
already due to VM_MIXEDMAP and VM_DONTEXPAND.
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
So any of these flag achieve that (e.g., mlock_fixup() checks
VM_SPECIAL). KSM similarly skips VM_DONTEXPAND and VM_MIXEDMAP (likely
we should be using VM_SPECIAL in vma_ksm_compatible()). Not sure about
page migration, likely its fine.
Thinking about MADV_DONTNEED, I can spot in
madvise_dontneed_free_valid_vma() only that we disallow primarily VM_PFNMAP.
... I assume if user space MADV_DONTNEED's some pages we'll simply get a
page fault later on access that will SIGBUS, handling that gracefully
(we should double-check!).
> But perhaps, VM_DONTEXPAND and MIXEDMAP (implicitely set by vm_insert_pages) are
> enough protection?
Do we want to dump these pages? VM_DONTDUMP might be reasonabe then.
>
> I don't see how anything could use GUP there and as David pointed-out on the
> previous version, it doesn't event prevent the GUP-fast path.
Yes, GUP-fast would still have worked under some conditions.
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-08 2:34 ` Steven Rostedt
2024-05-09 11:05 ` Vincent Donnefort
@ 2024-05-10 9:19 ` David Hildenbrand
2024-05-10 11:03 ` Vincent Donnefort
1 sibling, 1 reply; 12+ messages in thread
From: David Hildenbrand @ 2024-05-10 9:19 UTC (permalink / raw)
To: Steven Rostedt, Vincent Donnefort
Cc: mhiramat, linux-kernel, linux-trace-kernel, mathieu.desnoyers,
kernel-team, rdunlap, rppt, linux-mm
On 08.05.24 04:34, Steven Rostedt wrote:
> On Tue, 30 Apr 2024 12:13:51 +0100
> Vincent Donnefort <vdonnefort@google.com> wrote:
>
>> +#ifdef CONFIG_MMU
>> +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
>> + struct vm_area_struct *vma)
>> +{
>> + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
>> + unsigned int subbuf_pages, subbuf_order;
>> + struct page **pages;
>> + int p = 0, s = 0;
>> + int err;
>> +
>> + /* Refuse MP_PRIVATE or writable mappings */
>> + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
>> + !(vma->vm_flags & VM_MAYSHARE))
>> + return -EPERM;
>> +
>> + /*
>> + * Make sure the mapping cannot become writable later. Also tell the VM
>> + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
>> + * prevent migration, GUP and dump (VM_IO).
>> + */
>> + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
>
> Do we really need the VM_IO?
>
> When testing this in gdb, I would get:
>
> (gdb) p tmap->map->subbuf_size
> Cannot access memory at address 0x7ffff7fc2008
>
> It appears that you can't ptrace IO memory. When I removed that flag,
> gdb has no problem reading that memory.
>
> I think we should drop that flag.
>
> Can you send a v23 with that removed, Shuah's update, and also the
> change below:
>
>> +
>> + lockdep_assert_held(&cpu_buffer->mapping_lock);
>> +
>> + subbuf_order = cpu_buffer->buffer->subbuf_order;
>> + subbuf_pages = 1 << subbuf_order;
>> +
>> + nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
>> + nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
>> +
>> + vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
>> + if (!vma_pages || vma_pages > nr_pages)
>> + return -EINVAL;
>> +
>> + nr_pages = vma_pages;
>> +
>> + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
>> + if (!pages)
>> + return -ENOMEM;
>> +
>> + if (!pgoff) {
>> + pages[p++] = virt_to_page(cpu_buffer->meta_page);
>> +
>> + /*
>> + * TODO: Align sub-buffers on their size, once
>> + * vm_insert_pages() supports the zero-page.
>> + */
>> + } else {
>> + /* Skip the meta-page */
>> + pgoff--;
>> +
>> + if (pgoff % subbuf_pages) {
>> + err = -EINVAL;
>> + goto out;
>> + }
>> +
>> + s += pgoff / subbuf_pages;
>> + }
>> +
>> + while (s < nr_subbufs && p < nr_pages) {
>> + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
>> + int off = 0;
>> +
>> + for (; off < (1 << (subbuf_order)); off++, page++) {
>> + if (p >= nr_pages)
>> + break;
>> +
>> + pages[p++] = page;
>> + }
>> + s++;
>> + }
>
> The above can be made to:
>
> while (p < nr_pages) {
> struct page *page;
> int off = 0;
>
> if (WARN_ON_ONCE(s >= nr_subbufs))
> break;
I'm not particularly happy about us calling vm_insert_pages with NULL
pointers stored in pages.
Should we instead do
if (WARN_ON_ONCE(s >= nr_subbufs)) {
err = -EINVAL;
goto out;
}
?
--
Cheers,
David / dhildenb
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-10 9:15 ` David Hildenbrand
@ 2024-05-10 10:57 ` Vincent Donnefort
0 siblings, 0 replies; 12+ messages in thread
From: Vincent Donnefort @ 2024-05-10 10:57 UTC (permalink / raw)
To: David Hildenbrand
Cc: Steven Rostedt, mhiramat, linux-kernel, linux-trace-kernel,
mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
On Fri, May 10, 2024 at 11:15:59AM +0200, David Hildenbrand wrote:
> On 09.05.24 13:05, Vincent Donnefort wrote:
> > On Tue, May 07, 2024 at 10:34:02PM -0400, Steven Rostedt wrote:
> > > On Tue, 30 Apr 2024 12:13:51 +0100
> > > Vincent Donnefort <vdonnefort@google.com> wrote:
> > >
> > > > +#ifdef CONFIG_MMU
> > > > +static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
> > > > + struct vm_area_struct *vma)
> > > > +{
> > > > + unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
> > > > + unsigned int subbuf_pages, subbuf_order;
> > > > + struct page **pages;
> > > > + int p = 0, s = 0;
> > > > + int err;
> > > > +
> > > > + /* Refuse MP_PRIVATE or writable mappings */
> > > > + if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
> > > > + !(vma->vm_flags & VM_MAYSHARE))
> > > > + return -EPERM;
> > > > +
> > > > + /*
> > > > + * Make sure the mapping cannot become writable later. Also tell the VM
> > > > + * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND). Finally,
> > > > + * prevent migration, GUP and dump (VM_IO).
> > > > + */
> > > > + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_IO, VM_MAYWRITE);
> > >
> > > Do we really need the VM_IO?
> > >
> > > When testing this in gdb, I would get:
> > >
> > > (gdb) p tmap->map->subbuf_size
> > > Cannot access memory at address 0x7ffff7fc2008
> > >
> > > It appears that you can't ptrace IO memory. When I removed that flag,
> > > gdb has no problem reading that memory.
> >
> > Yeah, VM_IO indeed implies DONTDUMP. VM_IO was part of Linus recommendations.
>
> Yes, the VM should recognize that memory to some degree as being special
> already due to VM_MIXEDMAP and VM_DONTEXPAND.
>
> #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
>
> So any of these flag achieve that (e.g., mlock_fixup() checks VM_SPECIAL).
> KSM similarly skips VM_DONTEXPAND and VM_MIXEDMAP (likely we should be using
> VM_SPECIAL in vma_ksm_compatible()). Not sure about page migration, likely
> its fine.
>
> Thinking about MADV_DONTNEED, I can spot in
> madvise_dontneed_free_valid_vma() only that we disallow primarily VM_PFNMAP.
>
> ... I assume if user space MADV_DONTNEED's some pages we'll simply get a
> page fault later on access that will SIGBUS, handling that gracefully (we
> should double-check!).
I've just tested and indeed, I get a SIGBUS! All good there.
>
>
> > But perhaps, VM_DONTEXPAND and MIXEDMAP (implicitely set by vm_insert_pages) are
> > enough protection?
>
> Do we want to dump these pages? VM_DONTDUMP might be reasonabe then.
Somehow I thought this would prevent ptrace as well, but I've just tested it and
this is not the case as well. So let's keep DONTDUMP.
Thanks!
>
> >
> > I don't see how anything could use GUP there and as David pointed-out on the
> > previous version, it doesn't event prevent the GUP-fast path.
>
> Yes, GUP-fast would still have worked under some conditions.
>
> --
> Cheers,
>
> David / dhildenb
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-10 9:19 ` David Hildenbrand
@ 2024-05-10 11:03 ` Vincent Donnefort
2024-05-10 18:42 ` Steven Rostedt
0 siblings, 1 reply; 12+ messages in thread
From: Vincent Donnefort @ 2024-05-10 11:03 UTC (permalink / raw)
To: David Hildenbrand
Cc: Steven Rostedt, mhiramat, linux-kernel, linux-trace-kernel,
mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
[...]
> > > +
> > > + while (s < nr_subbufs && p < nr_pages) {
> > > + struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
> > > + int off = 0;
> > > +
> > > + for (; off < (1 << (subbuf_order)); off++, page++) {
> > > + if (p >= nr_pages)
> > > + break;
> > > +
> > > + pages[p++] = page;
> > > + }
> > > + s++;
> > > + }
> >
> > The above can be made to:
> >
> > while (p < nr_pages) {
> > struct page *page;
> > int off = 0;
> >
> > if (WARN_ON_ONCE(s >= nr_subbufs))
> > break;
>
> I'm not particularly happy about us calling vm_insert_pages with NULL
> pointers stored in pages.
>
> Should we instead do
>
> if (WARN_ON_ONCE(s >= nr_subbufs)) {
> err = -EINVAL;
> goto out;
> }
>
> ?
I could also nr_pages = p in the event of s >= nr_subbufs... but that
really that shouldn't happen so let's return an error.
>
> --
> Cheers,
>
> David / dhildenb
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions
2024-05-10 11:03 ` Vincent Donnefort
@ 2024-05-10 18:42 ` Steven Rostedt
0 siblings, 0 replies; 12+ messages in thread
From: Steven Rostedt @ 2024-05-10 18:42 UTC (permalink / raw)
To: Vincent Donnefort
Cc: David Hildenbrand, mhiramat, linux-kernel, linux-trace-kernel,
mathieu.desnoyers, kernel-team, rdunlap, rppt, linux-mm
On Fri, 10 May 2024 12:03:12 +0100
Vincent Donnefort <vdonnefort@google.com> wrote:
> > I'm not particularly happy about us calling vm_insert_pages with NULL
> > pointers stored in pages.
> >
> > Should we instead do
> >
> > if (WARN_ON_ONCE(s >= nr_subbufs)) {
> > err = -EINVAL;
> > goto out;
> > }
> >
> > ?
>
> I could also nr_pages = p in the event of s >= nr_subbufs... but that
> really that shouldn't happen so let's return an error.
I'm good with this. It should never happen anyway.
-- Steve
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2024-05-10 18:43 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <20240430111354.637356-1-vdonnefort@google.com>
2024-04-30 11:13 ` [PATCH v22 2/5] ring-buffer: Introducing ring-buffer mapping functions Vincent Donnefort
2024-05-02 13:30 ` David Hildenbrand
2024-05-02 13:38 ` Vincent Donnefort
2024-05-02 13:46 ` Steven Rostedt
2024-05-08 2:34 ` Steven Rostedt
2024-05-09 11:05 ` Vincent Donnefort
2024-05-10 9:15 ` David Hildenbrand
2024-05-10 10:57 ` Vincent Donnefort
2024-05-10 9:19 ` David Hildenbrand
2024-05-10 11:03 ` Vincent Donnefort
2024-05-10 18:42 ` Steven Rostedt
2024-04-30 11:13 ` [PATCH v22 3/5] tracing: Allow user-space mapping of the ring-buffer Vincent Donnefort
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox