linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mm: rearrange madvise code to allow for reuse
@ 2013-07-12  2:34 Colin Cross
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  0 siblings, 1 reply; 33+ messages in thread
From: Colin Cross @ 2013-07-12  2:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Kyungmin Park, Christoph Hellwig, John Stultz, Eric W. Biederman,
	Pekka Enberg, Dave Hansen, Colin Cross, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

This patch refactors the madvise syscall to allow for parts of it
to be reused by a prctl syscall that affects vmas.

Move the code that walks vmas in a virtual address range into a
function that takes a function pointer as a parameter.  The only
caller for now is sys_madvise, which uses it to call
madvise_vma_behavior on each vma, but the next patch will add
an additional caller.

Move handling all vma behaviors inside madvise_behavior, and
rename it to madvise_vma_behavior.

Move the code that updates the flags on a vma, including splitting
or merging the vma as necessary, into a new function called
madvise_update_vma.  The next patch will add support for updating
a new anon_name field as well.

Signed-off-by: Colin Cross <ccross@android.com>
---
 mm/madvise.c | 272 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 151 insertions(+), 121 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..b8820fd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int behavior)
 }
 
 /*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary.  Must be called with mmap_sem held for writing;
  */
-static long madvise_behavior(struct vm_area_struct * vma,
-		     struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev, unsigned long start,
+		     unsigned long end, unsigned long new_flags)
 {
 	struct mm_struct * mm = vma->vm_mm;
-	int error = 0;
 	pgoff_t pgoff;
-	unsigned long new_flags = vma->vm_flags;
-
-	switch (behavior) {
-	case MADV_NORMAL:
-		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
-		break;
-	case MADV_SEQUENTIAL:
-		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
-		break;
-	case MADV_RANDOM:
-		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
-		break;
-	case MADV_DONTFORK:
-		new_flags |= VM_DONTCOPY;
-		break;
-	case MADV_DOFORK:
-		if (vma->vm_flags & VM_IO) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTCOPY;
-		break;
-	case MADV_DONTDUMP:
-		new_flags |= VM_DONTDUMP;
-		break;
-	case MADV_DODUMP:
-		if (new_flags & VM_SPECIAL) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTDUMP;
-		break;
-	case MADV_MERGEABLE:
-	case MADV_UNMERGEABLE:
-		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
-			goto out;
-		break;
-	case MADV_HUGEPAGE:
-	case MADV_NOHUGEPAGE:
-		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error)
-			goto out;
-		break;
-	}
+	int error;
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto out;
+		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	if (start != vma->vm_start) {
 		error = split_vma(mm, vma, start, 1);
 		if (error)
-			goto out;
+			return error;
 	}
 
 	if (end != vma->vm_end) {
 		error = split_vma(mm, vma, end, 0);
 		if (error)
-			goto out;
+			return error;
 	}
 
 success:
@@ -128,10 +83,7 @@ success:
 	 */
 	vma->vm_flags = new_flags;
 
-out:
-	if (error == -ENOMEM)
-		error = -EAGAIN;
-	return error;
+	return 0;
 }
 
 #ifdef CONFIG_SWAP
@@ -337,6 +289,77 @@ static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+/*
+ * Apply an madvise behavior to a region of a vma.  madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long behavior)
+{
+	int error = 0;
+	unsigned long new_flags = vma->vm_flags;
+
+	switch (behavior) {
+	case MADV_REMOVE:
+		return madvise_remove(vma, prev, start, end);
+	case MADV_WILLNEED:
+		return madvise_willneed(vma, prev, start, end);
+	case MADV_DONTNEED:
+		return madvise_dontneed(vma, prev, start, end);
+	case MADV_NORMAL:
+		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+		break;
+	case MADV_SEQUENTIAL:
+		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+		break;
+	case MADV_DONTFORK:
+		new_flags |= VM_DONTCOPY;
+		break;
+	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTCOPY;
+		break;
+	case MADV_DONTDUMP:
+		new_flags |= VM_DONTDUMP;
+		break;
+	case MADV_DODUMP:
+		if (new_flags & VM_SPECIAL) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTDUMP;
+		break;
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+		error = ksm_madvise(vma, start, end, behavior, &new_flags);
+		if (error)
+			goto out;
+		break;
+	case MADV_HUGEPAGE:
+	case MADV_NOHUGEPAGE:
+		error = hugepage_madvise(vma, &new_flags, behavior);
+		if (error)
+			goto out;
+		break;
+	}
+
+	error = madvise_update_vma(vma, prev, start, end, new_flags);
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Error injection support for memory error handling.
@@ -369,22 +392,6 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 }
 #endif
 
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-		unsigned long start, unsigned long end, int behavior)
-{
-	switch (behavior) {
-	case MADV_REMOVE:
-		return madvise_remove(vma, prev, start, end);
-	case MADV_WILLNEED:
-		return madvise_willneed(vma, prev, start, end);
-	case MADV_DONTNEED:
-		return madvise_dontneed(vma, prev, start, end);
-	default:
-		return madvise_behavior(vma, prev, start, end, behavior);
-	}
-}
-
 static int
 madvise_behavior_valid(int behavior)
 {
@@ -415,6 +422,73 @@ madvise_behavior_valid(int behavior)
 }
 
 /*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range.  Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_sem held for reading or writing.
+ */
+static
+int madvise_walk_vmas(unsigned long start, unsigned long end,
+		unsigned long arg,
+		int (*visit)(struct vm_area_struct *vma,
+			struct vm_area_struct **prev, unsigned long start,
+			unsigned long end, unsigned long arg))
+{
+	struct vm_area_struct *vma;
+	struct vm_area_struct *prev;
+	unsigned long tmp;
+	int unmapped_error = 0;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 * - different from the way of handling in mlock etc.
+	 */
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
+	for (;;) {
+		int error;
+
+		/* Still start < end. */
+		if (!vma)
+			return -ENOMEM;
+
+		/* Here start < (end|vma->vm_end). */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+			if (start >= end)
+				break;
+		}
+
+		/* Here vma->vm_start <= start < (end|vma->vm_end) */
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
+
+		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+		error = visit(vma, &prev, start, tmp, arg);
+		if (error)
+			return error;
+		start = tmp;
+		if (prev && start < prev->vm_end)
+			start = prev->vm_end;
+		if (start >= end)
+			break;
+		if (prev)
+			vma = prev->vm_next;
+		else	/* madvise_remove dropped mmap_sem */
+			vma = find_vma(current->mm, start);
+	}
+
+	return unmapped_error;
+}
+
+/*
  * The madvise(2) system call.
  *
  * Applications can use madvise() to advise the kernel how it should
@@ -458,9 +532,7 @@ madvise_behavior_valid(int behavior)
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	unsigned long end, tmp;
-	struct vm_area_struct * vma, *prev;
-	int unmapped_error = 0;
+	unsigned long end;
 	int error = -EINVAL;
 	int write;
 	size_t len;
@@ -495,52 +567,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	else
 		down_read(&current->mm->mmap_sem);
 
-	/*
-	 * If the interval [start,end) covers some unmapped address
-	 * ranges, just ignore them, but return -ENOMEM at the end.
-	 * - different from the way of handling in mlock etc.
-	 */
-	vma = find_vma_prev(current->mm, start, &prev);
-	if (vma && start > vma->vm_start)
-		prev = vma;
-
 	blk_start_plug(&plug);
-	for (;;) {
-		/* Still start < end. */
-		error = -ENOMEM;
-		if (!vma)
-			goto out;
-
-		/* Here start < (end|vma->vm_end). */
-		if (start < vma->vm_start) {
-			unmapped_error = -ENOMEM;
-			start = vma->vm_start;
-			if (start >= end)
-				goto out;
-		}
-
-		/* Here vma->vm_start <= start < (end|vma->vm_end) */
-		tmp = vma->vm_end;
-		if (end < tmp)
-			tmp = end;
-
-		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma(vma, &prev, start, tmp, behavior);
-		if (error)
-			goto out;
-		start = tmp;
-		if (prev && start < prev->vm_end)
-			start = prev->vm_end;
-		error = unmapped_error;
-		if (start >= end)
-			goto out;
-		if (prev)
-			vma = prev->vm_next;
-		else	/* madvise_remove dropped mmap_sem */
-			vma = find_vma(current->mm, start);
-	}
-out:
+	error = madvise_walk_vmas(start, end, behavior, madvise_vma_behavior);
 	blk_finish_plug(&plug);
+
 	if (write)
 		up_write(&current->mm->mmap_sem);
 	else
-- 
1.8.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 [PATCH 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
@ 2013-07-12  2:34 ` Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
                     ` (4 more replies)
  0 siblings, 5 replies; 33+ messages in thread
From: Colin Cross @ 2013-07-12  2:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Kyungmin Park, Christoph Hellwig, John Stultz, Eric W. Biederman,
	Pekka Enberg, Dave Hansen, Colin Cross, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

Userspace processes often have multiple allocators that each do
anonymous mmaps to get memory.  When examining memory usage of
individual processes or systems as a whole, it is useful to be
able to break down the various heaps that were allocated by
each layer and examine their size, RSS, and physical memory
usage.

This patch adds a user pointer to the shared union in
vm_area_struct that points to a null terminated string inside
the user process containing a name for the vma.  vmas that
point to the same address will be merged, but vmas that
point to equivalent strings at different addresses will
not be merged.

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The names of named anonymous vmas are shown in /proc/pid/maps
as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
that is only present for named vmas.  If the userspace pointer
is no longer valid all or part of the name will be replaced
with "<fault>".

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fieds that are only used on file-backed
mappings, so it does not increase memory usage.

Signed-off-by: Colin Cross <ccross@android.com>
---
 Documentation/filesystems/proc.txt |  6 ++++
 fs/proc/task_mmu.c                 | 62 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  5 ++-
 include/linux/mm_types.h           | 15 +++++++++
 include/uapi/linux/prctl.h         |  3 ++
 kernel/sys.c                       | 24 +++++++++++++++
 mm/madvise.c                       | 56 +++++++++++++++++++++++++++++++---
 mm/mempolicy.c                     |  2 +-
 mm/mlock.c                         |  3 +-
 mm/mmap.c                          | 44 ++++++++++++++++-----------
 mm/mprotect.c                      |  3 +-
 11 files changed, 197 insertions(+), 26 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d5..e0eb9d2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
  [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
@@ -419,6 +421,7 @@ KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
 VmFlags: rd ex mr mw me de
+Name:           name from userspace
 
 the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -469,6 +472,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
 be vanished or the reverse -- new added.
 
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d8..de76be4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,6 +90,56 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+	const char __user *name = vma_get_anon_name(vma);
+	struct mm_struct *mm = vma->vm_mm;
+
+	unsigned long page_start_vaddr;
+	unsigned long page_offset;
+	unsigned long num_pages;
+	unsigned long max_len = NAME_MAX;
+	int i;
+
+	page_start_vaddr = (unsigned long)name & PAGE_MASK;
+	page_offset = (unsigned long)name - page_start_vaddr;
+	num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+	seq_puts(m, "[anon:");
+
+	for (i = 0; i < num_pages; i++) {
+		int len;
+		int write_len;
+		const char *kaddr;
+		long pages_pinned;
+		struct page *page;
+
+		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
+				1, 0, 0, &page, NULL);
+		if (pages_pinned < 1) {
+			seq_puts(m, "<fault>]");
+			return;
+		}
+
+		kaddr = (const char *)kmap(page);
+		len = min(max_len, PAGE_SIZE - page_offset);
+		write_len = strnlen(kaddr + page_offset, len);
+		seq_write(m, kaddr + page_offset, write_len);
+		kunmap(page);
+		put_page(page);
+
+		/* if strnlen hit a null terminator then we're done */
+		if (write_len != len)
+			break;
+
+		max_len -= len;
+		page_offset = 0;
+		page_start_vaddr += PAGE_SIZE;
+	}
+
+	seq_putc(m, ']');
+}
+
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -335,6 +385,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				pad_len_spaces(m, len);
 				seq_printf(m, "[stack:%d]", tid);
 			}
+			goto done;
+		}
+
+		if (vma_get_anon_name(vma)) {
+			pad_len_spaces(m, len);
+			seq_print_vma_name(m, vma);
 		}
 	}
 
@@ -634,6 +690,12 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 	show_smap_vma_flags(m, vma);
 
+	if (vma_get_anon_name(vma)) {
+		seq_puts(m, "Name:           ");
+		seq_print_vma_name(m, vma);
+		seq_putc(m, '\n');
+	}
+
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task->mm))
 			? vma->vm_start : 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd5679d..60038ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1486,7 +1486,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1829,5 +1829,8 @@ void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+				unsigned long name_addr);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..875ba48 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -255,6 +255,10 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * in the user process containing the name given to the vma, or NULL
+	 * if unnamed.
 	 */
 	union {
 		struct {
@@ -262,6 +266,7 @@ struct vm_area_struct {
 			unsigned long rb_subtree_last;
 		} linear;
 		struct list_head nonlinear;
+		const char __user *anon_name;
 	} shared;
 
 	/*
@@ -456,4 +461,14 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return mm->cpu_vm_mask_var;
 }
 
+
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		return NULL;
+
+	return vma->shared.anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..063bf75 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_VMA		41
+# define PR_SET_VMA_ANON_NAME		0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a7..401852f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2099,6 +2099,27 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+		unsigned long len, unsigned long arg)
+{
+	struct mm_struct *mm = current->mm;
+	int error;
+
+	down_write(&mm->mmap_sem);
+
+	switch (opt) {
+	case PR_SET_VMA_ANON_NAME:
+		error = madvise_set_anon_name(addr, len, arg);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2262,6 +2283,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_SET_VMA:
+		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index b8820fd..b2f8738 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -44,20 +44,23 @@ static int madvise_need_mmap_write(int behavior)
  */
 static int madvise_update_vma(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev, unsigned long start,
-		     unsigned long end, unsigned long new_flags)
+		     unsigned long end, unsigned long new_flags,
+		     const char __user *new_anon_name)
 {
 	struct mm_struct * mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int error;
 
-	if (new_flags == vma->vm_flags) {
+	if (new_flags == vma->vm_flags &&
+			new_anon_name == vma_get_anon_name(vma)) {
 		*prev = vma;
 		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+				vma->vm_file, pgoff, vma_policy(vma),
+				new_anon_name);
 	if (*prev) {
 		vma = *prev;
 		goto success;
@@ -82,10 +85,30 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
+	if (!vma->vm_file)
+		vma->shared.anon_name = new_anon_name;
 
 	return 0;
 }
 
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long name_addr)
+{
+	int error;
+
+	/* Only anonymous mappings can be named */
+	if (vma->vm_file)
+		return -EINVAL;
+
+	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+			(const char __user *)name_addr);
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
@@ -352,7 +375,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 		break;
 	}
 
-	error = madvise_update_vma(vma, prev, start, end, new_flags);
+	error = madvise_update_vma(vma, prev, start, end, new_flags,
+				vma_get_anon_name(vma));
 
 out:
 	if (error == -ENOMEM)
@@ -488,6 +512,30 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	return unmapped_error;
 }
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+		unsigned long name_addr)
+{
+	unsigned long end;
+	unsigned long len;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	return madvise_walk_vmas(start, end, name_addr, madvise_vma_anon_name);
+}
+
 /*
  * The madvise(2) system call.
  *
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7431001..11db490 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				  new_pol, vma->vm_name);
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..33861c7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -287,7 +287,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma_get_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..25abb88 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -893,7 +893,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+			struct file *file, unsigned long vm_flags,
+			const char __user *anon_name)
 {
 	if (vma->vm_flags ^ vm_flags)
 		return 0;
@@ -901,6 +902,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (vma_get_anon_name(vma) != anon_name)
+		return 0;
 	return 1;
 }
 
@@ -931,9 +934,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -950,9 +954,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -963,9 +968,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -995,7 +1000,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 		     	struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			const char __user *anon_name)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1021,15 +1027,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (prev && prev->vm_end == addr &&
   			mpol_equal(vma_policy(prev), policy) &&
-			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+			can_vma_merge_after(prev, vm_flags, anon_vma,
+						file, pgoff, anon_name)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
-				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+				can_vma_merge_before(next, vm_flags, anon_vma,
+						file, pgoff+pglen, anon_name) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1049,8 +1055,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (next && end == next->vm_start &&
  			mpol_equal(policy, vma_policy(next)) &&
-			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+			can_vma_merge_before(next, vm_flags, anon_vma,
+					file, pgoff+pglen, anon_name)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1519,7 +1525,8 @@ munmap_back:
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+			NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2663,7 +2670,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+					NULL, NULL, pgoff, NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2821,7 +2828,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_get_anon_name(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..94d50b7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -271,7 +271,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_get_anon_name(vma));
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
1.8.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
@ 2013-07-12  5:39   ` Pekka Enberg
  2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  5:43   ` Pekka Enberg
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  5:39 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/12/2013 05:34 AM, Colin Cross wrote:
> Userspace processes often have multiple allocators that each do
> anonymous mmaps to get memory.  When examining memory usage of
> individual processes or systems as a whole, it is useful to be
> able to break down the various heaps that were allocated by
> each layer and examine their size, RSS, and physical memory
> usage.
>
> This patch adds a user pointer to the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process containing a name for the vma.  vmas that
> point to the same address will be merged, but vmas that
> point to equivalent strings at different addresses will
> not be merged.
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The names of named anonymous vmas are shown in /proc/pid/maps
> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> that is only present for named vmas.  If the userspace pointer
> is no longer valid all or part of the name will be replaced
> with "<fault>".
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fieds that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>

Ingo, PeterZ, is this something worthwhile for replacing our
current JIT symbol hack with perf?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
@ 2013-07-12  5:43   ` Pekka Enberg
  2013-07-12  6:18     ` Colin Cross
  2013-07-12  6:36   ` Dave Hansen
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  5:43 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/12/2013 05:34 AM, Colin Cross wrote:
> Userspace processes often have multiple allocators that each do
> anonymous mmaps to get memory.  When examining memory usage of
> individual processes or systems as a whole, it is useful to be
> able to break down the various heaps that were allocated by
> each layer and examine their size, RSS, and physical memory
> usage.
>
> This patch adds a user pointer to the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process containing a name for the vma.  vmas that
> point to the same address will be merged, but vmas that
> point to equivalent strings at different addresses will
> not be merged.
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The names of named anonymous vmas are shown in /proc/pid/maps
> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> that is only present for named vmas.  If the userspace pointer
> is no longer valid all or part of the name will be replaced
> with "<fault>".
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fieds that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>

So how does this perform if I do prctl(PR_SET_VMA_ANON_NAME)
for thousands of relatively small (max 1 KB) JIT generated
functions? Will we run into MM problems because the VMAs are
not mergeable?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  5:43   ` Pekka Enberg
@ 2013-07-12  6:18     ` Colin Cross
  2013-07-12  7:03       ` Pekka Enberg
  0 siblings, 1 reply; 33+ messages in thread
From: Colin Cross @ 2013-07-12  6:18 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On Thu, Jul 11, 2013 at 10:43 PM, Pekka Enberg <penberg@kernel.org> wrote:
> On 07/12/2013 05:34 AM, Colin Cross wrote:
>>
>> Userspace processes often have multiple allocators that each do
>> anonymous mmaps to get memory.  When examining memory usage of
>> individual processes or systems as a whole, it is useful to be
>> able to break down the various heaps that were allocated by
>> each layer and examine their size, RSS, and physical memory
>> usage.
>>
>> This patch adds a user pointer to the shared union in
>> vm_area_struct that points to a null terminated string inside
>> the user process containing a name for the vma.  vmas that
>> point to the same address will be merged, but vmas that
>> point to equivalent strings at different addresses will
>> not be merged.
>>
>> Userspace can set the name for a region of memory by calling
>> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> Setting the name to NULL clears it.
>>
>> The names of named anonymous vmas are shown in /proc/pid/maps
>> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
>> that is only present for named vmas.  If the userspace pointer
>> is no longer valid all or part of the name will be replaced
>> with "<fault>".
>>
>> The idea to store a userspace pointer to reduce the complexity
>> within mm (at the expense of the complexity of reading
>> /proc/pid/mem) came from Dave Hansen.  This results in no
>> runtime overhead in the mm subsystem other than comparing
>> the anon_name pointers when considering vma merging.  The pointer
>> is stored in a union with fieds that are only used on file-backed
>> mappings, so it does not increase memory usage.
>>
>> Signed-off-by: Colin Cross <ccross@android.com>
>
>
> So how does this perform if I do prctl(PR_SET_VMA_ANON_NAME)
> for thousands of relatively small (max 1 KB) JIT generated
> functions? Will we run into MM problems because the VMAs are
> not mergeable?

This operates on vmas, so it can only handle naming page aligned
regions.  It would work fine to identify the regions that contain JIT
code, but not to identify individual functions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
  2013-07-12  5:43   ` Pekka Enberg
@ 2013-07-12  6:36   ` Dave Hansen
  2013-07-12  6:42     ` Colin Cross
  2013-07-14 14:11   ` Oleg Nesterov
  2013-07-14 14:17   ` Oleg Nesterov
  4 siblings, 1 reply; 33+ messages in thread
From: Dave Hansen @ 2013-07-12  6:36 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/11/2013 07:34 PM, Colin Cross wrote:
> +		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
> +				1, 0, 0, &page, NULL);
> +		if (pages_pinned < 1) {
> +			seq_puts(m, "<fault>]");
> +			return;
> +		}
> +
> +		kaddr = (const char *)kmap(page);
> +		len = min(max_len, PAGE_SIZE - page_offset);
> +		write_len = strnlen(kaddr + page_offset, len);
> +		seq_write(m, kaddr + page_offset, write_len);
> +		kunmap(page);
> +		put_page(page);

This looks a bit like access_process_vm()?  Can you perhaps use it here?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  6:36   ` Dave Hansen
@ 2013-07-12  6:42     ` Colin Cross
  0 siblings, 0 replies; 33+ messages in thread
From: Colin Cross @ 2013-07-12  6:42 UTC (permalink / raw)
  To: Dave Hansen
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On Thu, Jul 11, 2013 at 11:36 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> On 07/11/2013 07:34 PM, Colin Cross wrote:
>> +             pages_pinned = get_user_pages(current, mm, page_start_vaddr,
>> +                             1, 0, 0, &page, NULL);
>> +             if (pages_pinned < 1) {
>> +                     seq_puts(m, "<fault>]");
>> +                     return;
>> +             }
>> +
>> +             kaddr = (const char *)kmap(page);
>> +             len = min(max_len, PAGE_SIZE - page_offset);
>> +             write_len = strnlen(kaddr + page_offset, len);
>> +             seq_write(m, kaddr + page_offset, write_len);
>> +             kunmap(page);
>> +             put_page(page);
>
> This looks a bit like access_process_vm()?  Can you perhaps use it here?

It's a lot like __access_remote_vm, and this pattern is repeated in
many other places in the kernel.  I didn't try to reuse any of them
because I wanted to stop reading at a null byte and __access_remote_vm
would read the full NAME_MAX every time.  I was also avoiding having
to allocate a NAME_MAX sized buffer to copy into, instead passing the
mapped user page directly to seq_write.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  6:18     ` Colin Cross
@ 2013-07-12  7:03       ` Pekka Enberg
  0 siblings, 0 replies; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  7:03 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On 07/12/2013 09:18 AM, Colin Cross wrote:
> This operates on vmas, so it can only handle naming page aligned
> regions.  It would work fine to identify the regions that contain JIT
> code, but not to identify individual functions.

Right. The obvious question is: does this need to be attached to
VMAs or could it be a separate data structure that can be used for
both?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  5:39   ` Pekka Enberg
@ 2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  8:17       ` Peter Zijlstra
  2013-07-12  8:21       ` Pekka Enberg
  0 siblings, 2 replies; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:13 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, linux-kernel, Kyungmin Park, Christoph Hellwig,
	John Stultz, Eric W. Biederman, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> On 07/12/2013 05:34 AM, Colin Cross wrote:
> >Userspace processes often have multiple allocators that each do
> >anonymous mmaps to get memory.  When examining memory usage of
> >individual processes or systems as a whole, it is useful to be
> >able to break down the various heaps that were allocated by
> >each layer and examine their size, RSS, and physical memory
> >usage.
> >
> >This patch adds a user pointer to the shared union in
> >vm_area_struct that points to a null terminated string inside
> >the user process containing a name for the vma.  vmas that
> >point to the same address will be merged, but vmas that
> >point to equivalent strings at different addresses will
> >not be merged.
> >
> >Userspace can set the name for a region of memory by calling
> >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >Setting the name to NULL clears it.
> >
> >The names of named anonymous vmas are shown in /proc/pid/maps
> >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> >that is only present for named vmas.  If the userspace pointer
> >is no longer valid all or part of the name will be replaced
> >with "<fault>".
> >
> >The idea to store a userspace pointer to reduce the complexity
> >within mm (at the expense of the complexity of reading
> >/proc/pid/mem) came from Dave Hansen.  This results in no
> >runtime overhead in the mm subsystem other than comparing
> >the anon_name pointers when considering vma merging.  The pointer
> >is stored in a union with fieds that are only used on file-backed
> >mappings, so it does not increase memory usage.
> >
> >Signed-off-by: Colin Cross <ccross@android.com>
> 
> Ingo, PeterZ, is this something worthwhile for replacing our
> current JIT symbol hack with perf?

I really don't see the point of this stuff; in fact I intensely dislike it as I
don't think this is something the kernel needs to do at all.

Why can't these allocators Collin talks about use file maps and/or write their
own meta-data to file? He is after all only interested in Android and they have
complete control over the entire userspace stack.

I also don't see it helping with the JIT stuff; you still need to write out a
file with symbol information, we still need to find the file. A less hacky
solution for the entire JIT thing is you writing a proper ELF-DSO and
mmap()'ing that :-)

Storing a JIT specific userspace pointer in the VMA doesn't help with any of
that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:13     ` Peter Zijlstra
@ 2013-07-12  8:17       ` Peter Zijlstra
  2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:21       ` Pekka Enberg
  1 sibling, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:17 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, linux-kernel, Kyungmin Park, Christoph Hellwig,
	John Stultz, Eric W. Biederman, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On Fri, Jul 12, 2013 at 10:13:48AM +0200, Peter Zijlstra wrote:
> On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> > On 07/12/2013 05:34 AM, Colin Cross wrote:
> > >Userspace processes often have multiple allocators that each do
> > >anonymous mmaps to get memory.  When examining memory usage of
> > >individual processes or systems as a whole, it is useful to be
> > >able to break down the various heaps that were allocated by
> > >each layer and examine their size, RSS, and physical memory
> > >usage.
> > >
> > >This patch adds a user pointer to the shared union in
> > >vm_area_struct that points to a null terminated string inside
> > >the user process containing a name for the vma.  vmas that
> > >point to the same address will be merged, but vmas that
> > >point to equivalent strings at different addresses will
> > >not be merged.
> > >
> > >Userspace can set the name for a region of memory by calling
> > >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> > >Setting the name to NULL clears it.
> > >
> > >The names of named anonymous vmas are shown in /proc/pid/maps
> > >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> > >that is only present for named vmas.  If the userspace pointer
> > >is no longer valid all or part of the name will be replaced
> > >with "<fault>".
> > >
> > >The idea to store a userspace pointer to reduce the complexity
> > >within mm (at the expense of the complexity of reading
> > >/proc/pid/mem) came from Dave Hansen.  This results in no
> > >runtime overhead in the mm subsystem other than comparing
> > >the anon_name pointers when considering vma merging.  The pointer
> > >is stored in a union with fieds that are only used on file-backed
> > >mappings, so it does not increase memory usage.
> > >
> > >Signed-off-by: Colin Cross <ccross@android.com>
> > 
> > Ingo, PeterZ, is this something worthwhile for replacing our
> > current JIT symbol hack with perf?
> 
> I really don't see the point of this stuff; in fact I intensely dislike it as I
> don't think this is something the kernel needs to do at all.
> 
> Why can't these allocators Collin talks about use file maps and/or write their
> own meta-data to file? He is after all only interested in Android and they have
> complete control over the entire userspace stack.

In fact, nowhere in his entire Changelog does he explain why this needs be in
the kernel; _why_ can't userspace do this?

He needs to go change his allocators to use the new madv syscall anyway, he
might as well change them to write the stuff to a local file and be done with
it.

what gives?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  8:17       ` Peter Zijlstra
@ 2013-07-12  8:21       ` Pekka Enberg
  2013-07-12  8:55         ` Peter Zijlstra
  1 sibling, 1 reply; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  8:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> I also don't see it helping with the JIT stuff; you still need to write out a
> file with symbol information, we still need to find the file. A less hacky
> solution for the entire JIT thing is you writing a proper ELF-DSO and
> mmap()'ing that :-)
>
> Storing a JIT specific userspace pointer in the VMA doesn't help with any of
> that.

I'm thinking about corner cases like 'perf top' here. I don't see how we can
write out a ELF-DSO because the JIT compiler can generate new symbols
at any given time.

That's what made me think it'd be best for the _kernel_ to know about the
symbols so that perf could take advantage of that as well.

                                    Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:17       ` Peter Zijlstra
@ 2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:55           ` Pekka Enberg
  2013-07-12  9:00           ` Peter Zijlstra
  0 siblings, 2 replies; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  8:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 10:13:48AM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> > > On 07/12/2013 05:34 AM, Colin Cross wrote:
> > > >Userspace processes often have multiple allocators that each do
> > > >anonymous mmaps to get memory.  When examining memory usage of
> > > >individual processes or systems as a whole, it is useful to be
> > > >able to break down the various heaps that were allocated by
> > > >each layer and examine their size, RSS, and physical memory
> > > >usage.
> > > >
> > > >This patch adds a user pointer to the shared union in
> > > >vm_area_struct that points to a null terminated string inside
> > > >the user process containing a name for the vma.  vmas that
> > > >point to the same address will be merged, but vmas that
> > > >point to equivalent strings at different addresses will
> > > >not be merged.
> > > >
> > > >Userspace can set the name for a region of memory by calling
> > > >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> > > >Setting the name to NULL clears it.
> > > >
> > > >The names of named anonymous vmas are shown in /proc/pid/maps
> > > >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> > > >that is only present for named vmas.  If the userspace pointer
> > > >is no longer valid all or part of the name will be replaced
> > > >with "<fault>".
> > > >
> > > >The idea to store a userspace pointer to reduce the complexity
> > > >within mm (at the expense of the complexity of reading
> > > >/proc/pid/mem) came from Dave Hansen.  This results in no
> > > >runtime overhead in the mm subsystem other than comparing
> > > >the anon_name pointers when considering vma merging.  The pointer
> > > >is stored in a union with fieds that are only used on file-backed
> > > >mappings, so it does not increase memory usage.
> > > >
> > > >Signed-off-by: Colin Cross <ccross@android.com>
> > > 
> > > Ingo, PeterZ, is this something worthwhile for replacing our
> > > current JIT symbol hack with perf?
> > 
> > I really don't see the point of this stuff; in fact I intensely 
> > dislike it as I don't think this is something the kernel needs to do 
> > at all.
> > 
> > Why can't these allocators Collin talks about use file maps and/or 
> > write their own meta-data to file? He is after all only interested in 
> > Android and they have complete control over the entire userspace 
> > stack.
> 
> In fact, nowhere in his entire Changelog does he explain why this needs 
> be in the kernel; _why_ can't userspace do this?
> 
> He needs to go change his allocators to use the new madv syscall anyway, 
> he might as well change them to write the stuff to a local file and be 
> done with it.
> 
> what gives?

It makes tons of sense.

Just like we have a task's cmd-name it makes a lot of sense to name 
objects in a human readable fashion, to help debugging, instrumentation, 
performance analysis, etc.

Yes, in theory user-space could do all that. That's not the point: the 
point is to make it fast, easy enough and to have a central version (the 
kernel).

Doing it via temporary files has various disadvantages:

 - many tools really like to be filesystem invariant (not touch any files 
   even in tmpfs, be able to run in a readonly environment, etc.)

 - the overhead of opening, writing to and closing a file is an order of
   magnitude larger than a single prctl() call. [I'd even argue for such
   user-space tags to be attached to do_mmap(), unfortunately the mmap
   system call argument space is already pretty full. ]

 - stray files hang around (even in tmpfs). Point of instrumentation is to 
   be non-intrusive and as fool-proof as possible. When we are
   debugging problems the last thing we want are extra problems
   and unreliable instrumentation introduced by a fragile temporary file
   solution...

 - user space also tends to get the security model of temporary files
   wrong. static linking makes the user-space version iteration of such
   facilities harder. etc. etc. - there's other disadvantages as well.

So using temporary files is an instrumentation and debugging nightmare 
really. A simple self-contained prctl() variant, with the info stored by 
the kernel is as convenient as it gets.

I guess the real question is not whether it's useful, I think it clearly 
is. The question should be: are there real downsides? Does the addition to 
the anon mmap field blow up the size of vma_struct by a pointer, or is 
there still space?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:21       ` Pekka Enberg
@ 2013-07-12  8:55         ` Peter Zijlstra
  2013-07-12  9:04           ` Pekka Enberg
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:55 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:21:55AM +0300, Pekka Enberg wrote:
> On Fri, Jul 12, 2013 at 11:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > I also don't see it helping with the JIT stuff; you still need to write out a
> > file with symbol information, we still need to find the file. A less hacky
> > solution for the entire JIT thing is you writing a proper ELF-DSO and
> > mmap()'ing that :-)
> >
> > Storing a JIT specific userspace pointer in the VMA doesn't help with any of
> > that.
> 
> I'm thinking about corner cases like 'perf top' here. I don't see how we can
> write out a ELF-DSO because the JIT compiler can generate new symbols
> at any given time.

Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
the text section; make the symbol table larger than you expect. Then write the
symbol name after you've jit'ed the text but before you use it.

IIRC you once told me you never overwrite text but always append new symbols.
So you can basically fill the DSO with text/symbols use mmap memory writes.

Once the DSO is full -- equal to your previous anon-exec region being full,
you simply mmap a new DSO.

Wouldn't that work?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:44         ` Ingo Molnar
@ 2013-07-12  8:55           ` Pekka Enberg
  2013-07-12  9:00           ` Peter Zijlstra
  1 sibling, 0 replies; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  8:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:44 AM, Ingo Molnar <mingo@kernel.org> wrote:
> I guess the real question is not whether it's useful, I think it clearly
> is. The question should be: are there real downsides? Does the addition to
> the anon mmap field blow up the size of vma_struct by a pointer, or is
> there still space?

No, it's part of an union of 'struct vma_struct' in the current implementation
so the size doesn't change.

I'd still like to see something that's not restricted to page aligned memory
areas, though.

                                Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:55           ` Pekka Enberg
@ 2013-07-12  9:00           ` Peter Zijlstra
  2013-07-12  9:15             ` Ingo Molnar
  1 sibling, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 10:44:06AM +0200, Ingo Molnar wrote:
> It makes tons of sense.
> 
> Just like we have a task's cmd-name it makes a lot of sense to name 
> objects in a human readable fashion, to help debugging, instrumentation, 
> performance analysis, etc.
> 
> Yes, in theory user-space could do all that. That's not the point: the 
> point is to make it fast, easy enough and to have a central version (the 
> kernel).
> 
> Doing it via temporary files has various disadvantages:

We need those files anyway.. The current proposal is that the entire VMA has a
single userspace pointer in it. Or rather a 64bit value.

> I guess the real question is not whether it's useful, I think it clearly 
> is. The question should be: are there real downsides? Does the addition to 
> the anon mmap field blow up the size of vma_struct by a pointer, or is 
> there still space?

I don't see how the single u64 is useful at all for perf; you can have at most
one u64 per page; that's not nearly enough to put symbol information in.
Therefore we still require external files.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:55         ` Peter Zijlstra
@ 2013-07-12  9:04           ` Pekka Enberg
  2013-07-12  9:14             ` Peter Zijlstra
  2013-07-12  9:26             ` Ingo Molnar
  0 siblings, 2 replies; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  9:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
> the text section; make the symbol table larger than you expect. Then write the
> symbol name after you've jit'ed the text but before you use it.
>
> IIRC you once told me you never overwrite text but always append new symbols.
> So you can basically fill the DSO with text/symbols use mmap memory writes.

I don't but I think Hotspot, for example, does recompile method. Dunno
if it's a problem really, we could easily come up with a versioning
scheme for the methods and teach perf to treat the different memory
regions as the same method.

On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> Once the DSO is full -- equal to your previous anon-exec region being full,
> you simply mmap a new DSO.
>
> Wouldn't that work?

Okay and then whenever 'perf top' sees a non-mapped IP it reloads the
DSO (if it has changed)?

Yeah, I could see that working. It doesn't solve the problems Ingo 
mentioned which are also important, though.

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:04           ` Pekka Enberg
@ 2013-07-12  9:14             ` Peter Zijlstra
  2013-07-12  9:28               ` Ingo Molnar
  2013-07-12  9:26             ` Ingo Molnar
  1 sibling, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:14 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 12:04:46PM +0300, Pekka Enberg wrote:
> On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> >Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
> >the text section; make the symbol table larger than you expect. Then write the
> >symbol name after you've jit'ed the text but before you use it.
> >
> >IIRC you once told me you never overwrite text but always append new symbols.
> >So you can basically fill the DSO with text/symbols use mmap memory writes.
> 
> I don't but I think Hotspot, for example, does recompile method. Dunno
> if it's a problem really, we could easily come up with a versioning
> scheme for the methods and teach perf to treat the different memory
> regions as the same method.

Anything that overwrites symbols is going to have issues with profiling;
there's really nothing we can do about that.

> On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> >Once the DSO is full -- equal to your previous anon-exec region being full,
> >you simply mmap a new DSO.
> >
> >Wouldn't that work?
> 
> Okay and then whenever 'perf top' sees a non-mapped IP it reloads the
> DSO (if it has changed)?

I suppose, yeah. There might be a few issues with determining if a mmap()
written file has changed though :/

> Yeah, I could see that working. It doesn't solve the problems Ingo mentioned
> which are also important, though.

Nothing I've yet seen would do that. Its intrinsic to the fact that we want
'anonymous' text tied to a process instance but require part of that text
(symbol information at the very least) to be available after the process
instance.

That are two contradictory requirements. You cannot preserve and not preserve
at the same time.

And pushing the symbol info into the kernel isn't going to fix that either.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:00           ` Peter Zijlstra
@ 2013-07-12  9:15             ` Ingo Molnar
  2013-07-12  9:27               ` Peter Zijlstra
  0 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 10:44:06AM +0200, Ingo Molnar wrote:
> > It makes tons of sense.
> > 
> > Just like we have a task's cmd-name it makes a lot of sense to name 
> > objects in a human readable fashion, to help debugging, instrumentation, 
> > performance analysis, etc.
> > 
> > Yes, in theory user-space could do all that. That's not the point: the 
> > point is to make it fast, easy enough and to have a central version (the 
> > kernel).
> > 
> > Doing it via temporary files has various disadvantages:
> 
> We need those files anyway.. The current proposal is that the entire VMA 
> has a single userspace pointer in it. Or rather a 64bit value.

Yes but accessible via /proc/<PID>/mem or so?

> > I guess the real question is not whether it's useful, I think it 
> > clearly is. The question should be: are there real downsides? Does the 
> > addition to the anon mmap field blow up the size of vma_struct by a 
> > pointer, or is there still space?
> 
> I don't see how the single u64 is useful at all for perf; you can have 
> at most one u64 per page; that's not nearly enough to put symbol 
> information in. Therefore we still require external files.

I was thinking about it in the context of its original purpose: naming 
heap areas, which are pretty anonymous right now - /proc/*/maps is full
of mystery ranges today.

It's indeed not good enough for finer grained structure.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:04           ` Pekka Enberg
  2013-07-12  9:14             ` Peter Zijlstra
@ 2013-07-12  9:26             ` Ingo Molnar
  2013-07-12  9:38               ` Pekka Enberg
  1 sibling, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Pekka Enberg <penberg@kernel.org> wrote:

> > Once the DSO is full -- equal to your previous anon-exec region being 
> > full, you simply mmap a new DSO.
> >
> > Wouldn't that work?
> 
> Okay and then whenever 'perf top' sees a non-mapped IP it reloads the 
> DSO (if it has changed)?
> 
> Yeah, I could see that working. It doesn't solve the problems Ingo 
> mentioned which are also important, though.

Well, the JIT profiling case is really special - there we are constructing 
code and a symbol table on the fly. Talking to perf via a temporary file 
sounds unavoidable (and thus proper), because symbol information on that 
level is not something the kernel knows (or should know) about.

I was arguing primarily in the context of the original patch: naming 
allocator heaps. Today the kernel makes a few educated guesses about what 
each memory area is about, in /proc/*/maps:

 34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
 34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
 34511b2000-34511b7000 rw-p 00000000 00:00 0 
 7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
 7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0 
 7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0 
 7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
 7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
 ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]

... but on any larger app there's lots of anon mmap areas that are ... 
anonymous! ;-) User-space could help out a bit by naming them. It's not 
like there's many heaps, so the performance overhead aspect is minimal.

In the JIT case we have something different, a 'file IO' abstraction 
really: the JIT is generating (writing) new code and associated symbol 
records. So using temporary files there is natural and proper and most of 
the disadvantages I list don't apply because the sheer volume of new code 
generated dillutes the overhead of open()/close(), plus we do need some 
space for those symbols so a JIT cannot really expect to be able to run in 
a pure readonly environment.

In the allocator/heap case we have a _memory_ abstraction it's just that 
we also want to name the heap minimally.

For any finer than vma granularity user-space attributes the kernel cannot 
help much, it does not know (and probably should not know) about all 
user-space data structures.

Right now I don't see any good way to merge the two. (might be due to lack 
of imagination)

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:15             ` Ingo Molnar
@ 2013-07-12  9:27               ` Peter Zijlstra
  2013-07-12  9:40                 ` Ingo Molnar
  0 siblings, 1 reply; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > We need those files anyway.. The current proposal is that the entire VMA 
> > has a single userspace pointer in it. Or rather a 64bit value.
> 
> Yes but accessible via /proc/<PID>/mem or so?

*shudder*.. yes. But you're again opening two files. The only advantage of this
over userspace writing its own files is that the kernel cleans things up for
you.

However from what I understood android runs apps as individual users, and I
think we can do per user tmpfs mounts. So app dies, user exits, mount goes
*poof*.

> I was thinking about it in the context of its original purpose: naming 
> heap areas, which are pretty anonymous right now - /proc/*/maps is full
> of mystery ranges today.

It is.. although I've myself never had trouble with that. Most every memory
debugging that I've used/written over the past two decades was adequately able
to identify memory regions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:14             ` Peter Zijlstra
@ 2013-07-12  9:28               ` Ingo Molnar
  0 siblings, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Peter Zijlstra <peterz@infradead.org> wrote:

> > Yeah, I could see that working. It doesn't solve the problems Ingo 
> > mentioned which are also important, though.
> 
> Nothing I've yet seen would do that. Its intrinsic to the fact that we 
> want 'anonymous' text tied to a process instance but require part of 
> that text (symbol information at the very least) to be available after 
> the process instance.
> 
> That are two contradictory requirements. You cannot preserve and not 
> preserve at the same time.
> 
> And pushing the symbol info into the kernel isn't going to fix that 
> either.

I fully agree with you in the JIT case.

I was arguing the utilty of the original, somewhat limited usecase: 
minimally naming allocator areas/heaps, on a high level.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:26             ` Ingo Molnar
@ 2013-07-12  9:38               ` Pekka Enberg
  2013-07-12  9:45                 ` Ingo Molnar
  0 siblings, 1 reply; 33+ messages in thread
From: Pekka Enberg @ 2013-07-12  9:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 12:26 PM, Ingo Molnar <mingo@kernel.org> wrote:
> Well, the JIT profiling case is really special - there we are constructing
> code and a symbol table on the fly. Talking to perf via a temporary file
> sounds unavoidable (and thus proper), because symbol information on that
> level is not something the kernel knows (or should know) about.
>
> I was arguing primarily in the context of the original patch: naming
> allocator heaps. Today the kernel makes a few educated guesses about what
> each memory area is about, in /proc/*/maps:
>
>  34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
>  34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
>  34511b2000-34511b7000 rw-p 00000000 00:00 0
>  7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
>  7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0
>  7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0
>  7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
>  7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
>  ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
>
> ... but on any larger app there's lots of anon mmap areas that are ...
> anonymous! ;-) User-space could help out a bit by naming them. It's not
> like there's many heaps, so the performance overhead aspect is minimal.
>
> In the JIT case we have something different, a 'file IO' abstraction
> really: the JIT is generating (writing) new code and associated symbol
> records. So using temporary files there is natural and proper and most of
> the disadvantages I list don't apply because the sheer volume of new code
> generated dillutes the overhead of open()/close(), plus we do need some
> space for those symbols so a JIT cannot really expect to be able to run in
> a pure readonly environment.
>
> In the allocator/heap case we have a _memory_ abstraction it's just that
> we also want to name the heap minimally.
>
> For any finer than vma granularity user-space attributes the kernel cannot
> help much, it does not know (and probably should not know) about all
> user-space data structures.
>
> Right now I don't see any good way to merge the two. (might be due to lack
> of imagination)

I have no trouble with the imagination part but you make a strong point about
the kernel not helping at finer granularity than vma anyway.

The current functionality is already quite helpful for VMs as well. We could
annotate the different GC and JIT regions and make perf more human-friendly
by default.

                                Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:27               ` Peter Zijlstra
@ 2013-07-12  9:40                 ` Ingo Molnar
  2013-07-12  9:49                   ` Peter Zijlstra
  0 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > We need those files anyway.. The current proposal is that the entire VMA 
> > > has a single userspace pointer in it. Or rather a 64bit value.
> > 
> > Yes but accessible via /proc/<PID>/mem or so?
> 
> *shudder*.. yes. But you're again opening two files. The only advantage 
> of this over userspace writing its own files is that the kernel cleans 
> things up for you.

Opening of the files only occurs in the instrumentation case, which is 
rare. But temporary files would be forced upon the regular usecase when no 
instrumentation goes on.

> However from what I understood android runs apps as individual users, 
> and I think we can do per user tmpfs mounts. So app dies, user exits, 
> mount goes *poof*.

Yes, user-space could be smarter about temporary files.

Just like big banks could be less risk happy.

Yet the reality is that if left alone both apps and banks mess up, I don't 
think libertarianism works for policy: we are better off offering a 
framework that is simple, robust, self-contained, low risk and hard to 
mess up?

> > I was thinking about it in the context of its original purpose: naming 
> > heap areas, which are pretty anonymous right now - /proc/*/maps is 
> > full of mystery ranges today.
> 
> It is.. although I've myself never had trouble with that. Most every 
> memory debugging that I've used/written over the past two decades was 
> adequately able to identify memory regions.

So, these 400+ memory ranges are from Firefox's /proc/*/maps file:

7fbf59eff000-7fbf59f00000 ---p 00000000 00:00 0 
7fbf59f00000-7fbf5a800000 rw-p 00000000 00:00 0 
7fbf5a900000-7fbf5aa00000 rw-p 00000000 00:00 0 
7fbf5ad00000-7fbf5ae00000 rw-p 00000000 00:00 0 
7fbf5af00000-7fbf5b000000 rw-p 00000000 00:00 0 
7fbf5b100000-7fbf5b200000 rw-p 00000000 00:00 0 
7fbf5b500000-7fbf5b600000 rw-p 00000000 00:00 0 
7fbf5b800000-7fbf5b900000 rw-p 00000000 00:00 0 
7fbf5bb00000-7fbf5bc00000 rw-p 00000000 00:00 0 
7fbf5c000000-7fbf5c100000 rw-p 00000000 00:00 0 
7fbf5c200000-7fbf5c300000 rw-p 00000000 00:00 0 
7fbf5c800000-7fbf5c900000 rw-p 00000000 00:00 0 
7fbf5ca00000-7fbf5cb00000 rw-p 00000000 00:00 0 
7fbf5ce00000-7fbf5cf00000 rw-p 00000000 00:00 0 
7fbf5d200000-7fbf5d300000 rw-p 00000000 00:00 0 
7fbf5d400000-7fbf5d500000 rw-p 00000000 00:00 0 
7fbf5db00000-7fbf5dc00000 rw-p 00000000 00:00 0 
7fbf5dd00000-7fbf5de00000 rw-p 00000000 00:00 0 
7fbf5df00000-7fbf5e000000 rw-p 00000000 00:00 0 
7fbf5e300000-7fbf5e400000 rw-p 00000000 00:00 0 
7fbf5e500000-7fbf5e600000 rw-p 00000000 00:00 0 
7fbf5e900000-7fbf5ea00000 rw-p 00000000 00:00 0 
7fbf5eb00000-7fbf5ec00000 rw-p 00000000 00:00 0 
7fbf5ed00000-7fbf5ef00000 rw-p 00000000 00:00 0 
7fbf5f2ff000-7fbf5f300000 ---p 00000000 00:00 0 
7fbf5f300000-7fbf5fc00000 rw-p 00000000 00:00 0 
7fbf5fd00000-7fbf5fe00000 rw-p 00000000 00:00 0 
7fbf5ff00000-7fbf60000000 rw-p 00000000 00:00 0 
7fbf60200000-7fbf60300000 rw-p 00000000 00:00 0 
7fbf60600000-7fbf60700000 rw-p 00000000 00:00 0 
7fbf60800000-7fbf60900000 rw-p 00000000 00:00 0 
7fbf60a00000-7fbf60b00000 rw-p 00000000 00:00 0 
7fbf60e00000-7fbf60f00000 rw-p 00000000 00:00 0 
7fbf61000000-7fbf61100000 rw-p 00000000 00:00 0 
7fbf61a00000-7fbf61b00000 rw-p 00000000 00:00 0 
7fbf61c00000-7fbf61d00000 rw-p 00000000 00:00 0 
7fbf62000000-7fbf62100000 rw-p 00000000 00:00 0 
7fbf62200000-7fbf62300000 rw-p 00000000 00:00 0 
7fbf62400000-7fbf62500000 rw-p 00000000 00:00 0 
7fbf62600000-7fbf62700000 rw-p 00000000 00:00 0 
7fbf62800000-7fbf62a00000 rw-p 00000000 00:00 0 
7fbf62e00000-7fbf63000000 rw-p 00000000 00:00 0 
7fbf63100000-7fbf63200000 rw-p 00000000 00:00 0 
7fbf63300000-7fbf63400000 rw-p 00000000 00:00 0 
7fbf63600000-7fbf63700000 rw-p 00000000 00:00 0 
7fbf63900000-7fbf63a00000 rw-p 00000000 00:00 0 
7fbf63b00000-7fbf63c00000 rw-p 00000000 00:00 0 
7fbf63d00000-7fbf63e00000 rw-p 00000000 00:00 0 
7fbf63f00000-7fbf64000000 rw-p 00000000 00:00 0 
7fbf64100000-7fbf64200000 rw-p 00000000 00:00 0 
7fbf64300000-7fbf64400000 rw-p 00000000 00:00 0 
7fbf64500000-7fbf64600000 rw-p 00000000 00:00 0 
7fbf64700000-7fbf64800000 rw-p 00000000 00:00 0 
7fbf64a00000-7fbf64b00000 rw-p 00000000 00:00 0 
7fbf64c00000-7fbf64d00000 rw-p 00000000 00:00 0 
7fbf64e00000-7fbf64f00000 rw-p 00000000 00:00 0 
7fbf65400000-7fbf65500000 rw-p 00000000 00:00 0 
7fbf65600000-7fbf65700000 rw-p 00000000 00:00 0 
7fbf65800000-7fbf65900000 rw-p 00000000 00:00 0 
7fbf65a00000-7fbf65b00000 rw-p 00000000 00:00 0 
7fbf65c00000-7fbf65d00000 rw-p 00000000 00:00 0 
7fbf65e00000-7fbf65f00000 rw-p 00000000 00:00 0 
7fbf66000000-7fbf66100000 rw-p 00000000 00:00 0 
7fbf66200000-7fbf66300000 rw-p 00000000 00:00 0 
7fbf663ff000-7fbf66400000 ---p 00000000 00:00 0 
7fbf66400000-7fbf66d00000 rw-p 00000000 00:00 0 
7fbf66e00000-7fbf66f00000 rw-p 00000000 00:00 0 
7fbf67000000-7fbf67100000 rw-p 00000000 00:00 0 
7fbf67200000-7fbf67300000 rw-p 00000000 00:00 0 
7fbf67400000-7fbf67500000 rw-p 00000000 00:00 0 
7fbf67600000-7fbf67700000 rw-p 00000000 00:00 0 
7fbf67800000-7fbf67900000 rw-p 00000000 00:00 0 
7fbf67a00000-7fbf67b00000 rw-p 00000000 00:00 0 
7fbf67c00000-7fbf67d00000 rw-p 00000000 00:00 0 
7fbf67e00000-7fbf67f00000 rw-p 00000000 00:00 0 
7fbf68000000-7fbf68100000 rw-p 00000000 00:00 0 
7fbf68200000-7fbf68300000 rw-p 00000000 00:00 0 
7fbf68400000-7fbf68500000 rw-p 00000000 00:00 0 
7fbf68600000-7fbf68700000 rw-p 00000000 00:00 0 
7fbf68800000-7fbf68900000 rw-p 00000000 00:00 0 
7fbf68a00000-7fbf68b00000 rw-p 00000000 00:00 0 
7fbf68c00000-7fbf68d00000 rw-p 00000000 00:00 0 
7fbf68e00000-7fbf68f00000 rw-p 00000000 00:00 0 
7fbf69000000-7fbf69100000 rw-p 00000000 00:00 0 
7fbf692ff000-7fbf69300000 ---p 00000000 00:00 0 
7fbf69e00000-7fbf69f00000 rw-p 00000000 00:00 0 
7fbf6a000000-7fbf6a100000 rw-p 00000000 00:00 0 
7fbf6a200000-7fbf6a300000 rw-p 00000000 00:00 0 
7fbf6a400000-7fbf6a500000 rw-p 00000000 00:00 0 
7fbf6a600000-7fbf6a700000 rw-p 00000000 00:00 0 
7fbf6a800000-7fbf6a900000 rw-p 00000000 00:00 0 
7fbf6aa00000-7fbf6ad00000 rw-p 00000000 00:00 0 
7fbf6ae00000-7fbf6b000000 rw-p 00000000 00:00 0 
7fbf6b100000-7fbf6b200000 rw-p 00000000 00:00 0 
7fbf6b300000-7fbf6b400000 rw-p 00000000 00:00 0 
7fbf6b700000-7fbf6b800000 rw-p 00000000 00:00 0 
7fbf6b900000-7fbf6ba00000 rw-p 00000000 00:00 0 
7fbf6bb00000-7fbf6bd00000 rw-p 00000000 00:00 0 
7fbf6be00000-7fbf6bf00000 rw-p 00000000 00:00 0 
7fbf6c000000-7fbf6c100000 rw-p 00000000 00:00 0 
7fbf6c200000-7fbf6c300000 rw-p 00000000 00:00 0 
7fbf6c400000-7fbf6c500000 rw-p 00000000 00:00 0 
7fbf6c600000-7fbf6c700000 rw-p 00000000 00:00 0 
7fbf6c800000-7fbf6c900000 rw-p 00000000 00:00 0 
7fbf6ca00000-7fbf6cb00000 rw-p 00000000 00:00 0 
7fbf6cc00000-7fbf6cd00000 rw-p 00000000 00:00 0 
7fbf6ce00000-7fbf6d000000 rw-p 00000000 00:00 0 
7fbf6d100000-7fbf6d200000 rw-p 00000000 00:00 0 
7fbf6d300000-7fbf6d400000 rw-p 00000000 00:00 0 
7fbf6d500000-7fbf6da00000 rw-p 00000000 00:00 0 
7fbf6db00000-7fbf6dc00000 rw-p 00000000 00:00 0 
7fbf6e100000-7fbf6e200000 rw-p 00000000 00:00 0 
7fbf6e300000-7fbf6e400000 rw-p 00000000 00:00 0 
7fbf6e500000-7fbf6e600000 rw-p 00000000 00:00 0 
7fbf6e700000-7fbf6ed00000 rw-p 00000000 00:00 0 
7fbf6ee00000-7fbf6ef00000 rw-p 00000000 00:00 0 
7fbf6f000000-7fbf6f200000 rw-p 00000000 00:00 0 
7fbf6f300000-7fbf6f400000 rw-p 00000000 00:00 0 
7fbf6f500000-7fbf6f800000 rw-p 00000000 00:00 0 
7fbf6f900000-7fbf6fa00000 rw-p 00000000 00:00 0 
7fbf6fb00000-7fbf6fc00000 rw-p 00000000 00:00 0 
7fbf6fd00000-7fbf6fe00000 rw-p 00000000 00:00 0 
7fbf6ff00000-7fbf70000000 rw-p 00000000 00:00 0 
7fbf70100000-7fbf70200000 rw-p 00000000 00:00 0 
7fbf70300000-7fbf70400000 rw-p 00000000 00:00 0 
7fbf70500000-7fbf70600000 rw-p 00000000 00:00 0 
7fbf70700000-7fbf70800000 rw-p 00000000 00:00 0 
7fbf70900000-7fbf70b00000 rw-p 00000000 00:00 0 
7fbf70c00000-7fbf70d00000 rw-p 00000000 00:00 0 
7fbf70e00000-7fbf71300000 rw-p 00000000 00:00 0 
7fbf71400000-7fbf71500000 rw-p 00000000 00:00 0 
7fbf71600000-7fbf71700000 rw-p 00000000 00:00 0 
7fbf71800000-7fbf71900000 rw-p 00000000 00:00 0 
7fbf71a00000-7fbf71c00000 rw-p 00000000 00:00 0 
7fbf71d00000-7fbf71e00000 rw-p 00000000 00:00 0 
7fbf71f00000-7fbf72000000 rw-p 00000000 00:00 0 
7fbf72100000-7fbf72200000 rw-p 00000000 00:00 0 
7fbf72300000-7fbf72400000 rw-p 00000000 00:00 0 
7fbf72500000-7fbf72600000 rw-p 00000000 00:00 0 
7fbf72700000-7fbf72800000 rw-p 00000000 00:00 0 
7fbf72900000-7fbf72a00000 rw-p 00000000 00:00 0 
7fbf72b00000-7fbf72c00000 rw-p 00000000 00:00 0 
7fbf72d00000-7fbf72e00000 rw-p 00000000 00:00 0 
7fbf72f00000-7fbf73000000 rw-p 00000000 00:00 0 
7fbf73100000-7fbf73200000 rw-p 00000000 00:00 0 
7fbf73300000-7fbf73400000 rw-p 00000000 00:00 0 
7fbf734fb000-7fbf734fc000 ---p 00000000 00:00 0 
7fbf73d00000-7fbf73e00000 rw-p 00000000 00:00 0 
7fbf73f00000-7fbf74000000 rw-p 00000000 00:00 0 
7fbf741ff000-7fbf74200000 ---p 00000000 00:00 0 
7fbf74200000-7fbf74d00000 rw-p 00000000 00:00 0 
7fbf74e00000-7fbf75000000 rw-p 00000000 00:00 0 
7fbf75100000-7fbf75400000 rw-p 00000000 00:00 0 
7fbf754ff000-7fbf75500000 ---p 00000000 00:00 0 
7fbf75eff000-7fbf75f00000 ---p 00000000 00:00 0 
7fbf76900000-7fbf76b00000 rw-p 00000000 00:00 0 
7fbf76c00000-7fbf76d00000 rw-p 00000000 00:00 0 
7fbf76e00000-7fbf76f00000 rw-p 00000000 00:00 0 
7fbf77000000-7fbf77100000 rw-p 00000000 00:00 0 
7fbf77200000-7fbf77300000 rw-p 00000000 00:00 0 
7fbf77400000-7fbf77500000 rw-p 00000000 00:00 0 
7fbf77600000-7fbf77700000 rw-p 00000000 00:00 0 
7fbf77800000-7fbf77900000 rw-p 00000000 00:00 0 
7fbf77a00000-7fbf77b00000 rw-p 00000000 00:00 0 
7fbf77c00000-7fbf77e00000 rw-p 00000000 00:00 0 
7fbf77f00000-7fbf78000000 rw-p 00000000 00:00 0 
7fbf78100000-7fbf78200000 rw-p 00000000 00:00 0 
7fbf78300000-7fbf78400000 rw-p 00000000 00:00 0 
7fbf78500000-7fbf78700000 rw-p 00000000 00:00 0 
7fbf78800000-7fbf78900000 rw-p 00000000 00:00 0 
7fbf78a00000-7fbf78b00000 rw-p 00000000 00:00 0 
7fbf78c00000-7fbf78d00000 rw-p 00000000 00:00 0 
7fbf78e00000-7fbf78f00000 rw-p 00000000 00:00 0 
7fbf79000000-7fbf79100000 rw-p 00000000 00:00 0 
7fbf79200000-7fbf79300000 rw-p 00000000 00:00 0 
7fbf79400000-7fbf79600000 rw-p 00000000 00:00 0 
7fbf79700000-7fbf79900000 rw-p 00000000 00:00 0 
7fbf79a00000-7fbf79b00000 rw-p 00000000 00:00 0 
7fbf79c00000-7fbf79e00000 rw-p 00000000 00:00 0 
7fbf79f00000-7fbf7a000000 rw-p 00000000 00:00 0 
7fbf7a100000-7fbf7a200000 rw-p 00000000 00:00 0 
7fbf7a300000-7fbf7a600000 rw-p 00000000 00:00 0 
7fbf7a700000-7fbf7a800000 rw-p 00000000 00:00 0 
7fbf7ab00000-7fbf7ac00000 rw-p 00000000 00:00 0 
7fbf7ad00000-7fbf7ae00000 rw-p 00000000 00:00 0 
7fbf7af00000-7fbf7b000000 rw-p 00000000 00:00 0 
7fbf84100000-7fbf84200000 rw-p 00000000 00:00 0 
7fbf84600000-7fbf84f00000 rw-p 00000000 00:00 0 
7fbf85000000-7fbf85100000 rw-p 00000000 00:00 0 
7fbf85200000-7fbf85400000 rw-p 00000000 00:00 0 
7fbf85500000-7fbf85600000 rw-p 00000000 00:00 0 
7fbf85700000-7fbf85800000 rw-p 00000000 00:00 0 
7fbf85900000-7fbf85a00000 rw-p 00000000 00:00 0 
7fbf85b00000-7fbf85d00000 rw-p 00000000 00:00 0 
7fbf85e00000-7fbf86000000 rw-p 00000000 00:00 0 
7fbf86100000-7fbf86200000 rw-p 00000000 00:00 0 
7fbf86300000-7fbf86400000 rw-p 00000000 00:00 0 
7fbf86500000-7fbf86700000 rw-p 00000000 00:00 0 
7fbf86800000-7fbf86e00000 rw-p 00000000 00:00 0 
7fbf86f00000-7fbf87000000 rw-p 00000000 00:00 0 
7fbf87100000-7fbf87200000 rw-p 00000000 00:00 0 
7fbf87300000-7fbf87400000 rw-p 00000000 00:00 0 
7fbf87500000-7fbf87700000 rw-p 00000000 00:00 0 
7fbf87800000-7fbf87900000 rw-p 00000000 00:00 0 
7fbf87a00000-7fbf87b00000 rw-p 00000000 00:00 0 
7fbf87c00000-7fbf87d00000 rw-p 00000000 00:00 0 
7fbf87e00000-7fbf87f00000 rw-p 00000000 00:00 0 
7fbf88000000-7fbf88100000 rw-p 00000000 00:00 0 
7fbf88100000-7fbf88200000 rw-p 00000000 00:00 0 
7fbf88300000-7fbf88600000 rw-p 00000000 00:00 0 
7fbf887fe000-7fbf887ff000 ---p 00000000 00:00 0 
7fbf89100000-7fbf89200000 rw-p 00000000 00:00 0 
7fbf89300000-7fbf89400000 rw-p 00000000 00:00 0 
7fbf89500000-7fbf89600000 rw-p 00000000 00:00 0 
7fbf89700000-7fbf89900000 rw-p 00000000 00:00 0 
7fbf899f9000-7fbf899fa000 ---p 00000000 00:00 0 
7fbf8a200000-7fbf8a300000 rw-p 00000000 00:00 0 
7fbf8a400000-7fbf8a500000 rw-p 00000000 00:00 0 
7fbf8a600000-7fbf8a700000 rw-p 00000000 00:00 0 
7fbf8a800000-7fbf8a900000 rw-p 00000000 00:00 0 
7fbf8aa00000-7fbf8ab00000 rw-p 00000000 00:00 0 
7fbf8ab00000-7fbf8ad00000 rw-p 00000000 00:00 0 
7fbf8ae00000-7fbf8af00000 rw-p 00000000 00:00 0 
7fbf8b000000-7fbf8b100000 rw-p 00000000 00:00 0 
7fbf8b200000-7fbf8b300000 rw-p 00000000 00:00 0 
7fbf8b600000-7fbf8b700000 rw-p 00000000 00:00 0 
7fbf8b800000-7fbf8b900000 rw-p 00000000 00:00 0 
7fbf8ba00000-7fbf8bc00000 rw-p 00000000 00:00 0 
7fbf8bd00000-7fbf8be00000 rw-p 00000000 00:00 0 
7fbf8bf00000-7fbf8c000000 rw-p 00000000 00:00 0 
7fbf8c100000-7fbf8c200000 rw-p 00000000 00:00 0 
7fbf8c300000-7fbf8c400000 rw-p 00000000 00:00 0 
7fbf8c500000-7fbf8c600000 rw-p 00000000 00:00 0 
7fbf8c700000-7fbf8c800000 rw-p 00000000 00:00 0 
7fbf8c900000-7fbf8ca00000 rw-p 00000000 00:00 0 
7fbf8cb00000-7fbf8cc00000 rw-p 00000000 00:00 0 
7fbf8cd00000-7fbf8d300000 rw-p 00000000 00:00 0 
7fbf8d400000-7fbf8d600000 rw-p 00000000 00:00 0 
7fbf8d700000-7fbf8d800000 rw-p 00000000 00:00 0 
7fbf8d900000-7fbf8da00000 rw-p 00000000 00:00 0 
7fbf8dafc000-7fbf8dafd000 ---p 00000000 00:00 0 
7fbf8e2fd000-7fbf8e2fe000 ---p 00000000 00:00 0 
7fbf8eafe000-7fbf8eaff000 ---p 00000000 00:00 0 
7fbf8f2ff000-7fbf8f300000 ---p 00000000 00:00 0 
7fbf8fe00000-7fbf8ff00000 rw-p 00000000 00:00 0 
7fbf90000000-7fbf90100000 rw-p 00000000 00:00 0 
7fbf90200000-7fbf90300000 rw-p 00000000 00:00 0 
7fbf90400000-7fbf90500000 rw-p 00000000 00:00 0 
7fbf90600000-7fbf90700000 rw-p 00000000 00:00 0 
7fbf907ff000-7fbf90800000 ---p 00000000 00:00 0 
7fbf91200000-7fbf91300000 rw-p 00000000 00:00 0 
7fbf913ff000-7fbf91400000 ---p 00000000 00:00 0 
7fbf91800000-7fbf91900000 rw-p 00000000 00:00 0 
7fbf919fe000-7fbf919ff000 ---p 00000000 00:00 0 
7fbf921ff000-7fbf92200000 ---p 00000000 00:00 0 
7fbf92c00000-7fbf92d00000 rw-p 00000000 00:00 0 
7fbf92e00000-7fbf92f00000 rw-p 00000000 00:00 0 
7fbf93000000-7fbf93100000 rw-p 00000000 00:00 0 
7fbf93200000-7fbf93300000 rw-p 00000000 00:00 0 
7fbf93400000-7fbf93500000 rw-p 00000000 00:00 0 
7fbf93600000-7fbf93700000 rw-p 00000000 00:00 0 
7fbf937ff000-7fbf93800000 ---p 00000000 00:00 0 
7fbf94200000-7fbf94300000 rw-p 00000000 00:00 0 
7fbf94400000-7fbf94500000 rw-p 00000000 00:00 0 
7fbf94600000-7fbf94700000 rw-p 00000000 00:00 0 
7fbf94800000-7fbf94900000 rw-p 00000000 00:00 0 
7fbf94a00000-7fbf94b00000 rw-p 00000000 00:00 0 
7fbf94c00000-7fbf94d00000 rw-p 00000000 00:00 0 
7fbf94e00000-7fbf94f00000 rw-p 00000000 00:00 0 
7fbf95000000-7fbf95100000 rw-p 00000000 00:00 0 
7fbf95200000-7fbf95300000 rw-p 00000000 00:00 0 
7fbf95400000-7fbf95500000 rw-p 00000000 00:00 0 
7fbf95600000-7fbf95700000 rw-p 00000000 00:00 0 
7fbf95800000-7fbf95900000 rw-p 00000000 00:00 0 
7fbf95a00000-7fbf95b00000 rw-p 00000000 00:00 0 
7fbf95c00000-7fbf95d00000 rw-p 00000000 00:00 0 
7fbf95e00000-7fbf95f00000 rw-p 00000000 00:00 0 
7fbf96000000-7fbf96100000 rw-p 00000000 00:00 0 
7fbf96200000-7fbf96300000 rw-p 00000000 00:00 0 
7fbf96400000-7fbf96500000 rw-p 00000000 00:00 0 
7fbf96600000-7fbf96700000 rw-p 00000000 00:00 0 
7fbf96800000-7fbf96a00000 rw-p 00000000 00:00 0 
7fbf96b00000-7fbf96c00000 rw-p 00000000 00:00 0 
7fbf96d00000-7fbf96e00000 rw-p 00000000 00:00 0 
7fbf96f00000-7fbf97000000 rw-p 00000000 00:00 0 
7fbf97100000-7fbf97200000 rw-p 00000000 00:00 0 
7fbf97300000-7fbf97400000 rw-p 00000000 00:00 0 
7fbf97500000-7fbf97600000 rw-p 00000000 00:00 0 
7fbf97700000-7fbf97800000 rw-p 00000000 00:00 0 
7fbf97900000-7fbf97a00000 rw-p 00000000 00:00 0 
7fbf97b00000-7fbf97c00000 rw-p 00000000 00:00 0 
7fbf97d00000-7fbf97e00000 rw-p 00000000 00:00 0 
7fbf97f00000-7fbf98000000 rw-p 00000000 00:00 0 
7fbf98100000-7fbf98200000 rw-p 00000000 00:00 0 
7fbf98300000-7fbf98400000 rw-p 00000000 00:00 0 
7fbf98500000-7fbf98600000 rw-p 00000000 00:00 0 
7fbf98700000-7fbf98800000 rw-p 00000000 00:00 0 
7fbf98900000-7fbf98a00000 rw-p 00000000 00:00 0 
7fbf98b00000-7fbf98c00000 rw-p 00000000 00:00 0 
7fbf98d00000-7fbf98e00000 rw-p 00000000 00:00 0 
7fbf98f00000-7fbf99000000 rw-p 00000000 00:00 0 
7fbf99100000-7fbf99200000 rw-p 00000000 00:00 0 
7fbf99300000-7fbf99400000 rw-p 00000000 00:00 0 
7fbf99500000-7fbf99600000 rw-p 00000000 00:00 0 
7fbf99700000-7fbf99800000 rw-p 00000000 00:00 0 
7fbf99900000-7fbf99a00000 rw-p 00000000 00:00 0 
7fbf99b00000-7fbf99c00000 rw-p 00000000 00:00 0 
7fbf99d00000-7fbf99e00000 rw-p 00000000 00:00 0 
7fbf99f00000-7fbf9a000000 rw-p 00000000 00:00 0 
7fbf9a100000-7fbf9a200000 rw-p 00000000 00:00 0 
7fbf9a300000-7fbf9a400000 rw-p 00000000 00:00 0 
7fbf9a500000-7fbf9a600000 rw-p 00000000 00:00 0 
7fbf9a700000-7fbf9a800000 rw-p 00000000 00:00 0 
7fbf9a900000-7fbf9aa00000 rw-p 00000000 00:00 0 
7fbf9ab00000-7fbf9ac00000 rw-p 00000000 00:00 0 
7fbf9ad00000-7fbf9ae00000 rw-p 00000000 00:00 0 
7fbf9af00000-7fbf9b000000 rw-p 00000000 00:00 0 
7fbf9b100000-7fbf9b200000 rw-p 00000000 00:00 0 
7fbf9b300000-7fbf9b400000 rw-p 00000000 00:00 0 
7fbf9b500000-7fbf9b600000 rw-p 00000000 00:00 0 
7fbf9b700000-7fbf9b900000 rw-p 00000000 00:00 0 
7fbf9ba00000-7fbf9bb00000 rw-p 00000000 00:00 0 
7fbf9bc00000-7fbf9bd00000 rw-p 00000000 00:00 0 
7fbf9be00000-7fbf9bf00000 rw-p 00000000 00:00 0 
7fbf9c000000-7fbf9c100000 rw-p 00000000 00:00 0 
7fbf9c200000-7fbf9c700000 rw-p 00000000 00:00 0 
7fbf9c800000-7fbf9c900000 rw-p 00000000 00:00 0 
7fbf9ca00000-7fbf9cd00000 rw-p 00000000 00:00 0 
7fbf9ce00000-7fbf9cf00000 rw-p 00000000 00:00 0 
7fbf9d000000-7fbf9d100000 rw-p 00000000 00:00 0 
7fbf9d200000-7fbf9d300000 rw-p 00000000 00:00 0 
7fbf9d400000-7fbf9d500000 rw-p 00000000 00:00 0 
7fbf9d600000-7fbf9d700000 rw-p 00000000 00:00 0 
7fbf9d800000-7fbf9d900000 rw-p 00000000 00:00 0 
7fbf9da00000-7fbf9db00000 rw-p 00000000 00:00 0 
7fbf9dc00000-7fbf9dd00000 rw-p 00000000 00:00 0 
7fbf9de00000-7fbf9df00000 rw-p 00000000 00:00 0 
7fbf9e000000-7fbf9e100000 rw-p 00000000 00:00 0 
7fbf9e200000-7fbf9e300000 rw-p 00000000 00:00 0 
7fbf9e400000-7fbf9e500000 rw-p 00000000 00:00 0 
7fbf9e600000-7fbf9e700000 rw-p 00000000 00:00 0 
7fbf9e800000-7fbf9e900000 rw-p 00000000 00:00 0 
7fbf9ea00000-7fbf9eb00000 rw-p 00000000 00:00 0 
7fbf9ec00000-7fbf9ed00000 rw-p 00000000 00:00 0 
7fbf9f200000-7fbf9f300000 rw-p 00000000 00:00 0 
7fbf9f600000-7fbf9f700000 rw-p 00000000 00:00 0 
7fbf9fc00000-7fbf9fd00000 rw-p 00000000 00:00 0 
7fbf9fe00000-7fbf9ff00000 rw-p 00000000 00:00 0 
7fbfa0400000-7fbfa0c00000 rw-p 00000000 00:00 0 
7fbfa0d00000-7fbfa0e00000 rw-p 00000000 00:00 0 
7fbfa0f00000-7fbfa1000000 rw-p 00000000 00:00 0 
7fbfa1100000-7fbfa1300000 rw-p 00000000 00:00 0 
7fbfa1400000-7fbfa1700000 rw-p 00000000 00:00 0 
7fbfa1a00000-7fbfa4a00000 rw-p 00000000 00:00 0 
7fbfa4e00000-7fbfa7400000 rw-p 00000000 00:00 0 
7fbfa74fa000-7fbfa74fb000 ---p 00000000 00:00 0 
7fbfa7cfb000-7fbfa7cfc000 ---p 00000000 00:00 0 
7fbfa84fc000-7fbfa84fd000 ---p 00000000 00:00 0 
7fbfa908d000-7fbfa9091000 rw-p 00000000 00:00 0 
7fbfa94fe000-7fbfa94ff000 ---p 00000000 00:00 0 
7fbfa9cff000-7fbfa9d00000 ---p 00000000 00:00 0 
7fbfab2ff000-7fbfab300000 ---p 00000000 00:00 0 
7fbfac6ff000-7fbfac700000 ---p 00000000 00:00 0 
7fbfaec2c000-7fbfaec2d000 ---p 00000000 00:00 0 
7fbfaef00000-7fbfaf300000 rw-p 00000000 00:00 0 
7fbfafa00000-7fbfafb00000 rw-p 00000000 00:00 0 
7fbfafc00000-7fbfafd00000 rw-p 00000000 00:00 0 
7fbfb087c000-7fbfb087d000 rw-p 00000000 00:00 0 
7fbfb0f00000-7fbfb1000000 rw-p 00000000 00:00 0 
7fbfb1700000-7fbfb1d00000 rw-p 00000000 00:00 0 
7fbfb1dff000-7fbfb1e00000 ---p 00000000 00:00 0 
7fbfb283e000-7fbfb284e000 rwxp 00000000 00:00 0 
7fbfb2b00000-7fbfb3200000 rw-p 00000000 00:00 0 
7fbfb3200000-7fbfb3300000 rw-p 00000000 00:00 0 
7fbfb3500000-7fbfb3c00000 rw-p 00000000 00:00 0 
7fbfb3c1b000-7fbfb3c3b000 rwxp 00000000 00:00 0 
7fbfb3cfb000-7fbfb3cfc000 ---p 00000000 00:00 0 
7fbfb4700000-7fbfb4b00000 rw-p 00000000 00:00 0 
7fbfb4b00000-7fbfb5200000 rw-p 00000000 00:00 0 
7fbfb56fd000-7fbfb56fe000 ---p 00000000 00:00 0 
7fbfb5efe000-7fbfb5eff000 ---p 00000000 00:00 0 
7fbfb66ff000-7fbfb6700000 ---p 00000000 00:00 0 
7fbfb70be000-7fbfb70bf000 ---p 00000000 00:00 0 
7fbfb70ff000-7fbfb7100000 ---p 00000000 00:00 0 
7fbfb85c2000-7fbfb85f0000 rw-p 00000000 00:00 0 
7fbfb889b000-7fbfb889c000 rw-p 00000000 00:00 0 
7fbfb9362000-7fbfb9363000 rw-p 00000000 00:00 0 
7fbfb987a000-7fbfb987b000 rw-p 00000000 00:00 0 
7fbfb9ec2000-7fbfb9ec3000 rw-p 00000000 00:00 0 
7fbfba712000-7fbfba713000 rw-p 00000000 00:00 0 
7fbfbac2e000-7fbfbac30000 rw-p 00000000 00:00 0 
7fbfbd7f1000-7fbfbdb00000 rw-p 00000000 00:00 0 
7fbfbdbfd000-7fbfbdbfe000 ---p 00000000 00:00 0 
7fbfbe3fe000-7fbfbe3ff000 ---p 00000000 00:00 0 
7fbfbebff000-7fbfbec00000 ---p 00000000 00:00 0 
7fbfbf9ff000-7fbfbfa00000 ---p 00000000 00:00 0 
7fbfc7e00000-7fbfc8500000 rw-p 00000000 00:00 0 
7fbfc858a000-7fbfc859a000 rwxp 00000000 00:00 0 
7fbfc859a000-7fbfc859b000 ---p 00000000 00:00 0 
7fbfc929b000-7fbfc929f000 rw-p 00000000 00:00 0 
7fbfc9700000-7fbfc9900000 rw-p 00000000 00:00 0 
7fbfc990b000-7fbfc993b000 rwxp 00000000 00:00 0 
7fbfc9986000-7fbfc9996000 rwxp 00000000 00:00 0 
7fbfc99ef000-7fbfc99ff000 rwxp 00000000 00:00 0 
7fbfc99ff000-7fbfc9a00000 ---p 00000000 00:00 0 
7fbfcae02000-7fbfcae32000 rwxp 00000000 00:00 0 
7fbfcae32000-7fbfcae33000 ---p 00000000 00:00 0 
7fbfcb633000-7fbfcb634000 ---p 00000000 00:00 0 
7fbfcc234000-7fbfcc235000 ---p 00000000 00:00 0 
7fbfcca35000-7fbfcca36000 ---p 00000000 00:00 0 
7fbfcdaf8000-7fbfcdaf9000 rw-p 00000000 00:00 0 
7fbfcdd00000-7fbfcde00000 rw-p 00000000 00:00 0 
7fbfcde04000-7fbfcde24000 rwxp 00000000 00:00 0 
7fbfcde2c000-7fbfcde3c000 rwxp 00000000 00:00 0 
7fbfcde50000-7fbfcde70000 rwxp 00000000 00:00 0 
7fbfce0e7000-7fbfce0e8000 ---p 00000000 00:00 0 
7fbfcef00000-7fbfcf000000 rw-p 00000000 00:00 0 
7fbfcf005000-7fbfcf015000 rwxp 00000000 00:00 0 
7fbfcf0ff000-7fbfcf100000 ---p 00000000 00:00 0 
7fbfcfa0a000-7fbfcfa1a000 rwxp 00000000 00:00 0 
7fbfd030a000-7fbfd030b000 ---p 00000000 00:00 0 
7fbfd113c000-7fbfd113e000 rw-p 00000000 00:00 0 
7fbfd2af6000-7fbfd2af7000 rw-p 00000000 00:00 0 
7fbfd326d000-7fbfd326e000 rw-p 00000000 00:00 0 
7fbfd401f000-7fbfd4020000 rw-p 00000000 00:00 0 
7fbfd48f1000-7fbfd48f2000 rw-p 00000000 00:00 0 
7fbfd4b8c000-7fbfd4b8d000 rw-p 00000000 00:00 0 
7fbfd5c00000-7fbfd5d00000 rw-p 00000000 00:00 0 
7fbfd5d0e000-7fbfd5d3e000 rwxp 00000000 00:00 0 
7fbfd7f2a000-7fbfd7f2e000 rw-p 00000000 00:00 0 
7fbfd8b8e000-7fbfd8b92000 rw-p 00000000 00:00 0 
7fbfd9be0000-7fbfd9be2000 rw-p 00000000 00:00 0 
7fbfd9e00000-7fbfd9e01000 rw-p 00000000 00:00 0 
7fbfdbe81000-7fbfdbe82000 rw-p 00000000 00:00 0 
7fbfdc6ac000-7fbfdc6ae000 rw-p 00000000 00:00 0 
7fbfdcf07000-7fbfdcf09000 rw-p 00000000 00:00 0 
7fbfdd20b000-7fbfdd20d000 rw-p 00000000 00:00 0 
7fbfdd89e000-7fbfdd89f000 rw-p 00000000 00:00 0 
7fbfde597000-7fbfde598000 rw-p 00000000 00:00 0 
7fbfe1e0d000-7fbfe1f9c000 rw-p 00000000 00:00 0 
7fbfe2427000-7fbfe2428000 rw-p 00000000 00:00 0 
7fbfe2b87000-7fbfe2b89000 rw-p 00000000 00:00 0 
7fbfe35fd000-7fbfe3700000 rw-p 00000000 00:00 0 
7fbfe3701000-7fbfe3711000 rwxp 00000000 00:00 0 
7fbfe3d64000-7fbfe3d69000 rw-p 00000000 00:00 0 
7fbfe3f82000-7fbfe3f86000 rw-p 00000000 00:00 0 
7fbfe4579000-7fbfe458e000 rw-p 00000000 00:00 0 
7fbfe47b7000-7fbfe47b8000 rw-p 00000000 00:00 0 
7fbfe47b8000-7fbfe47b9000 rwxp 00000000 00:00 0 
7fbfe47d7000-7fbfe47e7000 rwxp 00000000 00:00 0 
7fbfe4883000-7fbfe4989000 rw-p 00000000 00:00 0 
7fbfe49b2000-7fbfe49b4000 rw-p 00000000 00:00 0 

It's about 35% out of 1300+ mappings that Firefox uses.

It is likely that the ---p mappings (about 40 of them) are guard pages.

How do I tell what the remaining anonymous areas are about?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:38               ` Pekka Enberg
@ 2013-07-12  9:45                 ` Ingo Molnar
  2013-07-12 10:09                   ` Peter Zijlstra
  0 siblings, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:45 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Pekka Enberg <penberg@kernel.org> wrote:

> On Fri, Jul 12, 2013 at 12:26 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> > Well, the JIT profiling case is really special - there we are 
> > constructing code and a symbol table on the fly. Talking to perf via a 
> > temporary file sounds unavoidable (and thus proper), because symbol 
> > information on that level is not something the kernel knows (or should 
> > know) about.
> >
> > I was arguing primarily in the context of the original patch: naming 
> > allocator heaps. Today the kernel makes a few educated guesses about 
> > what each memory area is about, in /proc/*/maps:
> >
> >  34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
> >  34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
> >  34511b2000-34511b7000 rw-p 00000000 00:00 0
> >  7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
> >  7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0
> >  7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0
> >  7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
> >  7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
> >  ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
> >
> > ... but on any larger app there's lots of anon mmap areas that are ... 
> > anonymous! ;-) User-space could help out a bit by naming them. It's 
> > not like there's many heaps, so the performance overhead aspect is 
> > minimal.
> >
> > In the JIT case we have something different, a 'file IO' abstraction 
> > really: the JIT is generating (writing) new code and associated symbol 
> > records. So using temporary files there is natural and proper and most 
> > of the disadvantages I list don't apply because the sheer volume of 
> > new code generated dillutes the overhead of open()/close(), plus we do 
> > need some space for those symbols so a JIT cannot really expect to be 
> > able to run in a pure readonly environment.
> >
> > In the allocator/heap case we have a _memory_ abstraction it's just 
> > that we also want to name the heap minimally.
> >
> > For any finer than vma granularity user-space attributes the kernel 
> > cannot help much, it does not know (and probably should not know) 
> > about all user-space data structures.
> >
> > Right now I don't see any good way to merge the two. (might be due to 
> > lack of imagination)
> 
> I have no trouble with the imagination part but you make a strong point 
> about the kernel not helping at finer granularity than vma anyway.
> 
> The current functionality is already quite helpful for VMs as well. We 
> could annotate the different GC and JIT regions and make perf more 
> human-friendly by default.

One thing where we could help JITs is to offer a direct channel to any 
perf profiling process: a prctl(SYS_TRACE) which would send a free-form 
string to any profiling task interested in it.

This would be a glorified anonymous write() in essence, without using a 
temporary file.

The advantage would be that the string could be captured as-is and copied 
to the ring-buffer of the profiling task - instead of having to recover it 
later on.

This is a model that I'd generally advocate: a single channel [per 
CPU-ified] for instrumentation/tracing.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:40                 ` Ingo Molnar
@ 2013-07-12  9:49                   ` Peter Zijlstra
  2013-07-12 10:01                     ` Ingo Molnar
  2013-07-12 20:51                     ` Colin Cross
  0 siblings, 2 replies; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > > 
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > We need those files anyway.. The current proposal is that the entire VMA 
> > > > has a single userspace pointer in it. Or rather a 64bit value.
> > > 
> > > Yes but accessible via /proc/<PID>/mem or so?
> > 
> > *shudder*.. yes. But you're again opening two files. The only advantage 
> > of this over userspace writing its own files is that the kernel cleans 
> > things up for you.
> 
> Opening of the files only occurs in the instrumentation case, which is 
> rare. But temporary files would be forced upon the regular usecase when no 
> instrumentation goes on.

Well, Colin didn't describe the intended use, but I can imagine a case where
its not all that rare. System health monitors might frequently want to update
this.

> > However from what I understood android runs apps as individual users, 
> > and I think we can do per user tmpfs mounts. So app dies, user exits, 
> > mount goes *poof*.
> 
> Yes, user-space could be smarter about temporary files.
> 
> Just like big banks could be less risk happy.
> 
> Yet the reality is that if left alone both apps and banks mess up, I don't 
> think libertarianism works for policy: we are better off offering a 
> framework that is simple, robust, self-contained, low risk and hard to 
> mess up?

Fair enough; but I still want Colin to tell me why he can't do this in
userspace. And what all he wants to go do with this information etc.

He's basically not told us much at all.

> So, these 400+ memory ranges are from Firefox's /proc/*/maps file:
> 
<snip>
> 
> It's about 35% out of 1300+ mappings that Firefox uses.
> 
> It is likely that the ---p mappings (about 40 of them) are guard pages.
> 
> How do I tell what the remaining anonymous areas are about?

Well, if you'd ran it within a memory allocator debug framework that would have
kept track of this. Typically memory debuggers can keep allocation time stacks
etc.

If I'm not actively debugging firefox I don't give a damn.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:49                   ` Peter Zijlstra
@ 2013-07-12 10:01                     ` Ingo Molnar
  2013-07-12 20:51                     ` Colin Cross
  1 sibling, 0 replies; 33+ messages in thread
From: Ingo Molnar @ 2013-07-12 10:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > > > 
> > > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > > 
> > > > > We need those files anyway.. The current proposal is that the entire VMA 
> > > > > has a single userspace pointer in it. Or rather a 64bit value.
> > > > 
> > > > Yes but accessible via /proc/<PID>/mem or so?
> > > 
> > > *shudder*.. yes. But you're again opening two files. The only advantage 
> > > of this over userspace writing its own files is that the kernel cleans 
> > > things up for you.
> > 
> > Opening of the files only occurs in the instrumentation case, which is 
> > rare. But temporary files would be forced upon the regular usecase 
> > when no instrumentation goes on.
> 
> Well, Colin didn't describe the intended use, but I can imagine a case 
> where its not all that rare. System health monitors might frequently 
> want to update this.

That's true.

So maybe it would be better to offer a tracepoint that allows apps to emit 
such information - to any system monitor around to listen.

If it's made a vsyscall that does not enter the kernel if the process is 
not being monitored would make it very low overhead.

> > So, these 400+ memory ranges are from Firefox's /proc/*/maps file:
> > 
> <snip>
> > 
> > It's about 35% out of 1300+ mappings that Firefox uses.
> > 
> > It is likely that the ---p mappings (about 40 of them) are guard pages.
> > 
> > How do I tell what the remaining anonymous areas are about?
> 
> Well, if you'd ran it within a memory allocator debug framework that 
> would have kept track of this. Typically memory debuggers can keep 
> allocation time stacks etc.
> 
> If I'm not actively debugging firefox I don't give a damn.

Yet people are nosy and find it rather useful to have such 
'heap/stack/vdso/vsyscall' annotations:

 0237c000-0239d000 rw-p 00000000 00:00 0                                  [heap]
 ...
 7fff622af000-7fff622d0000 rw-p 00000000 00:00 0                          [stack]
 7fff623fe000-7fff62400000 r-xp 00000000 00:00 0                          [vdso]
 ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]

and named vmas have names as well:

 7fa5b02eb000-7fa5b6718000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive

so why not allow some simple mechanism to descriptively name anonymous 
vmas as well?

Maybe the 8 bytes shouldn't be a pointer to user-space memory, but a short 
string, a bit like task_struct:comm[16]?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:45                 ` Ingo Molnar
@ 2013-07-12 10:09                   ` Peter Zijlstra
  0 siblings, 0 replies; 33+ messages in thread
From: Peter Zijlstra @ 2013-07-12 10:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 11:45:17AM +0200, Ingo Molnar wrote:
> One thing where we could help JITs is to offer a direct channel to any 
> perf profiling process: a prctl(SYS_TRACE) which would send a free-form 
> string to any profiling task interested in it.
> 
> This would be a glorified anonymous write() in essence, without using a 
> temporary file.
> 
> The advantage would be that the string could be captured as-is and copied 
> to the ring-buffer of the profiling task - instead of having to recover it 
> later on.
> 
> This is a model that I'd generally advocate: a single channel [per 
> CPU-ified] for instrumentation/tracing.

'free format text string' is long and cumbersome and requires parsing.

And size is the primary component in speed.

But yes, we could allow injection of something like 

struct PERF_RECORD_SYMBOL {
	struct perf_event_header	header;
	u32				pid, tid;
	u64				addr;
	u64				len;
	char				symbol[];
};

I still like the idea of actually writing valid ELF DSOs in that that would
also get us the TEXT and allow assembly inspection etc. It might also allow a
JIT to re-map those DSOs and decrease warm-up time -- provided the actual
program didn't change meanwhile.

How to do injection is another thing though; I don't much like prctl(). Then
again, offering a special file like /sys/bus/event_source/sink isn't
particularly pretty either.

Then there is the issue of attaching to an already running JIT; we'd need means
to 'catch' up. The DSOs trivially allow this; the injection not so much.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:49                   ` Peter Zijlstra
  2013-07-12 10:01                     ` Ingo Molnar
@ 2013-07-12 20:51                     ` Colin Cross
  2013-09-26  1:24                       ` Colin Cross
  1 sibling, 1 reply; 33+ messages in thread
From: Colin Cross @ 2013-07-12 20:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Pekka Enberg, lkml, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, Linux-MM,
	Linus Torvalds

On Fri, Jul 12, 2013 at 2:49 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
>> * Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
>> > >
>> > > * Peter Zijlstra <peterz@infradead.org> wrote:
>> > >
>> > > > We need those files anyway.. The current proposal is that the entire VMA
>> > > > has a single userspace pointer in it. Or rather a 64bit value.
>> > >
>> > > Yes but accessible via /proc/<PID>/mem or so?
>> >
>> > *shudder*.. yes. But you're again opening two files. The only advantage
>> > of this over userspace writing its own files is that the kernel cleans
>> > things up for you.
>>
>> Opening of the files only occurs in the instrumentation case, which is
>> rare. But temporary files would be forced upon the regular usecase when no
>> instrumentation goes on.
>
> Well, Colin didn't describe the intended use, but I can imagine a case where
> its not all that rare. System health monitors might frequently want to update
> this.
>
>> > However from what I understood android runs apps as individual users,
>> > and I think we can do per user tmpfs mounts. So app dies, user exits,
>> > mount goes *poof*.
>>
>> Yes, user-space could be smarter about temporary files.
>>
>> Just like big banks could be less risk happy.
>>
>> Yet the reality is that if left alone both apps and banks mess up, I don't
>> think libertarianism works for policy: we are better off offering a
>> framework that is simple, robust, self-contained, low risk and hard to
>> mess up?
>
> Fair enough; but I still want Colin to tell me why he can't do this in
> userspace. And what all he wants to go do with this information etc.
>
> He's basically not told us much at all.

I covered it a little in the thread on the previous version of the
patch, but I'll try to give more detail (and include it in a patch
stack description if I post another version).

In many userspace applications, and especially in VM based
applications like Android uses heavily, there are multiple different
allocators in use.  At a minimum there is libc malloc and the stack,
and in many cases there are libc malloc, the stack, direct syscalls to
mmap anonymous memory, and multiple VM heaps (one for small objects,
one for big objects, etc.).  Each of these layers usually has its own
tools to inspect its usage; malloc by compiling a debug version, the
VM through heap inspection tools, and for direct syscalls there is
usually no way to track them.

On Android we heavily use a set of tools that use an extended version
of the logic covered in Documentation/vm/pagemap.txt to walk all pages
mapped in userspace and slice their usage by process, shared (COW) vs.
unique mappings, backing, etc.  This can account for real physical
memory usage even in cases like fork without exec (which Android uses
heavily to share as many private COW pages as possible between
processes), Kernel SamePage Merging, and clean zero pages.  It
produces a measurement of the pages that only exist in that process
(USS, for unique), and a measurement of the physical memory usage of
that process with the cost of shared pages being evenly split between
processes that share them (PSS).  We need the feature to be efficient
enough to be left on at all times because app developers and end users
can use similar tools exposed through system reports and bugreports to
determine the memory usage of apps

If all anonymous memory is indistinguishable then figuring out the
real physical memory usage of each heap requires either a pagemap
walking tool that can understand the heap debugging of every layer, or
for every layer's heap debugging tools to implement the pagemap
walking logic, in which case it is hard to get a consistent view of
memory across the whole system.

Tracking the information in userspace leads to all sorts of problems.
It either needs to be stored inside the process, which means every
process has to have an API to export its current heap information upon
request, or it has to be stored externally in a filesystem that
somebody needs to clean up on crashes.  It needs to be readable while
the process is still running, so it has to have some sort of
synchronization with every layer of userspace.  Efficiently tracking
the ranges requires reimplementing something like the kernel vma
trees, and linking to it from every layer of userspace.  It requires
more memory, more syscalls, more runtime cost, and more complexity to
separately track regions that the kernel is already tracking.

This feature is considered critical enough that Dalvik (Android's VM)
uses ashmem, which is effectively deleted tmpfs files, solely to name
their heaps.   I'd like to get rid of as much ashmem use within
Android as possible, with an eye towards deprecating it.  ashmem heaps
work reasonably well for a VM, which is likely to want a single
contiguous region of address space that it will manage on its own, but
falls apart for malloc, which often wants small kernel-allocated
address space regions that may or may not merge with adjacent regions.
 Blindly using ashmem/deleted tmpfs files instead of anonymous mmaps
in malloc doubled the number of vmas in our main system process and
was worse for the GLBenchmark process.

As a concrete example of its usefulness (which should not be
considered the extent of its usefulness, it's just what I happened to
be looking at), I was recently tracking down why we were seeing many
dirty private pages that were all zeroes being merged by KSM.  Using a
mixture of ashmem naming and an early version of this patch, I could
slice the the number of KSM merged pages per process and per heap,
which then told me which heap debugging tools I should use to find who
was dirtying large regions of zeroes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
                     ` (2 preceding siblings ...)
  2013-07-12  6:36   ` Dave Hansen
@ 2013-07-14 14:11   ` Oleg Nesterov
  2013-07-14 19:27     ` Colin Cross
  2013-07-14 14:17   ` Oleg Nesterov
  4 siblings, 1 reply; 33+ messages in thread
From: Oleg Nesterov @ 2013-07-14 14:11 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

Sorry if this was already discussed... I am still trying to think if
we can make a simpler patch.

So, iiuc, the main problem is that if you want to track a vma you need
to prevent the merging with other vma's.

Question: is it important that vma's with the same vma_name should be
_merged_ automatically?

If not, can't we make "do not merge" a separate feature and then add
vma_name?

IOW, please forget about vma_name for the moment. Can't we start with
the trivial patch below? It simply adds the new vm flag which blocks
the merging, and MADV_ to set/clear it.

Yes, this is more limited. Once you set VM_TAINTED this vma is always
isolated. If you unmap a page in this vma, you create 2 isolated vma's.
If, for example, you do MADV_DONTFORK + MADV_DOFORK inside the tainted
vma, you will have 2 adjacent VM_TAINTED vma's with the same flags after
that. But you can do MADV_UNTAINT + MADV_TAINT again if you want to
merge them back. And perhaps this feature is useful even without the
naming. And perhaps we can also add MAP_TAINTED.

Now about vma_name. In this case PR_SET_VMA or MADV_NAME should simply
set/overwrite vma_name and nothing else, no need to do merge/split vma.

And if we add MAP_TAINTED, MAP_ANONYMOUS can reuse pgoff as vma_name
(we only need a simple changes in do_mmap_pgoff and mmap_region). But
this is minor.

Or this is too simple/ugly? Probably yes, this means that an allocator
which simply does a lot of MAP_ANONYMOUS + MADV_TAINT will create more
vma's than it needs. So I won't insist but I'd like to ask anyway.

Oleg.

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -90,6 +90,8 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
+#define VM_TAINTED	0x00001000
+
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 4164529..888af10 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -52,6 +52,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_TAINT	18
+#define MADV_UNTAINT	19
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..0ddc76f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -81,6 +81,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		}
 		new_flags &= ~VM_DONTDUMP;
 		break;
+	case MADV_TAINT:
+		new_flags |= VM_TAINTED;
+		break;
+	case MADV_UNTAINT:
+		new_flags &= ~VM_TAINTED;
+		break;
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
@@ -407,6 +413,8 @@ madvise_behavior_valid(int behavior)
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
+	case MADV_TAINT:
+	case MADV_UNTAINT:
 		return 1;
 
 	default:
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..00323b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1003,9 +1003,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 
 	/*
 	 * We later require that vma->vm_flags == vm_flags,
-	 * so this tests vma->vm_flags & VM_SPECIAL, too.
+	 * so this tests vma->vm_flags & VM_XXX, too.
 	 */
-	if (vm_flags & VM_SPECIAL)
+	if (vm_flags & (VM_SPECIAL | VM_TAINTED))
 		return NULL;
 
 	if (prev)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
                     ` (3 preceding siblings ...)
  2013-07-14 14:11   ` Oleg Nesterov
@ 2013-07-14 14:17   ` Oleg Nesterov
  2013-07-14 19:34     ` Colin Cross
  4 siblings, 1 reply; 33+ messages in thread
From: Oleg Nesterov @ 2013-07-14 14:17 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On 07/11, Colin Cross wrote:
>
> +static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
> +{
> +	const char __user *name = vma_get_anon_name(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +
> +	unsigned long page_start_vaddr;
> +	unsigned long page_offset;
> +	unsigned long num_pages;
> +	unsigned long max_len = NAME_MAX;
> +	int i;
> +
> +	page_start_vaddr = (unsigned long)name & PAGE_MASK;
> +	page_offset = (unsigned long)name - page_start_vaddr;
> +	num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
> +
> +	seq_puts(m, "[anon:");
> +
> +	for (i = 0; i < num_pages; i++) {
> +		int len;
> +		int write_len;
> +		const char *kaddr;
> +		long pages_pinned;
> +		struct page *page;
> +
> +		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
> +				1, 0, 0, &page, NULL);
> +		if (pages_pinned < 1) {
> +			seq_puts(m, "<fault>]");
> +			return;
> +		}
> +
> +		kaddr = (const char *)kmap(page);
> +		len = min(max_len, PAGE_SIZE - page_offset);
> +		write_len = strnlen(kaddr + page_offset, len);
> +		seq_write(m, kaddr + page_offset, write_len);
> +		kunmap(page);
> +		put_page(page);
> +
> +		/* if strnlen hit a null terminator then we're done */
> +		if (write_len != len)
> +			break;
> +
> +		max_len -= len;
> +		page_offset = 0;
> +		page_start_vaddr += PAGE_SIZE;
> +	}
> +
> +	seq_putc(m, ']');
> +}

Again, sorry if this was already discussed...

But for what? This moves the policy into the kernel and afaics buys nothing.
Can't it simply print the number?

If an application reads its own /proc/pid/maps, surely it knows how it should
interpret the numeric values.

If another process reads this file, and if it assumes that this number is a
pointer into that task's memory, it can do sys_process_vm_readv() ?

Oleg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-14 14:11   ` Oleg Nesterov
@ 2013-07-14 19:27     ` Colin Cross
  0 siblings, 0 replies; 33+ messages in thread
From: Colin Cross @ 2013-07-14 19:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, Linux-MM

On Sun, Jul 14, 2013 at 7:11 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> Sorry if this was already discussed... I am still trying to think if
> we can make a simpler patch.
>
> So, iiuc, the main problem is that if you want to track a vma you need
> to prevent the merging with other vma's.
>
> Question: is it important that vma's with the same vma_name should be
> _merged_ automatically?
>
> If not, can't we make "do not merge" a separate feature and then add
> vma_name?
>
> IOW, please forget about vma_name for the moment. Can't we start with
> the trivial patch below? It simply adds the new vm flag which blocks
> the merging, and MADV_ to set/clear it.
>
> Yes, this is more limited. Once you set VM_TAINTED this vma is always
> isolated. If you unmap a page in this vma, you create 2 isolated vma's.
> If, for example, you do MADV_DONTFORK + MADV_DOFORK inside the tainted
> vma, you will have 2 adjacent VM_TAINTED vma's with the same flags after
> that. But you can do MADV_UNTAINT + MADV_TAINT again if you want to
> merge them back. And perhaps this feature is useful even without the
> naming. And perhaps we can also add MAP_TAINTED.
>
> Now about vma_name. In this case PR_SET_VMA or MADV_NAME should simply
> set/overwrite vma_name and nothing else, no need to do merge/split vma.
>
> And if we add MAP_TAINTED, MAP_ANONYMOUS can reuse pgoff as vma_name
> (we only need a simple changes in do_mmap_pgoff and mmap_region). But
> this is minor.
>
> Or this is too simple/ugly? Probably yes, this means that an allocator
> which simply does a lot of MAP_ANONYMOUS + MADV_TAINT will create more
> vma's than it needs. So I won't insist but I'd like to ask anyway.

This is no different than using a new tmpfs file for every mmap
(although it saves the struct file and the inode), it results in a
huge increase in the number of vmas.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-14 14:17   ` Oleg Nesterov
@ 2013-07-14 19:34     ` Colin Cross
  0 siblings, 0 replies; 33+ messages in thread
From: Colin Cross @ 2013-07-14 19:34 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, Linux-MM

On Sun, Jul 14, 2013 at 7:17 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> On 07/11, Colin Cross wrote:
>>
>> +static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
>> +{
>> +     const char __user *name = vma_get_anon_name(vma);
>> +     struct mm_struct *mm = vma->vm_mm;
>> +
>> +     unsigned long page_start_vaddr;
>> +     unsigned long page_offset;
>> +     unsigned long num_pages;
>> +     unsigned long max_len = NAME_MAX;
>> +     int i;
>> +
>> +     page_start_vaddr = (unsigned long)name & PAGE_MASK;
>> +     page_offset = (unsigned long)name - page_start_vaddr;
>> +     num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
>> +
>> +     seq_puts(m, "[anon:");
>> +
>> +     for (i = 0; i < num_pages; i++) {
>> +             int len;
>> +             int write_len;
>> +             const char *kaddr;
>> +             long pages_pinned;
>> +             struct page *page;
>> +
>> +             pages_pinned = get_user_pages(current, mm, page_start_vaddr,
>> +                             1, 0, 0, &page, NULL);
>> +             if (pages_pinned < 1) {
>> +                     seq_puts(m, "<fault>]");
>> +                     return;
>> +             }
>> +
>> +             kaddr = (const char *)kmap(page);
>> +             len = min(max_len, PAGE_SIZE - page_offset);
>> +             write_len = strnlen(kaddr + page_offset, len);
>> +             seq_write(m, kaddr + page_offset, write_len);
>> +             kunmap(page);
>> +             put_page(page);
>> +
>> +             /* if strnlen hit a null terminator then we're done */
>> +             if (write_len != len)
>> +                     break;
>> +
>> +             max_len -= len;
>> +             page_offset = 0;
>> +             page_start_vaddr += PAGE_SIZE;
>> +     }
>> +
>> +     seq_putc(m, ']');
>> +}
>
> Again, sorry if this was already discussed...
>
> But for what? This moves the policy into the kernel and afaics buys nothing.
> Can't it simply print the number?
>
> If an application reads its own /proc/pid/maps, surely it knows how it should
> interpret the numeric values.
>
> If another process reads this file, and if it assumes that this number is a
> pointer into that task's memory, it can do sys_process_vm_readv() ?

I think there is value in keeping /proc/pid/maps human readable.  A
userspace tool could certainly put together the same information, but
there would be no easy way to do it from the command line.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12 20:51                     ` Colin Cross
@ 2013-09-26  1:24                       ` Colin Cross
  0 siblings, 0 replies; 33+ messages in thread
From: Colin Cross @ 2013-09-26  1:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Pekka Enberg, lkml, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, Linux-MM,
	Linus Torvalds

On Fri, Jul 12, 2013 at 1:51 PM, Colin Cross <ccross@android.com> wrote:
> On Fri, Jul 12, 2013 at 2:49 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
>>> * Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
>>> > >
>>> > > * Peter Zijlstra <peterz@infradead.org> wrote:
>>> > >
>>> > > > We need those files anyway.. The current proposal is that the entire VMA
>>> > > > has a single userspace pointer in it. Or rather a 64bit value.
>>> > >
>>> > > Yes but accessible via /proc/<PID>/mem or so?
>>> >
>>> > *shudder*.. yes. But you're again opening two files. The only advantage
>>> > of this over userspace writing its own files is that the kernel cleans
>>> > things up for you.
>>>
>>> Opening of the files only occurs in the instrumentation case, which is
>>> rare. But temporary files would be forced upon the regular usecase when no
>>> instrumentation goes on.
>>
>> Well, Colin didn't describe the intended use, but I can imagine a case where
>> its not all that rare. System health monitors might frequently want to update
>> this.
>>
>>> > However from what I understood android runs apps as individual users,
>>> > and I think we can do per user tmpfs mounts. So app dies, user exits,
>>> > mount goes *poof*.
>>>
>>> Yes, user-space could be smarter about temporary files.
>>>
>>> Just like big banks could be less risk happy.
>>>
>>> Yet the reality is that if left alone both apps and banks mess up, I don't
>>> think libertarianism works for policy: we are better off offering a
>>> framework that is simple, robust, self-contained, low risk and hard to
>>> mess up?
>>
>> Fair enough; but I still want Colin to tell me why he can't do this in
>> userspace. And what all he wants to go do with this information etc.
>>
>> He's basically not told us much at all.
>
> I covered it a little in the thread on the previous version of the
> patch, but I'll try to give more detail (and include it in a patch
> stack description if I post another version).
>
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
>
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).  We need the feature to be efficient
> enough to be left on at all times because app developers and end users
> can use similar tools exposed through system reports and bugreports to
> determine the memory usage of apps
>
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
>
> Tracking the information in userspace leads to all sorts of problems.
> It either needs to be stored inside the process, which means every
> process has to have an API to export its current heap information upon
> request, or it has to be stored externally in a filesystem that
> somebody needs to clean up on crashes.  It needs to be readable while
> the process is still running, so it has to have some sort of
> synchronization with every layer of userspace.  Efficiently tracking
> the ranges requires reimplementing something like the kernel vma
> trees, and linking to it from every layer of userspace.  It requires
> more memory, more syscalls, more runtime cost, and more complexity to
> separately track regions that the kernel is already tracking.
>
> This feature is considered critical enough that Dalvik (Android's VM)
> uses ashmem, which is effectively deleted tmpfs files, solely to name
> their heaps.   I'd like to get rid of as much ashmem use within
> Android as possible, with an eye towards deprecating it.  ashmem heaps
> work reasonably well for a VM, which is likely to want a single
> contiguous region of address space that it will manage on its own, but
> falls apart for malloc, which often wants small kernel-allocated
> address space regions that may or may not merge with adjacent regions.
>  Blindly using ashmem/deleted tmpfs files instead of anonymous mmaps
> in malloc doubled the number of vmas in our main system process and
> was worse for the GLBenchmark process.
>
> As a concrete example of its usefulness (which should not be
> considered the extent of its usefulness, it's just what I happened to
> be looking at), I was recently tracking down why we were seeing many
> dirty private pages that were all zeroes being merged by KSM.  Using a
> mixture of ashmem naming and an early version of this patch, I could
> slice the the number of KSM merged pages per process and per heap,
> which then told me which heap debugging tools I should use to find who
> was dirtying large regions of zeroes.

Peter, any thoughts on this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2013-09-26  1:24 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-12  2:34 [PATCH 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
2013-07-12  5:39   ` Pekka Enberg
2013-07-12  8:13     ` Peter Zijlstra
2013-07-12  8:17       ` Peter Zijlstra
2013-07-12  8:44         ` Ingo Molnar
2013-07-12  8:55           ` Pekka Enberg
2013-07-12  9:00           ` Peter Zijlstra
2013-07-12  9:15             ` Ingo Molnar
2013-07-12  9:27               ` Peter Zijlstra
2013-07-12  9:40                 ` Ingo Molnar
2013-07-12  9:49                   ` Peter Zijlstra
2013-07-12 10:01                     ` Ingo Molnar
2013-07-12 20:51                     ` Colin Cross
2013-09-26  1:24                       ` Colin Cross
2013-07-12  8:21       ` Pekka Enberg
2013-07-12  8:55         ` Peter Zijlstra
2013-07-12  9:04           ` Pekka Enberg
2013-07-12  9:14             ` Peter Zijlstra
2013-07-12  9:28               ` Ingo Molnar
2013-07-12  9:26             ` Ingo Molnar
2013-07-12  9:38               ` Pekka Enberg
2013-07-12  9:45                 ` Ingo Molnar
2013-07-12 10:09                   ` Peter Zijlstra
2013-07-12  5:43   ` Pekka Enberg
2013-07-12  6:18     ` Colin Cross
2013-07-12  7:03       ` Pekka Enberg
2013-07-12  6:36   ` Dave Hansen
2013-07-12  6:42     ` Colin Cross
2013-07-14 14:11   ` Oleg Nesterov
2013-07-14 19:27     ` Colin Cross
2013-07-14 14:17   ` Oleg Nesterov
2013-07-14 19:34     ` Colin Cross

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox