* [PATCH] per-node page breakout for /proc/<pid>/maps
@ 2003-04-12 1:55 Dave Hansen
0 siblings, 0 replies; only message in thread
From: Dave Hansen @ 2003-04-12 1:55 UTC (permalink / raw)
To: Martin J. Bligh; +Cc: linux-mm
[-- Attachment #1: Type: text/plain, Size: 485 bytes --]
We're quite interested to see how effective our NUMA allocation
strategies are, and how fragmented things get. The following patch
modifies /proc/<pid>/maps to display the number of pages each map has
allocated on each node of the system.
This should have few effects on non-numa machines. It's aimed at
Martin's tree for now, but I figured I'd cc linux-mm just in case any
one else was interested.
Tested on 4-node 16-way NUMA-Q and 4-way SMP.
--
Dave Hansen
haveblue@us.ibm.com
[-- Attachment #2: pidmaps_nodepages-2.5.67-mjb1-0.patch --]
[-- Type: text/plain, Size: 5069 bytes --]
Only in linux-2.5.67-mjb1-pidmaps-nodepages/fs/proc: .task_mmu.c.swo
Only in linux-2.5.67-mjb1-pidmaps-nodepages/fs/proc: .task_mmu.c.swp
diff -ur linux-2.5.67-mjb1-clean/fs/proc/task_mmu.c linux-2.5.67-mjb1-pidmaps-nodepages/fs/proc/task_mmu.c
--- linux-2.5.67-mjb1-clean/fs/proc/task_mmu.c Thu Apr 10 21:51:16 2003
+++ linux-2.5.67-mjb1-pidmaps-nodepages/fs/proc/task_mmu.c Fri Apr 11 18:50:25 2003
@@ -2,6 +2,7 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <asm/uaccess.h>
+#include <asm/mmzone.h>
char *task_mem(struct mm_struct *mm, char *buffer)
{
@@ -111,7 +112,70 @@
#define MAPS_LINE_FORMAT (sizeof(void*) == 4 ? MAPS_LINE_FORMAT4 : MAPS_LINE_FORMAT8)
#define MAPS_LINE_MAX (sizeof(void*) == 4 ? MAPS_LINE_MAX4 : MAPS_LINE_MAX8)
-static int proc_pid_maps_get_line (char *buf, struct vm_area_struct *map)
+static int print_vma_nodepages(char* buf, struct mm_struct *mm, struct vm_area_struct *map)
+{
+ int retval = 0;
+ unsigned long vaddr = map->vm_start;
+ unsigned long vm_end = map->vm_end;
+ int pages_per_node[MAX_NR_NODES];
+ int i;
+
+ if (numnodes<=1)
+ goto out;
+
+ for (i=0;i<numnodes;i++)
+ pages_per_node[i] = 0;
+
+ for (;vaddr < vm_end; vaddr += PAGE_SIZE) {
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *ptep;
+ pte_t pte = __pte(0);
+ unsigned long pfn = 0;
+
+ spin_lock(&mm->page_table_lock);
+ pgd = pgd_offset(mm, vaddr);
+ if (pgd_none(*pgd) || pgd_bad(*pgd))
+ goto next;
+
+ pmd = pmd_offset(pgd, vaddr);
+ if (pmd_none(*pmd))
+ goto next;
+ if (pmd_huge(*pmd)) {
+ /*
+ * there have to be 86 gigillion ways to
+ * state hugetlb page size, or the area mapped
+ * by a pmd entry, or ...
+ */
+ pages_per_node[page_to_pfn(pmd_page(*pmd))]
+ += PAGE_SIZE*PTRS_PER_PTE;
+ goto next;
+ }
+ if (pmd_bad(*pmd))
+ goto next;
+
+ ptep = pte_offset_map(pmd, vaddr);
+ if (!ptep)
+ goto next;
+
+ pte = *ptep;
+ next:
+ spin_unlock(&mm->page_table_lock);
+ pfn = pte_pfn(pte);
+ if (pfn) /* don't count the zero page */
+ pages_per_node[pfn_to_nid(pfn)]++;
+ }
+ retval += sprintf(&buf[retval],"#");
+ for (i=0; i<numnodes; i++)
+ retval += sprintf(&buf[retval], " %d",
+ pages_per_node[i]);
+
+out:
+ return retval;
+}
+
+static int proc_pid_maps_get_line (char *buf, struct mm_struct *mm,
+ struct vm_area_struct *map)
{
/* produce the next line */
char *line;
@@ -133,12 +197,56 @@
ino = 0;
if (map->vm_file != NULL) {
struct inode *inode = map->vm_file->f_dentry->d_inode;
+ int nplen, buf_left;
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
+ /*
+ * this is relatively disgusting. these functions are all
+ * meant to print at the _end_ of the buffer that they're given.
+ * I think this is to make the size calculation easier.
+ *
+ * if a print-into-buffer function is given a buffer, then
+ * just returns a pointer to that buffer, it may take
+ * an extra run through the buffer to figure out how much
+ * was actgually printed. This way, you can figure it out
+ * by doing (buf_arg+buf_len)-returned_buf, instead of running
+ * through it.
+ *
+ * why we don't just print into the beginning of the buffer
+ * and return the number of bytes written (like sprintf) I
+ * don't know.
+ *
+ * it doesn't look like these need to be null-terminated
+ *
+ * Dave Hansen <haveblue@us.ibm.com> 4-11-2003
+ */
+
+ /*
+ * since most of print_vma_nodepages()'s output is in decimal,
+ * and the number of nodes isn't known at compile time, it is
+ * hard to predetermine the length, which makes it extra
+ * hard to print into the end of a buffer.
+ *
+ * here, we print to the beginning of the buffer, then move
+ * it to them end
+ */
+ nplen = print_vma_nodepages(buf, mm, map);
+ BUG_ON(nplen > (PAGE_SIZE/2));
+ /* leave space for the \n */
+ buf_left = PAGE_SIZE - nplen - 1;
+ memmove(buf+buf_left, buf, nplen);
+ memset(buf,0,nplen);
+ buf[PAGE_SIZE-1] = '\n';
+
+ /*
+ * d_path is already designed to fill from the back of the buffer
+ * to the front
+ */
line = d_path(map->vm_file->f_dentry,
map->vm_file->f_vfsmnt,
- buf, PAGE_SIZE);
- buf[PAGE_SIZE-1] = '\n';
+ buf, buf_left);
+ /* replace d_path's terminating NULL with a space */
+ buf[buf_left-1] = ' ';
line -= MAPS_LINE_MAX;
if(line < buf)
line = buf;
@@ -207,7 +315,7 @@
off -= PAGE_SIZE;
goto next;
}
- len = proc_pid_maps_get_line(tmp, map);
+ len = proc_pid_maps_get_line(tmp, mm, map);
len -= off;
if (len > 0) {
if (retval+len > count) {
diff -ur linux-2.5.67-mjb1-clean/include/asm-i386/mmzone.h linux-2.5.67-mjb1-pidmaps-nodepages/include/asm-i386/mmzone.h
--- linux-2.5.67-mjb1-clean/include/asm-i386/mmzone.h Thu Apr 10 21:51:23 2003
+++ linux-2.5.67-mjb1-pidmaps-nodepages/include/asm-i386/mmzone.h Fri Apr 11 00:15:11 2003
@@ -8,7 +8,11 @@
#include <asm/smp.h>
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_DISCONTIGMEM
+
+#define pfn_to_nid(pfn) (0)
+
+#else
extern struct pglist_data *node_data[];
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2003-04-12 1:55 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-12 1:55 [PATCH] per-node page breakout for /proc/<pid>/maps Dave Hansen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox