linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] export NUMA allocation fragmentation
@ 2003-01-31  3:39 Dave Hansen
  0 siblings, 0 replies; only message in thread
From: Dave Hansen @ 2003-01-31  3:39 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1317 bytes --]

The NUMA memory allocation support attempts to allocate pages close to
the CPUs that it is currently running on.  We have a hard time
determining how effective these strategies have been, or how fragmented
the allocations might get if a process is bounced around between nodes.
 This patch adds a new /proc/<pid> entry: nodepages.

It walks the process's vm_area_structs for all vaddr ranges, then
examines the ptes to determine on which node each virtual address
physically resides.

I'm a little worried about just taking the pte from __follow_page() and
dumping it into pte_pfn().  Is there something I should be testing for,
before I feed it along?

I've tested it on both NUMA and non-NUMA systems (see the pfn_to_nid()
changes).  The below are from a 4-quad 16-proc NUMAQ.

This is a process that allocates, then faults in a 256MB chunk of
memory, bound to CPU 4 (node 1).
curly:~# cat /proc/378/nodepages
Node 0 pages: 369
Node 1 pages: 65571
Node 2 pages: 0
Node 3 pages: 0

Here is the same thing, bound to CPU12 (node 3), probably forked on node
1, before it was bound.
Node 0 pages: 369
Node 1 pages: 2
Node 2 pages: 0
Node 3 pages: 65569

I would imagine that the pages on node 0 are from libc, which was
originally mapped on node 0.  The other processes inherit this.
-- 
Dave Hansen
haveblue@us.ibm.com

[-- Attachment #2: proc-pid-nodepages-2.5.59-mjb2-1.patch --]
[-- Type: text/plain, Size: 4759 bytes --]

diff -ru linux-2.5.59-mjb2-clean/fs/proc/base.c linux-2.5.59-mjb2-vma-stat/fs/proc/base.c
--- linux-2.5.59-mjb2-clean/fs/proc/base.c	Wed Jan 29 19:02:49 2003
+++ linux-2.5.59-mjb2-vma-stat/fs/proc/base.c	Thu Jan 30 17:57:51 2003
@@ -45,6 +45,7 @@
 enum pid_directory_inos {
 	PROC_PID_INO = 2,
 	PROC_PID_STATUS,
+	PROC_PID_NODE_PAGES,
 	PROC_PID_MEM,
 	PROC_PID_CWD,
 	PROC_PID_ROOT,
@@ -72,6 +73,7 @@
   E(PROC_PID_FD,	"fd",		S_IFDIR|S_IRUSR|S_IXUSR),
   E(PROC_PID_ENVIRON,	"environ",	S_IFREG|S_IRUSR),
   E(PROC_PID_STATUS,	"status",	S_IFREG|S_IRUGO),
+  E(PROC_PID_NODE_PAGES,"nodepages",	S_IFREG|S_IRUGO),
   E(PROC_PID_CMDLINE,	"cmdline",	S_IFREG|S_IRUGO),
   E(PROC_PID_STAT,	"stat",		S_IFREG|S_IRUGO),
   E(PROC_PID_STATM,	"statm",	S_IFREG|S_IRUGO),
@@ -102,6 +104,7 @@
 int proc_pid_status(struct task_struct*,char*);
 int proc_pid_statm(struct task_struct*,char*);
 int proc_pid_cpu(struct task_struct*,char*);
+int proc_pid_nodepages(struct task_struct*,char*);
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
@@ -1012,6 +1015,10 @@
 		case PROC_PID_STATUS:
 			inode->i_fop = &proc_info_file_operations;
 			ei->op.proc_read = proc_pid_status;
+			break;
+		case PROC_PID_NODE_PAGES:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_nodepages;
 			break;
 		case PROC_PID_STAT:
 			inode->i_fop = &proc_info_file_operations;
diff -ru linux-2.5.59-mjb2-clean/fs/proc/task_mmu.c linux-2.5.59-mjb2-vma-stat/fs/proc/task_mmu.c
--- linux-2.5.59-mjb2-clean/fs/proc/task_mmu.c	Wed Jan 29 19:02:49 2003
+++ linux-2.5.59-mjb2-vma-stat/fs/proc/task_mmu.c	Thu Jan 30 19:25:54 2003
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <asm/uaccess.h>
+#include <asm/mmzone.h>
 
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
@@ -243,5 +244,56 @@
 out_free1:
 	free_page((unsigned long)kbuf);
 out:
+	return retval;
+}
+
+extern pte_t
+__follow_page(struct mm_struct *mm, unsigned long address);
+
+ssize_t proc_pid_nodepages(struct task_struct *task, char* buf)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct * map;
+	long retval;
+	int nids[MAX_NR_NODES];
+	int i;
+
+	for(i=0;i<numnodes;i++)
+		nids[i] = 0;
+	
+	/*
+	 * We might sleep getting the page, so get it first.
+	 */
+	mm = get_task_mm(task);
+
+	if(!mm) {
+		printk("%s(): !mm !!\n", __FUNCTION__);
+		return 0;
+	}
+	
+	retval = 0;
+
+	down_read(&mm->mmap_sem);
+	map = mm->mmap;
+	while (map) {
+		unsigned long vaddr = map->vm_start;
+		unsigned long vm_end = map->vm_end;
+		pte_t pte;
+		unsigned long pfn;
+		
+		for(;vaddr < vm_end; vaddr += PAGE_SIZE) {
+			pte = __follow_page(mm, vaddr);
+			pfn = pte_pfn(pte);
+			nids[pfn_to_nid(pfn)]++;
+		}
+		map = map->vm_next;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	for(i=0;i<numnodes;i++) {
+		retval += sprintf(&buf[retval], "Node %d pages: %d\n", 
+				i, nids[i]);
+	}
 	return retval;
 }
diff -ru linux-2.5.59-mjb2-clean/include/asm-i386/mmzone.h linux-2.5.59-mjb2-vma-stat/include/asm-i386/mmzone.h
--- linux-2.5.59-mjb2-clean/include/asm-i386/mmzone.h	Wed Jan 29 19:02:38 2003
+++ linux-2.5.59-mjb2-vma-stat/include/asm-i386/mmzone.h	Thu Jan 30 19:25:54 2003
@@ -8,14 +8,17 @@
 
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_DISCONTIGMEM
+
+#define pfn_to_nid(pfn)		(0)
+
+#else
 
 #ifdef CONFIG_X86_NUMAQ
 #include <asm/numaq.h>
 #elif CONFIG_X86_SUMMIT
 #include <asm/srat.h>
 #else
-#define pfn_to_nid(pfn)		(0)
 #endif /* CONFIG_X86_NUMAQ */
 
 extern struct pglist_data *node_data[];
diff -ru linux-2.5.59-mjb2-clean/mm/memory.c linux-2.5.59-mjb2-vma-stat/mm/memory.c
--- linux-2.5.59-mjb2-clean/mm/memory.c	Wed Jan 29 19:02:54 2003
+++ linux-2.5.59-mjb2-vma-stat/mm/memory.c	Thu Jan 30 16:45:55 2003
@@ -612,13 +612,12 @@
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write) 
+pte_t 
+__follow_page(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
-	unsigned long pfn;
 
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || pgd_bad(*pgd))
@@ -629,11 +628,25 @@
 		goto out;
 
 	ptep = pte_offset_map(pmd, address);
-	if (!ptep)
+	if (!ptep) {
+		pte.pte_low = 0; //__bad_page();		
+		pte.pte_high = 0;
 		goto out;
-
+	}
 	pte = *ptep;
 	pte_unmap(ptep);
+
+out:
+	return pte;
+}
+	
+struct page *
+follow_page(struct mm_struct *mm, unsigned long address, int write) 
+{
+	pte_t pte;	
+	unsigned long pfn;
+	
+	pte = __follow_page(mm, address);
 	if (pte_present(pte)) {
 		if (!write || (pte_write(pte) && pte_dirty(pte))) {
 			pfn = pte_pfn(pte);
@@ -642,7 +655,6 @@
 		}
 	}
 
-out:
 	return NULL;
 }
 

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2003-01-31  3:39 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-01-31  3:39 [PATCH] export NUMA allocation fragmentation Dave Hansen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox