linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
@ 2005-05-09 13:30 Wolfgang Wander
  2005-05-09 21:26 ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-09 13:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Arjan van de Ven, linux-mm, wwc

Hi,

   I'm resending this message since I haven't got any response other
than from another affected user (who was helped by the patch).

   Can you please take a short glimpse and tell me what I need to 
do to get something like it accepted?

            Wolfgang

Andrew Morton writes:
 > 
 > Guys, Wolfgang has found what appears to be a serious mmap fragmentation
 > problem with the mm_struct.free_area_cache.
 > 

Andrew asked me to send the appended patch also to the list for
comments:

  ------

   In addtion to the the free_area_cache I've added another member
called cached_hole_size which contains the largest hole we have found
up to the position of free_area_cache.  Thus if we come in with a new
request we know that we better start from scratch if the requested
length is less or equal to the cached_hole_size.

   I've tried to patch all available architectures but of course have
not even tried to compile it.  So far only i32 and x86_64 is tested.

   It avoids fragmentation (as 2.4 did) and should be still faster
than the uncached version I hacked earlier.  And yes - check how
I implemented the biggest unsigned long (~0UL), I'm not sure if that
is ok with your standards...

                Wolfgang


diff -ru linux-2.6.11.7/arch/arm/mm/mmap.c linux-2.6.11.7.wwc/arch/arm/mm/mmap.c
--- linux-2.6.11.7/arch/arm/mm/mmap.c	2005-03-02 02:38:10.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/arm/mm/mmap.c	2005-04-27 09:19:19.000000000 -0400
@@ -73,8 +73,13 @@
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
-
+	if( len > mm->cached_hole_size )
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+	
 full_search:
 	if (do_align)
 		addr = COLOUR_ALIGN(addr, pgoff);
@@ -90,6 +95,7 @@
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -101,6 +107,8 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		if (do_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff -ru linux-2.6.11.7/arch/i386/mm/hugetlbpage.c linux-2.6.11.7.wwc/arch/i386/mm/hugetlbpage.c
--- linux-2.6.11.7/arch/i386/mm/hugetlbpage.c	2005-03-02 02:38:26.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/i386/mm/hugetlbpage.c	2005-04-27 12:41:42.000000000 -0400
@@ -298,7 +298,12 @@
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 
-	start_addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = mm->free_area_cache;
+	else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
@@ -312,6 +317,7 @@
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -320,6 +326,8 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
@@ -331,12 +339,17 @@
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
 		mm->free_area_cache = base;
 
+	if( len <= largest_hole ) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -357,13 +370,20 @@
 		 * vma->vm_start, use it:
 		 */
 		if (addr + len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		        (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+				
+		/* remember the largest hole we saw so far */
+		if( addr + largest_hole < vma->vm_start )
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = (vma->vm_start - len) & HPAGE_MASK;
@@ -376,6 +396,7 @@
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -386,6 +407,7 @@
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
 			len, pgoff, flags);
 
@@ -393,7 +415,8 @@
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
-
+	mm->cached_hole_size = ~0UL;
+	
 	return addr;
 }
 
diff -ru linux-2.6.11.7/arch/ia64/kernel/sys_ia64.c linux-2.6.11.7.wwc/arch/ia64/kernel/sys_ia64.c
--- linux-2.6.11.7/arch/ia64/kernel/sys_ia64.c	2005-03-02 02:38:10.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/ia64/kernel/sys_ia64.c	2005-04-27 09:19:19.000000000 -0400
@@ -38,9 +38,15 @@
 	if (REGION_NUMBER(addr) == REGION_HPAGE)
 		addr = 0;
 #endif
-	if (!addr)
-		addr = mm->free_area_cache;
-
+	if (!addr) {
+	        if( len > mm->cached_hole_size )
+		        addr = mm->free_area_cache;
+		else {
+		        addr = TASK_UNMAPPED_BASE;
+			mm->cached_hole_size = 0;
+		}
+	}
+			
 	if (map_shared && (TASK_SIZE > 0xfffffffful))
 		/*
 		 * For 64-bit tasks, align shared segments to 1MB to avoid potential
@@ -59,6 +65,7 @@
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				/* Start a new search --- just in case we missed some holes.  */
 				addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -68,6 +75,8 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = (vma->vm_end + align_mask) & ~align_mask;
 	}
 }
diff -ru linux-2.6.11.7/arch/ppc64/mm/hugetlbpage.c linux-2.6.11.7.wwc/arch/ppc64/mm/hugetlbpage.c
--- linux-2.6.11.7/arch/ppc64/mm/hugetlbpage.c	2005-03-02 02:38:09.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/ppc64/mm/hugetlbpage.c	2005-04-27 12:43:52.000000000 -0400
@@ -515,7 +515,12 @@
 		    && !is_hugepage_only_range(addr,len))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	vma = find_vma(mm, addr);
@@ -539,6 +544,8 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		vma = vma->vm_next;
 	}
@@ -546,6 +553,7 @@
 	/* Make sure we didn't miss any holes */
 	if (start_addr != TASK_UNMAPPED_BASE) {
 		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
 		goto full_search;
 	}
 	return -ENOMEM;
@@ -567,6 +575,7 @@
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -587,6 +596,10 @@
 			return addr;
 	}
 
+	if( len <= largest_hole ) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -615,13 +628,21 @@
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		}
 		else
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+				
+		/* remember the largest hole we saw so far */
+		if( addr + largest_hole < vma->vm_start )
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -634,6 +655,7 @@
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -644,12 +666,14 @@
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
-
+	mm->cached_hole_size = ~0UL;
+	
 	return addr;
 }
 
diff -ru linux-2.6.11.7/arch/sh/kernel/sys_sh.c linux-2.6.11.7.wwc/arch/sh/kernel/sys_sh.c
--- linux-2.6.11.7/arch/sh/kernel/sys_sh.c	2005-03-02 02:38:34.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/sh/kernel/sys_sh.c	2005-04-27 09:19:19.000000000 -0400
@@ -79,6 +79,10 @@
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	if (flags & MAP_PRIVATE)
 		addr = PAGE_ALIGN(mm->free_area_cache);
 	else
@@ -95,6 +99,7 @@
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -106,6 +111,9 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 		if (!(flags & MAP_PRIVATE))
 			addr = COLOUR_ALIGN(addr);
diff -ru linux-2.6.11.7/arch/sparc64/kernel/sys_sparc.c linux-2.6.11.7.wwc/arch/sparc64/kernel/sys_sparc.c
--- linux-2.6.11.7/arch/sparc64/kernel/sys_sparc.c	2005-03-02 02:38:10.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/sparc64/kernel/sys_sparc.c	2005-04-27 09:19:19.000000000 -0400
@@ -84,6 +84,10 @@
 			return addr;
 	}
 
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
@@ -103,6 +107,7 @@
 		if (task_size < addr) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -114,6 +119,9 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 		if (do_color_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff -ru linux-2.6.11.7/arch/x86_64/ia32/ia32_aout.c linux-2.6.11.7.wwc/arch/x86_64/ia32/ia32_aout.c
--- linux-2.6.11.7/arch/x86_64/ia32/ia32_aout.c	2005-03-02 02:38:33.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/x86_64/ia32/ia32_aout.c	2005-04-27 09:19:19.000000000 -0400
@@ -312,6 +312,7 @@
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+	current->mm->cached_hole_size = 0;
 
 	current->mm->rss = 0;
 	current->mm->mmap = NULL;
diff -ru linux-2.6.11.7/arch/x86_64/kernel/sys_x86_64.c linux-2.6.11.7.wwc/arch/x86_64/kernel/sys_x86_64.c
--- linux-2.6.11.7/arch/x86_64/kernel/sys_x86_64.c	2005-03-02 02:38:13.000000000 -0500
+++ linux-2.6.11.7.wwc/arch/x86_64/kernel/sys_x86_64.c	2005-04-27 09:19:19.000000000 -0400
@@ -112,6 +112,10 @@
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -127,6 +131,7 @@
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -138,6 +143,9 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 	}
 }
diff -ru linux-2.6.11.7/fs/binfmt_aout.c linux-2.6.11.7.wwc/fs/binfmt_aout.c
--- linux-2.6.11.7/fs/binfmt_aout.c	2005-03-02 02:38:37.000000000 -0500
+++ linux-2.6.11.7.wwc/fs/binfmt_aout.c	2005-04-27 09:19:19.000000000 -0400
@@ -316,6 +316,7 @@
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = current->mm->cached_hole_size;
 
 	current->mm->rss = 0;
 	current->mm->mmap = NULL;
diff -ru linux-2.6.11.7/fs/binfmt_elf.c linux-2.6.11.7.wwc/fs/binfmt_elf.c
--- linux-2.6.11.7/fs/binfmt_elf.c	2005-04-27 13:15:09.000000000 -0400
+++ linux-2.6.11.7.wwc/fs/binfmt_elf.c	2005-04-27 09:19:19.000000000 -0400
@@ -766,6 +766,8 @@
 	   change some of these later */
 	current->mm->rss = 0;
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = current->mm->cached_hole_size;
+	
 	retval = setup_arg_pages(bprm, STACK_TOP, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
diff -ru linux-2.6.11.7/fs/hugetlbfs/inode.c linux-2.6.11.7.wwc/fs/hugetlbfs/inode.c
--- linux-2.6.11.7/fs/hugetlbfs/inode.c	2005-03-02 02:38:25.000000000 -0500
+++ linux-2.6.11.7.wwc/fs/hugetlbfs/inode.c	2005-04-27 12:39:19.000000000 -0400
@@ -122,6 +122,11 @@
 
 	start_addr = mm->free_area_cache;
 
+	if(len <= mm->cached_hole_size ) 
+		start_addr = TASK_UNMAPPED_BASE;
+
+
+
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
diff -ru linux-2.6.11.7/include/linux/sched.h linux-2.6.11.7.wwc/include/linux/sched.h
--- linux-2.6.11.7/include/linux/sched.h	2005-03-02 02:37:48.000000000 -0500
+++ linux-2.6.11.7.wwc/include/linux/sched.h	2005-04-27 09:19:19.000000000 -0400
@@ -212,8 +212,9 @@
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct vm_area_struct *area);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long free_area_cache;		/* first hole */
+        unsigned long mmap_base;		/* base of mmap area */
+        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
diff -ru linux-2.6.11.7/kernel/fork.c linux-2.6.11.7.wwc/kernel/fork.c
--- linux-2.6.11.7/kernel/fork.c	2005-03-02 02:37:48.000000000 -0500
+++ linux-2.6.11.7.wwc/kernel/fork.c	2005-04-27 12:44:24.000000000 -0400
@@ -173,6 +173,7 @@
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	mm->rss = 0;
 	mm->anon_rss = 0;
@@ -301,7 +302,8 @@
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
-
+	mm->cached_hole_size = ~0UL;
+	
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
 		return mm;
diff -ru linux-2.6.11.7/mm/mmap.c linux-2.6.11.7.wwc/mm/mmap.c
--- linux-2.6.11.7/mm/mmap.c	2005-03-02 02:38:12.000000000 -0500
+++ linux-2.6.11.7.wwc/mm/mmap.c	2005-04-27 12:57:00.000000000 -0400
@@ -1173,7 +1173,12 @@
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1184,7 +1189,8 @@
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1196,6 +1202,8 @@
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
@@ -1207,8 +1215,13 @@
 	 * Is this a new hole at the lowest possible address?
 	 */
 	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	    area->vm_start < area->vm_mm->free_area_cache) {
+	        unsigned area_size = area->vm_end-area->vm_start;
+		if( area->vm_mm->cached_hole_size < area_size ) 
+		        area->vm_mm->cached_hole_size = area_size;
+		else
+		        area->vm_mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1224,6 +1237,7 @@
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -1243,6 +1257,10 @@
 			return addr;
 	}
 
+	if( len <= mm->cached_hole_size ) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -1263,13 +1281,20 @@
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		        (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+				
+		/* remember the largest hole we saw so far */
+		if( addr + largest_hole < vma->vm_start )
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -1282,6 +1307,7 @@
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -1292,12 +1318,14 @@
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
-
+	mm->cached_hole_size = ~0UL;
+	
 	return addr;
 }
 #endif
@@ -1307,8 +1335,13 @@
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
-	if (area->vm_end > area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_end;
+        if (area->vm_end > area->vm_mm->free_area_cache) {
+	        unsigned area_size = area->vm_end-area->vm_start;
+		if( area->vm_mm->cached_hole_size < area_size ) 
+		        area->vm_mm->cached_hole_size = area_size;
+		else
+		        area->vm_mm->cached_hole_size = ~0UL;
+	}
 }
 
 unsigned long
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-09 13:30 Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Wolfgang Wander
@ 2005-05-09 21:26 ` Andrew Morton
  2005-05-09 21:30   ` Arjan van de Ven
  2005-05-10 15:35   ` Chen, Kenneth W
  0 siblings, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2005-05-09 21:26 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: mingo, arjanv, linux-mm

Wolfgang Wander <wwc@rentec.com> wrote:
>
>    I'm resending this message since I haven't got any response other
> than from another affected user (who was helped by the patch).

It hasn't been forgotten.  I'd prefer to not release 2.6.12 until we've
resolved this.

>    Can you please take a short glimpse and tell me what I need to 
> do to get something like it accepted?

Keep sending emails ;)

Possibly for the 2.6.12 release the safest approach would be to just
disable the free area cache while we think about it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-09 21:26 ` Andrew Morton
@ 2005-05-09 21:30   ` Arjan van de Ven
  2005-05-10 13:23     ` Wolfgang Wander
  2005-05-10 15:35   ` Chen, Kenneth W
  1 sibling, 1 reply; 19+ messages in thread
From: Arjan van de Ven @ 2005-05-09 21:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Wolfgang Wander, mingo, linux-mm

On Mon, May 09, 2005 at 02:26:51PM -0700, Andrew Morton wrote:

> Possibly for the 2.6.12 release the safest approach would be to just
> disable the free area cache while we think about it.

the free area cache either is historically tricky to be fair; it has the
thankless job of either keeping at the "ealiest" small hole (and thus being
useless if most allocs are bigger than that hole) or leaving an occasionally
small hole alone and thus fragmenting memory more, like you've shown.
I like neither to be honest; the price however is a higher lookup cost (well
mitigated if vma merging is really effective) 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-09 21:30   ` Arjan van de Ven
@ 2005-05-10 13:23     ` Wolfgang Wander
  0 siblings, 0 replies; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-10 13:23 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Andrew Morton, Wolfgang Wander, mingo, linux-mm

Arjan van de Ven writes:
 > On Mon, May 09, 2005 at 02:26:51PM -0700, Andrew Morton wrote:
 > 
 > > Possibly for the 2.6.12 release the safest approach would be to just
 > > disable the free area cache while we think about it.
 > 
 > the free area cache either is historically tricky to be fair; it has the
 > thankless job of either keeping at the "ealiest" small hole (and thus being
 > useless if most allocs are bigger than that hole) or leaving an occasionally
 > small hole alone and thus fragmenting memory more, like you've shown.
 > I like neither to be honest; the price however is a higher lookup cost (well
 > mitigated if vma merging is really effective) 

My approach tries to find a compromise. It changes the behaviour so
that is keeps the size of the largest hole it found below the cache
so that it knows for what request size to disregard the cache and
start from scratch.  By doing so it also avoids setting the cache
to the latest munmap-ed region (and adjusts the largest_hole below
the cache pointer instead).  Setting the cache pointer to the latest
munmap was also a major contributor to the fragmentation:

Imagine 5 consecutive maps of 1K, then free map 2,4,3 in that order.
The next mmap will get its space from the original area 3 leaving gaps
on either side.

But I'm of course also very happy if the original 2.4 behaviour
without caching is restored...

                    Wolfgang
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-09 21:26 ` Andrew Morton
  2005-05-09 21:30   ` Arjan van de Ven
@ 2005-05-10 15:35   ` Chen, Kenneth W
  2005-05-10 18:58     ` Andrew Morton
  1 sibling, 1 reply; 19+ messages in thread
From: Chen, Kenneth W @ 2005-05-10 15:35 UTC (permalink / raw)
  To: 'Andrew Morton', Wolfgang Wander; +Cc: mingo, arjanv, linux-mm

Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
> Possibly for the 2.6.12 release the safest approach would be to just
> disable the free area cache while we think about it.

I hope people are not thinking permanently kill the free area cache
algorithm.  It is known to give a large percentage of performance gain
on specweb SSL benchmark.  I think it gives 4-5% gain from free area
cache algorithm.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 15:35   ` Chen, Kenneth W
@ 2005-05-10 18:58     ` Andrew Morton
  2005-05-10 19:07       ` Arjan van de Ven
  2005-05-10 19:34       ` Chen, Kenneth W
  0 siblings, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2005-05-10 18:58 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: wwc, mingo, arjanv, linux-mm

"Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
>
> Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
> > Possibly for the 2.6.12 release the safest approach would be to just
> > disable the free area cache while we think about it.
> 
> I hope people are not thinking permanently kill the free area cache
> algorithm.  It is known to give a large percentage of performance gain
> on specweb SSL benchmark.  I think it gives 4-5% gain from free area
> cache algorithm.

It also makes previously-working workloads completely *fail*.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 18:58     ` Andrew Morton
@ 2005-05-10 19:07       ` Arjan van de Ven
  2005-05-10 19:34       ` Chen, Kenneth W
  1 sibling, 0 replies; 19+ messages in thread
From: Arjan van de Ven @ 2005-05-10 19:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chen, Kenneth W, wwc, mingo, linux-mm

On Tue, May 10, 2005 at 11:58:18AM -0700, Andrew Morton wrote:
> "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
> >
> > Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
> > > Possibly for the 2.6.12 release the safest approach would be to just
> > > disable the free area cache while we think about it.
> > 
> > I hope people are not thinking permanently kill the free area cache
> > algorithm.  It is known to give a large percentage of performance gain
> > on specweb SSL benchmark.  I think it gives 4-5% gain from free area
> > cache algorithm.
> 
> It also makes previously-working workloads completely *fail*.

the balance between correctness and performance ;)

the patch to keep track of basically the below-gap-size will fix the
correctness side I suppose, however I'm not sure I'm thrilled by the
inherent complexity that is beeing added. More to track means more
complexity and fragility. 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 18:58     ` Andrew Morton
  2005-05-10 19:07       ` Arjan van de Ven
@ 2005-05-10 19:34       ` Chen, Kenneth W
  2005-05-10 19:43         ` Andrew Morton
  1 sibling, 1 reply; 19+ messages in thread
From: Chen, Kenneth W @ 2005-05-10 19:34 UTC (permalink / raw)
  To: 'Andrew Morton'; +Cc: wwc, mingo, arjanv, linux-mm

Andrew Morton wrote Tuesday, May 10, 2005 11:58 AM
> "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
> > Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
> > > Possibly for the 2.6.12 release the safest approach would be to just
> > > disable the free area cache while we think about it.
> > 
> > I hope people are not thinking permanently kill the free area cache
> > algorithm.  It is known to give a large percentage of performance gain
> > on specweb SSL benchmark.  I think it gives 4-5% gain from free area
> > cache algorithm.
> 
> It also makes previously-working workloads completely *fail*.

I agree that functionality over rule most of everything else.  Though, I
do want to bring to your attention on how much performance regression we
will see if the free area cache is completely disabled.  I rather make
noise now instead of a couple month down the road :-)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 19:34       ` Chen, Kenneth W
@ 2005-05-10 19:43         ` Andrew Morton
  2005-05-10 19:50           ` Wolfgang Wander
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-05-10 19:43 UTC (permalink / raw)
  To: Chen, Kenneth W; +Cc: wwc, mingo, arjanv, linux-mm

"Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
>
> Andrew Morton wrote Tuesday, May 10, 2005 11:58 AM
> > "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
> > > Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
> > > > Possibly for the 2.6.12 release the safest approach would be to just
> > > > disable the free area cache while we think about it.
> > > 
> > > I hope people are not thinking permanently kill the free area cache
> > > algorithm.  It is known to give a large percentage of performance gain
> > > on specweb SSL benchmark.  I think it gives 4-5% gain from free area
> > > cache algorithm.
> > 
> > It also makes previously-working workloads completely *fail*.
> 
> I agree that functionality over rule most of everything else.  Though, I
> do want to bring to your attention on how much performance regression we
> will see if the free area cache is completely disabled.  I rather make
> noise now instead of a couple month down the road :-)

Well we allegedly have a patch from Wolfgang which fixes things up, but our
talk-to-testing ratio seems to be infinite.

This is pretty serious, guys.  Could someone please find the time to work
on it?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 19:43         ` Andrew Morton
@ 2005-05-10 19:50           ` Wolfgang Wander
  2005-05-10 19:57             ` Andrew Morton
  2005-05-10 19:58             ` Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Ingo Molnar
  0 siblings, 2 replies; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-10 19:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Chen, Kenneth W, wwc, mingo, arjanv, linux-mm

Andrew Morton writes:
 > "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
 > >
 > > Andrew Morton wrote Tuesday, May 10, 2005 11:58 AM
 > > > "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
 > > > > Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
 > > > > > Possibly for the 2.6.12 release the safest approach would be to just
 > > > > > disable the free area cache while we think about it.
 > > > > 
 > > > > I hope people are not thinking permanently kill the free area cache
 > > > > algorithm.  It is known to give a large percentage of performance gain
 > > > > on specweb SSL benchmark.  I think it gives 4-5% gain from free area
 > > > > cache algorithm.
 > > > 
 > > > It also makes previously-working workloads completely *fail*.
 > > 
 > > I agree that functionality over rule most of everything else.  Though, I
 > > do want to bring to your attention on how much performance regression we
 > > will see if the free area cache is completely disabled.  I rather make
 > > noise now instead of a couple month down the road :-)
 > 
 > Well we allegedly have a patch from Wolfgang which fixes things up, but our
 > talk-to-testing ratio seems to be infinite.
 > 
 > This is pretty serious, guys.  Could someone please find the time to work
 > on it?

I volunteer to do the testing - just the test I got from Ingo did
not show any timing difference for either of the three solutions:

a) use free_cache
b) disable free_cache
c) use my maybe improved and maybe much too complex free_cache

The test_str02.c I got only ran up to 1300 threads on my machine (8GB
dual x86_64) and Ingo expected it to go up to 20000.

If there is any other test case I'm very willing to do the timing
tests...

             Wolfgang
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 19:50           ` Wolfgang Wander
@ 2005-05-10 19:57             ` Andrew Morton
  2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
  2005-05-10 19:58             ` Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Ingo Molnar
  1 sibling, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-05-10 19:57 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: kenneth.w.chen, mingo, arjanv, linux-mm

Wolfgang Wander <wwc@rentec.com> wrote:
>
> Andrew Morton writes:
>  > "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
>  > >
>  > > Andrew Morton wrote Tuesday, May 10, 2005 11:58 AM
>  > > > "Chen, Kenneth W" <kenneth.w.chen@intel.com> wrote:
>  > > > > Andrew Morton wrote on Monday, May 09, 2005 2:27 PM
>  > > > > > Possibly for the 2.6.12 release the safest approach would be to just
>  > > > > > disable the free area cache while we think about it.
>  > > > > 
>  > > > > I hope people are not thinking permanently kill the free area cache
>  > > > > algorithm.  It is known to give a large percentage of performance gain
>  > > > > on specweb SSL benchmark.  I think it gives 4-5% gain from free area
>  > > > > cache algorithm.
>  > > > 
>  > > > It also makes previously-working workloads completely *fail*.
>  > > 
>  > > I agree that functionality over rule most of everything else.  Though, I
>  > > do want to bring to your attention on how much performance regression we
>  > > will see if the free area cache is completely disabled.  I rather make
>  > > noise now instead of a couple month down the road :-)
>  > 
>  > Well we allegedly have a patch from Wolfgang which fixes things up, but our
>  > talk-to-testing ratio seems to be infinite.
>  > 
>  > This is pretty serious, guys.  Could someone please find the time to work
>  > on it?
> 
> I volunteer to do the testing - just the test I got from Ingo did
> not show any timing difference for either of the three solutions:
> 
> a) use free_cache
> b) disable free_cache
> c) use my maybe improved and maybe much too complex free_cache
> 
> The test_str02.c I got only ran up to 1300 threads on my machine (8GB
> dual x86_64) and Ingo expected it to go up to 20000.
> 
> If there is any other test case I'm very willing to do the timing
> tests...
> 

Oh well, I might as well get it into -mm as long as we're (not) thinking
about it.

Could you please send along the latest version, against 2.6.12-rc4, with
description, signed-off-by, etc?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 19:50           ` Wolfgang Wander
  2005-05-10 19:57             ` Andrew Morton
@ 2005-05-10 19:58             ` Ingo Molnar
  2005-05-10 20:04               ` Wolfgang Wander
  1 sibling, 1 reply; 19+ messages in thread
From: Ingo Molnar @ 2005-05-10 19:58 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: Andrew Morton, Chen, Kenneth W, arjanv, linux-mm

* Wolfgang Wander <wwc@rentec.com> wrote:

> I volunteer to do the testing - just the test I got from Ingo did
> not show any timing difference for either of the three solutions:
> 
> a) use free_cache
> b) disable free_cache
> c) use my maybe improved and maybe much too complex free_cache
> 
> The test_str02.c I got only ran up to 1300 threads on my machine (8GB 
> dual x86_64) and Ingo expected it to go up to 20000.

do something like 'ulimit -s 128k' to reduce the thread stack sizes, to 
be able to run more threads. You are running an x86 (not x64) kernel to 
test, right?

	Ingo
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to
  2005-05-10 19:58             ` Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Ingo Molnar
@ 2005-05-10 20:04               ` Wolfgang Wander
  0 siblings, 0 replies; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-10 20:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Wolfgang Wander, Andrew Morton, Chen, Kenneth W, arjanv, linux-mm

Ingo Molnar writes:
 > 
 > * Wolfgang Wander <wwc@rentec.com> wrote:
 > 
 > > I volunteer to do the testing - just the test I got from Ingo did
 > > not show any timing difference for either of the three solutions:
 > > 
 > > a) use free_cache
 > > b) disable free_cache
 > > c) use my maybe improved and maybe much too complex free_cache
 > > 
 > > The test_str02.c I got only ran up to 1300 threads on my machine (8GB 
 > > dual x86_64) and Ingo expected it to go up to 20000.
 > 
 > do something like 'ulimit -s 128k' to reduce the thread stack sizes, to 
 > be able to run more threads. You are running an x86 (not x64) kernel to 
 > test, right?

I can run either one - thanks.  I'll try it out...

        Wolfgang
 
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-10 19:57             ` Andrew Morton
@ 2005-05-11 14:36               ` Wolfgang Wander
  2005-05-11 15:05                 ` Chen, Kenneth W
                                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-11 14:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Wolfgang Wander, kenneth.w.chen, mingo, arjanv, linux-mm


The patch below is against linux-2.6.12-rc4.

Ingo recently introduced a great speedup for allocating new
mmaps using the free_area_cache pointer which boosts the specweb 
SSL benchmark by 4-5% and causes huge performance increases in
thread creation.

The downside of this patch is that it does lead to fragmentation
in the mmap-ed areas (visible via /proc/self/maps), such that
some applications that work fine under 2.4 kernels quickly run
out of memory on any 2.6 kernel.

The problem is twofold:

  1) the free_area_cache is used to continue a search for
     memory where the last search ended.  Before the change
     new areas were always searched from the base address on.

     So now new small areas are cluttering holes of all sizes
     throughout the whole mmap-able region whereas before small
     holes tended to close holes near the base leaving holes
     far from the base large and available for larger requests.

  2) the free_area_cache also is set to the location of the last
     munmap-ed area so in scenarios where we allocate e.g.
     five regions of 1K each, then free regions 4 2 3 in this
     order the next request for 1K will be placed in the position
     of the old region 3, whereas before we appended it to the
     still active region 1, placing it at the location of the old
     region 2.  Before we had 1 free region of 2K, now we only
     get two free regions of 1K -> fragmentation.

The patch adresses thes issues by introducing yet another cache
descriptor cached_hole_size that contains the largest known hole
size below the current free_area_cache.  If a new request comes
in the size is compared against the cached_hole_size and if the
request can be filled with a hole below free_area_cache the
search is started from the base instead.

The results look promising:  Whereas 2.6.12-rc4 fragments
quickly and my (earlier posted) leakme.c test program terminates
after 50000+ iterations with 96 distinct and fragmented maps in
/proc/self/maps it performs nicely (as expected) with thread creation,
Ingo's test_str02 with 20000 threads requires 0.7s system time.

Taking out Ingo's patch (un-patch available per request) by basically
deleting all mentions of free_area_cache from the kernel and starting
the search for new memory always at the respective bases we observe:
leakme terminates successfully with 11 distinctive hardly fragmented
areas in /proc/self/maps but thread creating is gringdingly slow:
30+s(!) system time for Ingo's test_str02 with 20000 threads.

Now - drumroll ;-) the appended patch works fine with leakme: it
ends with only 7 distinct areas in /proc/self/maps and also thread
creation seems sufficiently fast with 0.71s for 20000 threads.

           Wolfgang


      ----------------------------------------

Signed-off-by: Wolfgang Wander <wwc@rentec.com>

      ----------------------------------------


diff -rpu linux-2.6.12-rc4-vanilla/arch/arm/mm/mmap.c linux-2.6.12-rc4-wwc/arch/arm/mm/mmap.c
--- linux-2.6.12-rc4-vanilla/arch/arm/mm/mmap.c	2005-03-02 02:38:10.000000000 -0500
+++ linux-2.6.12-rc4-wwc/arch/arm/mm/mmap.c	2005-05-10 16:33:34.363204724 -0400
@@ -73,8 +73,13 @@ arch_get_unmapped_area(struct file *filp
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
-
+	if( len > mm->cached_hole_size )
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+	
 full_search:
 	if (do_align)
 		addr = COLOUR_ALIGN(addr, pgoff);
@@ -90,6 +95,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -101,6 +107,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		if (do_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff -rpu linux-2.6.12-rc4-vanilla/arch/i386/mm/hugetlbpage.c linux-2.6.12-rc4-wwc/arch/i386/mm/hugetlbpage.c
--- linux-2.6.12-rc4-vanilla/arch/i386/mm/hugetlbpage.c	2005-05-10 18:28:55.902605331 -0400
+++ linux-2.6.12-rc4-wwc/arch/i386/mm/hugetlbpage.c	2005-05-10 16:33:34.364204677 -0400
@@ -294,7 +294,12 @@ static unsigned long hugetlb_get_unmappe
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 
-	start_addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = mm->free_area_cache;
+	else {
+	        start_addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
@@ -308,6 +313,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -316,6 +322,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 	}
 }
@@ -327,12 +335,17 @@ static unsigned long hugetlb_get_unmappe
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev_vma;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* don't allow allocations above current base */
 	if (mm->free_area_cache > base)
 		mm->free_area_cache = base;
 
+	if( len <= largest_hole ) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -353,13 +366,20 @@ try_again:
 		 * vma->vm_start, use it:
 		 */
 		if (addr + len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		        (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
-		else
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		} else
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+				
+		/* remember the largest hole we saw so far */
+		if( addr + largest_hole < vma->vm_start )
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = (vma->vm_start - len) & HPAGE_MASK;
@@ -372,6 +392,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -382,6 +403,7 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = hugetlb_get_unmapped_area_bottomup(file, addr0,
 			len, pgoff, flags);
 
@@ -389,7 +411,8 @@ fail:
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
-
+	mm->cached_hole_size = ~0UL;
+	
 	return addr;
 }
 
diff -rpu linux-2.6.12-rc4-vanilla/arch/ia64/kernel/sys_ia64.c linux-2.6.12-rc4-wwc/arch/ia64/kernel/sys_ia64.c
--- linux-2.6.12-rc4-vanilla/arch/ia64/kernel/sys_ia64.c	2005-05-10 18:28:55.929604069 -0400
+++ linux-2.6.12-rc4-wwc/arch/ia64/kernel/sys_ia64.c	2005-05-10 16:33:34.365204630 -0400
@@ -38,9 +38,15 @@ arch_get_unmapped_area (struct file *fil
 	if (REGION_NUMBER(addr) == REGION_HPAGE)
 		addr = 0;
 #endif
-	if (!addr)
-		addr = mm->free_area_cache;
-
+	if (!addr) {
+	        if( len > mm->cached_hole_size )
+		        addr = mm->free_area_cache;
+		else {
+		        addr = TASK_UNMAPPED_BASE;
+			mm->cached_hole_size = 0;
+		}
+	}
+			
 	if (map_shared && (TASK_SIZE > 0xfffffffful))
 		/*
 		 * For 64-bit tasks, align shared segments to 1MB to avoid potential
@@ -59,6 +65,7 @@ arch_get_unmapped_area (struct file *fil
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				/* Start a new search --- just in case we missed some holes.  */
 				addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -68,6 +75,8 @@ arch_get_unmapped_area (struct file *fil
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = (vma->vm_end + align_mask) & ~align_mask;
 	}
 }
diff -rpu linux-2.6.12-rc4-vanilla/arch/ppc64/mm/hugetlbpage.c linux-2.6.12-rc4-wwc/arch/ppc64/mm/hugetlbpage.c
--- linux-2.6.12-rc4-vanilla/arch/ppc64/mm/hugetlbpage.c	2005-05-10 18:28:56.186592052 -0400
+++ linux-2.6.12-rc4-wwc/arch/ppc64/mm/hugetlbpage.c	2005-05-10 16:33:34.366204583 -0400
@@ -468,7 +468,12 @@ unsigned long arch_get_unmapped_area(str
 		    && !is_hugepage_only_range(mm, addr,len))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	vma = find_vma(mm, addr);
@@ -492,6 +497,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 		vma = vma->vm_next;
 	}
@@ -499,6 +506,7 @@ full_search:
 	/* Make sure we didn't miss any holes */
 	if (start_addr != TASK_UNMAPPED_BASE) {
 		start_addr = addr = TASK_UNMAPPED_BASE;
+		mm->cached_hole_size = 0;
 		goto full_search;
 	}
 	return -ENOMEM;
@@ -520,6 +528,7 @@ arch_get_unmapped_area_topdown(struct fi
 	struct vm_area_struct *vma, *prev_vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long base = mm->mmap_base, addr = addr0;
+	unsigned long largest_hole = mm->cached_hole_size;
 	int first_time = 1;
 
 	/* requested length too big for entire address space */
@@ -540,6 +549,10 @@ arch_get_unmapped_area_topdown(struct fi
 			return addr;
 	}
 
+	if( len <= largest_hole ) {
+	        largest_hole = 0;
+		mm->free_area_cache  = base;
+	}
 try_again:
 	/* make sure it can fit in the remaining address space */
 	if (mm->free_area_cache < len)
@@ -568,13 +581,21 @@ hugepage_recheck:
 		 * vma->vm_start, use it:
 		 */
 		if (addr+len <= vma->vm_start &&
-				(!prev_vma || (addr >= prev_vma->vm_end)))
+		          (!prev_vma || (addr >= prev_vma->vm_end))) {
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
+		        mm->cached_hole_size = largest_hole;
+		        return (mm->free_area_cache = addr);
+		}
 		else
 			/* pull free_area_cache down to the first hole */
-			if (mm->free_area_cache == vma->vm_end)
+		        if (mm->free_area_cache == vma->vm_end) {
 				mm->free_area_cache = vma->vm_start;
+				mm->cached_hole_size = largest_hole;
+			}
+				
+		/* remember the largest hole we saw so far */
+		if( addr + largest_hole < vma->vm_start )
+		        largest_hole = vma->vm_start - addr;
 
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
@@ -587,6 +608,7 @@ fail:
 	 */
 	if (first_time) {
 		mm->free_area_cache = base;
+		largest_hole = 0;
 		first_time = 0;
 		goto try_again;
 	}
@@ -597,12 +619,14 @@ fail:
 	 * allocations.
 	 */
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = base;
-
+	mm->cached_hole_size = ~0UL;
+	
 	return addr;
 }
 
diff -rpu linux-2.6.12-rc4-vanilla/arch/sh/kernel/sys_sh.c linux-2.6.12-rc4-wwc/arch/sh/kernel/sys_sh.c
--- linux-2.6.12-rc4-vanilla/arch/sh/kernel/sys_sh.c	2005-03-02 02:38:34.000000000 -0500
+++ linux-2.6.12-rc4-wwc/arch/sh/kernel/sys_sh.c	2005-05-10 16:33:34.366204583 -0400
@@ -79,6 +79,10 @@ unsigned long arch_get_unmapped_area(str
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	if (flags & MAP_PRIVATE)
 		addr = PAGE_ALIGN(mm->free_area_cache);
 	else
@@ -95,6 +99,7 @@ full_search:
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -106,6 +111,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 		if (!(flags & MAP_PRIVATE))
 			addr = COLOUR_ALIGN(addr);
diff -rpu linux-2.6.12-rc4-vanilla/arch/sparc64/kernel/sys_sparc.c linux-2.6.12-rc4-wwc/arch/sparc64/kernel/sys_sparc.c
--- linux-2.6.12-rc4-vanilla/arch/sparc64/kernel/sys_sparc.c	2005-03-02 02:38:10.000000000 -0500
+++ linux-2.6.12-rc4-wwc/arch/sparc64/kernel/sys_sparc.c	2005-05-10 16:33:34.367204536 -0400
@@ -84,6 +84,10 @@ unsigned long arch_get_unmapped_area(str
 			return addr;
 	}
 
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = TASK_UNMAPPED_BASE;
+	}
 	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
@@ -103,6 +107,7 @@ full_search:
 		if (task_size < addr) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -114,6 +119,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 		if (do_color_align)
 			addr = COLOUR_ALIGN(addr, pgoff);
diff -rpu linux-2.6.12-rc4-vanilla/arch/x86_64/ia32/ia32_aout.c linux-2.6.12-rc4-wwc/arch/x86_64/ia32/ia32_aout.c
--- linux-2.6.12-rc4-vanilla/arch/x86_64/ia32/ia32_aout.c	2005-05-10 18:28:56.386582700 -0400
+++ linux-2.6.12-rc4-wwc/arch/x86_64/ia32/ia32_aout.c	2005-05-10 16:33:34.367204536 -0400
@@ -312,6 +312,7 @@ static int load_aout_binary(struct linux
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+	current->mm->cached_hole_size = 0;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff -rpu linux-2.6.12-rc4-vanilla/arch/x86_64/kernel/sys_x86_64.c linux-2.6.12-rc4-wwc/arch/x86_64/kernel/sys_x86_64.c
--- linux-2.6.12-rc4-vanilla/arch/x86_64/kernel/sys_x86_64.c	2005-05-10 18:28:56.406581765 -0400
+++ linux-2.6.12-rc4-wwc/arch/x86_64/kernel/sys_x86_64.c	2005-05-10 16:33:34.368204490 -0400
@@ -111,6 +111,10 @@ arch_get_unmapped_area(struct file *filp
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if( len <= mm->cached_hole_size ) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -126,6 +130,7 @@ full_search:
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -137,6 +142,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
+		
 		addr = vma->vm_end;
 	}
 }
diff -rpu linux-2.6.12-rc4-vanilla/fs/binfmt_aout.c linux-2.6.12-rc4-wwc/fs/binfmt_aout.c
--- linux-2.6.12-rc4-vanilla/fs/binfmt_aout.c	2005-05-10 18:28:59.957415723 -0400
+++ linux-2.6.12-rc4-wwc/fs/binfmt_aout.c	2005-05-10 16:33:34.368204490 -0400
@@ -316,6 +316,7 @@ static int load_aout_binary(struct linux
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = current->mm->cached_hole_size;
 
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
diff -rpu linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c linux-2.6.12-rc4-wwc/fs/binfmt_elf.c
--- linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c	2005-05-10 18:28:59.958415676 -0400
+++ linux-2.6.12-rc4-wwc/fs/binfmt_elf.c	2005-05-10 16:34:23.696894470 -0400
@@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_
 	   change some of these later */
 	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = current->mm->cached_hole_size;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
 				 executable_stack);
 	if (retval < 0) {
diff -rpu linux-2.6.12-rc4-vanilla/fs/hugetlbfs/inode.c linux-2.6.12-rc4-wwc/fs/hugetlbfs/inode.c
--- linux-2.6.12-rc4-vanilla/fs/hugetlbfs/inode.c	2005-05-10 18:29:00.032412216 -0400
+++ linux-2.6.12-rc4-wwc/fs/hugetlbfs/inode.c	2005-05-10 16:33:34.370204396 -0400
@@ -122,6 +122,11 @@ hugetlb_get_unmapped_area(struct file *f
 
 	start_addr = mm->free_area_cache;
 
+	if(len <= mm->cached_hole_size ) 
+		start_addr = TASK_UNMAPPED_BASE;
+
+
+
 full_search:
 	addr = ALIGN(start_addr, HPAGE_SIZE);
 
diff -rpu linux-2.6.12-rc4-vanilla/include/linux/sched.h linux-2.6.12-rc4-wwc/include/linux/sched.h
--- linux-2.6.12-rc4-vanilla/include/linux/sched.h	2005-05-10 18:29:02.918277269 -0400
+++ linux-2.6.12-rc4-wwc/include/linux/sched.h	2005-05-10 16:33:34.371204349 -0400
@@ -219,8 +219,9 @@ struct mm_struct {
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
 	void (*unmap_area) (struct vm_area_struct *area);
-	unsigned long mmap_base;		/* base of mmap area */
-	unsigned long free_area_cache;		/* first hole */
+        unsigned long mmap_base;		/* base of mmap area */
+        unsigned long cached_hole_size;         /* if non-zero, the largest hole below free_area_cache */
+	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
diff -rpu linux-2.6.12-rc4-vanilla/kernel/fork.c linux-2.6.12-rc4-wwc/kernel/fork.c
--- linux-2.6.12-rc4-vanilla/kernel/fork.c	2005-05-10 18:29:02.994273715 -0400
+++ linux-2.6.12-rc4-wwc/kernel/fork.c	2005-05-10 16:33:34.372204302 -0400
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_str
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
 	set_mm_counter(mm, rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
@@ -322,7 +323,8 @@ static struct mm_struct * mm_init(struct
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
-
+	mm->cached_hole_size = ~0UL;
+	
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
 		return mm;
diff -rpu linux-2.6.12-rc4-vanilla/mm/mmap.c linux-2.6.12-rc4-wwc/mm/mmap.c
--- linux-2.6.12-rc4-vanilla/mm/mmap.c	2005-05-10 18:29:03.031271985 -0400
+++ linux-2.6.12-rc4-wwc/mm/mmap.c	2005-05-10 17:57:17.869390186 -0400
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if( len > mm->cached_hole_size ) 
+	        start_addr = addr = mm->free_area_cache;
+	else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,8 @@ full_search:
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1198,6 +1204,8 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if( addr + mm->cached_hole_size < vma->vm_start )
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
@@ -1209,8 +1217,13 @@ void arch_unmap_area(struct vm_area_stru
 	 * Is this a new hole at the lowest possible address?
 	 */
 	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	    area->vm_start < area->vm_mm->free_area_cache) {
+	        unsigned area_size = area->vm_end-area->vm_start;
+		if( area->vm_mm->cached_hole_size < area_size ) 
+		        area->vm_mm->cached_hole_size = area_size;
+		else
+		        area->vm_mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1240,13 +1253,19 @@ arch_get_unmapped_area_topdown(struct fi
 			return addr;
 	}
 
+	/* check if free_area_cache is useful for us */
+	if( len <= mm->cached_hole_size ) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache  = mm->mmap_base;
+ 	}
+
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
 
 	/* make sure it can fit in the remaining address space */
 	if (addr >= len) {
 		vma = find_vma(mm, addr-len);
-		if (!vma || addr <= vma->vm_start)
+		if (!vma || addr <= vma->vm_start) 
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr-len);
 	}
@@ -1264,6 +1283,10 @@ arch_get_unmapped_area_topdown(struct fi
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
+ 		/* remember the largest hole we saw so far */
+ 		if( addr + mm->cached_hole_size < vma->vm_start )
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
 	} while (len <= vma->vm_start);
@@ -1274,13 +1297,15 @@ arch_get_unmapped_area_topdown(struct fi
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = mm->mmap_base;
-
+	mm->cached_hole_size = ~0UL;
+  	
 	return addr;
 }
 #endif
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
@ 2005-05-11 15:05                 ` Chen, Kenneth W
  2005-05-12  0:59                 ` Andrew Morton
                                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 19+ messages in thread
From: Chen, Kenneth W @ 2005-05-11 15:05 UTC (permalink / raw)
  To: 'Wolfgang Wander', Andrew Morton; +Cc: mingo, arjanv, linux-mm

Wolfgang Wander wrote on Wednesday, May 11, 2005 7:36 AM
> The patch adresses thes issues by introducing yet another cache
> descriptor cached_hole_size that contains the largest known hole
> size below the current free_area_cache.  If a new request comes
> in the size is compared against the cached_hole_size and if the
> request can be filled with a hole below free_area_cache the
> search is started from the base instead.
> 
> arch/ia64/kernel/sys_ia64.c     |   15 ++++++++++++---
> arch/ppc64/mm/hugetlbpage.c     |   34 +++++++++++++++++++++++++++++-----
> arch/sparc64/kernel/sys_sparc.c |    8 ++++++++


To me, the original issue is a specific problem with 32-bit address space
fragmentation.  On first glance of the patch, the changes for 64-bit arches
are questionable here.  I will work on this.

- Ken

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
  2005-05-11 15:05                 ` Chen, Kenneth W
@ 2005-05-12  0:59                 ` Andrew Morton
  2005-05-12  1:33                   ` Wolfgang Wander
  2005-05-12  7:07                 ` Ingo Molnar
  2005-05-12  7:14                 ` Ingo Molnar
  3 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-05-12  0:59 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: kenneth.w.chen, mingo, arjanv, linux-mm

Wolfgang Wander <wwc@rentec.com> wrote:
>
> diff -rpu linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c linux-2.6.12-rc4-wwc/fs/binfmt_elf.c
>  --- linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c	2005-05-10 18:28:59.958415676 -0400
>  +++ linux-2.6.12-rc4-wwc/fs/binfmt_elf.c	2005-05-10 16:34:23.696894470 -0400
>  @@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_
>   	   change some of these later */
>   	set_mm_counter(current->mm, rss, 0);
>   	current->mm->free_area_cache = current->mm->mmap_base;
>  +	current->mm->cached_hole_size = current->mm->cached_hole_size;

eh?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-12  0:59                 ` Andrew Morton
@ 2005-05-12  1:33                   ` Wolfgang Wander
  0 siblings, 0 replies; 19+ messages in thread
From: Wolfgang Wander @ 2005-05-12  1:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Wolfgang Wander, kenneth.w.chen, mingo, arjanv, linux-mm

Andrew Morton writes:
 > Wolfgang Wander <wwc@rentec.com> wrote:
 > >
 > > diff -rpu linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c linux-2.6.12-rc4-wwc/fs/binfmt_elf.c
 > >  --- linux-2.6.12-rc4-vanilla/fs/binfmt_elf.c	2005-05-10 18:28:59.958415676 -0400
 > >  +++ linux-2.6.12-rc4-wwc/fs/binfmt_elf.c	2005-05-10 16:34:23.696894470 -0400
 > >  @@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_
 > >   	   change some of these later */
 > >   	set_mm_counter(current->mm, rss, 0);
 > >   	current->mm->free_area_cache = current->mm->mmap_base;
 > >  +	current->mm->cached_hole_size = current->mm->cached_hole_size;
 > 
 > eh?

Outch!  Good catch. Thanks Andrew!

Ken, when you post your revised version can you replace this line
with 

+ current->mm->cached_hole_size = ~0UL;

Same problem in binfmt_aout.c!

Since we set free_area_cache to mmap_base the bug is hardly able to
cause any real harm but it looks more than silly.

    Wolfgang
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
  2005-05-11 15:05                 ` Chen, Kenneth W
  2005-05-12  0:59                 ` Andrew Morton
@ 2005-05-12  7:07                 ` Ingo Molnar
  2005-05-12  7:14                 ` Ingo Molnar
  3 siblings, 0 replies; 19+ messages in thread
From: Ingo Molnar @ 2005-05-12  7:07 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: Andrew Morton, kenneth.w.chen, arjanv, linux-mm

* Wolfgang Wander <wwc@rentec.com> wrote:

> The patch below is against linux-2.6.12-rc4.
> 
> Ingo recently introduced a great speedup for allocating new mmaps 
> using the free_area_cache pointer which boosts the specweb SSL 
> benchmark by 4-5% and causes huge performance increases in thread 
> creation.

small correction: 'recently' was more than 2.5 years ago (!). So this 
issue is something that hits certain rare workloads. Note that the mmap 
speedup was also backported to 2.4 so it is quite widely deployed. This 
is the first time anyone complained.

	Ingo
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Avoiding mmap fragmentation  (against 2.6.12-rc4) to
  2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
                                   ` (2 preceding siblings ...)
  2005-05-12  7:07                 ` Ingo Molnar
@ 2005-05-12  7:14                 ` Ingo Molnar
  3 siblings, 0 replies; 19+ messages in thread
From: Ingo Molnar @ 2005-05-12  7:14 UTC (permalink / raw)
  To: Wolfgang Wander; +Cc: Andrew Morton, kenneth.w.chen, arjanv, linux-mm

* Wolfgang Wander <wwc@rentec.com> wrote:

> Now - drumroll ;-) the appended patch works fine with leakme: it ends 
> with only 7 distinct areas in /proc/self/maps and also thread creation 
> seems sufficiently fast with 0.71s for 20000 threads.

great! Looks good to me. The whole allocator is a bit of a patchwork, 
but we knew that: the optimizations are heuristics so there will always 
be workloads where the linear search could trigger. (If someone replaces 
the whole thing with some smart size and address indexed tree structure 
it may work better, but i'm not holding my breath.)

This needs tons of testing though.

	Ingo
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-05-12  7:14 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-05-09 13:30 Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Wolfgang Wander
2005-05-09 21:26 ` Andrew Morton
2005-05-09 21:30   ` Arjan van de Ven
2005-05-10 13:23     ` Wolfgang Wander
2005-05-10 15:35   ` Chen, Kenneth W
2005-05-10 18:58     ` Andrew Morton
2005-05-10 19:07       ` Arjan van de Ven
2005-05-10 19:34       ` Chen, Kenneth W
2005-05-10 19:43         ` Andrew Morton
2005-05-10 19:50           ` Wolfgang Wander
2005-05-10 19:57             ` Andrew Morton
2005-05-11 14:36               ` [PATCH] Avoiding mmap fragmentation (against 2.6.12-rc4) to Wolfgang Wander
2005-05-11 15:05                 ` Chen, Kenneth W
2005-05-12  0:59                 ` Andrew Morton
2005-05-12  1:33                   ` Wolfgang Wander
2005-05-12  7:07                 ` Ingo Molnar
2005-05-12  7:14                 ` Ingo Molnar
2005-05-10 19:58             ` Fw: [Bug 4520] New: /proc/*/maps fragments too quickly compared to Ingo Molnar
2005-05-10 20:04               ` Wolfgang Wander

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox