[RFC][PATCH 3/5] mm: RCUify vma lookup

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: linux-mm@kvack.org
Cc: Christoph Lameter <clameter@engr.sgi.com>,
	"Paul E. McKenney" <paulmck@us.ibm.com>,
	Nick Piggin <npiggin@suse.de>, Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [RFC][PATCH 3/5] mm: RCUify vma lookup
Date: Tue, 06 Mar 2007 02:38:18 +0100	[thread overview]
Message-ID: <20070306014211.293824000@taijtu.programming.kicks-ass.net> (raw)
In-Reply-To: <20070306013815.951032000@taijtu.programming.kicks-ass.net>

[-- Attachment #1: mm-rcu.patch --]
[-- Type: text/plain, Size: 15098 bytes --]

mostly lockless vma lookup using the new b+tree
pin the vma using an atomic refcount

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/init_task.h |    3 
 include/linux/mm.h        |    7 +
 include/linux/sched.h     |    2 
 kernel/fork.c             |    4 
 mm/mmap.c                 |  212 ++++++++++++++++++++++++++++++++++++++++------
 5 files changed, 199 insertions(+), 29 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -103,12 +103,14 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	atomic_t vm_count;
 #ifndef CONFIG_MMU
 	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+	struct rcu_head vm_rcu_head;
 };
 
 static inline struct vm_area_struct *
@@ -1047,6 +1049,8 @@ static inline void vma_nonlinear_insert(
 }
 
 /* mmap.c */
+extern void btree_rcu_flush(struct btree_freevec *);
+extern void free_vma(struct vm_area_struct *vma);
 extern int __vm_enough_memory(long pages, int cap_sys_admin);
 extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
@@ -1132,6 +1136,9 @@ extern struct vm_area_struct * find_vma(
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 					     struct vm_area_struct **pprev);
 
+extern struct vm_area_struct * find_get_vma(struct mm_struct *mm, unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+
 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
    NULL if none.  Assume start_addr < end_addr. */
 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -54,6 +54,7 @@ struct sched_param {
 #include <linux/cpumask.h>
 #include <linux/errno.h>
 #include <linux/nodemask.h>
+#include <linux/rcupdate.h>
 
 #include <asm/system.h>
 #include <asm/semaphore.h>
@@ -311,6 +312,7 @@ struct mm_struct {
 	struct list_head mm_vmas;
 	struct btree_root mm_btree;
 	spinlock_t mm_btree_lock;
+	wait_queue_head_t mm_wq;
 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -39,6 +39,18 @@ static void unmap_region(struct mm_struc
 		struct vm_area_struct *vma, struct vm_area_struct *prev,
 		unsigned long start, unsigned long end);
 
+static void __btree_rcu_flush(struct rcu_head *head)
+{
+	struct btree_freevec *freevec =
+		container_of(head, struct btree_freevec, rcu_head);
+	btree_freevec_flush(freevec);
+}
+
+void btree_rcu_flush(struct btree_freevec *freevec)
+{
+	call_rcu(&freevec->rcu_head, __btree_rcu_flush);
+}
+
 /*
  * WARNING: the debugging will use recursive algorithms so never enable this
  * unless you know what you are doing.
@@ -217,6 +229,18 @@ void unlink_file_vma(struct vm_area_stru
 	}
 }
 
+static void __free_vma(struct rcu_head *head)
+{
+	struct vm_area_struct *vma =
+		container_of(head, struct vm_area_struct, vm_rcu_head);
+	kmem_cache_free(vm_area_cachep, vma);
+}
+
+void free_vma(struct vm_area_struct *vma)
+{
+	call_rcu(&vma->vm_rcu_head, __free_vma);
+}
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -229,7 +253,7 @@ static void remove_vma(struct vm_area_st
 		fput(vma->vm_file);
 	mpol_free(vma_policy(vma));
 	list_del(&vma->vm_list);
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 }
 
 asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -312,6 +336,7 @@ __vma_link_list(struct mm_struct *mm, st
 void __vma_link_btree(struct mm_struct *mm, struct vm_area_struct *vma)
 {
 	int err;
+	atomic_set(&vma->vm_count, 1);
 	spin_lock(&mm->mm_btree_lock);
 	err = btree_insert(&mm->mm_btree, vma->vm_start, vma);
 	spin_unlock(&mm->mm_btree_lock);
@@ -388,6 +413,17 @@ __insert_vm_struct(struct mm_struct * mm
 	mm->map_count++;
 }
 
+static void lock_vma(struct vm_area_struct *vma)
+{
+	wait_event(vma->vm_mm->mm_wq, (atomic_cmpxchg(&vma->vm_count, 1, 0) == 1));
+}
+
+static void unlock_vma(struct vm_area_struct *vma)
+{
+	BUG_ON(atomic_read(&vma->vm_count));
+	atomic_set(&vma->vm_count, 1);
+}
+
 static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
@@ -395,11 +431,12 @@ __vma_unlink(struct mm_struct *mm, struc
 	struct vm_area_struct *vma_tmp;
 	list_del(&vma->vm_list);
 	spin_lock(&mm->mm_btree_lock);
+	BUG_ON(atomic_read(&vma->vm_count));
 	vma_tmp = btree_remove(&mm->mm_btree, vma->vm_start);
 	spin_unlock(&mm->mm_btree_lock);
 	BUG_ON(vma_tmp != vma);
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = prev;
+	if (rcu_dereference(mm->mmap_cache) == vma)
+		rcu_assign_pointer(mm->mmap_cache, prev);
 }
 
 /*
@@ -415,6 +452,7 @@ void vma_adjust(struct vm_area_struct *v
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma_next(vma);
 	struct vm_area_struct *importer = NULL;
+	struct vm_area_struct *locked = NULL;
 	struct address_space *mapping = NULL;
 	struct prio_tree_root *root = NULL;
 	struct file *file = vma->vm_file;
@@ -425,6 +463,14 @@ void vma_adjust(struct vm_area_struct *v
 	if (next && !insert) {
 		if (end >= next->vm_end) {
 			/*
+			 * We need to lock the vma to force the lockless
+			 * lookup into the slow path. Because there is a
+			 * hole between removing the next vma and updating
+			 * the current.
+			 */
+			lock_vma(vma);
+			locked = vma;
+			/*
 			 * vma expands, overlapping all the next, and
 			 * perhaps the one after too (mprotect case 6).
 			 */
@@ -452,6 +498,25 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	if (insert) {
+		/*
+		 * In order to make the adjust + insert look atomic wrt. the
+		 * lockless lookups we need to force those into the slow path.
+		 */
+		if (insert->vm_start < start) {
+			/*
+			 * If the new vma is to be placed in front of the
+			 * current one, we must lock the previous.
+			 */
+			locked = vma_prev(vma);
+			if (!locked)
+				locked = vma;
+		} else
+			locked = vma;
+
+		lock_vma(locked);
+	}
+
 	btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL);
 
 	if (file) {
@@ -498,6 +563,23 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	/*
+	 * Remove the next vma before updating the address of the current,
+	 * because it might end up having the address of next.
+	 */
+	if (remove_next) {
+		/*
+		 * vma_merge has merged next into vma, and needs
+		 * us to remove next before dropping the locks.
+		 */
+		lock_vma(next);
+		__vma_unlink(mm, next, vma);
+		if (file)
+			__remove_shared_vm_struct(next, file, mapping);
+		if (next->anon_vma)
+			__anon_vma_merge(vma, next);
+	}
+
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
 		vma_prio_tree_remove(vma, root);
@@ -510,16 +592,14 @@ again:			remove_next = 1 + (end > next->
 	vma->vm_start = start;
 	vma->vm_end = end;
 	vma->vm_pgoff = pgoff;
-	spin_unlock(&mm->mm_btree_lock);
 
 	if (adjust_next) {
-		spin_lock(&mm->mm_btree_lock);
 		btree_update(&mm->mm_btree, next->vm_start,
 				next->vm_start + (adjust_next << PAGE_SHIFT));
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
-		spin_unlock(&mm->mm_btree_lock);
 	}
+	spin_unlock(&mm->mm_btree_lock);
 
 	if (root) {
 		if (adjust_next)
@@ -528,17 +608,11 @@ again:			remove_next = 1 + (end > next->
 		flush_dcache_mmap_unlock(mapping);
 	}
 
-	if (remove_next) {
-		/*
-		 * vma_merge has merged next into vma, and needs
-		 * us to remove next before dropping the locks.
-		 */
-		__vma_unlink(mm, next, vma);
-		if (file)
-			__remove_shared_vm_struct(next, file, mapping);
-		if (next->anon_vma)
-			__anon_vma_merge(vma, next);
-	} else if (insert) {
+	/*
+	 * Insert after updating the address of the current vma, because it
+	 * might end up having the previous address.
+	 */
+	if (insert) {
 		/*
 		 * split_vma has split insert from vma, and needs
 		 * us to insert it before dropping the locks
@@ -557,7 +631,7 @@ again:			remove_next = 1 + (end > next->
 			fput(file);
 		mm->map_count--;
 		mpol_free(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -569,6 +643,13 @@ again:			remove_next = 1 + (end > next->
 		}
 	}
 
+	if (locked) {
+		/*
+		 * unlock the vma, enabling lockless lookups.
+		 */
+		unlock_vma(locked);
+	}
+
 	validate_mm(mm);
 }
 
@@ -688,7 +769,7 @@ struct vm_area_struct *vma_merge(struct 
 	next = __vma_next(&mm->mm_vmas, prev);
 	area = next;
 	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
-		next = __vma_next(&mm->mm_vmas, next);
+		next = vma_next(next);
 
 	/*
 	 * Can it merge with the predecessor?
@@ -1071,7 +1152,7 @@ munmap_back:
 			fput(file);
 		}
 		mpol_free(vma_policy(vma));
-		kmem_cache_free(vm_area_cachep, vma);
+		free_vma(vma);
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
@@ -1098,7 +1179,7 @@ unmap_and_free_vma:
 	unmap_region(mm, &mm->mm_vmas, vma, prev, vma->vm_start, vma->vm_end);
 	charged = 0;
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma);
 unacct_error:
 	if (charged)
 		vm_unacct_memory(charged);
@@ -1145,7 +1226,7 @@ arch_get_unmapped_area(struct file *filp
 	}
 
 full_search:
-	for (vma = find_vma(mm, addr); ; vma = __vma_next(&mm->mm_vmas, vma)) {
+	for (vma = find_vma(mm, addr); ; vma = vma_next(vma)) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (TASK_SIZE - len < addr) {
 			/*
@@ -1329,22 +1410,21 @@ get_unmapped_area(struct file *file, uns
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct * mm, unsigned long addr)
 {
 	struct vm_area_struct *vma = NULL;
 
 	if (mm) {
 		/* Check the cache first. */
 		/* (Cache hit rate is typically around 35%.) */
-		vma = mm->mmap_cache;
+		vma = rcu_dereference(mm->mmap_cache);
 		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 			vma = btree_stab(&mm->mm_btree, addr);
-			/* addr < vm_end */
 			if (!vma || addr >= vma->vm_end)
 				vma = __vma_next(&mm->mm_vmas, vma);
 
 			if (vma)
-				mm->mmap_cache = vma;
+				rcu_assign_pointer(mm->mmap_cache, vma);
 		}
 	}
 	return vma;
@@ -1352,6 +1432,82 @@ struct vm_area_struct * find_vma(struct 
 
 EXPORT_SYMBOL(find_vma);
 
+/*
+ * Differs only from the above in that it uses the slightly more expensive
+ * btree_stab_next() in order to avoid using the mm->mm_vmas list without
+ * locks.
+ */
+static
+struct vm_area_struct *find_vma_rcu(struct mm_struct * mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = NULL, *next;
+
+	if (mm) {
+		/* Check the cache first. */
+		/* (Cache hit rate is typically around 35%.) */
+		vma = rcu_dereference(mm->mmap_cache);
+		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+			vma = btree_stab_next(&mm->mm_btree, addr, (void **)&next);
+			if (!vma || addr >= vma->vm_end)
+				vma = next;
+
+			if (vma)
+				rcu_assign_pointer(mm->mmap_cache, vma);
+		}
+	}
+	return vma;
+}
+
+/*
+ * Lockless lookup and pinning of vmas:
+ *
+ * In order to be able to do vma modifications and have them appear atomic
+ * we must sometimes still take the read lock. We do this when we fail to
+ * get a reference on the vma.
+ *
+ */
+struct vm_area_struct *find_get_vma(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma;
+
+	if (!mm)
+		return NULL;
+
+	rcu_read_lock();
+	vma = find_vma_rcu(mm, addr);
+	if (!vma || !atomic_inc_not_zero(&vma->vm_count))
+		goto slow;
+	rcu_read_unlock();
+	return vma;
+
+slow:
+	rcu_read_unlock();
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, addr);
+	if (vma && !atomic_inc_not_zero(&vma->vm_count))
+			BUG();
+	up_read(&mm->mmap_sem);
+	return vma;
+}
+
+void put_vma(struct vm_area_struct *vma)
+{
+	if (!vma)
+		return;
+
+	switch (atomic_dec_return(&vma->vm_count)) {
+		default:
+			break;
+
+		case 1:
+			wake_up_all(&vma->vm_mm->mm_wq);
+			break;
+
+		case 0:
+			BUG();
+	}
+}
+
 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
@@ -1621,10 +1777,13 @@ detach_vmas_to_be_unmapped(struct mm_str
 {
 	unsigned long addr;
 
+	rcu_assign_pointer(mm->mmap_cache, NULL);	/* Kill the cache. */
 	do {
 		struct vm_area_struct *vma_tmp;
 		btree_preload(&mm->mm_btree, GFP_KERNEL|__GFP_NOFAIL);
+		lock_vma(vma);
 		spin_lock(&mm->mm_btree_lock);
+		BUG_ON(atomic_read(&vma->vm_count));
 		vma_tmp = btree_remove(&mm->mm_btree, vma->vm_start);
 		spin_unlock(&mm->mm_btree_lock);
 		if (vma_tmp != vma) {
@@ -1643,7 +1802,6 @@ detach_vmas_to_be_unmapped(struct mm_str
 	else
 		addr = vma ?  vma->vm_end : mm->mmap_base;
 	mm->unmap_area(mm, addr);
-	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
 /*
Index: linux-2.6/include/linux/init_task.h
===================================================================
--- linux-2.6.orig/include/linux/init_task.h
+++ linux-2.6/include/linux/init_task.h
@@ -47,8 +47,9 @@
 #define INIT_MM(name) \
 {			 					\
 	.mm_vmas	= LIST_HEAD_INIT(name.mm_vmas),		\
-	.mm_btree	= BTREE_INIT(GFP_ATOMIC|__GFP_NOFAIL),	\
+	.mm_btree	= BTREE_INIT_FLUSH(GFP_ATOMIC|__GFP_NOFAIL, btree_rcu_flush), \
 	.mm_btree_lock	= __SPIN_LOCK_UNLOCKED(name.mm_btree_lock), \
+	.mm_wq		= __WAIT_QUEUE_HEAD_INITIALIZER(name.mm_wq), \
 	.pgd		= swapper_pg_dir, 			\
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -323,8 +323,10 @@ static void free_mm(struct mm_struct *mm
 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
 	INIT_LIST_HEAD(&mm->mm_vmas);
-	mm->mm_btree = BTREE_INIT(GFP_ATOMIC|__GFP_NOFAIL);
+	mm->mm_btree =
+		BTREE_INIT_FLUSH(GFP_ATOMIC|__GFP_NOFAIL, btree_rcu_flush);
 	spin_lock_init(&mm->mm_btree_lock);
+	init_waitqueue_head(&mm->mm_wq);
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);

-- 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2007-03-06  1:38 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-06  1:38 [RFC][PATCH 0/5] Lockless vma lookups Peter Zijlstra
2007-03-06  1:38 ` [RFC][PATCH 1/5] RCU friendly B+tree Peter Zijlstra
2007-03-06  1:38 ` [RFC][PATCH 2/5] mm: use B+tree for vmas Peter Zijlstra
2007-03-06  1:38 ` Peter Zijlstra [this message]
2007-03-06  2:23   ` [RFC][PATCH 3/5] mm: RCUify vma lookup Nick Piggin
2007-03-06  8:36     ` Nick Piggin
2007-03-08 17:31       ` Ingo Molnar
2007-03-06 12:31     ` Peter Zijlstra
2007-03-06 12:35       ` Peter Zijlstra
2007-03-06  1:38 ` [RFC][PATCH 4/5] i386: lockless fault handler Peter Zijlstra
2007-03-06  1:38 ` [RFC][PATCH 5/5] x86_64: " Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070306014211.293824000@taijtu.programming.kicks-ass.net \
    --to=a.p.zijlstra@chello.nl \
    --cc=clameter@engr.sgi.com \
    --cc=linux-mm@kvack.org \
    --cc=mingo@elte.hu \
    --cc=npiggin@suse.de \
    --cc=paulmck@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox