[RFC 3/4] lockless vma caching

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>,
	Andi Kleen <andi@firstfloor.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	cl@linux-foundation.org,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	"mingo@elte.hu" <mingo@elte.hu>
Subject: [RFC 3/4] lockless vma caching
Date: Fri, 18 Dec 2009 09:45:13 +0900	[thread overview]
Message-ID: <20091218094513.490f27b4.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20091218093849.8ba69ad9.kamezawa.hiroyu@jp.fujitsu.com>

For accessing vma in lockless style, some modification for vma lookup is
required. Now, rb-tree is used and it doesn't allow read while modification.

This is a trial to caching vma rather than diving into rb-tree. The last
fault vma is cached to pgd's page->cached_vma field. And, add reference count
and waitqueue to vma.

The accessor will have to do

	vma = lookup_vma_cache(mm, address);
	if (vma) {
		if (mm_check_version(mm) && /* no write lock at this point ? */
		    (vma->vm_start <= address) && (vma->vm_end > address))
			goto found_vma; /* start speculative job */
		else
			vma_release_cache(vma);
		vma = NULL;
	}
	vma = find_vma();
found_vma:
	....do some jobs....
	vma_release_cache(vma);

Maybe some more consideration for invalidation point is necessary.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/mm.h       |   20 +++++++++
 include/linux/mm_types.h |    5 ++
 mm/memory.c              |   14 ++++++
 mm/mmap.c                |  102 +++++++++++++++++++++++++++++++++++++++++++++--
 mm/page_alloc.c          |    1 
 5 files changed, 138 insertions(+), 4 deletions(-)

Index: mmotm-mm-accessor/include/linux/mm.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm.h
+++ mmotm-mm-accessor/include/linux/mm.h
@@ -763,6 +763,26 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
 
+struct vm_area_struct *lookup_vma_cache(struct mm_struct *mm,
+		unsigned long address);
+void invalidate_vma_cache(struct mm_struct *mm,
+		struct vm_area_struct *vma);
+void wait_vmas_cache_range(struct vm_area_struct *vma, unsigned long end);
+
+static inline void vma_hold(struct vm_area_struct *vma)
+{
+	atomic_inc(&vma->cache_access);
+}
+
+void __vma_release(struct vm_area_struct *vma);
+static inline void vma_release(struct vm_area_struct *vma)
+{
+	if (atomic_dec_and_test(&vma->cache_access)) {
+		if (waitqueue_active(&vma->cache_wait))
+			__vma_release(vma);
+	}
+}
+
 /**
  * mm_walk - callbacks for walk_page_range
  * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
Index: mmotm-mm-accessor/include/linux/mm_types.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm_types.h
+++ mmotm-mm-accessor/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/wait.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -77,6 +78,7 @@ struct page {
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
+		void *cache;
 	};
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
@@ -180,6 +182,9 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	atomic_t cache_access;
+	wait_queue_head_t cache_wait;
+
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
Index: mmotm-mm-accessor/mm/memory.c
===================================================================
--- mmotm-mm-accessor.orig/mm/memory.c
+++ mmotm-mm-accessor/mm/memory.c
@@ -145,6 +145,14 @@ void pmd_clear_bad(pmd_t *pmd)
 	pmd_clear(pmd);
 }
 
+static void update_vma_cache(pmd_t *pmd, struct vm_area_struct *vma)
+{
+	struct page *page;
+	/* ptelock is held */
+	page = pmd_page(*pmd);
+	page->cache = vma;
+}
+
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -2118,6 +2126,7 @@ reuse:
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
+		update_vma_cache(pmd, vma);
 		goto unlock;
 	}
 
@@ -2186,6 +2195,7 @@ gotten:
 		 */
 		set_pte_at_notify(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 		if (old_page) {
 			/*
 			 * Only after switching the pte to the new page may
@@ -2626,6 +2636,7 @@ static int do_swap_page(struct mm_struct
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
+	update_vma_cache(pmd, vma);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
@@ -2691,6 +2702,7 @@ setpte:
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, entry);
+	update_vma_cache(pmd, vma);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
@@ -2852,6 +2864,7 @@ static int __do_fault(struct mm_struct *
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 	} else {
 		if (charged)
 			mem_cgroup_uncharge_page(page);
@@ -2989,6 +3002,7 @@ static inline int handle_pte_fault(struc
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 	} else {
 		/*
 		 * This is needed only for protection faults but the arch code
Index: mmotm-mm-accessor/mm/mmap.c
===================================================================
--- mmotm-mm-accessor.orig/mm/mmap.c
+++ mmotm-mm-accessor/mm/mmap.c
@@ -187,6 +187,94 @@ error:
 	return -ENOMEM;
 }
 
+struct vm_area_struct *
+lookup_vma_cache(struct mm_struct *mm, unsigned long address)
+{
+	struct page *page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	struct vm_area_struct *ret = NULL;
+
+	if (!mm)
+		return NULL;
+
+	preempt_disable();
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out;
+	page = pmd_page(*pmd);
+	if (PageReserved(page))
+		goto out;
+	ret = (struct vm_area_struct *)page->cache;
+	if (ret)
+		vma_hold(ret);
+out:
+	preempt_enable();
+	return ret;
+}
+
+void invalidate_vma_cache_range(struct mm_struct *mm,
+	unsigned long start, unsigned long end)
+{
+	struct page *page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long address = start;
+	spinlock_t *lock;
+
+	if (!mm)
+		return;
+
+	while (address < end) {
+		pgd = pgd_offset(mm, address);
+		if (!pgd_present(*pgd)) {
+			address = pgd_addr_end(address, end);
+			continue;
+		}
+		pud = pud_offset(pgd, address);
+		if (!pud_present(*pud)) {
+			address = pud_addr_end(address, end);
+			continue;
+		}
+		pmd = pmd_offset(pud, address);
+		if (pmd_present(*pmd)) {
+			page = pmd_page(*pmd);
+			/*
+ 			 * this spinlock guarantee no race with speculative
+			 * page fault, finally.
+			 */
+			lock = pte_lockptr(mm, pmd);
+			spin_lock(lock);
+			page->cache = NULL;
+			spin_unlock(lock);
+		}
+		address = pmd_addr_end(address, end);
+	}
+}
+
+/* called under mm_write_lock() */
+void wait_vmas_cache_access(struct vm_area_struct *vma, unsigned long end)
+{
+	while (vma && (vma->vm_start < end)) {
+		wait_event_interruptible(vma->cache_wait,
+				atomic_read(&vma->cache_access) == 0);
+		vma = vma->vm_next;
+	}
+}
+
+void __vma_release(struct vm_area_struct *vma)
+{
+	wake_up(&vma->cache_wait);
+}
+
 /*
  * Requires inode->i_mapping->i_mmap_lock
  */
@@ -406,6 +494,8 @@ void __vma_link_rb(struct mm_struct *mm,
 {
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+	atomic_set(&vma->cache_access, 0);
+	init_waitqueue_head(&vma->cache_wait);
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -774,7 +864,8 @@ struct vm_area_struct *vma_merge(struct 
 	area = next;
 	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
 		next = next->vm_next;
-
+	invalidate_vma_cache_range(mm, addr, end);
+	wait_vmas_cache_access(next, end);
 	/*
 	 * Can it merge with the predecessor?
 	 */
@@ -1162,7 +1253,6 @@ munmap_back:
 			return -ENOMEM;
 		vm_flags |= VM_ACCOUNT;
 	}
-
 	/*
 	 * Can we just expand an old mapping?
 	 */
@@ -1930,7 +2020,9 @@ int do_munmap(struct mm_struct *mm, unsi
 	end = start + len;
 	if (vma->vm_start >= end)
 		return 0;
-
+	/* Before going further, clear vma cache */
+	invalidate_vma_cache_range(mm, start, end);
+	wait_vmas_cache_access(vma, end);
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -1940,7 +2032,6 @@ int do_munmap(struct mm_struct *mm, unsi
 	 */
 	if (start > vma->vm_start) {
 		int error;
-
 		/*
 		 * Make sure that map_count on return from munmap() will
 		 * not exceed its limit; but let map_count go just above
@@ -2050,6 +2141,7 @@ unsigned long do_brk(unsigned long addr,
 	if (error)
 		return error;
 
+
 	/*
 	 * mlock MCL_FUTURE?
 	 */
@@ -2069,6 +2161,7 @@ unsigned long do_brk(unsigned long addr,
 	 */
 	verify_mm_writelocked(mm);
 
+	invalidate_vma_cache_range(mm, addr,addr+len);
 	/*
 	 * Clear old maps.  this also does some error checking for us
 	 */
@@ -2245,6 +2338,7 @@ struct vm_area_struct *copy_vma(struct v
 				kmem_cache_free(vm_area_cachep, new_vma);
 				return NULL;
 			}
+			atomic_set(&new_vma->cache_access, 0);
 			vma_set_policy(new_vma, pol);
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
Index: mmotm-mm-accessor/mm/page_alloc.c
===================================================================
--- mmotm-mm-accessor.orig/mm/page_alloc.c
+++ mmotm-mm-accessor/mm/page_alloc.c
@@ -698,6 +698,7 @@ static int prep_new_page(struct page *pa
 
 	set_page_private(page, 0);
 	set_page_refcounted(page);
+	page->cache = NULL;
 
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2009-12-18  0:48 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-16  3:00 [mm][RFC][PATCH 0/11] mm accessor updates KAMEZAWA Hiroyuki
2009-12-16  3:01 ` [mm][RFC][PATCH 1/11] mm accessor for replacing mmap_sem KAMEZAWA Hiroyuki
2009-12-16  3:02 ` [mm][RFC][PATCH 2/11] mm accessor for kernel core KAMEZAWA Hiroyuki
2009-12-16  3:03 ` [mm][RFC][PATCH 3/11] mm accessor for fs KAMEZAWA Hiroyuki
2009-12-16  3:04 ` [mm][RFC][PATCH 4/11] mm accessor for kvm KAMEZAWA Hiroyuki
2009-12-16  3:05 ` [mm][RFC][PATCH 5/11] mm accessor for tomoyo KAMEZAWA Hiroyuki
2009-12-16  3:06 ` [mm][RFC][PATCH 6/11] mm accessor for driver/gpu KAMEZAWA Hiroyuki
2009-12-16  3:07 ` [mm][RFC][PATCH 7/11] mm accessor for inifiniband KAMEZAWA Hiroyuki
2009-12-16  3:08 ` [mm][RFC][PATCH 8/11] mm accessor for video KAMEZAWA Hiroyuki
2009-12-16  3:09 ` [mm][RFC][PATCH 9/11] mm accessor for sgi gru KAMEZAWA Hiroyuki
2009-12-16  3:10 ` [mm][RFC][PATCH 10/11] mm accessor for misc drivers KAMEZAWA Hiroyuki
2009-12-16  3:11 ` [mm][RFC][PATCH 11/11] mm accessor for x86 KAMEZAWA Hiroyuki
2009-12-16 10:11 ` [mm][RFC][PATCH 0/11] mm accessor updates Andi Kleen
2009-12-16 10:13   ` KAMEZAWA Hiroyuki
2009-12-16 10:28     ` Andi Kleen
2009-12-16 10:31       ` KAMEZAWA Hiroyuki
2009-12-16 10:49         ` Andi Kleen
2009-12-16 11:12           ` KAMEZAWA Hiroyuki
2009-12-16 11:31             ` Andi Kleen
2009-12-16 16:27               ` Christoph Lameter
2009-12-16 23:01                 ` Peter Zijlstra
2009-12-17  4:11                   ` KOSAKI Motohiro
2009-12-17  8:41                   ` Andi Kleen
2009-12-16 22:57         ` Peter Zijlstra
2009-12-17  8:40           ` Andi Kleen
2009-12-17  8:45             ` Peter Zijlstra
2009-12-17  8:54               ` Andi Kleen
2009-12-17 14:45                 ` Paul E. McKenney
2009-12-17 15:02                   ` Peter Zijlstra
2009-12-17 17:53                   ` Andi Kleen
2009-12-17 19:08                     ` Paul E. McKenney
2009-12-17 19:55                       ` Andi Kleen
2009-12-17 19:56                         ` Christoph Lameter
2009-12-17 20:14                           ` Peter Zijlstra
2009-12-17 20:42                             ` Christoph Lameter
2009-12-18  5:17                               ` Ingo Molnar
2009-12-18 17:00                                 ` Avi Kivity
2009-12-18 17:12                                   ` Ingo Molnar
2009-12-18 18:12                                     ` Christoph Lameter
2009-12-18 18:43                                       ` Andi Kleen
2009-12-18 18:45                                       ` Ingo Molnar
2009-12-18 23:18                                         ` KAMEZAWA Hiroyuki
2009-12-17 19:33             ` Christoph Lameter
2009-12-17 20:07               ` Peter Zijlstra
2009-12-17 20:13                 ` Christoph Lameter
2009-12-17 20:19                   ` Peter Zijlstra
2009-12-16 10:31       ` Minchan Kim
2009-12-16 10:33         ` KAMEZAWA Hiroyuki
2009-12-18  0:38           ` [RFC 0/4] speculative page fault (Was " KAMEZAWA Hiroyuki
2009-12-18  0:41             ` [RFC 1/4] uninline mm accessor KAMEZAWA Hiroyuki
2009-12-18  0:43             ` [RFC 2/4] add mm event counter KAMEZAWA Hiroyuki
2009-12-19  3:23               ` Minchan Kim
2009-12-19  6:37                 ` KAMEZAWA Hiroyuki
2009-12-18  0:45             ` KAMEZAWA Hiroyuki [this message]
2009-12-19  3:43               ` [RFC 3/4] lockless vma caching Minchan Kim
2009-12-19  6:44                 ` KAMEZAWA Hiroyuki
2009-12-18  0:46             ` [RFC 4/4] speculative pag fault KAMEZAWA Hiroyuki
2009-12-18  5:54               ` Minchan Kim
2009-12-18  6:06                 ` KAMEZAWA Hiroyuki
2009-12-18  6:33                   ` Minchan Kim
2009-12-19  3:55               ` Minchan Kim
2009-12-19  6:49                 ` KAMEZAWA Hiroyuki
2009-12-16 16:24   ` [mm][RFC][PATCH 0/11] mm accessor updates Christoph Lameter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091218094513.490f27b4.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=cl@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan.kim@gmail.com \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox