From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>,
Andi Kleen <andi@firstfloor.org>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
"linux-mm@kvack.org" <linux-mm@kvack.org>,
cl@linux-foundation.org,
"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
"mingo@elte.hu" <mingo@elte.hu>
Subject: [RFC 3/4] lockless vma caching
Date: Fri, 18 Dec 2009 09:45:13 +0900 [thread overview]
Message-ID: <20091218094513.490f27b4.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20091218093849.8ba69ad9.kamezawa.hiroyu@jp.fujitsu.com>
For accessing vma in lockless style, some modification for vma lookup is
required. Now, rb-tree is used and it doesn't allow read while modification.
This is a trial to caching vma rather than diving into rb-tree. The last
fault vma is cached to pgd's page->cached_vma field. And, add reference count
and waitqueue to vma.
The accessor will have to do
vma = lookup_vma_cache(mm, address);
if (vma) {
if (mm_check_version(mm) && /* no write lock at this point ? */
(vma->vm_start <= address) && (vma->vm_end > address))
goto found_vma; /* start speculative job */
else
vma_release_cache(vma);
vma = NULL;
}
vma = find_vma();
found_vma:
....do some jobs....
vma_release_cache(vma);
Maybe some more consideration for invalidation point is necessary.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
include/linux/mm.h | 20 +++++++++
include/linux/mm_types.h | 5 ++
mm/memory.c | 14 ++++++
mm/mmap.c | 102 +++++++++++++++++++++++++++++++++++++++++++++--
mm/page_alloc.c | 1
5 files changed, 138 insertions(+), 4 deletions(-)
Index: mmotm-mm-accessor/include/linux/mm.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm.h
+++ mmotm-mm-accessor/include/linux/mm.h
@@ -763,6 +763,26 @@ unsigned long unmap_vmas(struct mmu_gath
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
+struct vm_area_struct *lookup_vma_cache(struct mm_struct *mm,
+ unsigned long address);
+void invalidate_vma_cache(struct mm_struct *mm,
+ struct vm_area_struct *vma);
+void wait_vmas_cache_range(struct vm_area_struct *vma, unsigned long end);
+
+static inline void vma_hold(struct vm_area_struct *vma)
+{
+ atomic_inc(&vma->cache_access);
+}
+
+void __vma_release(struct vm_area_struct *vma);
+static inline void vma_release(struct vm_area_struct *vma)
+{
+ if (atomic_dec_and_test(&vma->cache_access)) {
+ if (waitqueue_active(&vma->cache_wait))
+ __vma_release(vma);
+ }
+}
+
/**
* mm_walk - callbacks for walk_page_range
* @pgd_entry: if set, called for each non-empty PGD (top-level) entry
Index: mmotm-mm-accessor/include/linux/mm_types.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm_types.h
+++ mmotm-mm-accessor/include/linux/mm_types.h
@@ -12,6 +12,7 @@
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/page-debug-flags.h>
+#include <linux/wait.h>
#include <asm/page.h>
#include <asm/mmu.h>
@@ -77,6 +78,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
+ void *cache;
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
@@ -180,6 +182,9 @@ struct vm_area_struct {
void * vm_private_data; /* was vm_pte (shared mem) */
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
+ atomic_t cache_access;
+ wait_queue_head_t cache_wait;
+
#ifndef CONFIG_MMU
struct vm_region *vm_region; /* NOMMU mapping region */
#endif
Index: mmotm-mm-accessor/mm/memory.c
===================================================================
--- mmotm-mm-accessor.orig/mm/memory.c
+++ mmotm-mm-accessor/mm/memory.c
@@ -145,6 +145,14 @@ void pmd_clear_bad(pmd_t *pmd)
pmd_clear(pmd);
}
+static void update_vma_cache(pmd_t *pmd, struct vm_area_struct *vma)
+{
+ struct page *page;
+ /* ptelock is held */
+ page = pmd_page(*pmd);
+ page->cache = vma;
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -2118,6 +2126,7 @@ reuse:
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
+ update_vma_cache(pmd, vma);
goto unlock;
}
@@ -2186,6 +2195,7 @@ gotten:
*/
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
+ update_vma_cache(pmd, vma);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2626,6 +2636,7 @@ static int do_swap_page(struct mm_struct
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
+ update_vma_cache(pmd, vma);
unlock:
pte_unmap_unlock(page_table, ptl);
out:
@@ -2691,6 +2702,7 @@ setpte:
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, entry);
+ update_vma_cache(pmd, vma);
unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
@@ -2852,6 +2864,7 @@ static int __do_fault(struct mm_struct *
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
+ update_vma_cache(pmd, vma);
} else {
if (charged)
mem_cgroup_uncharge_page(page);
@@ -2989,6 +3002,7 @@ static inline int handle_pte_fault(struc
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, entry);
+ update_vma_cache(pmd, vma);
} else {
/*
* This is needed only for protection faults but the arch code
Index: mmotm-mm-accessor/mm/mmap.c
===================================================================
--- mmotm-mm-accessor.orig/mm/mmap.c
+++ mmotm-mm-accessor/mm/mmap.c
@@ -187,6 +187,94 @@ error:
return -ENOMEM;
}
+struct vm_area_struct *
+lookup_vma_cache(struct mm_struct *mm, unsigned long address)
+{
+ struct page *page;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ struct vm_area_struct *ret = NULL;
+
+ if (!mm)
+ return NULL;
+
+ preempt_disable();
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ goto out;
+ page = pmd_page(*pmd);
+ if (PageReserved(page))
+ goto out;
+ ret = (struct vm_area_struct *)page->cache;
+ if (ret)
+ vma_hold(ret);
+out:
+ preempt_enable();
+ return ret;
+}
+
+void invalidate_vma_cache_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct page *page;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned long address = start;
+ spinlock_t *lock;
+
+ if (!mm)
+ return;
+
+ while (address < end) {
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd)) {
+ address = pgd_addr_end(address, end);
+ continue;
+ }
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud)) {
+ address = pud_addr_end(address, end);
+ continue;
+ }
+ pmd = pmd_offset(pud, address);
+ if (pmd_present(*pmd)) {
+ page = pmd_page(*pmd);
+ /*
+ * this spinlock guarantee no race with speculative
+ * page fault, finally.
+ */
+ lock = pte_lockptr(mm, pmd);
+ spin_lock(lock);
+ page->cache = NULL;
+ spin_unlock(lock);
+ }
+ address = pmd_addr_end(address, end);
+ }
+}
+
+/* called under mm_write_lock() */
+void wait_vmas_cache_access(struct vm_area_struct *vma, unsigned long end)
+{
+ while (vma && (vma->vm_start < end)) {
+ wait_event_interruptible(vma->cache_wait,
+ atomic_read(&vma->cache_access) == 0);
+ vma = vma->vm_next;
+ }
+}
+
+void __vma_release(struct vm_area_struct *vma)
+{
+ wake_up(&vma->cache_wait);
+}
+
/*
* Requires inode->i_mapping->i_mmap_lock
*/
@@ -406,6 +494,8 @@ void __vma_link_rb(struct mm_struct *mm,
{
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ atomic_set(&vma->cache_access, 0);
+ init_waitqueue_head(&vma->cache_wait);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -774,7 +864,8 @@ struct vm_area_struct *vma_merge(struct
area = next;
if (next && next->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
-
+ invalidate_vma_cache_range(mm, addr, end);
+ wait_vmas_cache_access(next, end);
/*
* Can it merge with the predecessor?
*/
@@ -1162,7 +1253,6 @@ munmap_back:
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
-
/*
* Can we just expand an old mapping?
*/
@@ -1930,7 +2020,9 @@ int do_munmap(struct mm_struct *mm, unsi
end = start + len;
if (vma->vm_start >= end)
return 0;
-
+ /* Before going further, clear vma cache */
+ invalidate_vma_cache_range(mm, start, end);
+ wait_vmas_cache_access(vma, end);
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -1940,7 +2032,6 @@ int do_munmap(struct mm_struct *mm, unsi
*/
if (start > vma->vm_start) {
int error;
-
/*
* Make sure that map_count on return from munmap() will
* not exceed its limit; but let map_count go just above
@@ -2050,6 +2141,7 @@ unsigned long do_brk(unsigned long addr,
if (error)
return error;
+
/*
* mlock MCL_FUTURE?
*/
@@ -2069,6 +2161,7 @@ unsigned long do_brk(unsigned long addr,
*/
verify_mm_writelocked(mm);
+ invalidate_vma_cache_range(mm, addr,addr+len);
/*
* Clear old maps. this also does some error checking for us
*/
@@ -2245,6 +2338,7 @@ struct vm_area_struct *copy_vma(struct v
kmem_cache_free(vm_area_cachep, new_vma);
return NULL;
}
+ atomic_set(&new_vma->cache_access, 0);
vma_set_policy(new_vma, pol);
new_vma->vm_start = addr;
new_vma->vm_end = addr + len;
Index: mmotm-mm-accessor/mm/page_alloc.c
===================================================================
--- mmotm-mm-accessor.orig/mm/page_alloc.c
+++ mmotm-mm-accessor/mm/page_alloc.c
@@ -698,6 +698,7 @@ static int prep_new_page(struct page *pa
set_page_private(page, 0);
set_page_refcounted(page);
+ page->cache = NULL;
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2009-12-18 0:48 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-12-16 3:00 [mm][RFC][PATCH 0/11] mm accessor updates KAMEZAWA Hiroyuki
2009-12-16 3:01 ` [mm][RFC][PATCH 1/11] mm accessor for replacing mmap_sem KAMEZAWA Hiroyuki
2009-12-16 3:02 ` [mm][RFC][PATCH 2/11] mm accessor for kernel core KAMEZAWA Hiroyuki
2009-12-16 3:03 ` [mm][RFC][PATCH 3/11] mm accessor for fs KAMEZAWA Hiroyuki
2009-12-16 3:04 ` [mm][RFC][PATCH 4/11] mm accessor for kvm KAMEZAWA Hiroyuki
2009-12-16 3:05 ` [mm][RFC][PATCH 5/11] mm accessor for tomoyo KAMEZAWA Hiroyuki
2009-12-16 3:06 ` [mm][RFC][PATCH 6/11] mm accessor for driver/gpu KAMEZAWA Hiroyuki
2009-12-16 3:07 ` [mm][RFC][PATCH 7/11] mm accessor for inifiniband KAMEZAWA Hiroyuki
2009-12-16 3:08 ` [mm][RFC][PATCH 8/11] mm accessor for video KAMEZAWA Hiroyuki
2009-12-16 3:09 ` [mm][RFC][PATCH 9/11] mm accessor for sgi gru KAMEZAWA Hiroyuki
2009-12-16 3:10 ` [mm][RFC][PATCH 10/11] mm accessor for misc drivers KAMEZAWA Hiroyuki
2009-12-16 3:11 ` [mm][RFC][PATCH 11/11] mm accessor for x86 KAMEZAWA Hiroyuki
2009-12-16 10:11 ` [mm][RFC][PATCH 0/11] mm accessor updates Andi Kleen
2009-12-16 10:13 ` KAMEZAWA Hiroyuki
2009-12-16 10:28 ` Andi Kleen
2009-12-16 10:31 ` KAMEZAWA Hiroyuki
2009-12-16 10:49 ` Andi Kleen
2009-12-16 11:12 ` KAMEZAWA Hiroyuki
2009-12-16 11:31 ` Andi Kleen
2009-12-16 16:27 ` Christoph Lameter
2009-12-16 23:01 ` Peter Zijlstra
2009-12-17 4:11 ` KOSAKI Motohiro
2009-12-17 8:41 ` Andi Kleen
2009-12-16 22:57 ` Peter Zijlstra
2009-12-17 8:40 ` Andi Kleen
2009-12-17 8:45 ` Peter Zijlstra
2009-12-17 8:54 ` Andi Kleen
2009-12-17 14:45 ` Paul E. McKenney
2009-12-17 15:02 ` Peter Zijlstra
2009-12-17 17:53 ` Andi Kleen
2009-12-17 19:08 ` Paul E. McKenney
2009-12-17 19:55 ` Andi Kleen
2009-12-17 19:56 ` Christoph Lameter
2009-12-17 20:14 ` Peter Zijlstra
2009-12-17 20:42 ` Christoph Lameter
2009-12-18 5:17 ` Ingo Molnar
2009-12-18 17:00 ` Avi Kivity
2009-12-18 17:12 ` Ingo Molnar
2009-12-18 18:12 ` Christoph Lameter
2009-12-18 18:43 ` Andi Kleen
2009-12-18 18:45 ` Ingo Molnar
2009-12-18 23:18 ` KAMEZAWA Hiroyuki
2009-12-17 19:33 ` Christoph Lameter
2009-12-17 20:07 ` Peter Zijlstra
2009-12-17 20:13 ` Christoph Lameter
2009-12-17 20:19 ` Peter Zijlstra
2009-12-16 10:31 ` Minchan Kim
2009-12-16 10:33 ` KAMEZAWA Hiroyuki
2009-12-18 0:38 ` [RFC 0/4] speculative page fault (Was " KAMEZAWA Hiroyuki
2009-12-18 0:41 ` [RFC 1/4] uninline mm accessor KAMEZAWA Hiroyuki
2009-12-18 0:43 ` [RFC 2/4] add mm event counter KAMEZAWA Hiroyuki
2009-12-19 3:23 ` Minchan Kim
2009-12-19 6:37 ` KAMEZAWA Hiroyuki
2009-12-18 0:45 ` KAMEZAWA Hiroyuki [this message]
2009-12-19 3:43 ` [RFC 3/4] lockless vma caching Minchan Kim
2009-12-19 6:44 ` KAMEZAWA Hiroyuki
2009-12-18 0:46 ` [RFC 4/4] speculative pag fault KAMEZAWA Hiroyuki
2009-12-18 5:54 ` Minchan Kim
2009-12-18 6:06 ` KAMEZAWA Hiroyuki
2009-12-18 6:33 ` Minchan Kim
2009-12-19 3:55 ` Minchan Kim
2009-12-19 6:49 ` KAMEZAWA Hiroyuki
2009-12-16 16:24 ` [mm][RFC][PATCH 0/11] mm accessor updates Christoph Lameter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20091218094513.490f27b4.kamezawa.hiroyu@jp.fujitsu.com \
--to=kamezawa.hiroyu@jp.fujitsu.com \
--cc=akpm@linux-foundation.org \
--cc=andi@firstfloor.org \
--cc=cl@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan.kim@gmail.com \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox