From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Linux Memory Management <linux-mm@kvack.org>
Subject: [PATCH 4/7] abstract pagetable locking and pte updates
Date: Fri, 29 Oct 2004 17:21:58 +1000 [thread overview]
Message-ID: <4181EF96.2030602@yahoo.com.au> (raw)
In-Reply-To: <4181EF80.3030709@yahoo.com.au>
[-- Attachment #1: Type: text/plain, Size: 4 bytes --]
4/7
[-- Attachment #2: vm-abstract-pgtable-locking.patch --]
[-- Type: text/x-patch, Size: 95884 bytes --]
Abstract out page table locking and pte updating. Move over to a
transactional type API for doing pte updates. See asm-generic/pgtable.h
for more details.
* VMAs pin pagetables. You must hold the mmap_sem or anon vma lock
in order to pin the vmas before doing any page table operations.
* mm_lock_page_table(mm); must also be taken when doing page table
operations.
* In order to modify a pte, one must do the following:
{
struct pte_modify pmod; /* This can store the old pteval for cmpxchg */
pte_t pte;
pte = ptep_begin_modify(&pmod, mm, ptep);
/* confirm pte is what we want */
if (wrong_pte(pte)) {
ptep_abort(&pmod, mm, ptep);
goto out;
}
... /* modify pte (not *ptep) */
if (ptep_commit(&pmod, mm, ptep, pte)) {
/* commit failed - usually cleanup & retry or cleanup & fail */
} else {
/*
* *ptep was updated.
* The old *ptep value is guaranteed not to have changed between
* ptep_begin_modify and ptep_commit _except_ some implementations
* may allow hardware bits to have changed, so we need a range of
* ptep_commit_xxx functions to cope with those situations.
*/
}
}
---
linux-2.6-npiggin/arch/i386/kernel/vm86.c | 19
linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c | 11
linux-2.6-npiggin/arch/i386/mm/ioremap.c | 23
linux-2.6-npiggin/fs/exec.c | 22
linux-2.6-npiggin/include/asm-generic/pgtable.h | 298 +++++++++
linux-2.6-npiggin/include/asm-generic/tlb.h | 9
linux-2.6-npiggin/include/linux/mm.h | 1
linux-2.6-npiggin/kernel/fork.c | 10
linux-2.6-npiggin/kernel/futex.c | 7
linux-2.6-npiggin/mm/fremap.c | 44 -
linux-2.6-npiggin/mm/hugetlb.c | 4
linux-2.6-npiggin/mm/memory.c | 780 ++++++++++++++----------
linux-2.6-npiggin/mm/mmap.c | 4
linux-2.6-npiggin/mm/mprotect.c | 30
linux-2.6-npiggin/mm/mremap.c | 25
linux-2.6-npiggin/mm/msync.c | 52 +
linux-2.6-npiggin/mm/rmap.c | 175 +++--
linux-2.6-npiggin/mm/swap_state.c | 2
linux-2.6-npiggin/mm/swapfile.c | 63 -
linux-2.6-npiggin/mm/vmalloc.c | 24
20 files changed, 1104 insertions(+), 499 deletions(-)
diff -puN mm/memory.c~vm-abstract-pgtable-locking mm/memory.c
--- linux-2.6/mm/memory.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/memory.c 2004-10-29 16:28:08.000000000 +1000
@@ -145,11 +145,14 @@ static inline void free_one_pgd(struct m
*/
void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr)
{
- pgd_t * page_dir = tlb->mm->pgd;
+ struct mm_struct *mm = tlb->mm;
+ pgd_t * page_dir = mm->pgd;
page_dir += first;
do {
+ mm_lock_page_table(mm);
free_one_pgd(tlb, page_dir);
+ mm_unlock_page_table(mm);
page_dir++;
} while (--nr);
}
@@ -159,35 +162,50 @@ pte_t fastcall * pte_alloc_map(struct mm
if (!pmd_present(*pmd)) {
struct page *new;
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
if (!new)
return NULL;
/*
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
- if (pmd_present(*pmd)) {
+ if (pmd_test_and_populate(mm, pmd, new)) {
pte_free(new);
goto out;
}
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
- pmd_populate(mm, pmd, new);
}
out:
return pte_offset_map(pmd, address);
}
+static inline pte_t * __pte_alloc_map_unlocked(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+{
+ if (!pmd_present(*pmd)) {
+ struct page *new;
+
+ new = pte_alloc_one(mm, address);
+ if (!new)
+ return NULL;
+
+ pmd_populate(mm, pmd, new);
+ mm->nr_ptes++;
+ inc_page_state(nr_page_table_pages);
+ }
+ return pte_offset_map(pmd, address);
+}
+
pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
if (!pmd_present(*pmd)) {
pte_t *new;
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
if (!new)
return NULL;
@@ -195,13 +213,9 @@ pte_t fastcall * pte_alloc_kernel(struct
* Because we dropped the lock, we should re-check the
* entry, as somebody else could have populated it..
*/
- if (pmd_present(*pmd)) {
+ if (pmd_test_and_populate_kernel(mm, pmd, new))
pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
}
-out:
return pte_offset_kernel(pmd, address);
}
#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
@@ -214,9 +228,6 @@ out:
*
* 08Jan98 Merged into one routine from several inline routines to reduce
* variable count and make things faster. -jj
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within pmd_alloc() and pte_alloc_map().
*/
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -237,9 +248,9 @@ int copy_page_range(struct mm_struct *ds
pmd_t * src_pmd, * dst_pmd;
src_pgd++; dst_pgd++;
-
+
/* copy_pmd_range */
-
+
if (pgd_none(*src_pgd))
goto skip_copy_pmd_range;
if (unlikely(pgd_bad(*src_pgd))) {
@@ -251,6 +262,7 @@ skip_copy_pmd_range: address = (address
continue;
}
+ /* XXX: Don't we worry about the lock for pgd? */
src_pmd = pmd_offset(src_pgd, address);
dst_pmd = pmd_alloc(dst, dst_pgd, address);
if (!dst_pmd)
@@ -258,9 +270,9 @@ skip_copy_pmd_range: address = (address
do {
pte_t * src_pte, * dst_pte;
-
+
/* copy_pte_range */
-
+
if (pmd_none(*src_pmd))
goto skip_copy_pte_range;
if (unlikely(pmd_bad(*src_pmd))) {
@@ -273,24 +285,43 @@ skip_copy_pte_range:
goto cont_copy_pmd_range;
}
- dst_pte = pte_alloc_map(dst, dst_pmd, address);
+ dst_pte = __pte_alloc_map_unlocked(dst, dst_pmd, address);
if (!dst_pte)
goto nomem;
- spin_lock(&src->page_table_lock);
+ mm_lock_page_table(src);
+ mm_pin_pages(src);
src_pte = pte_offset_map_nested(src_pmd, address);
do {
- pte_t pte = *src_pte;
+ struct pte_modify pmod;
+ pte_t new;
struct page *page;
unsigned long pfn;
+again:
/* copy_one_pte */
- if (pte_none(pte))
+ /*
+ * We use this transaction to check that the
+ * src hasn't changed from under us. Even if
+ * we don't actually change it.
+ */
+ new = ptep_begin_modify(&pmod, src, src_pte);
+ if (pte_none(new)) {
+ ptep_abort(&pmod, src, src_pte);
goto cont_copy_pte_range_noset;
+ }
/* pte contains position in swap, so copy. */
- if (!pte_present(pte)) {
- if (!pte_file(pte)) {
- swap_duplicate(pte_to_swp_entry(pte));
+ if (!pte_present(new)) {
+ if (!pte_file(new))
+ swap_duplicate(pte_to_swp_entry(new));
+ set_pte(dst_pte, new);
+ if (ptep_verify_finish(&pmod, src, src_pte)) {
+ pte_clear(dst_pte);
+ if (!pte_file(new))
+ free_swap_and_cache(pte_to_swp_entry(new));
+ goto again;
+ }
+ if (!pte_file(new)) {
if (list_empty(&dst->mmlist)) {
spin_lock(&mmlist_lock);
list_add(&dst->mmlist,
@@ -298,10 +329,9 @@ skip_copy_pte_range:
spin_unlock(&mmlist_lock);
}
}
- set_pte(dst_pte, pte);
goto cont_copy_pte_range_noset;
}
- pfn = pte_pfn(pte);
+ pfn = pte_pfn(new);
/* the pte points outside of valid memory, the
* mapping is assumed to be good, meaningful
* and not mapped via rmap - duplicate the
@@ -312,7 +342,11 @@ skip_copy_pte_range:
page = pfn_to_page(pfn);
if (!page || PageReserved(page)) {
- set_pte(dst_pte, pte);
+ set_pte(dst_pte, new);
+ if (ptep_verify_finish(&pmod, src, src_pte)) {
+ pte_clear(dst_pte);
+ goto again;
+ }
goto cont_copy_pte_range_noset;
}
@@ -320,22 +354,26 @@ skip_copy_pte_range:
* If it's a COW mapping, write protect it both
* in the parent and the child
*/
- if (cow) {
- ptep_set_wrprotect(src_pte);
- pte = *src_pte;
- }
+ if (cow)
+ new = pte_wrprotect(new);
/*
* If it's a shared mapping, mark it clean in
* the child
*/
if (vma->vm_flags & VM_SHARED)
- pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
+ new = pte_mkclean(new);
+ new = pte_mkold(new);
get_page(page);
- dst->rss++;
- set_pte(dst_pte, pte);
page_dup_rmap(page);
+ set_pte(dst_pte, new);
+ if (ptep_commit(&pmod, src, src_pte, new)) {
+ pte_clear(dst_pte);
+ page_remove_rmap(page);
+ put_page(page);
+ goto again;
+ }
+ dst->rss++;
cont_copy_pte_range_noset:
address += PAGE_SIZE;
if (address >= end) {
@@ -348,22 +386,23 @@ cont_copy_pte_range_noset:
} while ((unsigned long)src_pte & PTE_TABLE_MASK);
pte_unmap_nested(src_pte-1);
pte_unmap(dst_pte-1);
- spin_unlock(&src->page_table_lock);
- cond_resched_lock(&dst->page_table_lock);
+ mm_unpin_pages(src);
+ mm_unlock_page_table(src);
cont_copy_pmd_range:
src_pmd++;
dst_pmd++;
} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
}
out_unlock:
- spin_unlock(&src->page_table_lock);
+ mm_unpin_pages(src);
+ mm_unlock_page_table(src);
out:
return 0;
nomem:
return -ENOMEM;
}
-static void zap_pte_range(struct mmu_gather *tlb,
+static void zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm,
pmd_t *pmd, unsigned long address,
unsigned long size, struct zap_details *details)
{
@@ -384,13 +423,17 @@ static void zap_pte_range(struct mmu_gat
size &= PAGE_MASK;
if (details && !details->check_mapping && !details->nonlinear_vma)
details = NULL;
+ mm_pin_pages(mm);
for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
- pte_t pte = *ptep;
- if (pte_none(pte))
- continue;
- if (pte_present(pte)) {
+ struct pte_modify pmod;
+ pte_t old, new;
+again:
+ new = ptep_begin_modify(&pmod, mm, ptep);
+ if (pte_none(new))
+ goto trns_abort;
+ if (pte_present(new)) {
struct page *page = NULL;
- unsigned long pfn = pte_pfn(pte);
+ unsigned long pfn = pte_pfn(new);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageReserved(page))
@@ -404,7 +447,7 @@ static void zap_pte_range(struct mmu_gat
*/
if (details->check_mapping &&
details->check_mapping != page->mapping)
- continue;
+ goto trns_abort;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
@@ -412,23 +455,27 @@ static void zap_pte_range(struct mmu_gat
if (details->nonlinear_vma &&
(page->index < details->first_index ||
page->index > details->last_index))
- continue;
+ goto trns_abort;
}
- pte = ptep_get_and_clear(ptep);
+ pte_clear(&new);
+ if (likely(page)) {
+ if (unlikely(details) && details->nonlinear_vma
+ && linear_page_index(details->nonlinear_vma,
+ address+offset) != page->index)
+ new = pgoff_to_pte(page->index);
+ }
+ if (ptep_commit_clear(&pmod, mm, ptep, new, old))
+ goto again;
tlb_remove_tlb_entry(tlb, ptep, address+offset);
- if (unlikely(!page))
- continue;
- if (unlikely(details) && details->nonlinear_vma
- && linear_page_index(details->nonlinear_vma,
- address+offset) != page->index)
- set_pte(ptep, pgoff_to_pte(page->index));
- if (pte_dirty(pte))
- set_page_dirty(page);
- if (pte_young(pte) && !PageAnon(page))
- mark_page_accessed(page);
- tlb->freed++;
- page_remove_rmap(page);
- tlb_remove_page(tlb, page);
+ if (likely(page)) {
+ if (pte_dirty(old))
+ set_page_dirty(page);
+ if (pte_young(old) && !PageAnon(page))
+ mark_page_accessed(page);
+ tlb->freed++;
+ page_remove_rmap(page);
+ tlb_remove_page(tlb, page);
+ }
continue;
}
/*
@@ -436,15 +483,22 @@ static void zap_pte_range(struct mmu_gat
* if details->nonlinear_vma, we leave file entries.
*/
if (unlikely(details))
- continue;
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
- pte_clear(ptep);
+ goto trns_abort;
+ pte_clear(&new);
+ if (ptep_commit_clear(&pmod, mm, ptep, new, old))
+ goto again;
+ if (!pte_file(old))
+ free_swap_and_cache(pte_to_swp_entry(old));
+
+ continue;
+trns_abort:
+ ptep_abort(&pmod, mm, ptep);
}
+ mm_unpin_pages(mm);
pte_unmap(ptep-1);
}
-static void zap_pmd_range(struct mmu_gather *tlb,
+static void zap_pmd_range(struct mmu_gather *tlb, struct mm_struct *mm,
pgd_t * dir, unsigned long address,
unsigned long size, struct zap_details *details)
{
@@ -463,27 +517,29 @@ static void zap_pmd_range(struct mmu_gat
if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
end = ((address + PGDIR_SIZE) & PGDIR_MASK);
do {
- zap_pte_range(tlb, pmd, address, end - address, details);
+ zap_pte_range(tlb, mm, pmd, address, end - address, details);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
}
-static void unmap_page_range(struct mmu_gather *tlb,
+static void unmap_page_range(struct mmu_gather *tlb, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, struct zap_details *details)
{
pgd_t * dir;
BUG_ON(address >= end);
- dir = pgd_offset(vma->vm_mm, address);
+ mm_lock_page_table(mm);
+ dir = pgd_offset(mm, address);
tlb_start_vma(tlb, vma);
do {
- zap_pmd_range(tlb, dir, address, end - address, details);
+ zap_pmd_range(tlb, mm, dir, address, end - address, details);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
tlb_end_vma(tlb, vma);
+ mm_unlock_page_table(mm);
}
/* Dispose of an entire struct mmu_gather per rescheduling point */
@@ -513,11 +569,7 @@ static void unmap_page_range(struct mmu_
*
* Returns the number of vma's which were covered by the unmapping.
*
- * Unmap all pages in the vma list. Called under page_table_lock.
- *
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
+ * Unmap all pages in the vma list.
*
* Only addresses between `start' and `end' will be unmapped.
*
@@ -533,7 +585,7 @@ static int __unmap_vmas(struct mmu_gathe
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
- unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+ unsigned long zap_bytes;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
int ret = 0;
@@ -556,6 +608,7 @@ static int __unmap_vmas(struct mmu_gathe
ret++;
while (start != end) {
unsigned long block;
+ zap_bytes = ZAP_BLOCK_SIZE;
if (!tlb_start_valid) {
tlb_start = start;
@@ -567,7 +620,7 @@ static int __unmap_vmas(struct mmu_gathe
unmap_hugepage_range(vma, start, end);
} else {
block = min(zap_bytes, end - start);
- unmap_page_range(*tlbp, vma, start,
+ unmap_page_range(*tlbp, mm, vma, start,
start + block, details);
}
@@ -578,7 +631,7 @@ static int __unmap_vmas(struct mmu_gathe
if (!atomic && need_resched()) {
int fullmm = tlb_is_full_mm(*tlbp);
tlb_finish_mmu(*tlbp, tlb_start, start);
- cond_resched_lock(&mm->page_table_lock);
+ cond_resched();
*tlbp = tlb_gather_mmu(mm, fullmm);
tlb_start_valid = 0;
}
@@ -594,12 +647,10 @@ void unmap_vmas(struct mm_struct *mm, st
{
struct mmu_gather *tlb;
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
__unmap_vmas(&tlb, mm, vma,
start_addr, end_addr, nr_accounted, details);
tlb_finish_mmu(tlb, start_addr, end_addr);
- spin_unlock(&mm->page_table_lock);
}
int unmap_all_vmas(struct mm_struct *mm, unsigned long *nr_accounted)
@@ -607,13 +658,11 @@ int unmap_all_vmas(struct mm_struct *mm,
struct mmu_gather *tlb;
int ret;
lru_add_drain();
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 1);
flush_cache_mm(mm);
/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
ret = __unmap_vmas(&tlb, mm, mm->mmap, 0, ~0UL, nr_accounted, NULL);
tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
- spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -640,9 +689,14 @@ void zap_page_range(struct vm_area_struc
unmap_vmas(mm, vma, address, end, &nr_accounted, details);
}
+void follow_page_finish(struct mm_struct *mm, unsigned long address)
+{
+ mm_unpin_pages(mm);
+ mm_unlock_page_table(mm);
+}
+
/*
* Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
*/
struct page *
follow_page(struct mm_struct *mm, unsigned long address, int write)
@@ -653,7 +707,8 @@ follow_page(struct mm_struct *mm, unsign
unsigned long pfn;
struct page *page;
- page = follow_huge_addr(mm, address, write);
+ mm_lock_page_table(mm);
+ page = follow_huge_addr(mm, address, write); /* XXX: hugepages are broken */
if (! IS_ERR(page))
return page;
@@ -673,11 +728,16 @@ follow_page(struct mm_struct *mm, unsign
if (!ptep)
goto out;
- pte = *ptep;
+ /* XXX: should be able to drop the mm_pin_pages lock after pinning the
+ * page with get_page?
+ */
+ mm_pin_pages(mm);
+ pte = ptep_atomic_read(ptep);
pte_unmap(ptep);
+
if (pte_present(pte)) {
if (write && !pte_write(pte))
- goto out;
+ goto out_unpin;
pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
@@ -688,7 +748,10 @@ follow_page(struct mm_struct *mm, unsign
}
}
+out_unpin:
+ mm_unpin_pages(mm);
out:
+ mm_unlock_page_table(mm);
return NULL;
}
@@ -698,23 +761,29 @@ untouched_anonymous_page(struct mm_struc
{
pgd_t *pgd;
pmd_t *pmd;
+ int ret = 1;
/* Check if the vma is for an anonymous mapping. */
if (vma->vm_ops && vma->vm_ops->nopage)
return 0;
+ mm_lock_page_table(mm);
+
/* Check if page directory entry exists. */
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return 1;
+ goto out;
/* Check if page middle directory entry exists. */
pmd = pmd_offset(pgd, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- return 1;
+ goto out;
/* There is a pte slot for 'address' in 'mm'. */
- return 0;
+ ret = 0;
+out:
+ mm_unlock_page_table(mm);
+ return ret;
}
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -753,6 +822,7 @@ int get_user_pages(struct task_struct *t
pte = pte_offset_map(pmd, pg);
if (!pte)
return i ? : -EFAULT;
+ /* XXX: don't need atomic read for *pte? (guess not) */
if (!pte_present(*pte)) {
pte_unmap(pte);
return i ? : -EFAULT;
@@ -779,7 +849,6 @@ int get_user_pages(struct task_struct *t
&start, &len, i);
continue;
}
- spin_lock(&mm->page_table_lock);
do {
struct page *page;
int lookup_write = write;
@@ -793,10 +862,10 @@ int get_user_pages(struct task_struct *t
*/
if (!lookup_write &&
untouched_anonymous_page(mm,vma,start)) {
- page = ZERO_PAGE(start);
- break;
+ if (pages)
+ pages[i] = ZERO_PAGE(start);
+ goto set_vmas;
}
- spin_unlock(&mm->page_table_lock);
switch (handle_mm_fault(mm,vma,start,write)) {
case VM_FAULT_MINOR:
tsk->min_flt++;
@@ -819,7 +888,6 @@ int get_user_pages(struct task_struct *t
* we are forcing write access.
*/
lookup_write = write && !force;
- spin_lock(&mm->page_table_lock);
}
if (pages) {
pages[i] = page;
@@ -827,21 +895,23 @@ int get_user_pages(struct task_struct *t
if (!PageReserved(page))
page_cache_get(page);
}
+ if (page)
+ follow_page_finish(mm, start);
+set_vmas:
if (vmas)
vmas[i] = vma;
i++;
start += PAGE_SIZE;
len--;
} while(len && start < vma->vm_end);
- spin_unlock(&mm->page_table_lock);
} while(len);
return i;
}
EXPORT_SYMBOL(get_user_pages);
-static void zeromap_pte_range(pte_t * pte, unsigned long address,
- unsigned long size, pgprot_t prot)
+static void zeromap_pte_range(struct mm_struct *mm, pte_t * pte,
+ unsigned long address, unsigned long size, pgprot_t prot)
{
unsigned long end;
@@ -850,9 +920,14 @@ static void zeromap_pte_range(pte_t * pt
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
- BUG_ON(!pte_none(*pte));
- set_pte(pte, zero_pte);
+ struct pte_modify pmod;
+ pte_t new;
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ BUG_ON(!pte_none(new));
+ new = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
+ if (ptep_commit(&pmod, mm, pte, new))
+ goto again;
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
@@ -872,7 +947,7 @@ static inline int zeromap_pmd_range(stru
pte_t * pte = pte_alloc_map(mm, pmd, base + address);
if (!pte)
return -ENOMEM;
- zeromap_pte_range(pte, base + address, end - address, prot);
+ zeromap_pte_range(mm, pte, base + address, end - address, prot);
pte_unmap(pte);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
@@ -893,7 +968,7 @@ int zeromap_page_range(struct vm_area_st
if (address >= end)
BUG();
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
do {
pmd_t *pmd = pmd_alloc(mm, dir, address);
error = -ENOMEM;
@@ -909,7 +984,7 @@ int zeromap_page_range(struct vm_area_st
* Why flush? zeromap_pte_range has a BUG_ON for !pte_none()
*/
flush_tlb_range(vma, beg, end);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return error;
}
@@ -918,8 +993,9 @@ int zeromap_page_range(struct vm_area_st
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
-static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
- unsigned long pfn, pgprot_t prot)
+static inline void remap_pte_range(struct mm_struct *mm, pte_t * pte,
+ unsigned long address, unsigned long size,
+ unsigned long pfn, pgprot_t prot)
{
unsigned long end;
@@ -927,14 +1003,26 @@ static inline void remap_pte_range(pte_t
end = address + size;
if (end > PMD_SIZE)
end = PMD_SIZE;
+ mm_pin_pages(mm);
do {
- BUG_ON(!pte_none(*pte));
- if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
- set_pte(pte, pfn_pte(pfn, prot));
+ struct pte_modify pmod;
+ pte_t new;
+
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ BUG_ON(!pte_none(new));
+ if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) {
+ new = pfn_pte(pfn, prot);
+ if (ptep_commit(&pmod, mm, pte, new))
+ goto again;
+ } else
+ ptep_abort(&pmod, mm, pte);
+
address += PAGE_SIZE;
pfn++;
pte++;
} while (address && (address < end));
+ mm_unpin_pages(mm);
}
static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
@@ -952,7 +1040,7 @@ static inline int remap_pmd_range(struct
pte_t * pte = pte_alloc_map(mm, pmd, base + address);
if (!pte)
return -ENOMEM;
- remap_pte_range(pte, base + address, end - address, pfn + (address >> PAGE_SHIFT), prot);
+ remap_pte_range(mm, pte, base + address, end - address, pfn + (address >> PAGE_SHIFT), prot);
pte_unmap(pte);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
@@ -984,7 +1072,7 @@ int remap_pfn_range(struct vm_area_struc
* this region.
*/
vma->vm_flags |= VM_IO | VM_RESERVED;
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
do {
pmd_t *pmd = pmd_alloc(mm, dir, from);
error = -ENOMEM;
@@ -1000,7 +1088,7 @@ int remap_pfn_range(struct vm_area_struc
* Why flush? remap_pte_range has a BUG_ON for !pte_none()
*/
flush_tlb_range(vma, beg, end);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return error;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1019,21 +1107,6 @@ static inline pte_t maybe_mkwrite(pte_t
}
/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
- */
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
- pte_t *page_table)
-{
- pte_t entry;
-
- flush_cache_page(vma, address);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
- vma);
- ptep_establish(vma, address, page_table, entry);
- update_mmu_cache(vma, address, entry);
-}
-
-/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
@@ -1050,15 +1123,30 @@ static inline void break_cow(struct vm_a
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
+ * We hold the mm semaphore and have the page table locked on entry, and exit
+ * with the page table unlocked.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+static int do_wp_page(struct pte_modify *pmod, struct mm_struct *mm,
+ struct vm_area_struct * vma, unsigned long address,
+ pte_t *page_table, pmd_t *pmd, pte_t pte)
{
+ pte_t new;
struct page *old_page, *new_page;
- unsigned long pfn = pte_pfn(pte);
- pte_t entry;
+ unsigned long pfn;
+ int ret = VM_FAULT_OOM;
+
+ /* Audit use of mm_pin_pages nesting with ptep_begin_modify, maybe
+ * deadlockable if we do pte locks.
+ */
+ mm_pin_pages(mm);
+
+ /* Make sure the pte hasn't changed under us after pinning */
+ if (ptep_verify(pmod, mm, page_table)) {
+ ret = VM_FAULT_MINOR;
+ goto out_error;
+ }
+
+ pfn = pte_pfn(pte);
if (unlikely(!pfn_valid(pfn))) {
/*
@@ -1066,25 +1154,25 @@ static int do_wp_page(struct mm_struct *
* at least the kernel stops what it's doing before it corrupts
* data, but for the moment just pretend this is OOM.
*/
- pte_unmap(page_table);
printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
address);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ goto out_error;
}
+
old_page = pfn_to_page(pfn);
if (!TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
if (reuse) {
+ mm_unpin_pages(mm);
flush_cache_page(vma, address);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
- ptep_set_access_flags(vma, address, page_table, entry, 1);
- update_mmu_cache(vma, address, entry);
+ new = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma);
+ if (!ptep_commit_access_flush(pmod, mm, vma, address,
+ page_table, new, 1))
+ update_mmu_cache(vma, address, new);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return VM_FAULT_MINOR;
}
}
@@ -1095,41 +1183,70 @@ static int do_wp_page(struct mm_struct *
*/
if (!PageReserved(old_page))
page_cache_get(old_page);
- spin_unlock(&mm->page_table_lock);
+ ptep_abort(pmod, mm, page_table);
+ mm_unpin_pages(mm);
+ mm_unlock_page_table(mm);
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ ptep_abort(pmod, mm, page_table);
goto no_new_page;
+ }
new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
- if (!new_page)
+ if (!new_page) {
+ ptep_abort(pmod, mm, page_table);
goto no_new_page;
- copy_cow_page(old_page,new_page,address);
+ }
+ copy_cow_page(old_page, new_page, address);
/*
* Re-check the pte - we dropped the lock
*/
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, pte))) {
- if (PageReserved(old_page))
- ++mm->rss;
- else
- page_remove_rmap(old_page);
- break_cow(vma, new_page, address, page_table);
- lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
+ new = ptep_begin_modify(pmod, mm, page_table);
- /* Free the old page.. */
- new_page = old_page;
+ if (unlikely(!pte_same(new, pte))) {
+ ptep_abort(pmod, mm, page_table);
+ goto out;
+ }
+
+ /* break COW */
+ flush_cache_page(vma, address);
+ new = maybe_mkwrite(pte_mkdirty(
+ mk_pte(new_page, vma->vm_page_prot)), vma);
+ page_add_anon_rmap(new_page, vma, address);
+ if (ptep_commit_establish_flush(pmod, mm, vma, address,
+ page_table, new)) {
+ page_remove_rmap(new_page);
+ goto out;
}
+ update_mmu_cache(vma, address, new);
+ if (PageReserved(old_page))
+ ++mm->rss;
+ else
+ page_remove_rmap(old_page);
+
+ /* After lru_cache_add_active new_page may disappear, so don't touch! */
+ lru_cache_add_active(new_page);
+
+ /* Free the old page.. */
+ new_page = old_page;
+
+out:
+ ret = VM_FAULT_MINOR;
pte_unmap(page_table);
+ mm_unlock_page_table(mm);
page_cache_release(new_page);
- page_cache_release(old_page);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR;
-
no_new_page:
page_cache_release(old_page);
- return VM_FAULT_OOM;
+ return ret;
+
+out_error:
+ ptep_abort(pmod, mm, page_table);
+ pte_unmap(page_table);
+ mm_unpin_pages(mm);
+ mm_unlock_page_table(mm);
+ return ret;
}
/*
@@ -1201,6 +1318,7 @@ void unmap_mapping_range(struct address_
spin_lock(&mapping->i_mmap_lock);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
+ smp_wmb(); /* For truncate_count */
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
unmap_mapping_range_list(&mapping->i_mmap, &details);
@@ -1329,37 +1447,39 @@ void swapin_readahead(swp_entry_t entry,
}
/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
+ * We hold the mm semaphore and the page table locked on entry.
+ * We release the pagetable lock on exit.
*/
-static int do_swap_page(struct mm_struct * mm,
- struct vm_area_struct * vma, unsigned long address,
- pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+static int do_swap_page(struct pte_modify *pmod, struct mm_struct * mm,
+ struct vm_area_struct * vma, unsigned long address, int write_access,
+ pte_t *page_table, pmd_t *pmd, pte_t orig_pte)
{
+ int used_swap_page = 0;
+ pte_t new, old;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(orig_pte);
- pte_t pte;
int ret = VM_FAULT_MINOR;
+ ptep_abort(pmod, mm, page_table);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte.
*/
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, orig_pte)))
+ if (likely(pte_same(ptep_atomic_read(page_table),
+ orig_pte)))
ret = VM_FAULT_OOM;
else
ret = VM_FAULT_MINOR;
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
goto out;
}
@@ -1376,71 +1496,83 @@ static int do_swap_page(struct mm_struct
* Back out if somebody else faulted in this pte while we
* released the page table lock.
*/
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
page_table = pte_offset_map(pmd, address);
- if (unlikely(!pte_same(*page_table, orig_pte))) {
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ new = ptep_begin_modify(pmod, mm, page_table);
+ if (unlikely(!pte_same(new, orig_pte))) {
+ ptep_abort(pmod, mm, page_table);
unlock_page(page);
- page_cache_release(page);
- ret = VM_FAULT_MINOR;
- goto out;
+ goto out_failed;
}
/* The page isn't present yet, go ahead with the fault. */
-
+
swap_free(entry);
- if (vm_swap_full())
- remove_exclusive_swap_page(page);
- mm->rss++;
- pte = mk_pte(page, vma->vm_page_prot);
+ new = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ new = maybe_mkwrite(pte_mkdirty(new), vma);
write_access = 0;
+ used_swap_page = 1;
}
- unlock_page(page);
flush_icache_page(vma, page);
- set_pte(page_table, pte);
page_add_anon_rmap(page, vma, address);
+ if (ptep_commit(pmod, mm, page_table, new)) {
+ page_remove_rmap(page);
+ swap_duplicate(entry);
+ unlock_page(page);
+ goto out_failed;
+ }
+ if (!used_swap_page && vm_swap_full())
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+ mm->rss++;
if (write_access) {
- if (do_wp_page(mm, vma, address,
- page_table, pmd, pte) == VM_FAULT_OOM)
- ret = VM_FAULT_OOM;
- goto out;
+ old = new;
+ new = ptep_begin_modify(pmod, mm, page_table);
+ if (likely(pte_same(old, new))) {
+ if (do_wp_page(pmod, mm, vma, address,
+ page_table, pmd, new) == VM_FAULT_OOM)
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ ptep_abort(pmod, mm, page_table);
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ update_mmu_cache(vma, address, new);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
out:
return ret;
+
+out_failed:
+ pte_unmap(page_table);
+ mm_unlock_page_table(mm);
+ page_cache_release(page);
+ return ret;
}
/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
+ * We are called with the MM semaphore and page table locked
+ * to protect against concurrent faults in multithreaded programs.
*/
static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+do_anonymous_page(struct pte_modify *pmod, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long addr,
+ int write_access, pte_t *page_table, pmd_t *pmd)
{
- pte_t entry;
- struct page * page = ZERO_PAGE(addr);
-
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
+ pte_t new;
+ struct page *page;
- /* ..except if it's a write access */
+ /* XXX: is this really unlikely? The code previously suggested so */
if (write_access) {
/* Allocate our own private page. */
+ ptep_abort(ptep, mm, page_table);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
@@ -1449,31 +1581,40 @@ do_anonymous_page(struct mm_struct *mm,
goto no_mem;
clear_user_highpage(page, addr);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
page_table = pte_offset_map(pmd, addr);
+ new = ptep_begin_modify(pmod, mm, page_table);
- if (!pte_none(*page_table)) {
- pte_unmap(page_table);
+ if (unlikely(!pte_none(new))) {
+ ptep_abort(ptep, mm, page_table);
+ page_cache_release(page);
+ goto out;
+ }
+ new = maybe_mkwrite(pte_mkdirty(mk_pte(page,
+ vma->vm_page_prot)), vma);
+ page_add_anon_rmap(page, vma, addr);
+ if (ptep_commit(pmod, mm, page_table, new)) {
+ page_remove_rmap(page);
page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
goto out;
}
+
mm->rss++;
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
- lru_cache_add_active(page);
mark_page_accessed(page);
- page_add_anon_rmap(page, vma, addr);
+ lru_cache_add_active(page);
+ } else {
+ /* Read-only mapping of ZERO_PAGE. */
+ page = ZERO_PAGE(addr);
+ new = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+ if (ptep_commit(pmod, mm, page_table, new))
+ goto out;
}
- set_pte(page_table, entry);
- pte_unmap(page_table);
-
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, entry);
- spin_unlock(&mm->page_table_lock);
+ update_mmu_cache(vma, addr, new);
out:
+ pte_unmap(page_table);
+ mm_unlock_page_table(mm);
return VM_FAULT_MINOR;
no_mem:
return VM_FAULT_OOM;
@@ -1492,27 +1633,29 @@ no_mem:
* spinlock held. Exit with the spinlock released.
*/
static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+do_no_page(struct pte_modify *pmod, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int write_access, pte_t *page_table, pmd_t *pmd, pte_t pte)
{
+ pte_t new;
struct page * new_page;
struct address_space *mapping = NULL;
- pte_t entry;
int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
+ return do_anonymous_page(pmod, mm, vma, address,
+ write_access, page_table, pmd);
+
+ ptep_abort(ptep, mm, page_table);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
sequence = atomic_read(&mapping->truncate_count);
}
- smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */
retry:
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
@@ -1539,20 +1682,32 @@ retry:
anon = 1;
}
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
+ /* XXX: investigate this further WRT lockless page table issues. */
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
- if (mapping &&
- (unlikely(sequence != atomic_read(&mapping->truncate_count)))) {
- sequence = atomic_read(&mapping->truncate_count);
- spin_unlock(&mm->page_table_lock);
- page_cache_release(new_page);
- goto retry;
+ if (mapping) {
+ smp_rmb(); /* For truncate_count */
+ if (unlikely(sequence !=
+ atomic_read(&mapping->truncate_count))) {
+ sequence = atomic_read(&mapping->truncate_count);
+ mm_unlock_page_table(mm);
+ page_cache_release(new_page);
+ goto retry;
+ }
}
page_table = pte_offset_map(pmd, address);
+ new = ptep_begin_modify(pmod, mm, page_table);
+
+ /* Only go through if we didn't race with anybody else... */
+ if (unlikely(!pte_none(new))) {
+ /* One of our sibling threads was faster, back out. */
+ ptep_abort(ptep, mm, page_table);
+ goto out_failed;
+ }
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -1564,34 +1719,39 @@ retry:
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
- /* Only go through if we didn't race with anybody else... */
- if (pte_none(*page_table)) {
- if (!PageReserved(new_page))
- ++mm->rss;
- flush_icache_page(vma, new_page);
- entry = mk_pte(new_page, vma->vm_page_prot);
- if (write_access)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- set_pte(page_table, entry);
- if (anon) {
- lru_cache_add_active(new_page);
- page_add_anon_rmap(new_page, vma, address);
- } else
- page_add_file_rmap(new_page);
- pte_unmap(page_table);
- } else {
- /* One of our sibling threads was faster, back out. */
- pte_unmap(page_table);
- page_cache_release(new_page);
- spin_unlock(&mm->page_table_lock);
- goto out;
- }
+
+ flush_icache_page(vma, new_page);
+ new = mk_pte(new_page, vma->vm_page_prot);
+ if (write_access)
+ new = maybe_mkwrite(pte_mkdirty(new), vma);
+
+ if (anon) {
+ page_add_anon_rmap(new_page, vma, address);
+ } else
+ page_add_file_rmap(new_page);
+
+ if (ptep_commit(pmod, mm, page_table, new)) {
+ page_remove_rmap(new_page);
+ goto out_failed;
+ }
+ if (!PageReserved(new_page))
+ ++mm->rss;
+ if (anon)
+ lru_cache_add_active(new_page);
+
+ pte_unmap(page_table);
/* no need to invalidate: a not-present page shouldn't be cached */
- update_mmu_cache(vma, address, entry);
- spin_unlock(&mm->page_table_lock);
+ update_mmu_cache(vma, address, new);
out:
+ mm_unlock_page_table(mm);
return ret;
+
+out_failed:
+ pte_unmap(page_table);
+ mm_unlock_page_table(mm);
+ page_cache_release(new_page);
+ return VM_FAULT_MINOR;
oom:
page_cache_release(new_page);
ret = VM_FAULT_OOM;
@@ -1603,8 +1763,9 @@ oom:
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*/
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+static int do_file_page(struct pte_modify *pmod, struct mm_struct * mm,
+ struct vm_area_struct * vma, unsigned long address,
+ int write_access, pte_t *ptep, pmd_t *pmd, pte_t pte)
{
unsigned long pgoff;
int err;
@@ -1616,14 +1777,27 @@ static int do_file_page(struct mm_struct
*/
if (!vma->vm_ops || !vma->vm_ops->populate ||
(write_access && !(vma->vm_flags & VM_SHARED))) {
- pte_clear(pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ pte_clear(&pte);
+ if (ptep_commit(pmod, mm, ptep, pte)) {
+ pte_unmap(ptep);
+ mm_unlock_page_table(mm);
+ return VM_FAULT_MINOR;
+ }
+ pte = ptep_begin_modify(pmod, mm, ptep);
+ return do_no_page(pmod, mm, vma, address,
+ write_access, ptep, pmd, pte);
}
- pgoff = pte_to_pgoff(*pte);
+ pgoff = pte_to_pgoff(ptep_atomic_read(ptep));
+ /* XXX: is this right? */
+ if (ptep_verify_finish(pmod, mm, ptep)) {
+ pte_unmap(ptep);
+ mm_unlock_page_table(mm);
+ return VM_FAULT_MINOR;
+ }
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap(ptep);
+ mm_unlock_page_table(mm);
err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
@@ -1642,25 +1816,16 @@ static int do_file_page(struct mm_struct
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with the page table locked, and exit with it unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{
+ struct pte_modify pmod;
pte_t entry;
- entry = *pte;
+ entry = ptep_begin_modify(&pmod, mm, pte);
if (!pte_present(entry)) {
/*
* If it truly wasn't present, we know that kswapd
@@ -1668,28 +1833,37 @@ static inline int handle_pte_fault(struc
* drop the lock.
*/
if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ return do_no_page(&pmod, mm, vma, address,
+ write_access, pte, pmd, entry);
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
- return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+ return do_file_page(&pmod, mm, vma, address,
+ write_access, pte, pmd, entry);
+
+ return do_swap_page(&pmod, mm, vma, address,
+ write_access, pte, pmd, entry);
}
if (write_access) {
if (!pte_write(entry))
- return do_wp_page(mm, vma, address, pte, pmd, entry);
+ return do_wp_page(&pmod, mm, vma, address,
+ pte, pmd, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- ptep_set_access_flags(vma, address, pte, entry, write_access);
- update_mmu_cache(vma, address, entry);
+ if (!ptep_commit_access_flush(&pmod, mm, vma, address,
+ pte, entry, write_access)) {
+ /* Success */
+ update_mmu_cache(vma, address, entry);
+ }
+
pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return VM_FAULT_MINOR;
}
/*
- * By the time we get here, we already hold the mm semaphore
+ * This must be called with mmap_sem held for reading.
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
unsigned long address, int write_access)
@@ -1698,26 +1872,22 @@ int handle_mm_fault(struct mm_struct *mm
pmd_t *pmd;
__set_current_state(TASK_RUNNING);
- pgd = pgd_offset(mm, address);
-
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
+ pgd = pgd_offset(mm, address);
pmd = pmd_alloc(mm, pgd, address);
-
if (pmd) {
pte_t * pte = pte_alloc_map(mm, pmd, address);
if (pte)
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
+ return handle_pte_fault(mm, vma, address,
+ write_access, pte, pmd);
}
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
+
return VM_FAULT_OOM;
}
@@ -1734,22 +1904,15 @@ pmd_t fastcall *__pmd_alloc(struct mm_st
{
pmd_t *new;
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
if (!new)
return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pgd_present(*pgd)) {
+ if (pgd_test_and_populate(mm, pgd, new))
pmd_free(new);
- goto out;
- }
- pgd_populate(mm, pgd, new);
-out:
+
return pmd_offset(pgd, address);
}
@@ -1784,7 +1947,8 @@ struct page * vmalloc_to_page(void * vma
pgd_t *pgd = pgd_offset_k(addr);
pmd_t *pmd;
pte_t *ptep, pte;
-
+
+ /* XXX: investigate */
if (!pgd_none(*pgd)) {
pmd = pmd_offset(pgd, addr);
if (!pmd_none(*pmd)) {
diff -puN include/asm-generic/pgtable.h~vm-abstract-pgtable-locking include/asm-generic/pgtable.h
--- linux-2.6/include/asm-generic/pgtable.h~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/asm-generic/pgtable.h 2004-10-29 16:40:39.000000000 +1000
@@ -134,4 +134,302 @@ static inline void ptep_mkdirty(pte_t *p
#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr)
#endif
+#ifndef __ASSEMBLY__
+#ifdef __HAVE_ARCH_PTEP_CMPXCHG
+#define mm_lock_page_table(__mm) \
+do { \
+} while (0);
+
+#define mm_unlock_page_table(__mm) \
+do { \
+} while (0);
+
+#define mm_pin_pages(__mm) \
+do { \
+ spin_lock(&__mm->page_table_lock); \
+} while (0)
+
+#define mm_unpin_pages(__mm) \
+do { \
+ spin_unlock(&__mm->page_table_lock); \
+} while (0)
+
+/* mm_lock_page_table doesn't actually take a lock, so this can be 0 */
+#define MM_RELOCK_CHECK 0
+
+struct pte_modify {
+ pte_t oldval;
+};
+
+#ifndef __HAVE_ARCH_PTEP_ATOMIC_READ
+#define ptep_atomic_read(__ptep) \
+({ \
+ *__ptep; \
+})
+#endif
+
+#define ptep_begin_modify(__pmod, __mm, __ptep) \
+({ \
+ (void)__mm; \
+ (__pmod)->oldval = ptep_atomic_read(__ptep); \
+ (__pmod)->oldval; \
+})
+
+#define ptep_abort(__pmod, __mm, __ptep) \
+do {} while (0)
+
+#define ptep_commit(__pmod, __mm, __ptep, __newval) \
+({ \
+ unlikely(ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval)); \
+})
+
+#define ptep_commit_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({ \
+ int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval); \
+ /* XXX: \ * worthwhile to see if cmpxchg has succeeded before flushing? \
+ * worthwhile to see if pte_val has changed before flushing? \
+ * like so?: \
+ * if (!ret && pte_val((__pmod)->oldval) != pte_val(__newval)) \
+ */ \
+ flush_tlb_page(__vma, __address); \
+ unlikely(ret); \
+})
+
+#define ptep_commit_access_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __dirty) \
+({ \
+ int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval); \
+ flush_tlb_page(__vma, __address); \
+ unlikely(ret); \
+})
+
+#define ptep_commit_establish_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({ \
+ int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval); \
+ flush_tlb_page(__vma, __address); \
+ unlikely(ret); \
+})
+
+#define ptep_commit_clear(__pmod, __mm, __ptep, __newval, __oldval) \
+({ \
+ int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval); \
+ __oldval = (__pmod)->oldval; \
+ unlikely(ret); \
+})
+
+#define ptep_commit_clear_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __oldval) \
+({ \
+ int ret = ptep_cmpxchg(__ptep, (__pmod)->oldval, __newval); \
+ flush_tlb_page(__vma, __address); \
+ __oldval = (__pmod)->oldval; \
+ unlikely(ret); \
+})
+
+#define ptep_commit_clear_flush_young(__pmod, __mm, __vma, __address, __ptep, __young) \
+({ \
+ pte_t oldval = (__pmod)->oldval; \
+ int ret = ptep_cmpxchg(__ptep, oldval, pte_mkold(oldval)); \
+ *__young = pte_young(oldval); \
+ if (likely(!ret) && *__young) \
+ flush_tlb_page(__vma, __address); \
+ unlikely(ret); \
+})
+
+#define ptep_commit_clear_flush_dirty(__pmod, __mm, __vma, __address, __ptep, __dirty) \
+({ \
+ pte_t oldval = (__pmod)->oldval; \
+ int ret = ptep_cmpxchg(__ptep, oldval, pte_mkclean(oldval)); \
+ *__dirty = pte_dirty(oldval); \
+ if (likely(!ret) && *__dirty) \
+ flush_tlb_page(__vma, __address); \
+ unlikely(ret); \
+})
+
+#define ptep_verify(__pmod, __mm, __ptep) \
+({ \
+ /* Prevent writes leaking forward and reads leaking back */ \
+ smp_mb(); \
+ unlikely(pte_val((__pmod)->oldval) != pte_val(ptep_atomic_read(__ptep))); \
+})
+
+#define ptep_verify_finish(__pmod, __mm, __ptep) \
+ ptep_verify(__pmod, __mm, __ptep)
+
+#else /* __HAVE_ARCH_PTEP_CMPXCHG */ /* GENERIC_PTEP_LOCKING follows */
+/* Use the generic mm->page_table_lock serialised scheme */
+/*
+ * XXX: can we make use of this?
+ * At the moment, yes because some code is holding a ptep_begin_modify
+ * transaction across dropping and retaking the mm_lock_page_table (see
+ * mm/memory.c do_??? pagefault routines). A pte cmpxchg system can take
+ * advantage of this (holding the transaction open), but it possibly isn't
+ * exactly clean, and will blow up if ptep_begin_modify takes a lock itself.
+ *
+ * And ptep_begin_modify would probably like to take a lock if an architecture
+ * wants to do per-pte locking (ppc64, maybe).
+ */
+#define MM_RELOCK_CHECK 1
+
+/*
+ * Lock and unlock the pagetable for walking. This guarantees we can safely
+ * walk pgd->pmd->pte, and only that.
+ */
+#define mm_lock_page_table(__mm) \
+do { \
+ spin_lock(&(__mm)->page_table_lock); \
+} while (0)
+
+#define mm_unlock_page_table(__mm) \
+do { \
+ spin_unlock(&(__mm)->page_table_lock); \
+} while (0)
+
+/*
+ * XXX: pin and unpin may be tricky without a page_table_lock.
+ * Use vma locks maybe? Pte page locks? Pte bit?
+ */
+/*
+ * Prevent pages mapped into __mm, __vma from being freed.
+ * Taken inside mm_lock_page_table
+ */
+#define mm_pin_pages(__mm) \
+do { \
+ (void)__mm; \
+} while (0)
+
+#define mm_unpin_pages(__mm) \
+do { \
+ (void)__mm; \
+} while (0)
+
+#define ptep_atomic_read(__ptep) \
+({ \
+ *__ptep; \
+})
+
+/* XXX: will we want pmd/pgd_atomic_read? Yes. (big job) */
+
+/*
+ * A pte modification sequence goes something like this:
+ * struct pte_modify pmod;
+ * pte_t pte;
+ *
+ * mm_lock_page_table(mm);
+ * // walk page table to find ptep
+ * pte = ptep_begin_modify(&pmod, mm, ptep)
+ * if (!pte is valid) {
+ * ptep_abort(&pmod, mm, ptep); // XXX: isn't yet part of the API.
+ * goto out;
+ * }
+ * // modify pte, or make one that we want to install
+ *
+ * if (ptep_commit(&pmod, mm, ptep, pte)) {
+ * // commit failed
+ * goto out;
+ * }
+ *
+ * // At this point, the pte replaced by the commit is guaranteed to be the
+ * // same as the one returned by ptep_begin_modify, although hardware bits
+ * // may have changed. The other ptep_commit_* functions can provide
+ * // protection against hardware bits changing.
+ */
+struct pte_modify {
+};
+
+#define ptep_begin_modify(__pmod, __mm, __ptep) \
+({ \
+ (void)__pmod; \
+ (void)__mm; \
+ ptep_atomic_read(__ptep); \
+})
+
+#define ptep_abort(__pmod, __mm, __ptep) \
+do {} while (0)
+
+#define ptep_commit(__pmod, __mm, __ptep, __newval) \
+({ \
+ set_pte_atomic(__ptep, __newval); \
+ 0; \
+})
+
+#define ptep_commit_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({ \
+ set_pte_atomic(__ptep, __newval); \
+ flush_tlb_page(__vma, __address); \
+ 0; \
+})
+
+#define ptep_commit_access_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __dirty) \
+({ \
+ ptep_set_access_flags(__vma, __address, __ptep, __newval, __dirty); \
+ 0; \
+})
+
+#define ptep_commit_establish_flush(__pmod, __mm, __vma, __address, __ptep, __newval) \
+({ \
+ ptep_establish(__vma, __address, __ptep, __newval); \
+ 0; \
+})
+
+#define ptep_commit_clear(__pmod, __mm, __ptep, __newval, __oldval) \
+({ \
+ __oldval = ptep_get_and_clear(__ptep); \
+ set_pte(__ptep, __newval); \
+ 0; \
+})
+
+#define ptep_commit_clear_flush(__pmod, __mm, __vma, __address, __ptep, __newval, __oldval) \
+({ \
+ __oldval = ptep_clear_flush(__vma, __address, __ptep); \
+ set_pte(__ptep, __newval); \
+ 0; \
+})
+
+#define ptep_commit_clear_flush_young(__pmod, __mm, __vma, __address, __ptep, __young) \
+({ \
+ *__young = ptep_clear_flush_young(__vma, __address, __ptep); \
+ 0; \
+})
+
+#define ptep_commit_clear_flush_dirty(__pmod, __mm, __vma, __address, __ptep, __dirty) \
+({ \
+ *__dirty = ptep_clear_flush_dirty(__vma, __address, __ptep); \
+ 0; \
+})
+
+#define ptep_verify(__pmod, __mm, __ptep) \
+({ \
+ (void)__pmod; \
+ 0; \
+})
+
+#define ptep_verify_finish(__pmod, __mm, __ptep) \
+ ptep_verify(__pmod, __mm, __ptep)
+
+#define pgd_test_and_populate(__mm, ___pgd, ___pmd) \
+({ \
+ int ret = pgd_present(*(___pgd)); \
+ if (likely(!ret)) \
+ pgd_populate(__mm, ___pgd, ___pmd); \
+ unlikely(ret); \
+})
+
+#define pmd_test_and_populate(__mm, ___pmd, ___page) \
+({ \
+ int ret = pmd_present(*(___pmd)); \
+ if (likely(!ret)) \
+ pmd_populate(__mm, ___pmd, ___page); \
+ unlikely(ret); \
+})
+
+#define pmd_test_and_populate_kernel(__mm, ___pmd, ___page) \
+({ \
+ int ret = pmd_present(*(___pmd)); \
+ if (likely(!ret)) \
+ pmd_populate_kernel(__mm, ___pmd, ___page); \
+ unlikely(ret); \
+})
+
+#endif /* GENERIC_PTEP_LOCKING */
+#endif /* ASSEMBLY */
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff -puN kernel/fork.c~vm-abstract-pgtable-locking kernel/fork.c
--- linux-2.6/kernel/fork.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/kernel/fork.c 2004-10-29 16:28:08.000000000 +1000
@@ -227,7 +227,6 @@ static inline int dup_mmap(struct mm_str
* link in first so that swapoff can see swap entries,
* and try_to_unmap_one's find_vma find the new vma.
*/
- spin_lock(&mm->page_table_lock);
*pprev = tmp;
pprev = &tmp->vm_next;
@@ -237,7 +236,6 @@ static inline int dup_mmap(struct mm_str
mm->map_count++;
retval = copy_page_range(mm, current->mm, tmp);
- spin_unlock(&mm->page_table_lock);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -446,7 +444,15 @@ static int copy_mm(unsigned long clone_f
* allows optimizing out ipis; the tlb_gather_mmu code
* is an example.
*/
+ /*
+ * XXX: I think this is only needed for sparc64's tlb and
+ * context switching code - but sparc64 is in big trouble
+ * now anyway because tlb_gather_mmu can be done without
+ * holding the page table lock now anyway.
+ */
+#if 0
spin_unlock_wait(&oldmm->page_table_lock);
+#endif
goto good_mm;
}
diff -puN kernel/futex.c~vm-abstract-pgtable-locking kernel/futex.c
--- linux-2.6/kernel/futex.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/kernel/futex.c 2004-10-29 16:28:08.000000000 +1000
@@ -204,15 +204,13 @@ static int get_futex_key(unsigned long u
/*
* Do a quick atomic lookup first - this is the fastpath.
*/
- spin_lock(¤t->mm->page_table_lock);
page = follow_page(mm, uaddr, 0);
if (likely(page != NULL)) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- spin_unlock(¤t->mm->page_table_lock);
+ follow_page_finish(mm, uaddr);
return 0;
}
- spin_unlock(¤t->mm->page_table_lock);
/*
* Do it the general way.
@@ -505,7 +503,7 @@ static int futex_wait(unsigned long uadd
/*
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
- */
+ */
up_read(¤t->mm->mmap_sem);
/*
@@ -520,6 +518,7 @@ static int futex_wait(unsigned long uadd
/* add_wait_queue is the barrier after __set_current_state. */
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&q.waiters, &wait);
+
/*
* !list_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
diff -puN include/linux/mm.h~vm-abstract-pgtable-locking include/linux/mm.h
--- linux-2.6/include/linux/mm.h~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mm.h 2004-10-29 16:28:08.000000000 +1000
@@ -758,6 +758,7 @@ extern struct vm_area_struct *find_exten
extern struct page * vmalloc_to_page(void *addr);
extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
int write);
+extern void follow_page_finish(struct mm_struct *mm, unsigned long address);
int remap_pfn_range(struct vm_area_struct *, unsigned long,
unsigned long, unsigned long, pgprot_t);
diff -puN include/asm-generic/tlb.h~vm-abstract-pgtable-locking include/asm-generic/tlb.h
--- linux-2.6/include/asm-generic/tlb.h~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/include/asm-generic/tlb.h 2004-10-29 16:28:08.000000000 +1000
@@ -53,7 +53,13 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
static inline struct mmu_gather *
tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
{
- struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+ /*
+ * XXX: Now calling this without the page_table_lock!
+ * This will blow up at least sparc64 (see sparc64's switch_mm
+ * and kernel/fork.c:copy_mm for more details.
+ */
+ int cpu = get_cpu();
+ struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
tlb->mm = mm;
@@ -97,6 +103,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
/* keep the page table cache within bounds */
check_pgt_cache();
+ put_cpu();
}
static inline unsigned int
diff -puN mm/mmap.c~vm-abstract-pgtable-locking mm/mmap.c
--- linux-2.6/mm/mmap.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mmap.c 2004-10-29 16:28:08.000000000 +1000
@@ -1575,14 +1575,12 @@ static void free_dangling_pgtables_regio
{
struct mmu_gather *tlb;
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
if (is_hugepage_only_range(start, end - start))
hugetlb_free_pgtables(tlb, prev, start, end);
else
free_pgtables(tlb, prev, start, end);
tlb_finish_mmu(tlb, start, end);
- spin_unlock(&mm->page_table_lock);
}
/*
@@ -1866,11 +1864,9 @@ void exit_mmap(struct mm_struct *mm)
* Finally, free the pagetables. By this point, nothing should
* refer to them.
*/
- spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 1);
clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
- spin_unlock(&mm->page_table_lock);
}
/* Insert vm structure into process list sorted by address
diff -puN mm/rmap.c~vm-abstract-pgtable-locking mm/rmap.c
--- linux-2.6/mm/rmap.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/rmap.c 2004-10-29 16:28:08.000000000 +1000
@@ -32,7 +32,7 @@
* page->flags PG_locked (lock_page)
* mapping->i_mmap_lock
* anon_vma->lock
- * mm->page_table_lock
+ * mm_lock_page_table(mm)
* zone->lru_lock (in mark_page_accessed)
* swap_list_lock (in swap_free etc's swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
@@ -101,7 +101,11 @@ int anon_vma_prepare(struct vm_area_stru
locked = NULL;
}
- /* page_table_lock to protect against threads */
+ /* protect against threads */
+ /*
+ * XXX: this only needs to serialise against itself.
+ * Perhaps we should rename the page table lock at some point.
+ */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
@@ -256,6 +260,8 @@ unsigned long page_address_in_vma(struct
static int page_referenced_one(struct page *page,
struct vm_area_struct *vma, unsigned int *mapcount)
{
+ struct pte_modify pmod;
+ pte_t new;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pgd_t *pgd;
@@ -269,7 +275,7 @@ static int page_referenced_one(struct pa
if (address == -EFAULT)
goto out;
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -280,14 +286,19 @@ static int page_referenced_one(struct pa
goto out_unlock;
pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
- goto out_unmap;
-
- if (page_to_pfn(page) != pte_pfn(*pte))
- goto out_unmap;
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (!pte_present(new))
+ goto out_abort;
+
+ /*
+ * This doesn't need mm_pin_pages, because the anonvma locks
+ * serialise against try_to_unmap.
+ */
+ if (page_to_pfn(page) != pte_pfn(new))
+ goto out_abort;
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
+ /* Doesn't matter much if this fails */
+ ptep_commit_clear_flush_young(&pmod, mm, vma, address, pte, &referenced);
if (mm != current->mm && has_swap_token(mm))
referenced++;
@@ -297,9 +308,13 @@ static int page_referenced_one(struct pa
out_unmap:
pte_unmap(pte);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
out:
return referenced;
+
+out_abort:
+ ptep_abort(&pmod, mm, pte);
+ goto out_unmap;
}
static int page_referenced_anon(struct page *page)
@@ -420,8 +435,6 @@ int page_referenced(struct page *page, i
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- *
- * The caller needs to hold the mm->page_table_lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -448,8 +461,6 @@ void page_add_anon_rmap(struct page *pag
/**
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
- *
- * The caller needs to hold the mm->page_table_lock.
*/
void page_add_file_rmap(struct page *page)
{
@@ -464,8 +475,6 @@ void page_add_file_rmap(struct page *pag
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
- *
- * Caller needs to hold the mm->page_table_lock.
*/
void page_remove_rmap(struct page *page)
{
@@ -494,12 +503,14 @@ void page_remove_rmap(struct page *page)
*/
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
{
+ struct pte_modify pmod;
+ swp_entry_t entry;
+ pte_t new, old;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- pte_t pteval;
int ret = SWAP_AGAIN;
if (!mm->rss)
@@ -509,10 +520,10 @@ static int try_to_unmap_one(struct page
goto out;
/*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
+ * We need to lock the page table to protect from page faults,
+ * munmap, fork, exit, etc...
*/
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -523,27 +534,37 @@ static int try_to_unmap_one(struct page
goto out_unlock;
pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
- goto out_unmap;
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (!pte_present(new))
+ goto out_abort;
- if (page_to_pfn(page) != pte_pfn(*pte))
- goto out_unmap;
+ /*
+ * XXX: don't need to pin pages here because anonvma locking means
+ * this page can't come out from underneath us (ie. we serialise
+ * with other try_to_unmap's
+ */
+ if (page_to_pfn(page) != pte_pfn(new))
+ goto out_abort;
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
* skipped over this mm) then we should reactivate it.
*/
- if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
- ptep_clear_flush_young(vma, address, pte)) {
+ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) {
ret = SWAP_FAIL;
- goto out_unmap;
+ goto out_abort;
+ }
+
+ if (pte_young(new)) {
+ ret = SWAP_AGAIN;
+ goto out_abort;
}
/*
* Don't pull an anonymous page out from under get_user_pages.
- * GUP carefully breaks COW and raises page count (while holding
- * page_table_lock, as we have here) to make sure that the page
+ * GUP carefully breaks COW and raises page count (while the page
+ * table is locked, as we have here) to make sure that the page
* cannot be freed. If we unmap that page here, a user write
* access to the virtual address will bring back the page, but
* its raised count will (ironically) be taken to mean it's not
@@ -555,22 +576,27 @@ static int try_to_unmap_one(struct page
* to drop page lock: its reference to the page stops existing
* ptes from being unmapped, so swapoff can make progress.
*/
+ /*
+ * XXX: this should be ok, as GUP is doing atomic checking...?
+ * Well maybe not because neither are serialised. But hmm, GUP
+ * and friends need to pin pages anyway, so it may be that these
+ * paths will actually get serialised even without the page table
+ * lock.
+ */
+ /* XXX: Should this be enough? (Obviously a finer lock would be nice) */
+ mm_pin_pages(mm);
if (PageSwapCache(page) &&
page_count(page) != page_mapcount(page) + 2) {
- ret = SWAP_FAIL;
- goto out_unmap;
+ mm_unpin_pages(mm);
+ ret = SWAP_AGAIN;
+ goto out_abort;
}
/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);
-
+ pte_clear(&new);
if (PageAnon(page)) {
- swp_entry_t entry = { .val = page->private };
+ entry.val = page->private;
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
@@ -582,9 +608,22 @@ static int try_to_unmap_one(struct page
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- set_pte(pte, swp_entry_to_pte(entry));
- BUG_ON(pte_file(*pte));
+ new = swp_entry_to_pte(entry);
+ BUG_ON(pte_file(new));
+ }
+
+ if (ptep_commit_clear_flush(&pmod, mm, vma, address, pte, new, old)) {
+ ret = SWAP_AGAIN;
+ mm_unpin_pages(mm);
+ if (PageAnon(page))
+ free_swap_and_cache(entry);
+ goto out_unmap;
}
+ mm_unpin_pages(mm);
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(old))
+ set_page_dirty(page);
mm->rss--;
page_remove_rmap(page);
@@ -593,9 +632,13 @@ static int try_to_unmap_one(struct page
out_unmap:
pte_unmap(pte);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
out:
return ret;
+
+out_abort:
+ ptep_abort(&pmod, mm, pte);
+ goto out_unmap;
}
/*
@@ -627,18 +670,11 @@ static void try_to_unmap_cluster(unsigne
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- pte_t pteval;
struct page *page;
unsigned long address;
unsigned long end;
unsigned long pfn;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
-
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
if (address < vma->vm_start)
@@ -646,6 +682,12 @@ static void try_to_unmap_cluster(unsigne
if (end > vma->vm_end)
end = vma->vm_end;
+ /*
+ * We need to lock the page table to protect from page faults,
+ * munmap, fork, exit, etc...
+ */
+ mm_lock_page_table(mm);
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out_unlock;
@@ -656,44 +698,57 @@ static void try_to_unmap_cluster(unsigne
for (pte = pte_offset_map(pmd, address);
address < end; pte++, address += PAGE_SIZE) {
+ struct pte_modify pmod;
+ pte_t new, old;
- if (!pte_present(*pte))
- continue;
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+
+ if (!pte_present(new))
+ goto out_abort;
- pfn = pte_pfn(*pte);
+ pfn = pte_pfn(new);
if (!pfn_valid(pfn))
- continue;
+ goto out_abort;
page = pfn_to_page(pfn);
BUG_ON(PageAnon(page));
if (PageReserved(page))
- continue;
+ goto out_abort;
- if (ptep_clear_flush_young(vma, address, pte))
- continue;
+ if (pte_young(new))
+ goto out_abort;
/* Nuke the page table entry. */
flush_cache_page(vma, address);
- pteval = ptep_clear_flush(vma, address, pte);
+ pte_clear(&new);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
- set_pte(pte, pgoff_to_pte(page->index));
+ new = pgoff_to_pte(page->index);
+
+ if (ptep_commit_clear_flush(&pmod, mm, vma, address, pte, new, old))
+ goto again;
+ flush_tlb_page(vma, address);
/* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
+ if (pte_dirty(old))
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
mm->rss--;
(*mapcount)--;
+
+ continue;
+out_abort:
+ ptep_abort(&pmod, mm, pte);
}
pte_unmap(pte);
out_unlock:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
}
static int try_to_unmap_anon(struct page *page)
diff -puN mm/mremap.c~vm-abstract-pgtable-locking mm/mremap.c
--- linux-2.6/mm/mremap.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mremap.c 2004-10-29 16:28:08.000000000 +1000
@@ -99,7 +99,7 @@ move_one_page(struct vm_area_struct *vma
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
}
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
src = get_one_pte_map_nested(mm, old_addr);
if (src) {
@@ -115,21 +115,28 @@ move_one_page(struct vm_area_struct *vma
spin_unlock(&mapping->i_mmap_lock);
dst = alloc_one_pte_map(mm, new_addr);
if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
spin_lock(&mapping->i_mmap_lock);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
}
src = get_one_pte_map_nested(mm, old_addr);
}
+
/*
- * Since alloc_one_pte_map can drop and re-acquire
- * page_table_lock, we should re-check the src entry...
+ * Since alloc_one_pte_map can drop and re-lock the
+ * page table, we should re-check the src entry...
*/
if (src) {
if (dst) {
- pte_t pte;
- pte = ptep_clear_flush(vma, old_addr, src);
- set_pte(dst, pte);
+ struct pte_modify pmod;
+ pte_t new, old;
+again:
+ new = ptep_begin_modify(&pmod, mm, src);
+ pte_clear(&new);
+ if (ptep_commit_clear_flush(&pmod, mm, vma,
+ old_addr, src, new, old))
+ goto again;
+ set_pte(dst, old);
} else
error = -ENOMEM;
pte_unmap_nested(src);
@@ -137,7 +144,7 @@ move_one_page(struct vm_area_struct *vma
if (dst)
pte_unmap(dst);
}
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
return error;
diff -puN mm/msync.c~vm-abstract-pgtable-locking mm/msync.c
--- linux-2.6/mm/msync.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/msync.c 2004-10-29 16:28:08.000000000 +1000
@@ -18,27 +18,40 @@
#include <asm/tlbflush.h>
/*
- * Called with mm->page_table_lock held to protect against other
+ * Called with the page table locked to protect against other
* threads/the swapper from ripping pte's out from under us.
*/
-static int filemap_sync_pte(pte_t *ptep, struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
-{
- pte_t pte = *ptep;
- unsigned long pfn = pte_pfn(pte);
+static int filemap_sync_pte(struct mm_struct *mm, pte_t *ptep,
+ struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags)
+{
+ struct pte_modify pmod;
+ pte_t new;
+ unsigned long pfn;
struct page *page;
+ int dirty;
+
+again:
+ new = ptep_begin_modify(&pmod, mm, ptep);
- if (pte_present(pte) && pfn_valid(pfn)) {
+ pfn = pte_pfn(new);
+ if (pte_present(new) && pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- if (!PageReserved(page) &&
- (ptep_clear_flush_dirty(vma, address, ptep) ||
- page_test_and_clear_dirty(page)))
- set_page_dirty(page);
+ if (!PageReserved(page)) {
+ new = pte_mkclean(new);
+ if (ptep_commit_clear_flush_dirty(&pmod, mm, vma, address, ptep, &dirty))
+ goto again;
+ if (dirty || page_test_and_clear_dirty(page))
+ set_page_dirty(page);
+ goto out;
+ }
}
+ ptep_abort(&pmod, mm, ptep);
+out:
return 0;
}
-static int filemap_sync_pte_range(pmd_t * pmd,
+static int filemap_sync_pte_range(struct mm_struct *mm, pmd_t * pmd,
unsigned long address, unsigned long end,
struct vm_area_struct *vma, unsigned int flags)
{
@@ -52,22 +65,25 @@ static int filemap_sync_pte_range(pmd_t
pmd_clear(pmd);
return 0;
}
+
+ mm_pin_pages(mm); /* Required for filemap_sync_pte */
pte = pte_offset_map(pmd, address);
if ((address & PMD_MASK) != (end & PMD_MASK))
end = (address & PMD_MASK) + PMD_SIZE;
error = 0;
do {
- error |= filemap_sync_pte(pte, vma, address, flags);
+ error |= filemap_sync_pte(mm, pte, vma, address, flags);
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
pte_unmap(pte - 1);
+ mm_unpin_pages(mm);
return error;
}
-static inline int filemap_sync_pmd_range(pgd_t * pgd,
+static inline int filemap_sync_pmd_range(struct mm_struct *mm, pgd_t * pgd,
unsigned long address, unsigned long end,
struct vm_area_struct *vma, unsigned int flags)
{
@@ -86,7 +102,7 @@ static inline int filemap_sync_pmd_range
end = (address & PGDIR_MASK) + PGDIR_SIZE;
error = 0;
do {
- error |= filemap_sync_pte_range(pmd, address, end, vma, flags);
+ error |= filemap_sync_pte_range(mm, pmd, address, end, vma, flags);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
@@ -103,7 +119,7 @@ static int filemap_sync(struct vm_area_s
/* Aquire the lock early; it may be possible to avoid dropping
* and reaquiring it repeatedly.
*/
- spin_lock(&vma->vm_mm->page_table_lock);
+ mm_lock_page_table(vma->vm_mm);
dir = pgd_offset(vma->vm_mm, address);
flush_cache_range(vma, address, end);
@@ -117,7 +133,7 @@ static int filemap_sync(struct vm_area_s
if (address >= end)
BUG();
do {
- error |= filemap_sync_pmd_range(dir, address, end, vma, flags);
+ error |= filemap_sync_pmd_range(vma->vm_mm, dir, address, end, vma, flags);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
@@ -127,7 +143,7 @@ static int filemap_sync(struct vm_area_s
*/
flush_tlb_range(vma, end - size, end);
out:
- spin_unlock(&vma->vm_mm->page_table_lock);
+ mm_unlock_page_table(vma->vm_mm);
return error;
}
diff -puN mm/mprotect.c~vm-abstract-pgtable-locking mm/mprotect.c
--- linux-2.6/mm/mprotect.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/mprotect.c 2004-10-29 16:28:08.000000000 +1000
@@ -26,7 +26,7 @@
#include <asm/tlbflush.h>
static inline void
-change_pte_range(pmd_t *pmd, unsigned long address,
+change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long address,
unsigned long size, pgprot_t newprot)
{
pte_t * pte;
@@ -45,16 +45,21 @@ change_pte_range(pmd_t *pmd, unsigned lo
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- if (pte_present(*pte)) {
- pte_t entry;
-
+ struct pte_modify pmod;
+ pte_t new, old;
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (pte_present(new)) {
/* Avoid an SMP race with hardware updated dirty/clean
* bits by wiping the pte and then setting the new pte
* into place.
*/
- entry = ptep_get_and_clear(pte);
- set_pte(pte, pte_modify(entry, newprot));
- }
+ new = pte_modify(new, newprot);
+ if (ptep_commit_clear(&pmod, mm, pte, new, old))
+ goto again;
+ } else
+ ptep_abort(&pmod, mm, pte);
+
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
@@ -62,7 +67,7 @@ change_pte_range(pmd_t *pmd, unsigned lo
}
static inline void
-change_pmd_range(pgd_t *pgd, unsigned long address,
+change_pmd_range(struct mm_struct *mm, pgd_t *pgd, unsigned long address,
unsigned long size, pgprot_t newprot)
{
pmd_t * pmd;
@@ -81,7 +86,7 @@ change_pmd_range(pgd_t *pgd, unsigned lo
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
- change_pte_range(pmd, address, end - address, newprot);
+ change_pte_range(mm, pmd, address, end - address, newprot);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
@@ -93,19 +98,20 @@ change_protection(struct vm_area_struct
{
pgd_t *dir;
unsigned long beg = start;
+ struct mm_struct *mm = current->mm;
dir = pgd_offset(current->mm, start);
flush_cache_range(vma, beg, end);
if (start >= end)
BUG();
- spin_lock(¤t->mm->page_table_lock);
+ mm_lock_page_table(mm);
do {
- change_pmd_range(dir, start, end - start, newprot);
+ change_pmd_range(mm, dir, start, end - start, newprot);
start = (start + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (start && (start < end));
flush_tlb_range(vma, beg, end);
- spin_unlock(¤t->mm->page_table_lock);
+ mm_unlock_page_table(mm);
return;
}
diff -puN mm/swap_state.c~vm-abstract-pgtable-locking mm/swap_state.c
--- linux-2.6/mm/swap_state.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/swap_state.c 2004-10-29 16:28:08.000000000 +1000
@@ -273,7 +273,7 @@ static inline void free_swap_cache(struc
/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * as the page table is locked.
*/
void free_page_and_swap_cache(struct page *page)
{
diff -puN fs/exec.c~vm-abstract-pgtable-locking fs/exec.c
--- linux-2.6/fs/exec.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/fs/exec.c 2004-10-29 16:28:08.000000000 +1000
@@ -298,10 +298,12 @@ EXPORT_SYMBOL(copy_strings_kernel);
void install_arg_page(struct vm_area_struct *vma,
struct page *page, unsigned long address)
{
+ struct pte_modify pmod;
struct mm_struct *mm = vma->vm_mm;
pgd_t * pgd;
pmd_t * pmd;
pte_t * pte;
+ pte_t new;
if (unlikely(anon_vma_prepare(vma)))
goto out_sig;
@@ -309,29 +311,35 @@ void install_arg_page(struct vm_area_str
flush_dcache_page(page);
pgd = pgd_offset(mm, address);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
pmd = pmd_alloc(mm, pgd, address);
if (!pmd)
goto out;
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
goto out;
- if (!pte_none(*pte)) {
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (!pte_none(new)) {
+ ptep_abort(&pmod, mm, pte);
pte_unmap(pte);
goto out;
}
+ new = pte_mkdirty(pte_mkwrite(mk_pte(page, vma->vm_page_prot)));
+ page_add_anon_rmap(page, vma, address);
+ if (ptep_commit(&pmod, mm, pte, new)) {
+ page_remove_rmap(page);
+ goto again;
+ }
mm->rss++;
lru_cache_add_active(page);
- set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
- page, vma->vm_page_prot))));
- page_add_anon_rmap(page, vma, address);
pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
/* no need for flush_tlb */
return;
out:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
out_sig:
__free_page(page);
force_sig(SIGKILL, current);
diff -puN arch/i386/kernel/vm86.c~vm-abstract-pgtable-locking arch/i386/kernel/vm86.c
--- linux-2.6/arch/i386/kernel/vm86.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/kernel/vm86.c 2004-10-29 16:28:08.000000000 +1000
@@ -136,13 +136,13 @@ struct pt_regs * fastcall save_v86_state
static void mark_screen_rdonly(struct task_struct * tsk)
{
+ struct mm_struct *mm = tsk->mm;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte, *mapped;
int i;
- preempt_disable();
- spin_lock(&tsk->mm->page_table_lock);
+ mm_lock_page_table(mm);
pgd = pgd_offset(tsk->mm, 0xA0000);
if (pgd_none(*pgd))
goto out;
@@ -161,14 +161,21 @@ static void mark_screen_rdonly(struct ta
}
pte = mapped = pte_offset_map(pmd, 0xA0000);
for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
+ struct pte_modify pmod;
+ pte_t new;
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (pte_present(new)) {
+ new = pte_wrprotect(new);
+ if (ptep_commit(&pmod, mm, pte, new))
+ goto again;
+ } else
+ ptep_abort(&pmod, mm, pte);
pte++;
}
pte_unmap(mapped);
out:
- spin_unlock(&tsk->mm->page_table_lock);
- preempt_enable();
+ mm_unlock_page_table(mm);
flush_tlb();
}
diff -puN arch/i386/mm/hugetlbpage.c~vm-abstract-pgtable-locking arch/i386/mm/hugetlbpage.c
--- linux-2.6/arch/i386/mm/hugetlbpage.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c 2004-10-29 16:28:08.000000000 +1000
@@ -40,6 +40,7 @@ static pte_t *huge_pte_offset(struct mm_
static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access)
{
+ struct pte_modify pmod;
pte_t entry;
mm->rss += (HPAGE_SIZE / PAGE_SIZE);
@@ -50,7 +51,11 @@ static void set_huge_pte(struct mm_struc
entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
entry = pte_mkyoung(entry);
mk_pte_huge(entry);
- set_pte(page_table, entry);
+
+ /* XXX: ... */
+ do {
+ ptep_begin_modify(&pmod, mm, page_table);
+ } while (ptep_commit(&pmod, mm, page_table, entry));
}
/*
@@ -231,7 +236,7 @@ int hugetlb_prefault(struct address_spac
BUG_ON(vma->vm_start & ~HPAGE_MASK);
BUG_ON(vma->vm_end & ~HPAGE_MASK);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
unsigned long idx;
pte_t *pte = huge_pte_alloc(mm, addr);
@@ -279,7 +284,7 @@ int hugetlb_prefault(struct address_spac
set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
}
out:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return ret;
}
diff -puN mm/swapfile.c~vm-abstract-pgtable-locking mm/swapfile.c
--- linux-2.6/mm/swapfile.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/swapfile.c 2004-10-29 16:28:08.000000000 +1000
@@ -426,22 +426,9 @@ void free_swap_and_cache(swp_entry_t ent
* share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later.
*/
-/* vma->vm_mm->page_table_lock is held */
-static void
-unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
- swp_entry_t entry, struct page *page)
-{
- vma->vm_mm->rss++;
- get_page(page);
- set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
- page_add_anon_rmap(page, vma, address);
- swap_free(entry);
-}
-
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
- unsigned long address, unsigned long size, unsigned long offset,
- swp_entry_t entry, struct page *page)
+static unsigned long unuse_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *dir, unsigned long address, unsigned long size,
+ unsigned long offset, swp_entry_t entry, struct page *page)
{
pte_t * pte;
unsigned long end;
@@ -461,12 +448,26 @@ static unsigned long unuse_pmd(struct vm
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
+ struct pte_modify pmod;
+ pte_t new;
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
- if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, offset + address, pte, entry, page);
+again:
+ new = ptep_begin_modify(&pmod, mm, pte);
+ if (unlikely(pte_same(new, swp_pte))) {
+ get_page(page);
+ new = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (ptep_commit(&pmod, mm, pte, new)) {
+ put_page(page);
+ goto again;
+ }
+
+ vma->vm_mm->rss++;
+ page_add_anon_rmap(page, vma, address);
+ swap_free(entry);
+
pte_unmap(pte);
/*
@@ -477,7 +478,9 @@ static unsigned long unuse_pmd(struct vm
/* add 1 since address may be 0 */
return 1 + offset + address;
- }
+ } else
+ ptep_abort(&pmod, mm, pte);
+
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
@@ -485,9 +488,8 @@ static unsigned long unuse_pmd(struct vm
return 0;
}
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
- unsigned long address, unsigned long size,
+static unsigned long unuse_pgd(struct mm_struct *mm, struct vm_area_struct *vma,
+ pgd_t *dir, unsigned long address, unsigned long size,
swp_entry_t entry, struct page *page)
{
pmd_t * pmd;
@@ -510,7 +512,7 @@ static unsigned long unuse_pgd(struct vm
if (address >= end)
BUG();
do {
- foundaddr = unuse_pmd(vma, pmd, address, end - address,
+ foundaddr = unuse_pmd(mm, vma, pmd, address, end - address,
offset, entry, page);
if (foundaddr)
return foundaddr;
@@ -520,9 +522,8 @@ static unsigned long unuse_pgd(struct vm
return 0;
}
-/* vma->vm_mm->page_table_lock is held */
-static unsigned long unuse_vma(struct vm_area_struct * vma,
- swp_entry_t entry, struct page *page)
+static unsigned long unuse_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ swp_entry_t entry, struct page *page)
{
pgd_t *pgdir;
unsigned long start, end;
@@ -538,15 +539,17 @@ static unsigned long unuse_vma(struct vm
start = vma->vm_start;
end = vma->vm_end;
}
+ mm_lock_page_table(vma->vm_mm);
pgdir = pgd_offset(vma->vm_mm, start);
do {
- foundaddr = unuse_pgd(vma, pgdir, start, end - start,
- entry, page);
+ foundaddr = unuse_pgd(mm, vma, pgdir, start,
+ end - start, entry, page);
if (foundaddr)
return foundaddr;
start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++;
} while (start && (start < end));
+ mm_unlock_page_table(vma->vm_mm);
return 0;
}
@@ -568,15 +571,13 @@ static int unuse_process(struct mm_struc
down_read(&mm->mmap_sem);
lock_page(page);
}
- spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) {
- foundaddr = unuse_vma(vma, entry, page);
+ foundaddr = unuse_vma(mm, vma, entry, page);
if (foundaddr)
break;
}
}
- spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem);
/*
* Currently unuse_process cannot fail, but leave error handling
diff -puN mm/vmalloc.c~vm-abstract-pgtable-locking mm/vmalloc.c
--- linux-2.6/mm/vmalloc.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/vmalloc.c 2004-10-29 16:28:08.000000000 +1000
@@ -45,6 +45,7 @@ static void unmap_area_pte(pmd_t *pmd, u
do {
pte_t page;
+ /* XXX: make this use ptep_begin_modify */
page = ptep_get_and_clear(pte);
address += PAGE_SIZE;
pte++;
@@ -57,7 +58,7 @@ static void unmap_area_pte(pmd_t *pmd, u
}
static void unmap_area_pmd(pgd_t *dir, unsigned long address,
- unsigned long size)
+ unsigned long size)
{
unsigned long end;
pmd_t *pmd;
@@ -84,8 +85,7 @@ static void unmap_area_pmd(pgd_t *dir, u
}
static int map_area_pte(pte_t *pte, unsigned long address,
- unsigned long size, pgprot_t prot,
- struct page ***pages)
+ unsigned long size, pgprot_t prot, struct page ***pages)
{
unsigned long end;
@@ -95,13 +95,18 @@ static int map_area_pte(pte_t *pte, unsi
end = PMD_SIZE;
do {
+ struct pte_modify pmod;
+ pte_t new;
struct page *page = **pages;
-
- WARN_ON(!pte_none(*pte));
if (!page)
return -ENOMEM;
- set_pte(pte, mk_pte(page, prot));
+again:
+ new = ptep_begin_modify(&pmod, &init_mm, pte);
+ WARN_ON(!pte_none(new));
+ new = mk_pte(page, prot);
+ if (ptep_commit(&pmod, &init_mm, pte, new))
+ goto again;
address += PAGE_SIZE;
pte++;
(*pages)++;
@@ -110,8 +115,7 @@ static int map_area_pte(pte_t *pte, unsi
}
static int map_area_pmd(pmd_t *pmd, unsigned long address,
- unsigned long size, pgprot_t prot,
- struct page ***pages)
+ unsigned long size, pgprot_t prot, struct page ***pages)
{
unsigned long base, end;
@@ -158,7 +162,7 @@ int map_vm_area(struct vm_struct *area,
int err = 0;
dir = pgd_offset_k(address);
- spin_lock(&init_mm.page_table_lock);
+ mm_lock_page_table(&init_mm);
do {
pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
if (!pmd) {
@@ -174,7 +178,7 @@ int map_vm_area(struct vm_struct *area,
dir++;
} while (address && (address < end));
- spin_unlock(&init_mm.page_table_lock);
+ mm_unlock_page_table(&init_mm);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
diff -puN mm/hugetlb.c~vm-abstract-pgtable-locking mm/hugetlb.c
--- linux-2.6/mm/hugetlb.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/hugetlb.c 2004-10-29 16:28:08.000000000 +1000
@@ -253,7 +253,7 @@ void zap_hugepage_range(struct vm_area_s
{
struct mm_struct *mm = vma->vm_mm;
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
unmap_hugepage_range(vma, start, start + length);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
}
diff -puN mm/fremap.c~vm-abstract-pgtable-locking mm/fremap.c
--- linux-2.6/mm/fremap.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/mm/fremap.c 2004-10-29 16:28:08.000000000 +1000
@@ -23,19 +23,28 @@
static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
- pte_t pte = *ptep;
+ struct pte_modify pmod;
+ pte_t new, old;
- if (pte_none(pte))
+again:
+ new = ptep_begin_modify(&pmod, mm, ptep);
+ if (pte_none(new)) {
+ ptep_abort(&pmod, mm, ptep);
return;
- if (pte_present(pte)) {
- unsigned long pfn = pte_pfn(pte);
+ }
+ if (pte_present(new)) {
+ /* XXX: needs mm_pin_pages */
+ unsigned long pfn = pte_pfn(new);
flush_cache_page(vma, addr);
- pte = ptep_clear_flush(vma, addr, ptep);
+ pte_clear(&new);
+ if (ptep_commit_clear_flush(&pmod, mm, vma, addr,
+ ptep, new, old))
+ goto again;
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!PageReserved(page)) {
- if (pte_dirty(pte))
+ if (pte_dirty(old))
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
@@ -43,9 +52,12 @@ static inline void zap_pte(struct mm_str
}
}
} else {
- if (!pte_file(pte))
- free_swap_and_cache(pte_to_swp_entry(pte));
- pte_clear(ptep);
+ /* XXX: this will need to be done under a lock. Or maybe
+ * we should clear the pte first?
+ */
+ if (!pte_file(new))
+ free_swap_and_cache(pte_to_swp_entry(new));
+ ptep_abort(&pmod, mm, ptep);
}
}
@@ -65,7 +77,7 @@ int install_page(struct mm_struct *mm, s
pte_t pte_val;
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
pmd = pmd_alloc(mm, pgd, addr);
if (!pmd)
@@ -85,6 +97,10 @@ int install_page(struct mm_struct *mm, s
if (!page->mapping || page->index >= size)
goto err_unlock;
+ /*
+ * XXX: locking becomes probably very broken - all this will now
+ * be non atomic with lockless pagetables. Investigate.
+ */
zap_pte(mm, vma, addr, pte);
mm->rss++;
@@ -97,7 +113,7 @@ int install_page(struct mm_struct *mm, s
err = 0;
err_unlock:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return err;
}
EXPORT_SYMBOL(install_page);
@@ -117,7 +133,7 @@ int install_file_pte(struct mm_struct *m
pte_t pte_val;
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
+ mm_lock_page_table(mm);
pmd = pmd_alloc(mm, pgd, addr);
if (!pmd)
@@ -133,11 +149,11 @@ int install_file_pte(struct mm_struct *m
pte_val = *pte;
pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return 0;
err_unlock:
- spin_unlock(&mm->page_table_lock);
+ mm_unlock_page_table(mm);
return err;
}
diff -puN arch/i386/mm/ioremap.c~vm-abstract-pgtable-locking arch/i386/mm/ioremap.c
--- linux-2.6/arch/i386/mm/ioremap.c~vm-abstract-pgtable-locking 2004-10-29 16:28:08.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/mm/ioremap.c 2004-10-29 16:28:08.000000000 +1000
@@ -17,8 +17,9 @@
#include <asm/tlbflush.h>
#include <asm/pgtable.h>
-static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
- unsigned long phys_addr, unsigned long flags)
+static inline void remap_area_pte(pte_t * pte, unsigned long address,
+ unsigned long size, unsigned long phys_addr,
+ unsigned long flags)
{
unsigned long end;
unsigned long pfn;
@@ -31,12 +32,20 @@ static inline void remap_area_pte(pte_t
BUG();
pfn = phys_addr >> PAGE_SHIFT;
do {
- if (!pte_none(*pte)) {
+ struct pte_modify pmod;
+ pte_t new;
+again:
+ new = ptep_begin_modify(&pmod, &init_mm, pte);
+ if (!pte_none(new)) {
printk("remap_area_pte: page already exists\n");
BUG();
}
- set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
- _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
+ new = pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW |
+ _PAGE_DIRTY | _PAGE_ACCESSED | flags));
+ if (ptep_commit(&pmod, &init_mm, pte, new)) {
+ printk("remap_area_pte: ptep_commit raced\n");
+ goto again;
+ }
address += PAGE_SIZE;
pfn++;
pte++;
@@ -78,7 +87,7 @@ static int remap_area_pages(unsigned lon
flush_cache_all();
if (address >= end)
BUG();
- spin_lock(&init_mm.page_table_lock);
+ mm_lock_page_table(&init_mm);
do {
pmd_t *pmd;
pmd = pmd_alloc(&init_mm, dir, address);
@@ -92,7 +101,7 @@ static int remap_area_pages(unsigned lon
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
- spin_unlock(&init_mm.page_table_lock);
+ mm_unlock_page_table(&init_mm);
flush_tlb_all();
return error;
}
_
next prev parent reply other threads:[~2004-10-29 7:21 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-10-29 7:20 [PATCH 0/7] " Nick Piggin
2004-10-29 7:20 ` [PATCH 1/7] " Nick Piggin
2004-10-29 7:21 ` [PATCH 2/7] " Nick Piggin
2004-10-29 7:21 ` [PATCH 3/7] " Nick Piggin
2004-10-29 7:21 ` Nick Piggin [this message]
2004-10-29 7:22 ` [PATCH 5/7] " Nick Piggin
2004-10-29 7:23 ` [PATCH 6/7] " Nick Piggin
2004-10-29 7:23 ` [PATCH 7/7] " Nick Piggin
2004-10-29 7:46 ` [PATCH 0/7] " William Lee Irwin III
2004-11-02 0:15 ` Christoph Lameter
2004-11-02 0:54 ` William Lee Irwin III
2004-11-02 1:34 ` Nick Piggin
2004-11-02 1:55 ` William Lee Irwin III
2004-11-02 2:38 ` Nick Piggin
2004-11-02 6:57 ` William Lee Irwin III
2004-11-02 17:55 ` Christoph Lameter
2004-10-29 11:45 ` Nick Piggin
2004-10-29 20:52 ` William Lee Irwin III
2004-10-30 2:46 ` Nick Piggin
2004-11-02 0:19 ` Christoph Lameter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4181EF96.2030602@yahoo.com.au \
--to=nickpiggin@yahoo.com.au \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox