* [RFT][PATCH 1/2] pagefault scalability alternative
2005-08-22 21:27 [RFT][PATCH 0/2] pagefault scalability alternative Hugh Dickins
@ 2005-08-22 21:29 ` Hugh Dickins
2005-08-22 21:31 ` [RFT][PATCH 2/2] " Hugh Dickins
2005-08-22 22:29 ` [RFT][PATCH 0/2] " Christoph Lameter
2 siblings, 0 replies; 20+ messages in thread
From: Hugh Dickins @ 2005-08-22 21:29 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, Linus Torvalds, Andrew Morton, linux-mm
First remove Christoph's pagefault scalability from 2.6.13-rc6-mm1...
--- 26136m1/arch/i386/Kconfig 2005-08-19 14:30:02.000000000 +0100
+++ 26136m1-/arch/i386/Kconfig 2005-08-20 16:44:38.000000000 +0100
@@ -909,11 +909,6 @@ config HAVE_DEC_LOCK
depends on (SMP || PREEMPT) && X86_CMPXCHG
default y
-config ATOMIC_TABLE_OPS
- bool
- depends on SMP && X86_CMPXCHG && !X86_PAE
- default y
-
# turning this on wastes a bunch of space.
# Summit needs it only when NUMA is on
config BOOT_IOREMAP
--- 26136m1/arch/ia64/Kconfig 2005-08-19 14:30:02.000000000 +0100
+++ 26136m1-/arch/ia64/Kconfig 2005-08-20 16:44:38.000000000 +0100
@@ -297,11 +297,6 @@ config PREEMPT
source "mm/Kconfig"
-config ATOMIC_TABLE_OPS
- bool
- depends on SMP
- default y
-
config HAVE_DEC_LOCK
bool
depends on (SMP || PREEMPT)
--- 26136m1/arch/x86_64/Kconfig 2005-08-19 14:30:04.000000000 +0100
+++ 26136m1-/arch/x86_64/Kconfig 2005-08-20 16:44:38.000000000 +0100
@@ -221,11 +221,6 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
-config ATOMIC_TABLE_OPS
- bool
- depends on SMP
- default y
-
source "kernel/Kconfig.preempt"
config K8_NUMA
--- 26136m1/include/asm-generic/4level-fixup.h 2005-08-19 14:30:11.000000000 +0100
+++ 26136m1-/include/asm-generic/4level-fixup.h 2005-08-20 16:44:38.000000000 +0100
@@ -26,7 +26,6 @@
#define pud_present(pud) 1
#define pud_ERROR(pud) do { } while (0)
#define pud_clear(pud) pgd_clear(pud)
-#define pud_populate pgd_populate
#undef pud_free_tlb
#define pud_free_tlb(tlb, x) do { } while (0)
--- 26136m1/include/asm-generic/pgtable-nopmd.h 2005-08-19 14:30:11.000000000 +0100
+++ 26136m1-/include/asm-generic/pgtable-nopmd.h 2005-08-20 16:44:38.000000000 +0100
@@ -31,11 +31,6 @@ static inline void pud_clear(pud_t *pud)
#define pmd_ERROR(pmd) (pud_ERROR((pmd).pud))
#define pud_populate(mm, pmd, pte) do { } while (0)
-#define __ARCH_HAVE_PUD_TEST_AND_POPULATE
-static inline int pud_test_and_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
- return 1;
-}
/*
* (pmds are folded into puds so this doesn't get actually called,
--- 26136m1/include/asm-generic/pgtable-nopud.h 2005-08-19 14:30:11.000000000 +0100
+++ 26136m1-/include/asm-generic/pgtable-nopud.h 2005-08-20 16:44:38.000000000 +0100
@@ -27,14 +27,8 @@ static inline int pgd_bad(pgd_t pgd) {
static inline int pgd_present(pgd_t pgd) { return 1; }
static inline void pgd_clear(pgd_t *pgd) { }
#define pud_ERROR(pud) (pgd_ERROR((pud).pgd))
-#define pgd_populate(mm, pgd, pud) do { } while (0)
-
-#define __HAVE_ARCH_PGD_TEST_AND_POPULATE
-static inline int pgd_test_and_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
- return 1;
-}
+#define pgd_populate(mm, pgd, pud) do { } while (0)
/*
* (puds are folded into pgds so this doesn't get actually called,
* but the define is needed for a generic inline function.)
--- 26136m1/include/asm-generic/pgtable.h 2005-08-19 14:30:11.000000000 +0100
+++ 26136m1-/include/asm-generic/pgtable.h 2005-08-20 16:44:38.000000000 +0100
@@ -127,191 +127,6 @@ do { \
})
#endif
-#ifdef CONFIG_ATOMIC_TABLE_OPS
-
-/*
- * The architecture does support atomic table operations.
- * We may be able to provide atomic ptep_xchg and ptep_cmpxchg using
- * cmpxchg and xchg.
- */
-#ifndef __HAVE_ARCH_PTEP_XCHG
-#define ptep_xchg(__mm, __address, __ptep, __pteval) \
- __pte(xchg(&pte_val(*(__ptep)), pte_val(__pteval)))
-#endif
-
-#ifndef __HAVE_ARCH_PTEP_CMPXCHG
-#define ptep_cmpxchg(__mm, __address, __ptep,__oldval,__newval) \
- (cmpxchg(&pte_val(*(__ptep)), \
- pte_val(__oldval), \
- pte_val(__newval) \
- ) == pte_val(__oldval) \
- )
-#endif
-
-#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
-#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
-({ \
- pte_t __pte = ptep_xchg(__vma, __address, __ptep, __pteval); \
- flush_tlb_page(__vma, __address); \
- __pte; \
-})
-#endif
-
-/*
- * page_table_atomic_start and page_table_atomic_stop may be used to
- * define special measures that an arch needs to guarantee atomic
- * operations outside of a spinlock. In the case that an arch does
- * not support atomic page table operations we will fall back to the
- * page table lock.
- */
-#ifndef __HAVE_ARCH_PAGE_TABLE_ATOMIC_START
-#define page_table_atomic_start(mm) do { } while (0)
-#endif
-
-#ifndef __HAVE_ARCH_PAGE_TABLE_ATOMIC_START
-#define page_table_atomic_stop(mm) do { } while (0)
-#endif
-
-/*
- * Fallback functions for atomic population of higher page table
- * structures. These simply acquire the page_table_lock for
- * synchronization. An architecture may override these generic
- * functions to provide atomic populate functions to make these
- * more effective.
- */
-
-#ifndef __HAVE_ARCH_PGD_TEST_AND_POPULATE
-#define pgd_test_and_populate(__mm, __pgd, __pud) \
-({ \
- int __rc; \
- spin_lock(&mm->page_table_lock); \
- __rc = pgd_none(*(__pgd)); \
- if (__rc) pgd_populate(__mm, __pgd, __pud); \
- spin_unlock(&mm->page_table_lock); \
- __rc; \
-})
-#endif
-
-#ifndef __HAVE_ARCH_PUD_TEST_AND_POPULATE
-#define pud_test_and_populate(__mm, __pud, __pmd) \
-({ \
- int __rc; \
- spin_lock(&mm->page_table_lock); \
- __rc = pud_none(*(__pud)); \
- if (__rc) pud_populate(__mm, __pud, __pmd); \
- spin_unlock(&mm->page_table_lock); \
- __rc; \
-})
-#endif
-
-#ifndef __HAVE_ARCH_PMD_TEST_AND_POPULATE
-#define pmd_test_and_populate(__mm, __pmd, __page) \
-({ \
- int __rc; \
- spin_lock(&mm->page_table_lock); \
- __rc = !pmd_present(*(__pmd)); \
- if (__rc) pmd_populate(__mm, __pmd, __page); \
- spin_unlock(&mm->page_table_lock); \
- __rc; \
-})
-#endif
-
-#else
-
-/*
- * No support for atomic operations on the page table.
- * Exchanging of pte values is done by first swapping zeros into
- * a pte and then putting new content into the pte entry.
- * However, these functions will generate an empty pte for a
- * short time frame. This means that the page_table_lock must be held
- * to avoid a page fault that would install a new entry.
- */
-
-/* Fall back to the page table lock to synchronize page table access */
-#define page_table_atomic_start(mm) spin_lock(&(mm)->page_table_lock)
-#define page_table_atomic_stop(mm) spin_unlock(&(mm)->page_table_lock)
-
-#ifndef __HAVE_ARCH_PTEP_XCHG
-#define ptep_xchg(__mm, __address, __ptep, __pteval) \
-({ \
- pte_t __pte = ptep_get_and_clear(__mm, __address, __ptep); \
- set_pte_at(__mm, __address, __ptep, __pteval); \
- __pte; \
-})
-#endif
-
-#ifndef __HAVE_ARCH_PTEP_XCHG_FLUSH
-#ifndef __HAVE_ARCH_PTEP_XCHG
-#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
-({ \
- pte_t __pte = ptep_clear_flush(__vma, __address, __ptep); \
- set_pte_at((__vma)->vm_mm, __address, __ptep, __pteval); \
- __pte; \
-})
-#else
-#define ptep_xchg_flush(__vma, __address, __ptep, __pteval) \
-({ \
- pte_t __pte = ptep_xchg((__vma)->vm_mm, __address, __ptep, __pteval);\
- flush_tlb_page(__vma, __address); \
- __pte; \
-})
-#endif
-#endif
-
-/*
- * The fallback function for ptep_cmpxchg avoids any real use of cmpxchg
- * since cmpxchg may not be available on certain architectures. Instead
- * the clearing of a pte is used as a form of locking mechanism.
- * This approach will only work if the page_table_lock is held to insure
- * that the pte is not populated by a page fault generated on another
- * CPU.
- */
-#ifndef __HAVE_ARCH_PTEP_CMPXCHG
-#define ptep_cmpxchg(__mm, __address, __ptep, __old, __new) \
-({ \
- pte_t prev = ptep_get_and_clear(__mm, __address, __ptep); \
- int r = pte_val(prev) == pte_val(__old); \
- set_pte_at(__mm, __address, __ptep, r ? (__new) : prev); \
- r; \
-})
-#endif
-
-/*
- * Fallback functions for atomic population of higher page table
- * structures. These rely on the page_table_lock being held.
- */
-#ifndef __HAVE_ARCH_PGD_TEST_AND_POPULATE
-#define pgd_test_and_populate(__mm, __pgd, __pud) \
-({ \
- int __rc; \
- __rc = pgd_none(*(__pgd)); \
- if (__rc) pgd_populate(__mm, __pgd, __pud); \
- __rc; \
-})
-#endif
-
-#ifndef __HAVE_ARCH_PUD_TEST_AND_POPULATE
-#define pud_test_and_populate(__mm, __pud, __pmd) \
-({ \
- int __rc; \
- __rc = pud_none(*(__pud)); \
- if (__rc) pud_populate(__mm, __pud, __pmd); \
- __rc; \
-})
-#endif
-
-#ifndef __HAVE_ARCH_PMD_TEST_AND_POPULATE
-#define pmd_test_and_populate(__mm, __pmd, __page) \
-({ \
- int __rc; \
- __rc = !pmd_present(*(__pmd)); \
- if (__rc) pmd_populate(__mm, __pmd, __page); \
- __rc; \
-})
-#endif
-
-#endif
-
#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect(__mm, __address, __ptep) \
({ \
--- 26136m1/include/asm-ia64/pgalloc.h 2005-08-19 14:30:12.000000000 +0100
+++ 26136m1-/include/asm-ia64/pgalloc.h 2005-08-20 16:44:38.000000000 +0100
@@ -1,10 +1,6 @@
#ifndef _ASM_IA64_PGALLOC_H
#define _ASM_IA64_PGALLOC_H
-/* Empty entries of PMD and PGD */
-#define PMD_NONE 0
-#define PUD_NONE 0
-
/*
* This file contains the functions and defines necessary to allocate
* page tables.
@@ -90,21 +86,6 @@ static inline void pgd_free(pgd_t * pgd)
pgtable_quicklist_free(pgd);
}
-/* Atomic populate */
-static inline int
-pud_test_and_populate (struct mm_struct *mm, pud_t *pud_entry, pmd_t *pmd)
-{
- return ia64_cmpxchg8_acq(pud_entry,__pa(pmd), PUD_NONE) == PUD_NONE;
-}
-
-/* Atomic populate */
-static inline int
-pmd_test_and_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
-{
- return ia64_cmpxchg8_acq(pmd_entry, page_to_phys(pte), PMD_NONE) == PMD_NONE;
-}
-
-
static inline void
pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
{
--- 26136m1/include/asm-ia64/pgtable.h 2005-08-19 14:30:12.000000000 +0100
+++ 26136m1-/include/asm-ia64/pgtable.h 2005-08-20 16:44:38.000000000 +0100
@@ -565,8 +565,6 @@ do { \
#define __HAVE_ARCH_PTE_SAME
#define __HAVE_ARCH_PGD_OFFSET_GATE
#define __HAVE_ARCH_LAZY_MMU_PROT_UPDATE
-#define __HAVE_ARCH_PUD_TEST_AND_POPULATE
-#define __HAVE_ARCH_PMD_TEST_AND_POPULATE
#include <asm-generic/pgtable-nopud.h>
#include <asm-generic/pgtable.h>
--- 26136m1/include/linux/page-flags.h 2005-08-19 14:30:13.000000000 +0100
+++ 26136m1-/include/linux/page-flags.h 2005-08-20 16:44:38.000000000 +0100
@@ -132,12 +132,6 @@ struct page_state {
unsigned long pgrotated; /* pages rotated to tail of the LRU */
unsigned long nr_bounce; /* pages for bounce buffers */
- unsigned long spurious_page_faults; /* Faults with no ops */
- unsigned long cmpxchg_fail_flag_update; /* cmpxchg failures for pte flag update */
- unsigned long cmpxchg_fail_flag_reuse; /* cmpxchg failures when cow reuse of pte */
-
- unsigned long cmpxchg_fail_anon_read; /* cmpxchg failures on anonymous read */
- unsigned long cmpxchg_fail_anon_write; /* cmpxchg failures on anonymous write */
};
extern void get_page_state(struct page_state *ret);
--- 26136m1/include/linux/sched.h 2005-08-19 14:30:13.000000000 +0100
+++ 26136m1-/include/linux/sched.h 2005-08-20 16:44:38.000000000 +0100
@@ -227,43 +227,12 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
-#ifdef CONFIG_ATOMIC_TABLE_OPS
-/*
- * No spinlock is held during atomic page table operations. The
- * counters are not protected anymore and must also be
- * incremented atomically.
-*/
-#ifdef ATOMIC64_INIT
-#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
-typedef atomic64_t mm_counter_t;
-#else
-/*
- * This may limit process memory to 2^31 * PAGE_SIZE which may be around 8TB
- * if using 4KB page size
- */
-#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
-typedef atomic_t mm_counter_t;
-#endif
-#else
-/*
- * No atomic page table operations. Counters are protected by
- * the page table lock
- */
#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
#define get_mm_counter(mm, member) ((mm)->_##member)
#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
#define inc_mm_counter(mm, member) (mm)->_##member++
#define dec_mm_counter(mm, member) (mm)->_##member--
typedef unsigned long mm_counter_t;
-#endif
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
--- 26136m1/mm/memory.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1-/mm/memory.c 2005-08-20 16:54:41.000000000 +0100
@@ -36,8 +36,6 @@
* (Gerhard.Wichert@pdb.siemens.de)
*
* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
- * Jan 2005 Scalability improvement by reducing the use and the length of time
- * the page table lock is held (Christoph Lameter)
*/
#include <linux/kernel_stat.h>
@@ -553,22 +551,16 @@ static void zap_pte_range(struct mmu_gat
page->index > details->last_index))
continue;
}
- if (unlikely(!page)) {
- ptent = ptep_get_and_clear_full(tlb->mm, addr,
+ ptent = ptep_get_and_clear_full(tlb->mm, addr,
pte, tlb->fullmm);
- tlb_remove_tlb_entry(tlb, pte, addr);
- continue;
- }
- if (unlikely(details) && details->nonlinear_vma &&
- linear_page_index(details->nonlinear_vma,
- addr) != page->index) {
- ptent = ptep_xchg(tlb->mm, addr, pte,
- pgoff_to_pte(page->index));
- } else {
- ptent = ptep_get_and_clear_full(tlb->mm, addr,
- pte, tlb->fullmm);
- }
tlb_remove_tlb_entry(tlb, pte, addr);
+ if (unlikely(!page))
+ continue;
+ if (unlikely(details) && details->nonlinear_vma
+ && linear_page_index(details->nonlinear_vma,
+ addr) != page->index)
+ set_pte_at(tlb->mm, addr, pte,
+ pgoff_to_pte(page->index));
if (pte_dirty(ptent))
set_page_dirty(page);
if (PageAnon(page))
@@ -982,7 +974,7 @@ int get_user_pages(struct task_struct *t
*/
if (ret & VM_FAULT_WRITE)
write_access = 0;
-
+
switch (ret & ~VM_FAULT_WRITE) {
case VM_FAULT_MINOR:
tsk->min_flt++;
@@ -1651,7 +1643,8 @@ void swapin_readahead(swp_entry_t entry,
}
/*
- * We hold the mm semaphore and have started atomic pte operations
+ * We hold the mm semaphore and the page_table_lock on entry and
+ * should release the pagetable lock on exit..
*/
static int do_swap_page(struct mm_struct * mm,
struct vm_area_struct * vma, unsigned long address,
@@ -1663,14 +1656,15 @@ static int do_swap_page(struct mm_struct
int ret = VM_FAULT_MINOR;
pte_unmap(page_table);
- page_table_atomic_stop(mm);
+ spin_unlock(&mm->page_table_lock);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte
+ * Back out if somebody else faulted in this pte while
+ * we released the page table lock.
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1693,7 +1687,8 @@ static int do_swap_page(struct mm_struct
lock_page(page);
/*
- * Back out if somebody else faulted in this pte
+ * Back out if somebody else faulted in this pte while we
+ * released the page table lock.
*/
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
@@ -1748,75 +1743,61 @@ out_nomap:
}
/*
- * We are called with atomic operations started and the
- * value of the pte that was read in orig_entry.
+ * We are called with the MM semaphore and page_table_lock
+ * spinlock held to protect against concurrent faults in
+ * multithreaded programs.
*/
static int
do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr, pte_t orig_entry)
+ unsigned long addr)
{
pte_t entry;
- struct page * page;
+ struct page * page = ZERO_PAGE(addr);
- if (unlikely(!write_access)) {
+ /* Read-only mapping of ZERO_PAGE. */
+ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr),
- vma->vm_page_prot));
+ /* ..except if it's a write access */
+ if (write_access) {
+ /* Allocate our own private page. */
+ pte_unmap(page_table);
+ spin_unlock(&mm->page_table_lock);
- /*
- * If the cmpxchg fails then another cpu may
- * already have populated the entry
- */
- if (ptep_cmpxchg(mm, addr, page_table, orig_entry, entry)) {
- update_mmu_cache(vma, addr, entry);
- lazy_mmu_prot_update(entry);
- } else {
- inc_page_state(cmpxchg_fail_anon_read);
+ if (unlikely(anon_vma_prepare(vma)))
+ goto no_mem;
+ page = alloc_zeroed_user_highpage(vma, addr);
+ if (!page)
+ goto no_mem;
+
+ spin_lock(&mm->page_table_lock);
+ page_table = pte_offset_map(pmd, addr);
+
+ if (!pte_none(*page_table)) {
+ pte_unmap(page_table);
+ page_cache_release(page);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
}
- goto minor_fault;
+ inc_mm_counter(mm, rss);
+ entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
+ vma->vm_page_prot)),
+ vma);
+ lru_cache_add_active(page);
+ SetPageReferenced(page);
+ page_add_anon_rmap(page, vma, addr);
}
- /* This leaves the write case */
- page_table_atomic_stop(mm);
+ set_pte_at(mm, addr, page_table, entry);
pte_unmap(page_table);
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
- page = alloc_zeroed_user_highpage(vma, addr);
- if (!page)
- goto oom;
-
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
- page_table = pte_offset_map(pmd, addr);
- page_table_atomic_start(mm);
-
- if (!ptep_cmpxchg(mm, addr, page_table, orig_entry, entry)) {
- page_cache_release(page);
- inc_page_state(cmpxchg_fail_anon_write);
- goto minor_fault;
- }
-
- /*
- * These two functions must come after the cmpxchg
- * because if the page is on the LRU then try_to_unmap may come
- * in and unmap the pte.
- */
- page_add_anon_rmap(page, vma, addr);
- lru_cache_add_active(page);
- inc_mm_counter(mm, rss);
+ /* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
lazy_mmu_prot_update(entry);
-
-minor_fault:
- page_table_atomic_stop(mm);
- pte_unmap(page_table);
+ spin_unlock(&mm->page_table_lock);
+out:
return VM_FAULT_MINOR;
-
-oom:
+no_mem:
return VM_FAULT_OOM;
}
@@ -1829,12 +1810,12 @@ oom:
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and atomic pte operations started.
+ * This is called with the MM semaphore held and the page table
+ * spinlock held. Exit with the spinlock released.
*/
static int
do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table,
- pmd_t *pmd, pte_t orig_entry)
+ unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
{
struct page * new_page;
struct address_space *mapping = NULL;
@@ -1845,9 +1826,9 @@ do_no_page(struct mm_struct *mm, struct
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address, orig_entry);
+ pmd, write_access, address);
pte_unmap(page_table);
- page_table_atomic_stop(mm);
+ spin_unlock(&mm->page_table_lock);
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1954,7 +1935,7 @@ oom:
* nonlinear vmas.
*/
static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd, pte_t entry)
+ unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
{
unsigned long pgoff;
int err;
@@ -1967,13 +1948,13 @@ static int do_file_page(struct mm_struct
if (!vma->vm_ops->populate ||
(write_access && !(vma->vm_flags & VM_SHARED))) {
pte_clear(mm, address, pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd, entry);
+ return do_no_page(mm, vma, address, write_access, pte, pmd);
}
- pgoff = pte_to_pgoff(entry);
+ pgoff = pte_to_pgoff(*pte);
pte_unmap(pte);
- page_table_atomic_stop(mm);
+ spin_unlock(&mm->page_table_lock);
err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
@@ -1992,80 +1973,49 @@ static int do_file_page(struct mm_struct
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note that kswapd only ever _removes_ pages, never adds them.
- * We exploit that case if possible to avoid taking the
- * page table lock.
-*/
+ * Note the "page_table_lock". It is to protect against kswapd removing
+ * pages from under us. Note that kswapd only ever _removes_ pages, never
+ * adds them. As such, once we have noticed that the page is not present,
+ * we can drop the lock early.
+ *
+ * The adding of pages is protected by the MM semaphore (which we hold),
+ * so we don't need to worry about a page being suddenly been added into
+ * our VM.
+ *
+ * We enter with the pagetable spinlock held, we are supposed to
+ * release it when done.
+ */
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct * vma, unsigned long address,
int write_access, pte_t *pte, pmd_t *pmd)
{
pte_t entry;
- pte_t new_entry;
entry = *pte;
if (!pte_present(entry)) {
/*
- * Pass the value of the pte to do_no_page and do_file_page
- * This value may be used to verify that the pte is still
- * not present allowing atomic insertion of ptes.
+ * If it truly wasn't present, we know that kswapd
+ * and the PTE updates will not touch it later. So
+ * drop the lock.
*/
if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access,
- pte, pmd, entry);
+ return do_no_page(mm, vma, address, write_access, pte, pmd);
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access,
- pte, pmd, entry);
- return do_swap_page(mm, vma, address, pte, pmd,
- entry, write_access);
+ return do_file_page(mm, vma, address, write_access, pte, pmd);
+ return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
}
- new_entry = pte_mkyoung(entry);
if (write_access) {
- if (!pte_write(entry)) {
-#ifdef CONFIG_ATOMIC_TABLE_OPS
- /*
- * do_wp_page modifies a pte. We can add a pte without
- * the page_table_lock but not modify a pte since a
- * cmpxchg does not allow us to verify that the page
- * was not changed under us. So acquire the page table
- * lock.
- */
- spin_lock(&mm->page_table_lock);
- if (pte_same(entry, *pte))
- return do_wp_page(mm, vma, address, pte,
- pmd, entry);
- /*
- * pte was changed under us. Another processor may have
- * done what we needed to do.
- */
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR;
-#else
+ if (!pte_write(entry))
return do_wp_page(mm, vma, address, pte, pmd, entry);
-#endif
- }
- new_entry = pte_mkdirty(new_entry);
+ entry = pte_mkdirty(entry);
}
-
- /*
- * If the cmpxchg fails then another processor may have done
- * the changes for us. If not then another fault will bring
- * another chance to do this again.
- */
- if (ptep_cmpxchg(mm, address, pte, entry, new_entry)) {
- flush_tlb_page(vma, address);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
- } else {
- inc_page_state(cmpxchg_fail_flag_update);
- }
-
+ entry = pte_mkyoung(entry);
+ ptep_set_access_flags(vma, address, pte, entry, write_access);
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
pte_unmap(pte);
- page_table_atomic_stop(mm);
- if (pte_val(new_entry) == pte_val(entry))
- inc_page_state(spurious_page_faults);
+ spin_unlock(&mm->page_table_lock);
return VM_FAULT_MINOR;
}
@@ -2084,90 +2034,33 @@ int __handle_mm_fault(struct mm_struct *
inc_page_state(pgfault);
- if (unlikely(is_vm_hugetlb_page(vma)))
- goto sigbus; /* mapping truncation does this. */
+ if (is_vm_hugetlb_page(vma))
+ return VM_FAULT_SIGBUS; /* mapping truncation does this. */
/*
- * We try to rely on the mmap_sem and the SMP-safe atomic PTE updates.
- * to synchronize with kswapd. However, the arch may fall back
- * in page_table_atomic_start to the page table lock.
- *
- * We may be able to avoid taking and releasing the page_table_lock
- * for the p??_alloc functions through atomic operations so we
- * duplicate the functionality of pmd_alloc, pud_alloc and
- * pte_alloc_map here.
+ * We need the page table lock to synchronize with kswapd
+ * and the SMP-safe atomic PTE updates.
*/
- page_table_atomic_start(mm);
pgd = pgd_offset(mm, address);
- if (unlikely(pgd_none(*pgd))) {
-#ifdef __ARCH_HAS_4LEVEL_HACK
- /* The hack does not allow a clean fall back.
- * We need to insert a pmd entry into a pgd. pgd_test_and_populate is set
- * up to take a pmd entry. pud_none(pgd) == 0, therefore
- * the pud population branch will never be taken.
- */
- pmd_t *new;
-
- page_table_atomic_stop(mm);
- new = pmd_alloc_one(mm, address);
-#else
- pud_t *new;
-
- page_table_atomic_stop(mm);
- new = pud_alloc_one(mm, address);
-#endif
-
- if (!new)
- goto oom;
-
- page_table_atomic_start(mm);
- if (!pgd_test_and_populate(mm, pgd, new))
- pud_free(new);
- }
-
- pud = pud_offset(pgd, address);
- if (unlikely(pud_none(*pud))) {
- pmd_t *new;
-
- page_table_atomic_stop(mm);
- new = pmd_alloc_one(mm, address);
-
- if (!new)
- goto oom;
-
- page_table_atomic_start(mm);
-
- if (!pud_test_and_populate(mm, pud, new))
- pmd_free(new);
- }
-
- pmd = pmd_offset(pud, address);
- if (unlikely(!pmd_present(*pmd))) {
- struct page *new;
-
- page_table_atomic_stop(mm);
- new = pte_alloc_one(mm, address);
-
- if (!new)
- goto oom;
+ spin_lock(&mm->page_table_lock);
- page_table_atomic_start(mm);
+ pud = pud_alloc(mm, pgd, address);
+ if (!pud)
+ goto oom;
- if (!pmd_test_and_populate(mm, pmd, new))
- pte_free(new);
- else {
- inc_page_state(nr_page_table_pages);
- inc_mm_counter(mm, nr_ptes);
- }
- }
+ pmd = pmd_alloc(mm, pud, address);
+ if (!pmd)
+ goto oom;
- pte = pte_offset_map(pmd, address);
+ pte = pte_alloc_map(mm, pmd, address);
+ if (!pte)
+ goto oom;
+
return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
-oom:
- return VM_FAULT_OOM;
-sigbus:
- return VM_FAULT_SIGBUS;
+ oom:
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_OOM;
}
#ifndef __PAGETABLE_PUD_FOLDED
--- 26136m1/mm/mprotect.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1-/mm/mprotect.c 2005-08-20 16:44:38.000000000 +0100
@@ -32,19 +32,17 @@ static void change_pte_range(struct mm_s
pte = pte_offset_map(pmd, addr);
do {
- pte_t ptent;
-redo:
- ptent = *pte;
- if (!pte_present(ptent))
- continue;
+ if (pte_present(*pte)) {
+ pte_t ptent;
- /* Deal with a potential SMP race with hardware/arch
- * interrupt updating dirty/clean bits through the use
- * of ptep_cmpxchg.
- */
- if (!ptep_cmpxchg(mm, addr, pte, ptent, pte_modify(ptent, newprot)))
- goto redo;
- lazy_mmu_prot_update(ptent);
+ /* Avoid an SMP race with hardware updated dirty/clean
+ * bits by wiping the pte and then setting the new pte
+ * into place.
+ */
+ ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
+ set_pte_at(mm, addr, pte, ptent);
+ lazy_mmu_prot_update(ptent);
+ }
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
}
--- 26136m1/mm/page_alloc.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1-/mm/page_alloc.c 2005-08-20 16:44:38.000000000 +0100
@@ -2286,12 +2286,6 @@ static char *vmstat_text[] = {
"pgrotated",
"nr_bounce",
- "spurious_page_faults",
- "cmpxchg_fail_flag_update",
- "cmpxchg_fail_flag_reuse",
-
- "cmpxchg_fail_anon_read",
- "cmpxchg_fail_anon_write",
};
static void *vmstat_start(struct seq_file *m, loff_t *pos)
--- 26136m1/mm/rmap.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1-/mm/rmap.c 2005-08-20 16:44:38.000000000 +0100
@@ -539,6 +539,11 @@ static int try_to_unmap_one(struct page
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
+ pteval = ptep_clear_flush(vma, address, pte);
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
@@ -553,15 +558,10 @@ static int try_to_unmap_one(struct page
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
- pteval = ptep_xchg_flush(vma, address, pte, swp_entry_to_pte(entry));
+ set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte));
dec_mm_counter(mm, anon_rss);
- } else
- pteval = ptep_clear_flush(vma, address, pte);
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);
+ }
dec_mm_counter(mm, rss);
page_remove_rmap(page);
@@ -653,15 +653,15 @@ static void try_to_unmap_cluster(unsigne
if (ptep_clear_flush_young(vma, address, pte))
continue;
+ /* Nuke the page table entry. */
flush_cache_page(vma, address, pfn);
+ pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
- pteval = ptep_xchg_flush(vma, address, pte, pgoff_to_pte(page->index));
- else
- pteval = ptep_clear_flush(vma, address, pte);
+ set_pte_at(mm, address, pte, pgoff_to_pte(page->index));
- /* Move the dirty bit to the physical page now that the pte is gone. */
+ /* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
set_page_dirty(page);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread* [RFT][PATCH 2/2] pagefault scalability alternative
2005-08-22 21:27 [RFT][PATCH 0/2] pagefault scalability alternative Hugh Dickins
2005-08-22 21:29 ` [RFT][PATCH 1/2] " Hugh Dickins
@ 2005-08-22 21:31 ` Hugh Dickins
2005-08-23 0:25 ` Nick Piggin
2005-08-23 5:39 ` Andi Kleen
2005-08-22 22:29 ` [RFT][PATCH 0/2] " Christoph Lameter
2 siblings, 2 replies; 20+ messages in thread
From: Hugh Dickins @ 2005-08-22 21:31 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, Linus Torvalds, Andrew Morton, linux-mm
Then add Hugh's pagefault scalability alternative on top.
--- 26136m1-/arch/i386/kernel/vm86.c 2005-08-19 14:30:02.000000000 +0100
+++ 26136m1+/arch/i386/kernel/vm86.c 2005-08-22 12:41:30.000000000 +0100
@@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state
return ret;
}
-static void mark_screen_rdonly(struct task_struct * tsk)
+static void mark_screen_rdonly(struct mm_struct *mm)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, *mapped;
+ pte_t *pte;
+ spinlock_t *ptl;
int i;
- preempt_disable();
- spin_lock(&tsk->mm->page_table_lock);
- pgd = pgd_offset(tsk->mm, 0xA0000);
+ pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
pud = pud_offset(pgd, 0xA0000);
@@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct ta
pmd = pmd_offset(pud, 0xA0000);
if (pmd_none_or_clear_bad(pmd))
goto out;
- pte = mapped = pte_offset_map(pmd, 0xA0000);
+ pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
set_pte(pte, pte_wrprotect(*pte));
pte++;
}
- pte_unmap(mapped);
+ pte_unmap_unlock(pte, ptl);
out:
- spin_unlock(&tsk->mm->page_table_lock);
- preempt_enable();
flush_tlb();
}
@@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm
tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk);
+ mark_screen_rdonly(tsk->mm);
__asm__ __volatile__(
"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
"movl %0,%%esp\n\t"
--- 26136m1-/arch/i386/mm/ioremap.c 2005-08-08 11:56:42.000000000 +0100
+++ 26136m1+/arch/i386/mm/ioremap.c 2005-08-22 12:41:30.000000000 +0100
@@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd,
unsigned long pfn;
pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel(&init_mm, pmd, addr);
+ pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
@@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned l
flush_cache_all();
phys_addr -= addr;
pgd = pgd_offset_k(addr);
- spin_lock(&init_mm.page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&init_mm.page_table_lock);
flush_tlb_all();
return err;
}
--- 26136m1-/arch/i386/mm/pgtable.c 2005-08-19 14:30:02.000000000 +0100
+++ 26136m1+/arch/i386/mm/pgtable.c 2005-08-22 12:41:30.000000000 +0100
@@ -153,14 +153,15 @@ pte_t *pte_alloc_one_kernel(struct mm_st
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- struct page *pte;
+ struct page *page;
#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+ page = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
#endif
- return pte;
+ pte_lock_init(page);
+ return page;
}
void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
@@ -266,3 +267,45 @@ void pgd_free(pgd_t *pgd)
/* in the non-PAE case, free_pgtables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);
}
+
+#ifdef CONFIG_HIGHPTE
+/*
+ * This is out-of-line here in order to get the header includes working.
+ * Perhaps we should add a linux/pgtable.h to get around that, though
+ * the problem is really with all that kmap_atomic needs to pull in.
+ */
+pte_t *pte_offset_map(pmd_t *pmd, unsigned long address)
+{
+ struct page *page = pmd_page(*pmd);
+ return (pte_t *)kmap_atomic(page, KM_PTE0) + pte_index(address);
+}
+#endif /* CONFIG_HIGHPTE */
+
+#if defined(CONFIG_SPLIT_PTLOCK) || defined(CONFIG_HIGHPTE)
+/*
+ * This is out-of-line here in order to get the header includes working,
+ * and avoid repeated evaluation of pmd_page when CONFIG_SPLIT_PTLOCK.
+ */
+pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address, spinlock_t **ptlp)
+{
+ struct page *page = pmd_page(*pmd);
+ spinlock_t *ptl;
+ pte_t *pte;
+
+#ifdef CONFIG_SPLIT_PTLOCK
+ ptl = __pte_lockptr(page);
+#else
+ ptl = &mm->page_table_lock;
+#endif
+ *ptlp = ptl;
+
+#ifdef CONFIG_HIGHPTE
+ pte = (pte_t *)kmap_atomic(page, KM_PTE0) + pte_index(address);
+#else
+ pte = (pte_t *)page_address(page) + pte_index(address);
+#endif
+ spin_lock(ptl);
+ return pte;
+}
+#endif /* CONFIG_SPLIT_PTLOCK || CONFIG_HIGHPTE */
--- 26136m1-/arch/i386/oprofile/backtrace.c 2005-08-08 11:56:42.000000000 +0100
+++ 26136m1+/arch/i386/oprofile/backtrace.c 2005-08-22 12:41:30.000000000 +0100
@@ -12,6 +12,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <asm/ptrace.h>
+#include <asm/uaccess.h>
struct frame_head {
struct frame_head * ebp;
@@ -21,26 +22,26 @@ struct frame_head {
static struct frame_head *
dump_backtrace(struct frame_head * head)
{
- oprofile_add_trace(head->ret);
+ struct frame_head khead[2];
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= head->ebp)
+ /*
+ * Hugh: I've most probably got this wrong, but I believe
+ * it's along the right lines, and should be easily fixed -
+ * and don't forget to run sparse over it, thanks.
+ * As before, check beyond the frame_head too before
+ * accepting it, though I don't really get that logic.
+ */
+ if (__copy_from_user_inatomic(khead, head, sizeof(khead)))
return NULL;
- return head->ebp;
-}
+ oprofile_add_trace(khead[0].ret);
-/* check that the page(s) containing the frame head are present */
-static int pages_present(struct frame_head * head)
-{
- struct mm_struct * mm = current->mm;
-
- /* FIXME: only necessary once per page */
- if (!check_user_page_readable(mm, (unsigned long)head))
- return 0;
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (head >= khead[0].ebp)
+ return NULL;
- return check_user_page_readable(mm, (unsigned long)(head + 1));
+ return khead[0].ebp;
}
/*
@@ -97,15 +98,6 @@ x86_backtrace(struct pt_regs * const reg
return;
}
-#ifdef CONFIG_SMP
- if (!spin_trylock(¤t->mm->page_table_lock))
- return;
-#endif
-
- while (depth-- && head && pages_present(head))
+ while (depth-- && head)
head = dump_backtrace(head);
-
-#ifdef CONFIG_SMP
- spin_unlock(¤t->mm->page_table_lock);
-#endif
}
--- 26136m1-/arch/ia64/mm/init.c 2005-08-08 11:56:43.000000000 +0100
+++ 26136m1+/arch/ia64/mm/init.c 2005-08-22 12:41:30.000000000 +0100
@@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsi
pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */
- spin_lock(&init_mm.page_table_lock);
{
pud = pud_alloc(&init_mm, pgd, address);
if (!pud)
goto out;
-
pmd = pmd_alloc(&init_mm, pud, address);
if (!pmd)
goto out;
- pte = pte_alloc_map(&init_mm, pmd, address);
+ pte = pte_alloc_kernel(pmd, address);
if (!pte)
goto out;
- if (!pte_none(*pte)) {
- pte_unmap(pte);
+ if (!pte_none(*pte))
goto out;
- }
set_pte(pte, mk_pte(page, pgprot));
- pte_unmap(pte);
}
- out: spin_unlock(&init_mm.page_table_lock);
+ out:
/* no need for flush_tlb */
return page;
}
--- 26136m1-/arch/x86_64/mm/ioremap.c 2005-08-08 11:56:50.000000000 +0100
+++ 26136m1+/arch/x86_64/mm/ioremap.c 2005-08-22 12:41:30.000000000 +0100
@@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t *
if (address >= end)
BUG();
do {
- pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address);
+ pte_t * pte = pte_alloc_kernel(pmd, address);
if (!pte)
return -ENOMEM;
remap_area_pte(pte, address, end - address, address + phys_addr, flags);
@@ -105,7 +105,6 @@ static int remap_area_pages(unsigned lon
flush_cache_all();
if (address >= end)
BUG();
- spin_lock(&init_mm.page_table_lock);
do {
pud_t *pud;
pud = pud_alloc(&init_mm, pgd, address);
@@ -119,7 +118,6 @@ static int remap_area_pages(unsigned lon
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgd++;
} while (address && (address < end));
- spin_unlock(&init_mm.page_table_lock);
flush_tlb_all();
return error;
}
--- 26136m1-/fs/exec.c 2005-08-19 14:30:09.000000000 +0100
+++ 26136m1+/fs/exec.c 2005-08-22 12:41:30.000000000 +0100
@@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_str
pud_t * pud;
pmd_t * pmd;
pte_t * pte;
+ spinlock_t *ptl;
if (unlikely(anon_vma_prepare(vma)))
- goto out_sig;
+ goto out;
flush_dcache_page(page);
pgd = pgd_offset(mm, address);
-
- spin_lock(&mm->page_table_lock);
pud = pud_alloc(mm, pgd, address);
if (!pud)
goto out;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
goto out;
- pte = pte_alloc_map(mm, pmd, address);
+ pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
if (!pte)
goto out;
if (!pte_none(*pte)) {
- pte_unmap(pte);
+ pte_unmap_unlock(pte, ptl);
goto out;
}
inc_mm_counter(mm, rss);
@@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_str
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
page_add_anon_rmap(page, vma, address);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
/* no need for flush_tlb */
return;
out:
- spin_unlock(&mm->page_table_lock);
-out_sig:
__free_page(page);
force_sig(SIGKILL, current);
}
--- 26136m1-/fs/hugetlbfs/inode.c 2005-08-08 11:57:11.000000000 +0100
+++ 26136m1+/fs/hugetlbfs/inode.c 2005-08-22 12:41:30.000000000 +0100
@@ -92,7 +92,7 @@ out:
}
/*
- * Called under down_write(mmap_sem), page_table_lock is not held
+ * Called under down_write(mmap_sem)
*/
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
@@ -321,7 +321,7 @@ hugetlb_vmtruncate_list(struct prio_tree
v_length = vma->vm_end - vma->vm_start;
- zap_hugepage_range(vma,
+ unmap_hugepage_range(vma,
vma->vm_start + v_offset,
v_length - v_offset);
}
--- 26136m1-/fs/proc/task_mmu.c 2005-08-19 14:30:10.000000000 +0100
+++ 26136m1+/fs/proc/task_mmu.c 2005-08-22 12:41:30.000000000 +0100
@@ -186,10 +186,11 @@ static void smaps_pte_range(struct vm_ar
struct mem_size_stats *mss)
{
pte_t *pte, ptent;
+ spinlock_t *ptl;
unsigned long pfn;
struct page *page;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
ptent = *pte;
if (pte_none(ptent) || !pte_present(ptent))
@@ -213,8 +214,8 @@ static void smaps_pte_range(struct vm_ar
mss->private_clean += PAGE_SIZE;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- cond_resched_lock(&vma->vm_mm->page_table_lock);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
}
static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -272,13 +273,8 @@ static int show_smap(struct seq_file *m,
struct mem_size_stats mss;
memset(&mss, 0, sizeof mss);
-
- if (mm) {
- spin_lock(&mm->page_table_lock);
+ if (mm)
smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
- spin_unlock(&mm->page_table_lock);
- }
-
return show_map_internal(m, v, &mss);
}
@@ -407,9 +403,8 @@ static struct numa_maps *get_numa_maps(c
for_each_node(i)
md->node[i] =0;
- spin_lock(&mm->page_table_lock);
for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
- page = follow_page(mm, vaddr, 0);
+ page = follow_page(mm, vaddr, 0, 0);
if (page) {
int count = page_mapcount(page);
@@ -422,8 +417,8 @@ static struct numa_maps *get_numa_maps(c
md->anon++;
md->node[page_to_nid(page)]++;
}
+ cond_resched();
}
- spin_unlock(&mm->page_table_lock);
return md;
}
--- 26136m1-/include/asm-generic/tlb.h 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/include/asm-generic/tlb.h 2005-08-22 12:41:30.000000000 +0100
@@ -135,10 +135,10 @@ static inline void tlb_remove_page(struc
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
-#define pte_free_tlb(tlb, ptep) \
+#define pte_free_tlb(tlb, page) \
do { \
tlb->need_flush = 1; \
- __pte_free_tlb(tlb, ptep); \
+ __pte_free_tlb(tlb, page); \
} while (0)
#ifndef __ARCH_HAS_4LEVEL_HACK
--- 26136m1-/include/asm-i386/pgalloc.h 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/include/asm-i386/pgalloc.h 2005-08-22 12:41:30.000000000 +0100
@@ -27,13 +27,16 @@ static inline void pte_free_kernel(pte_t
free_page((unsigned long)pte);
}
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct page *page)
{
- __free_page(pte);
+ pte_lock_deinit(page);
+ __free_page(page);
}
-
-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pte_free_tlb(tlb, page) do { \
+ pte_lock_deinit(page); \
+ tlb_remove_page((tlb), (page)); \
+} while (0)
#ifdef CONFIG_X86_PAE
/*
--- 26136m1-/include/asm-i386/pgtable.h 2005-08-19 14:30:12.000000000 +0100
+++ 26136m1+/include/asm-i386/pgtable.h 2005-08-22 12:41:30.000000000 +0100
@@ -202,7 +202,8 @@ extern unsigned long pg0[];
#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pmd_none(x) (!pmd_val(x))
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x) (!(unsigned long)pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
@@ -401,9 +402,8 @@ extern pte_t *lookup_address(unsigned lo
extern void noexec_setup(const char *str);
-#if defined(CONFIG_HIGHPTE)
-#define pte_offset_map(dir, address) \
- ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
+#ifdef CONFIG_HIGHPTE
+extern pte_t *pte_offset_map(pmd_t *pmd, unsigned long address);
#define pte_offset_map_nested(dir, address) \
((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
@@ -416,6 +416,12 @@ extern void noexec_setup(const char *str
#define pte_unmap_nested(pte) do { } while (0)
#endif
+#if defined(CONFIG_HIGHPTE) || defined(CONFIG_SPLIT_PTLOCK)
+#define __HAVE_PTE_OFFSET_MAP_LOCK
+extern pte_t *pte_offset_map_lock(struct mm_struct *mm,
+ pmd_t *pmd, unsigned long address, spinlock_t **ptlp);
+#endif
+
/*
* The i386 doesn't have any external MMU info: the kernel page
* tables contain all the necessary information.
--- 26136m1-/include/asm-ia64/pgalloc.h 2005-08-20 16:44:38.000000000 +0100
+++ 26136m1+/include/asm-ia64/pgalloc.h 2005-08-22 12:41:30.000000000 +0100
@@ -119,7 +119,9 @@ pmd_populate_kernel(struct mm_struct *mm
static inline struct page *pte_alloc_one(struct mm_struct *mm,
unsigned long addr)
{
- return virt_to_page(pgtable_quicklist_alloc());
+ struct page *page = virt_to_page(pgtable_quicklist_alloc());
+ pte_lock_init(page);
+ return page;
}
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
@@ -128,17 +130,18 @@ static inline pte_t *pte_alloc_one_kerne
return pgtable_quicklist_alloc();
}
-static inline void pte_free(struct page *pte)
+static inline void pte_free(struct page *page)
{
- pgtable_quicklist_free(page_address(pte));
+ pte_lock_deinit(page);
+ pgtable_quicklist_free(page_address(page));
}
-static inline void pte_free_kernel(pte_t * pte)
+static inline void pte_free_kernel(pte_t *pte)
{
pgtable_quicklist_free(pte);
}
-#define __pte_free_tlb(tlb, pte) pte_free(pte)
+#define __pte_free_tlb(tlb, page) pte_free(page)
extern void check_pgt_cache(void);
--- 26136m1-/include/asm-x86_64/pgalloc.h 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/include/asm-x86_64/pgalloc.h 2005-08-22 12:41:30.000000000 +0100
@@ -18,11 +18,6 @@ static inline void pmd_populate(struct m
set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
}
-extern __inline__ pmd_t *get_pmd(void)
-{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL);
-}
-
extern __inline__ void pmd_free(pmd_t *pmd)
{
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
@@ -77,10 +72,11 @@ static inline pte_t *pte_alloc_one_kerne
static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
- void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!p)
+ struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ if (!page)
return NULL;
- return virt_to_page(p);
+ pte_lock_init(page);
+ return page;
}
/* Should really implement gc for free page table pages. This could be
@@ -89,15 +85,19 @@ static inline struct page *pte_alloc_one
extern __inline__ void pte_free_kernel(pte_t *pte)
{
BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
- free_page((unsigned long)pte);
+ free_page((unsigned long)pte);
}
-extern inline void pte_free(struct page *pte)
+extern inline void pte_free(struct page *page)
{
- __free_page(pte);
+ pte_lock_deinit(page);
+ __free_page(page);
}
-#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte))
+#define __pte_free_tlb(tlb, page) do { \
+ pte_lock_deinit(page); \
+ tlb_remove_page((tlb), (page)); \
+} while (0)
#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
--- 26136m1-/include/linux/hugetlb.h 2005-08-08 11:57:23.000000000 +0100
+++ 26136m1+/include/linux/hugetlb.h 2005-08-22 12:41:30.000000000 +0100
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(str
int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
@@ -91,7 +90,6 @@ static inline unsigned long hugetlb_tota
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
-#define zap_hugepage_range(vma, start, len) BUG()
#define unmap_hugepage_range(vma, start, end) BUG()
#define is_hugepage_mem_enough(size) 0
#define hugetlb_report_meminfo(buf) 0
--- 26136m1-/include/linux/mm.h 2005-08-19 14:30:13.000000000 +0100
+++ 26136m1+/include/linux/mm.h 2005-08-22 12:41:30.000000000 +0100
@@ -709,10 +709,6 @@ static inline void unmap_shared_mapping_
}
extern int vmtruncate(struct inode * inode, loff_t offset);
-extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -764,16 +760,15 @@ struct shrinker;
extern struct shrinker *set_shrinker(int, shrinker_t);
extern void remove_shrinker(struct shrinker *shrinker);
-/*
- * On a two-level or three-level page table, this ends up being trivial. Thus
- * the inlining and the symmetry break with pte_alloc_map() that does all
- * of this out-of-line.
- */
+pud_t *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+pmd_t *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
+
/*
* The following ifdef needed to get the 4level-fixup.h header to work.
* Remove it when 4level-fixup.h has been removed.
*/
-#ifdef CONFIG_MMU
#ifndef __ARCH_HAS_4LEVEL_HACK
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
@@ -788,8 +783,58 @@ static inline pmd_t *pmd_alloc(struct mm
return __pmd_alloc(mm, pud, address);
return pmd_offset(pud, address);
}
-#endif
-#endif /* CONFIG_MMU */
+#endif /* !__ARCH_HAS_4LEVEL_HACK */
+
+#ifdef CONFIG_SPLIT_PTLOCK
+#define __pte_lockptr(page) ((spinlock_t *)&((page)->private))
+#define pte_lock_init(page) spin_lock_init(__pte_lockptr(page))
+#define pte_lock_deinit(page) ((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd) __pte_lockptr(pmd_page(*(pmd)))
+#else
+#define pte_lock_init(page) do {} while (0)
+#define pte_lock_deinit(page) do {} while (0)
+#define pte_lockptr(mm, pmd) (&(mm)->page_table_lock)
+#endif /* !CONFIG_SPLIT_PTLOCK */
+
+#ifndef __HAVE_PTE_OFFSET_MAP_LOCK
+static inline pte_t *pte_offset_map_lock(struct mm_struct *mm,
+ pmd_t *pmd, unsigned long address, spinlock_t **ptlp)
+{
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ pte_t *pte = pte_offset_map(pmd, address);
+ *ptlp = ptl;
+ spin_lock(ptl);
+ return pte;
+}
+#endif /* !__HAVE_PTE_OFFSET_MAP_LOCK */
+
+#define pte_unmap_unlock(pte, ptl) do { \
+ spin_unlock(ptl); \
+ pte_unmap(pte); \
+} while (0)
+
+static inline pte_t *pte_alloc_map(struct mm_struct *mm,
+ pmd_t *pmd, unsigned long address)
+{
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, address) < 0)
+ return NULL;
+ return pte_offset_map(pmd, address);
+}
+
+static inline pte_t *pte_alloc_map_lock(struct mm_struct *mm,
+ pmd_t *pmd, unsigned long address, spinlock_t **ptlp)
+{
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, address) < 0)
+ return NULL;
+ return pte_offset_map_lock(mm, pmd, address, ptlp);
+}
+
+static inline pte_t *pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+{
+ if (!pmd_present(*pmd) && __pte_alloc_kernel(pmd, address) < 0)
+ return NULL;
+ return pte_offset_kernel(pmd, address);
+}
extern void free_area_init(unsigned long * zones_size);
extern void free_area_init_node(int nid, pg_data_t *pgdat,
@@ -839,6 +884,7 @@ extern int split_vma(struct mm_struct *,
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *);
+extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff);
extern void exit_mmap(struct mm_struct *);
@@ -929,8 +975,7 @@ extern struct vm_area_struct *find_exten
extern struct page * vmalloc_to_page(void *addr);
extern unsigned long vmalloc_to_pfn(void *addr);
extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
- int write);
-extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
+ int write, int acquire);
int remap_pfn_range(struct vm_area_struct *, unsigned long,
unsigned long, unsigned long, pgprot_t);
--- 26136m1-/include/linux/rmap.h 2005-08-08 11:57:24.000000000 +0100
+++ 26136m1+/include/linux/rmap.h 2005-08-22 12:41:30.000000000 +0100
@@ -95,7 +95,8 @@ int try_to_unmap(struct page *);
/*
* Called from mm/filemap_xip.c to unmap empty zero page
*/
-pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
+pte_t *page_check_address(struct page *, struct mm_struct *,
+ unsigned long, spinlock_t **);
/*
--- 26136m1-/include/linux/sched.h 2005-08-20 16:44:38.000000000 +0100
+++ 26136m1+/include/linux/sched.h 2005-08-22 12:41:30.000000000 +0100
@@ -227,12 +227,42 @@ arch_get_unmapped_area_topdown(struct fi
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+#ifdef CONFIG_SPLIT_PTLOCK
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically - for now, revisit it later.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * This may limit process memory to 2^31 * PAGE_SIZE which may be around 8TB
+ * if using 4KB page size
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+#else /* !CONFIG_SPLIT_PTLOCK */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
#define get_mm_counter(mm, member) ((mm)->_##member)
#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
#define inc_mm_counter(mm, member) (mm)->_##member++
#define dec_mm_counter(mm, member) (mm)->_##member--
typedef unsigned long mm_counter_t;
+#endif /* !CONFIG_SPLIT_PTLOCK */
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
--- 26136m1-/kernel/fork.c 2005-08-19 14:30:13.000000000 +0100
+++ 26136m1+/kernel/fork.c 2005-08-22 12:41:30.000000000 +0100
@@ -190,7 +190,8 @@ static inline int dup_mmap(struct mm_str
struct mempolicy *pol;
down_write(&oldmm->mmap_sem);
- flush_cache_mm(current->mm);
+ down_write(&mm->mmap_sem);
+ flush_cache_mm(oldmm);
mm->locked_vm = 0;
mm->mmap = NULL;
mm->mmap_cache = NULL;
@@ -205,7 +206,7 @@ static inline int dup_mmap(struct mm_str
rb_parent = NULL;
pprev = &mm->mmap;
- for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+ for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -252,12 +253,8 @@ static inline int dup_mmap(struct mm_str
}
/*
- * Link in the new vma and copy the page table entries:
- * link in first so that swapoff can see swap entries.
- * Note that, exceptionally, here the vma is inserted
- * without holding mm->mmap_sem.
+ * Link in the new vma and copy the page table entries.
*/
- spin_lock(&mm->page_table_lock);
*pprev = tmp;
pprev = &tmp->vm_next;
@@ -266,8 +263,7 @@ static inline int dup_mmap(struct mm_str
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, current->mm, tmp);
- spin_unlock(&mm->page_table_lock);
+ retval = copy_page_range(mm, oldmm, tmp);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -278,7 +274,8 @@ static inline int dup_mmap(struct mm_str
retval = 0;
out:
- flush_tlb_mm(current->mm);
+ flush_tlb_mm(oldmm);
+ up_write(&mm->mmap_sem);
up_write(&oldmm->mmap_sem);
return retval;
fail_nomem_policy:
@@ -477,7 +474,7 @@ static int copy_mm(unsigned long clone_f
* allows optimizing out ipis; the tlb_gather_mmu code
* is an example.
*/
- spin_unlock_wait(&oldmm->page_table_lock);
+ spin_unlock_wait(&oldmm->page_table_lock); /* Hugh?? */
goto good_mm;
}
--- 26136m1-/kernel/futex.c 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/kernel/futex.c 2005-08-22 12:41:30.000000000 +0100
@@ -204,15 +204,13 @@ static int get_futex_key(unsigned long u
/*
* Do a quick atomic lookup first - this is the fastpath.
*/
- spin_lock(¤t->mm->page_table_lock);
- page = follow_page(mm, uaddr, 0);
+ page = follow_page(mm, uaddr, 0, 1);
if (likely(page != NULL)) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- spin_unlock(¤t->mm->page_table_lock);
+ put_page(page);
return 0;
}
- spin_unlock(¤t->mm->page_table_lock);
/*
* Do it the general way.
--- 26136m1-/mm/Kconfig 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/Kconfig 2005-08-22 12:41:30.000000000 +0100
@@ -111,3 +111,19 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+config SPLIT_PTLOCK
+ bool "Finer-grained page table locking"
+ depends on SMP
+ default y
+ help
+ Heavily threaded applications might benefit from splitting
+ the mm page_table_lock, so that faults on different parts of
+ the user address space can be handled with less contention.
+
+ So far, only i386, ia64 and x86_64 architectures have been
+ converted: the other MMU architectures should fail to build.
+
+ For testing purposes, the patch defaults this option to Y.
+ To test for improvements which come from narrowing the scope
+ of the page_table_lock, without splitting it, choose N.
--- 26136m1-/mm/filemap_xip.c 2005-08-08 11:57:25.000000000 +0100
+++ 26136m1+/mm/filemap_xip.c 2005-08-22 12:41:30.000000000 +0100
@@ -172,8 +172,10 @@ __xip_unmap (struct address_space * mapp
struct mm_struct *mm;
struct prio_tree_iter iter;
unsigned long address;
+ pmd_t *pmd;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,13 @@ __xip_unmap (struct address_space * mapp
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- pte = page_check_address(ZERO_PAGE(address), mm,
- address);
- if (!IS_ERR(pte)) {
+ pte = page_check_address(ZERO_PAGE(address), mm, address, &ptl);
+ if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
BUG_ON(pte_dirty(pteval));
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
}
}
spin_unlock(&mapping->i_mmap_lock);
--- 26136m1-/mm/fremap.c 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/mm/fremap.c 2005-08-22 12:41:30.000000000 +0100
@@ -64,21 +64,18 @@ int install_page(struct mm_struct *mm, s
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto err;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto err;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto err;
/*
* This page may have been truncated. Tell the
@@ -87,27 +84,25 @@ int install_page(struct mm_struct *mm, s
err = -EINVAL;
inode = vma->vm_file->f_mapping->host;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (!page->mapping || page->index >= size)
- goto err_unlock;
+ if (!page->mapping || page->index >= size) {
+ pte_unmap_unlock(pte, ptl);
+ goto err;
+ }
zap_pte(mm, vma, addr, pte);
-
- inc_mm_counter(mm,rss);
+ inc_mm_counter(mm, rss);
flush_icache_page(vma, page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
page_add_file_rmap(page);
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
-
+ pte_unmap_unlock(pte, ptl);
err = 0;
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+err:
return err;
}
EXPORT_SYMBOL(install_page);
-
/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
@@ -121,37 +116,29 @@ int install_file_pte(struct mm_struct *m
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto err;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto err;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto err;
zap_pte(mm, vma, addr, pte);
-
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
- return 0;
-
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+err:
return err;
}
-
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
--- 26136m1-/mm/hugetlb.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/hugetlb.c 2005-08-22 12:41:30.000000000 +0100
@@ -268,6 +268,17 @@ static pte_t make_huge_pte(struct vm_are
return entry;
}
+static void add_huge_rss(struct mm_struct *mm, long nbytes)
+{
+ /*
+ * Take the page_table_lock here when updating mm_counter,
+ * though we won't need it in the case when it's an atomic.
+ */
+ spin_lock(&mm->page_table_lock);
+ add_mm_counter(mm, rss, nbytes >> PAGE_SHIFT);
+ spin_unlock(&mm->page_table_lock);
+}
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
@@ -276,6 +287,9 @@ int copy_hugetlb_page_range(struct mm_st
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
+ /* Assume we alloc them all because unmap will assume we did */
+ add_huge_rss(dst, end - addr);
+
while (addr < end) {
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
@@ -285,7 +299,6 @@ int copy_hugetlb_page_range(struct mm_st
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
addr += HPAGE_SIZE;
}
@@ -323,20 +336,10 @@ void unmap_hugepage_range(struct vm_area
page = pte_page(pte);
put_page(page);
}
- add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
+ add_huge_rss(mm, start - end);
flush_tlb_range(vma, start, end);
}
-void zap_hugepage_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long length)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- spin_lock(&mm->page_table_lock);
- unmap_hugepage_range(vma, start, start + length);
- spin_unlock(&mm->page_table_lock);
-}
-
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
{
struct mm_struct *mm = current->mm;
@@ -349,7 +352,9 @@ int hugetlb_prefault(struct address_spac
hugetlb_prefault_arch_hook(mm);
- spin_lock(&mm->page_table_lock);
+ /* Assume we alloc them all because unmap will assume we did */
+ add_huge_rss(mm, vma->vm_end - vma->vm_start);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
unsigned long idx;
pte_t *pte = huge_pte_alloc(mm, addr);
@@ -386,11 +391,9 @@ int hugetlb_prefault(struct address_spac
goto out;
}
}
- add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
}
out:
- spin_unlock(&mm->page_table_lock);
return ret;
}
--- 26136m1-/mm/memory.c 2005-08-20 16:54:41.000000000 +0100
+++ 26136m1+/mm/memory.c 2005-08-22 12:41:30.000000000 +0100
@@ -260,6 +260,12 @@ void free_pgtables(struct mmu_gather **t
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
+ /*
+ * Make vma invisible to rmap before freeing pgtables.
+ */
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+
if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -272,6 +278,8 @@ void free_pgtables(struct mmu_gather **t
HPAGE_SIZE)) {
vma = next;
next = vma->vm_next;
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -280,65 +288,46 @@ void free_pgtables(struct mmu_gather **t
}
}
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- struct page *new;
+ struct page *new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free(new);
- goto out;
- }
- inc_mm_counter(mm, nr_ptes);
- inc_page_state(nr_page_table_pages);
- pmd_populate(mm, pmd, new);
+ spin_lock(&mm->page_table_lock);
+ if (pmd_present(*pmd)) {
+ pte_free(new);
+ goto out;
}
+ inc_mm_counter(mm, nr_ptes);
+ inc_page_state(nr_page_table_pages);
+ pmd_populate(mm, pmd, new);
out:
- return pte_offset_map(pmd, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- pte_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
+ spin_lock(&init_mm.page_table_lock);
+ if (pmd_present(*pmd)) {
+ pte_free_kernel(new);
+ goto out;
}
+ pmd_populate_kernel(&init_mm, pmd, new);
out:
- return pte_offset_kernel(pmd, address);
+ spin_unlock(&init_mm.page_table_lock);
+ return 0;
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/
static inline void
@@ -357,7 +346,8 @@ copy_one_pte(struct mm_struct *dst_mm, s
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
spin_lock(&mmlist_lock);
- list_add(&dst_mm->mmlist, &src_mm->mmlist);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist, &src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
}
@@ -409,26 +399,30 @@ static int copy_pte_range(struct mm_stru
unsigned long addr, unsigned long end)
{
pte_t *src_pte, *dst_pte;
+ spinlock_t *src_ptl, *dst_ptl;
unsigned long vm_flags = vma->vm_flags;
- int progress;
+ int progress = 0;
again:
- dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock(src_ptl);
- progress = 0;
- spin_lock(&src_mm->page_table_lock);
do {
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
*/
- if (progress >= 32 && (need_resched() ||
- need_lockbreak(&src_mm->page_table_lock) ||
- need_lockbreak(&dst_mm->page_table_lock)))
- break;
+ if (progress >= 32) {
+ progress = 0;
+ if (need_resched() ||
+ need_lockbreak(src_ptl) ||
+ need_lockbreak(dst_ptl))
+ break;
+ }
if (pte_none(*src_pte)) {
progress++;
continue;
@@ -436,11 +430,11 @@ again:
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
- spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
- pte_unmap(dst_pte - 1);
- cond_resched_lock(&dst_mm->page_table_lock);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
if (addr != end)
goto again;
return 0;
@@ -519,8 +513,9 @@ static void zap_pte_range(struct mmu_gat
struct zap_details *details)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(tlb->mm, pmd, addr, &ptl);
do {
pte_t ptent = *pte;
if (pte_none(ptent))
@@ -582,7 +577,7 @@ static void zap_pte_range(struct mmu_gat
free_swap_and_cache(pte_to_swp_entry(ptent));
pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
}
static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -658,10 +653,10 @@ static void unmap_page_range(struct mmu_
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
- * Unmap all pages in the vma list. Called under page_table_lock.
+ * Unmap all pages in the vma list.
*
- * We aim to not hold page_table_lock for too long (for scheduling latency
- * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
+ * We aim to not hold locks for too long (for scheduling latency reasons).
+ * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
@@ -723,16 +718,15 @@ unsigned long unmap_vmas(struct mmu_gath
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
- need_lockbreak(&mm->page_table_lock) ||
(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
if (i_mmap_lock) {
/* must reset count of rss freed */
*tlbp = tlb_gather_mmu(mm, fullmm);
goto out;
}
- spin_unlock(&mm->page_table_lock);
+ preempt_enable();
cond_resched();
- spin_lock(&mm->page_table_lock);
+ preempt_disable();
}
*tlbp = tlb_gather_mmu(mm, fullmm);
@@ -759,37 +753,36 @@ unsigned long zap_page_range(struct vm_a
unsigned long end = address + size;
unsigned long nr_accounted = 0;
- if (is_vm_hugetlb_page(vma)) {
- zap_hugepage_range(vma, address, size);
- return end;
- }
-
lru_add_drain();
- spin_lock(&mm->page_table_lock);
+ preempt_disable();
tlb = tlb_gather_mmu(mm, 0);
end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
- spin_unlock(&mm->page_table_lock);
+ preempt_enable();
return end;
}
/*
* Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
*/
-static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
- int read, int write, int accessed)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+ int write, int acquire)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
+ spinlock_t *ptl;
unsigned long pfn;
struct page *page;
page = follow_huge_addr(mm, address, write);
- if (! IS_ERR(page))
- return page;
+ if (!IS_ERR(page)) {
+ if (acquire && !PageReserved(page))
+ page_cache_get(page);
+ goto out;
+ }
+ page = NULL;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
@@ -802,51 +795,37 @@ static struct page *__follow_page(struct
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (pmd_huge(*pmd))
- return follow_huge_pmd(mm, address, pmd, write);
- ptep = pte_offset_map(pmd, address);
+ if (pmd_huge(*pmd)) {
+ page = follow_huge_pmd(mm, address, pmd, write);
+ if (page && acquire && !PageReserved(page))
+ page_cache_get(page);
+ goto out;
+ }
+
+ ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
goto out;
pte = *ptep;
- pte_unmap(ptep);
if (pte_present(pte)) {
if (write && !pte_write(pte))
- goto out;
- if (read && !pte_read(pte))
- goto out;
+ goto unlock;
pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- if (accessed) {
- if (write && !pte_dirty(pte) &&!PageDirty(page))
- set_page_dirty(page);
- mark_page_accessed(page);
- }
- return page;
+ if (write && !pte_dirty(pte) &&!PageDirty(page))
+ set_page_dirty(page);
+ if (acquire && !PageReserved(page))
+ page_cache_get(page);
+ mark_page_accessed(page);
}
}
-
+unlock:
+ pte_unmap_unlock(ptep, ptl);
out:
- return NULL;
-}
-
-inline struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write)
-{
- return __follow_page(mm, address, 0, write, 1);
-}
-
-/*
- * check_user_page_readable() can be called frm niterrupt context by oprofile,
- * so we need to avoid taking any non-irq-safe locks
- */
-int check_user_page_readable(struct mm_struct *mm, unsigned long address)
-{
- return __follow_page(mm, address, 1, 0, 0) != NULL;
+ return page;
}
-EXPORT_SYMBOL(check_user_page_readable);
static inline int
untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
@@ -943,13 +922,12 @@ int get_user_pages(struct task_struct *t
&start, &len, i);
continue;
}
- spin_lock(&mm->page_table_lock);
do {
int write_access = write;
struct page *page;
- cond_resched_lock(&mm->page_table_lock);
- while (!(page = follow_page(mm, start, write_access))) {
+ cond_resched();
+ while (!(page = follow_page(mm, start, write_access, !!pages))) {
int ret;
/*
@@ -963,7 +941,6 @@ int get_user_pages(struct task_struct *t
page = ZERO_PAGE(start);
break;
}
- spin_unlock(&mm->page_table_lock);
ret = __handle_mm_fault(mm, vma, start, write_access);
/*
@@ -989,13 +966,10 @@ int get_user_pages(struct task_struct *t
default:
BUG();
}
- spin_lock(&mm->page_table_lock);
}
if (pages) {
pages[i] = page;
flush_dcache_page(page);
- if (!PageReserved(page))
- page_cache_get(page);
}
if (vmas)
vmas[i] = vma;
@@ -1003,7 +977,6 @@ int get_user_pages(struct task_struct *t
start += PAGE_SIZE;
len--;
} while (len && start < vma->vm_end);
- spin_unlock(&mm->page_table_lock);
} while (len);
return i;
}
@@ -1013,8 +986,9 @@ static int zeromap_pte_range(struct mm_s
unsigned long addr, unsigned long end, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
@@ -1022,7 +996,7 @@ static int zeromap_pte_range(struct mm_s
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, zero_pte);
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1072,14 +1046,12 @@ int zeromap_page_range(struct vm_area_st
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = zeromap_pud_range(mm, pgd, addr, next, prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
@@ -1093,8 +1065,9 @@ static int remap_pte_range(struct mm_str
unsigned long pfn, pgprot_t prot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_alloc_map(mm, pmd, addr);
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
do {
@@ -1103,7 +1076,7 @@ static int remap_pte_range(struct mm_str
set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
return 0;
}
@@ -1171,7 +1144,6 @@ int remap_pfn_range(struct vm_area_struc
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1179,7 +1151,6 @@ int remap_pfn_range(struct vm_area_struc
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1198,15 +1169,15 @@ static inline pte_t maybe_mkwrite(pte_t
}
/*
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
+ * We hold the mm semaphore for reading and the pte_lock.
*/
-static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
- pte_t *page_table)
+static inline void break_cow(struct vm_area_struct *vma,
+ struct page *new_page, unsigned long address, pte_t *page_table)
{
pte_t entry;
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
- vma);
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_establish(vma, address, page_table, entry);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
@@ -1217,9 +1188,6 @@ static inline void break_cow(struct vm_a
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
- * Goto-purists beware: the only reason for goto's here is that it results
- * in better assembly code.. The "default" path will see no jumps at all.
- *
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
@@ -1229,16 +1197,18 @@ static inline void break_cow(struct vm_a
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We hold the mm semaphore and the page_table_lock on entry and exit
- * with the page_table_lock released.
- */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte)
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
- unsigned long pfn = pte_pfn(pte);
+ unsigned long pfn = pte_pfn(orig_pte);
pte_t entry;
- int ret;
+ int ret = VM_FAULT_MINOR;
if (unlikely(!pfn_valid(pfn))) {
/*
@@ -1246,11 +1216,10 @@ static int do_wp_page(struct mm_struct *
* at least the kernel stops what it's doing before it corrupts
* data, but for the moment just pretend this is OOM.
*/
- pte_unmap(page_table);
printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n",
address);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_OOM;
+ ret = VM_FAULT_OOM;
+ goto unlock;
}
old_page = pfn_to_page(pfn);
@@ -1259,24 +1228,22 @@ static int do_wp_page(struct mm_struct *
unlock_page(old_page);
if (reuse) {
flush_cache_page(vma, address, pfn);
- entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)),
- vma);
+ entry = pte_mkyoung(pte_mkdirty(orig_pte));
+ entry = maybe_mkwrite(entry, vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- return VM_FAULT_MINOR|VM_FAULT_WRITE;
+ ret |= VM_FAULT_WRITE;
+ goto unlock;
}
}
- pte_unmap(page_table);
/*
* Ok, we need to copy. Oh, well..
*/
if (!PageReserved(old_page))
page_cache_get(old_page);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
goto no_new_page;
@@ -1290,13 +1257,12 @@ static int do_wp_page(struct mm_struct *
goto no_new_page;
copy_user_highpage(new_page, old_page, address);
}
+
/*
* Re-check the pte - we dropped the lock
*/
- ret = VM_FAULT_MINOR;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (likely(pte_same(*page_table, pte))) {
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (likely(pte_same(*page_table, orig_pte))) {
if (PageAnon(old_page))
dec_mm_counter(mm, anon_rss);
if (PageReserved(old_page))
@@ -1312,10 +1278,10 @@ static int do_wp_page(struct mm_struct *
new_page = old_page;
ret |= VM_FAULT_WRITE;
}
- pte_unmap(page_table);
page_cache_release(new_page);
page_cache_release(old_page);
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
return ret;
no_new_page:
@@ -1388,13 +1354,6 @@ again:
restart_addr = zap_page_range(vma, start_addr,
end_addr - start_addr, details);
-
- /*
- * We cannot rely on the break test in unmap_vmas:
- * on the one hand, we don't want to restart our loop
- * just because that broke out for the page_table_lock;
- * on the other hand, it does no test when vma is small.
- */
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);
@@ -1643,38 +1602,43 @@ void swapin_readahead(swp_entry_t entry,
}
/*
- * We hold the mm semaphore and the page_table_lock on entry and
- * should release the pagetable lock on exit..
- */
-static int do_swap_page(struct mm_struct * mm,
- struct vm_area_struct * vma, unsigned long address,
- pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access)
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
+ spinlock_t *ptl;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
+ swp_entry_t entry;
pte_t pte;
int ret = VM_FAULT_MINOR;
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*page_table, orig_pte)))
+ goto unlock;
+ spin_unlock(ptl);
+ }
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+
+ entry = pte_to_swp_entry(orig_pte);
page = lookup_swap_cache(entry);
if (!page) {
swapin_readahead(entry, address, vma);
page = read_swap_cache_async(entry, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte while
- * we released the page table lock.
+ * Back out if somebody else faulted in this pte
+ * while we released the pte_lock.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_MINOR;
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- goto out;
+ goto unlock;
}
/* Had to read the page from swap area: Major fault */
@@ -1688,14 +1652,11 @@ static int do_swap_page(struct mm_struct
/*
* Back out if somebody else faulted in this pte while we
- * released the page table lock.
+ * released the pte_lock.
*/
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, address);
- if (unlikely(!pte_same(*page_table, orig_pte))) {
- ret = VM_FAULT_MINOR;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
- }
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
@@ -1722,7 +1683,7 @@ static int do_swap_page(struct mm_struct
if (write_access) {
if (do_wp_page(mm, vma, address,
- page_table, pmd, pte) == VM_FAULT_OOM)
+ page_table, pmd, ptl, pte) == VM_FAULT_OOM)
ret = VM_FAULT_OOM;
goto out;
}
@@ -1730,72 +1691,70 @@ static int do_swap_page(struct mm_struct
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
lazy_mmu_prot_update(pte);
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(page_table, ptl);
out:
return ret;
+
out_nomap:
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
- goto out;
+ return ret;
}
/*
- * We are called with the MM semaphore and page_table_lock
- * spinlock held to protect against concurrent faults in
- * multithreaded programs.
- */
-static int
-do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
- pte_t *page_table, pmd_t *pmd, int write_access,
- unsigned long addr)
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
+ spinlock_t *ptl;
pte_t entry;
- struct page * page = ZERO_PAGE(addr);
-
- /* Read-only mapping of ZERO_PAGE. */
- entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
- /* ..except if it's a write access */
if (write_access) {
+ struct page *page;
+
/* Allocate our own private page. */
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
if (unlikely(anon_vma_prepare(vma)))
goto no_mem;
- page = alloc_zeroed_user_highpage(vma, addr);
+ page = alloc_zeroed_user_highpage(vma, address);
if (!page)
goto no_mem;
- spin_lock(&mm->page_table_lock);
- page_table = pte_offset_map(pmd, addr);
-
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table)) {
- pte_unmap(page_table);
page_cache_release(page);
- spin_unlock(&mm->page_table_lock);
goto out;
}
inc_mm_counter(mm, rss);
- entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
- vma->vm_page_prot)),
- vma);
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
lru_cache_add_active(page);
SetPageReferenced(page);
- page_add_anon_rmap(page, vma, addr);
+ page_add_anon_rmap(page, vma, address);
+ } else {
+ /* Read-only mapping of ZERO_PAGE. */
+ entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+ entry = pte_wrprotect(entry);
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (!pte_none(*page_table))
+ goto out;
}
- set_pte_at(mm, addr, page_table, entry);
- pte_unmap(page_table);
+ set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, addr, entry);
+ update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
out:
+ pte_unmap_unlock(page_table, ptl);
return VM_FAULT_MINOR;
no_mem:
return VM_FAULT_OOM;
@@ -1810,25 +1769,23 @@ no_mem:
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
- * This is called with the MM semaphore held and the page table
- * spinlock held. Exit with the spinlock released.
- */
-static int
-do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access)
{
- struct page * new_page;
+ spinlock_t *ptl;
+ struct page *new_page;
struct address_space *mapping = NULL;
pte_t entry;
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
- if (!vma->vm_ops || !vma->vm_ops->nopage)
- return do_anonymous_page(mm, vma, page_table,
- pmd, write_access, address);
pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
@@ -1836,7 +1793,6 @@ do_no_page(struct mm_struct *mm, struct
smp_rmb(); /* serializes i_size against truncate_count */
}
retry:
- cond_resched();
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/*
* No smp_rmb is needed here as long as there's a full
@@ -1869,19 +1825,20 @@ retry:
anon = 1;
}
- spin_lock(&mm->page_table_lock);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
- sequence = mapping->truncate_count;
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(page_table, ptl);
page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
goto retry;
}
- page_table = pte_offset_map(pmd, address);
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -1908,55 +1865,55 @@ retry:
page_add_anon_rmap(new_page, vma, address);
} else
page_add_file_rmap(new_page);
- pte_unmap(page_table);
} else {
/* One of our sibling threads was faster, back out. */
- pte_unmap(page_table);
page_cache_release(new_page);
- spin_unlock(&mm->page_table_lock);
goto out;
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- spin_unlock(&mm->page_table_lock);
out:
+ pte_unmap_unlock(page_table, ptl);
return ret;
oom:
page_cache_release(new_page);
- ret = VM_FAULT_OOM;
- goto out;
+ return VM_FAULT_OOM;
}
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
- */
-static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ int write_access, pte_t orig_pte)
{
- unsigned long pgoff;
+ pgoff_t pgoff;
int err;
- BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
- /*
- * Fall back to the linear mapping if the fs does not support
- * ->populate:
- */
- if (!vma->vm_ops->populate ||
- (write_access && !(vma->vm_flags & VM_SHARED))) {
- pte_clear(mm, address, pte);
- return do_no_page(mm, vma, address, write_access, pte, pmd);
- }
-
- pgoff = pte_to_pgoff(*pte);
+ if (sizeof(pte_t) > sizeof(unsigned long)) {
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ err = !pte_same(*page_table, orig_pte);
+ pte_unmap_unlock(page_table, ptl);
+ if (err)
+ return VM_FAULT_MINOR;
+ } else
+ pte_unmap(page_table);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ BUG_ON(!vma->vm_ops || !vma->vm_ops->populate);
+ BUG_ON(!(vma->vm_flags & VM_SHARED));
- err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+ pgoff = pte_to_pgoff(orig_pte);
+ err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
+ vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err)
@@ -1973,56 +1930,56 @@ static int do_file_page(struct mm_struct
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct * vma, unsigned long address,
- int write_access, pte_t *pte, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, int write_access)
{
pte_t entry;
+ spinlock_t *ptl;
entry = *pte;
if (!pte_present(entry)) {
- /*
- * If it truly wasn't present, we know that kswapd
- * and the PTE updates will not touch it later. So
- * drop the lock.
- */
- if (pte_none(entry))
- return do_no_page(mm, vma, address, write_access, pte, pmd);
+ if (pte_none(entry)) {
+ if (!vma->vm_ops || !vma->vm_ops->nopage)
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, write_access);
+ return do_no_page(mm, vma, address,
+ pte, pmd, write_access);
+ }
if (pte_file(entry))
- return do_file_page(mm, vma, address, write_access, pte, pmd);
- return do_swap_page(mm, vma, address, pte, pmd, entry, write_access);
+ return do_file_page(mm, vma, address,
+ pte, pmd, write_access, entry);
+ return do_swap_page(mm, vma, address,
+ pte, pmd, write_access, entry);
}
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*pte, entry)))
+ goto out;
if (write_access) {
if (!pte_write(entry))
- return do_wp_page(mm, vma, address, pte, pmd, entry);
+ return do_wp_page(mm, vma, address,
+ pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
ptep_set_access_flags(vma, address, pte, entry, write_access);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+out:
+ pte_unmap_unlock(pte, ptl);
return VM_FAULT_MINOR;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
-int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
@@ -2036,30 +1993,19 @@ int __handle_mm_fault(struct mm_struct *
if (is_vm_hugetlb_page(vma))
return VM_FAULT_SIGBUS; /* mapping truncation does this. */
-
- /*
- * We need the page table lock to synchronize with kswapd
- * and the SMP-safe atomic PTE updates.
- */
pgd = pgd_offset(mm, address);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, address);
if (!pud)
goto oom;
-
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
goto oom;
-
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
goto oom;
-
- return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
- oom:
- spin_unlock(&mm->page_table_lock);
+ return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+oom:
return VM_FAULT_OOM;
}
@@ -2067,29 +2013,22 @@ int __handle_mm_fault(struct mm_struct *
/*
* Allocate page upper directory.
*
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+pud_t *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
- pud_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pud_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pud_t *new = pud_alloc_one(mm, address);
if (!new)
return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
+ spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) {
pud_free(new);
goto out;
}
pgd_populate(mm, pgd, new);
- out:
+out:
+ spin_unlock(&mm->page_table_lock);
return pud_offset(pgd, address);
}
#endif /* __PAGETABLE_PUD_FOLDED */
@@ -2098,23 +2037,15 @@ pud_t fastcall *__pud_alloc(struct mm_st
/*
* Allocate page middle directory.
*
- * We've already handled the fast-path in-line, and we own the
- * page table lock.
+ * We've already handled the fast-path in-line.
*/
-pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+pmd_t *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
- pmd_t *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pmd_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
+ pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
+ spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
if (pud_present(*pud)) {
pmd_free(new);
@@ -2129,7 +2060,8 @@ pmd_t fastcall *__pmd_alloc(struct mm_st
pgd_populate(mm, pud, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
- out:
+out:
+ spin_unlock(&mm->page_table_lock);
return pmd_offset(pud, address);
}
#endif /* __PAGETABLE_PMD_FOLDED */
--- 26136m1-/mm/mempolicy.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/mempolicy.c 2005-08-22 12:41:30.000000000 +0100
@@ -243,9 +243,9 @@ static int check_pte_range(struct mm_str
{
pte_t *orig_pte;
pte_t *pte;
+ spinlock_t *ptl;
- spin_lock(&mm->page_table_lock);
- orig_pte = pte = pte_offset_map(pmd, addr);
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
unsigned long pfn;
unsigned int nid;
@@ -259,8 +259,7 @@ static int check_pte_range(struct mm_str
if (!test_bit(nid, nodes))
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(orig_pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(orig_pte, ptl);
return addr != end;
}
--- 26136m1-/mm/mmap.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/mmap.c 2005-08-22 12:41:30.000000000 +0100
@@ -177,26 +177,36 @@ static void __remove_shared_vm_struct(st
}
/*
- * Remove one vm structure and free it.
+ * Unlink a file-based vm structure from its prio_tree
+ * to hide it from rmap before freeing its page tables.
*/
-static void remove_vm_struct(struct vm_area_struct *vma)
+void unlink_file_vma(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
- might_sleep();
if (file) {
struct address_space *mapping = file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
__remove_shared_vm_struct(vma, file, mapping);
spin_unlock(&mapping->i_mmap_lock);
}
+}
+
+/*
+ * Close a vm structure and free it, returning the next.
+ */
+static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+{
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (file)
- fput(file);
- anon_vma_unlink(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
mpol_free(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
+ return next;
}
asmlinkage unsigned long sys_brk(unsigned long brk)
@@ -1599,44 +1609,26 @@ find_extend_vma(struct mm_struct * mm, u
}
#endif
-/* Normal function to fix up a mapping
- * This function is the default for when an area has no specific
- * function. This may be used as part of a more specific routine.
- *
- * By the time this function is called, the area struct has been
- * removed from the process mapping list.
- */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
-{
- size_t len = area->vm_end - area->vm_start;
-
- area->vm_mm->total_vm -= len >> PAGE_SHIFT;
- if (area->vm_flags & VM_LOCKED)
- area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
- vm_stat_unaccount(area);
- remove_vm_struct(area);
-}
-
/*
- * Update the VMA and inode share lists.
- *
- * Ok - we have the memory areas we should free on the 'free' list,
+ * Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
*/
-static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+
do {
- struct vm_area_struct *next = vma->vm_next;
- unmap_vma(mm, vma);
- vma = next;
+ long pages = vma_pages(vma);
+ mm->total_vm -= pages;
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= pages;
+ __vm_stat_account(mm, vma->vm_flags, vma->vm_file, -pages);
+ vma = remove_vma(vma);
} while (vma);
validate_mm(mm);
}
/*
* Get rid of page table information in the indicated region.
- *
- * Called with the page table lock held.
*/
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -1647,14 +1639,14 @@ static void unmap_region(struct mm_struc
unsigned long nr_accounted = 0;
lru_add_drain();
- spin_lock(&mm->page_table_lock);
+ preempt_disable();
tlb = tlb_gather_mmu(mm, 0);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
- spin_unlock(&mm->page_table_lock);
+ preempt_enable();
}
/*
@@ -1795,7 +1787,7 @@ int do_munmap(struct mm_struct *mm, unsi
unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
- unmap_vma_list(mm, vma);
+ remove_vma_list(mm, vma);
return 0;
}
@@ -1929,9 +1921,7 @@ void exit_mmap(struct mm_struct *mm)
unsigned long end;
lru_add_drain();
-
- spin_lock(&mm->page_table_lock);
-
+ preempt_disable();
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
/* Use -1 here to ensure all VMAs in the mm are unmapped */
@@ -1939,24 +1929,13 @@ void exit_mmap(struct mm_struct *mm)
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
-
- mm->mmap = mm->mmap_cache = NULL;
- mm->mm_rb = RB_ROOT;
- set_mm_counter(mm, rss, 0);
- mm->total_vm = 0;
- mm->locked_vm = 0;
-
- spin_unlock(&mm->page_table_lock);
+ preempt_enable();
/*
* Walk the list again, actually closing and freeing it
- * without holding any MM locks.
*/
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
- remove_vm_struct(vma);
- vma = next;
- }
+ while (vma)
+ vma = remove_vma(vma);
BUG_ON(get_mm_counter(mm, nr_ptes) > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
--- 26136m1-/mm/mprotect.c 2005-08-20 16:44:38.000000000 +0100
+++ 26136m1+/mm/mprotect.c 2005-08-22 12:41:30.000000000 +0100
@@ -29,8 +29,9 @@ static void change_pte_range(struct mm_s
unsigned long addr, unsigned long end, pgprot_t newprot)
{
pte_t *pte;
+ spinlock_t *ptl;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
do {
if (pte_present(*pte)) {
pte_t ptent;
@@ -44,7 +45,7 @@ static void change_pte_range(struct mm_s
lazy_mmu_prot_update(ptent);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
}
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -88,7 +89,6 @@ static void change_protection(struct vm_
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -96,7 +96,6 @@ static void change_protection(struct vm_
change_pud_range(mm, pgd, addr, next, newprot);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
- spin_unlock(&mm->page_table_lock);
}
static int
--- 26136m1-/mm/mremap.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/mremap.c 2005-08-22 12:41:30.000000000 +0100
@@ -22,35 +22,7 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte = NULL;
-
- pgd = pgd_offset(mm, addr);
- if (pgd_none_or_clear_bad(pgd))
- goto end;
-
- pud = pud_offset(pgd, addr);
- if (pud_none_or_clear_bad(pud))
- goto end;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none_or_clear_bad(pmd))
- goto end;
-
- pte = pte_offset_map_nested(pmd, addr);
- if (pte_none(*pte)) {
- pte_unmap_nested(pte);
- pte = NULL;
- }
-end:
- return pte;
-}
-
-static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -68,35 +40,39 @@ static pte_t *get_one_pte_map(struct mm_
if (pmd_none_or_clear_bad(pmd))
return NULL;
- return pte_offset_map(pmd, addr);
+ return pmd;
}
-static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return NULL;
+
pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
- pte = pte_alloc_map(mm, pmd, addr);
- return pte;
+ if (!pmd)
+ return NULL;
+
+ if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr) < 0)
+ return NULL;
+
+ return pmd;
}
-static int
-move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
- struct vm_area_struct *new_vma, unsigned long new_addr)
+static void move_ptes(struct vm_area_struct *vma,
+ unsigned long old_addr, pmd_t *old_pmd,
+ unsigned long old_end, struct vm_area_struct *new_vma,
+ unsigned long new_addr, pmd_t *new_pmd)
{
struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
- pte_t *src, *dst;
+ pte_t *old_pte, *new_pte, pte;
+ spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) {
/*
@@ -111,74 +87,75 @@ move_one_page(struct vm_area_struct *vma
new_vma->vm_truncate_count != vma->vm_truncate_count)
new_vma->vm_truncate_count = 0;
}
- spin_lock(&mm->page_table_lock);
- src = get_one_pte_map_nested(mm, old_addr);
- if (src) {
- /*
- * Look to see whether alloc_one_pte_map needs to perform a
- * memory allocation. If it does then we need to drop the
- * atomic kmap
- */
- dst = get_one_pte_map(mm, new_addr);
- if (unlikely(!dst)) {
- pte_unmap_nested(src);
- if (mapping)
- spin_unlock(&mapping->i_mmap_lock);
- dst = alloc_one_pte_map(mm, new_addr);
- if (mapping && !spin_trylock(&mapping->i_mmap_lock)) {
- spin_unlock(&mm->page_table_lock);
- spin_lock(&mapping->i_mmap_lock);
- spin_lock(&mm->page_table_lock);
- }
- src = get_one_pte_map_nested(mm, old_addr);
- }
- /*
- * Since alloc_one_pte_map can drop and re-acquire
- * page_table_lock, we should re-check the src entry...
- */
- if (src) {
- if (dst) {
- pte_t pte;
- pte = ptep_clear_flush(vma, old_addr, src);
- /* ZERO_PAGE can be dependant on virtual addr */
- if (pfn_valid(pte_pfn(pte)) &&
- pte_page(pte) == ZERO_PAGE(old_addr))
- pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
- set_pte_at(mm, new_addr, dst, pte);
- } else
- error = -ENOMEM;
- pte_unmap_nested(src);
- }
- if (dst)
- pte_unmap(dst);
- }
- spin_unlock(&mm->page_table_lock);
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * pte locks because exclusive mmap_sem prevents deadlock.
+ */
+ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_ptl = pte_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock(new_ptl);
+
+ for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
+ new_pte++, new_addr += PAGE_SIZE) {
+ if (pte_none(*old_pte))
+ continue;
+ pte = ptep_clear_flush(vma, old_addr, old_pte);
+#ifdef CONFIG_MIPS
+ /* ZERO_PAGE can be dependant on virtual addr */
+ if (pfn_valid(pte_pfn(pte)) &&
+ pte_page(pte) == ZERO_PAGE(old_addr))
+ pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr),
+ new_vma->vm_page_prot));
+#endif
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
+
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ pte_unmap_nested(new_pte - 1);
+ pte_unmap_unlock(old_pte - 1, old_ptl);
+
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
- return error;
}
+#define LATENCY_LIMIT (64 * PAGE_SIZE)
+
static unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
- unsigned long offset;
+ unsigned long extent, next, old_end;
+ pmd_t *old_pmd, *new_pmd;
- flush_cache_range(vma, old_addr, old_addr + len);
+ old_end = old_addr + len;
+ flush_cache_range(vma, old_addr, old_end);
- /*
- * This is not the clever way to do this, but we're taking the
- * easy way out on the assumption that most remappings will be
- * only a few pages.. This also makes error recovery easier.
- */
- for (offset = 0; offset < len; offset += PAGE_SIZE) {
- if (move_one_page(vma, old_addr + offset,
- new_vma, new_addr + offset) < 0)
- break;
+ for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
+ next = (old_addr + PMD_SIZE) & PMD_MASK;
+ if (next - 1 > old_end)
+ next = old_end;
+ extent = next - old_addr;
+ old_pmd = get_old_pmd(vma->vm_mm, old_addr);
+ if (!old_pmd)
+ continue;
+ new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ if (!new_pmd)
+ break;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ if (extent > LATENCY_LIMIT)
+ extent = LATENCY_LIMIT;
+ move_ptes(vma, old_addr, old_pmd, old_addr + extent,
+ new_vma, new_addr, new_pmd);
}
- return offset;
+
+ return len + old_addr - old_end; /* how much done */
}
static unsigned long move_vma(struct vm_area_struct *vma,
--- 26136m1-/mm/msync.c 2005-08-08 11:57:25.000000000 +0100
+++ 26136m1+/mm/msync.c 2005-08-22 12:41:30.000000000 +0100
@@ -17,21 +17,25 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
-/*
- * Called with mm->page_table_lock held to protect against other
- * threads/the swapper from ripping pte's out from under us.
- */
-
static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end)
{
pte_t *pte;
+ spinlock_t *ptl;
+ int progress = 0;
- pte = pte_offset_map(pmd, addr);
+again:
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
unsigned long pfn;
struct page *page;
+ if (progress >= 64) {
+ progress = 0;
+ if (need_resched() || need_lockbreak(ptl))
+ break;
+ }
+ progress++;
if (!pte_present(*pte))
continue;
if (!pte_maybe_dirty(*pte))
@@ -46,8 +50,12 @@ static void sync_pte_range(struct vm_are
if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page))
set_page_dirty(page);
+ progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
}
static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -96,38 +104,13 @@ static void sync_page_range(struct vm_ar
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- spin_lock(&mm->page_table_lock);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
sync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
- spin_unlock(&mm->page_table_lock);
-}
-
-#ifdef CONFIG_PREEMPT
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- const size_t chunk = 64 * 1024; /* bytes */
- unsigned long next;
-
- do {
- next = addr + chunk;
- if (next > end || next < addr)
- next = end;
- sync_page_range(vma, addr, next);
- cond_resched();
- } while (addr = next, addr != end);
-}
-#else
-static inline void filemap_sync(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- sync_page_range(vma, addr, end);
}
-#endif
/*
* MS_SYNC syncs the entire file - including mappings.
@@ -150,7 +133,7 @@ static int msync_interval(struct vm_area
return -EBUSY;
if (file && (vma->vm_flags & VM_SHARED)) {
- filemap_sync(vma, addr, end);
+ sync_page_range(vma, addr, end);
if (flags & MS_SYNC) {
struct address_space *mapping = file->f_mapping;
--- 26136m1-/mm/rmap.c 2005-08-20 16:44:38.000000000 +0100
+++ 26136m1+/mm/rmap.c 2005-08-22 12:41:30.000000000 +0100
@@ -244,37 +244,44 @@ unsigned long page_address_in_vma(struct
/*
* Check that @page is mapped at @address into @mm.
*
- * On success returns with mapped pte and locked mm->page_table_lock.
+ * On success returns with mapped pte and pte_lock.
*/
pte_t *page_check_address(struct page *page, struct mm_struct *mm,
- unsigned long address)
+ unsigned long address, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
pgd = pgd_offset(mm, address);
- if (likely(pgd_present(*pgd))) {
- pud = pud_offset(pgd, address);
- if (likely(pud_present(*pud))) {
- pmd = pmd_offset(pud, address);
- if (likely(pmd_present(*pmd))) {
- pte = pte_offset_map(pmd, address);
- if (likely(pte_present(*pte) &&
- page_to_pfn(page) == pte_pfn(*pte)))
- return pte;
- pte_unmap(pte);
- }
- }
+ if (!pgd_present(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return NULL;
+
+ pte = pte_offset_map(pmd, address);
+ /* Make a quick check before getting the lock */
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return NULL;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
+ *ptlp = ptl;
+ return pte;
}
- spin_unlock(&mm->page_table_lock);
- return ERR_PTR(-ENOENT);
+ pte_unmap_unlock(pte, ptl);
+ return NULL;
}
/*
@@ -287,28 +294,28 @@ static int page_referenced_one(struct pa
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *pte;
+ spinlock_t *ptl;
int referenced = 0;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (!IS_ERR(pte)) {
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
+ goto out;
- /* Pretend the page is referenced if the task has the
- swap token and is in the middle of a page fault. */
- if (mm != current->mm && !ignore_token &&
- has_swap_token(mm) &&
- sem_is_read_locked(&mm->mmap_sem))
- referenced++;
+ if (ptep_clear_flush_young(vma, address, pte))
+ referenced++;
- (*mapcount)--;
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
- }
+ /* Pretend the page is referenced if the task has the
+ swap token and is in the middle of a page fault. */
+ if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+ sem_is_read_locked(&mm->mmap_sem))
+ referenced++;
+
+ (*mapcount)--;
+ pte_unmap_unlock(pte, ptl);
out:
return referenced;
}
@@ -438,7 +445,7 @@ int page_referenced(struct page *page, i
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte_lock.
*/
void page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
@@ -465,7 +472,7 @@ void page_add_anon_rmap(struct page *pag
* page_add_file_rmap - add pte mapping to a file page
* @page: the page to add the mapping to
*
- * The caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte_lock.
*/
void page_add_file_rmap(struct page *page)
{
@@ -481,7 +488,7 @@ void page_add_file_rmap(struct page *pag
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
*
- * Caller needs to hold the mm->page_table_lock.
+ * The caller needs to hold the pte_lock.
*/
void page_remove_rmap(struct page *page)
{
@@ -514,14 +521,15 @@ static int try_to_unmap_one(struct page
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
int ret = SWAP_AGAIN;
address = vma_address(page, vma);
if (address == -EFAULT)
goto out;
- pte = page_check_address(page, mm, address);
- if (IS_ERR(pte))
+ pte = page_check_address(page, mm, address, &ptl);
+ if (!pte)
goto out;
/*
@@ -555,7 +563,8 @@ static int try_to_unmap_one(struct page
swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
- list_add(&mm->mmlist, &init_mm.mmlist);
+ if (list_empty(&mm->mmlist))
+ list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
@@ -568,8 +577,7 @@ static int try_to_unmap_one(struct page
page_cache_release(page);
out_unmap:
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
out:
return ret;
}
@@ -603,19 +611,14 @@ static void try_to_unmap_cluster(unsigne
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte, *original_pte;
+ pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
struct page *page;
unsigned long address;
unsigned long end;
unsigned long pfn;
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- spin_lock(&mm->page_table_lock);
-
address = (vma->vm_start + cursor) & CLUSTER_MASK;
end = address + CLUSTER_SIZE;
if (address < vma->vm_start)
@@ -625,17 +628,17 @@ static void try_to_unmap_cluster(unsigne
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
- goto out_unlock;
+ return;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
- goto out_unlock;
+ return;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
- goto out_unlock;
+ return;
- for (original_pte = pte = pte_offset_map(pmd, address);
+ for (pte = pte_offset_map_lock(mm, pmd, address, &ptl);
address < end; pte++, address += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -671,9 +674,7 @@ static void try_to_unmap_cluster(unsigne
(*mapcount)--;
}
- pte_unmap(original_pte);
-out_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte - 1, ptl);
}
static int try_to_unmap_anon(struct page *page)
--- 26136m1-/mm/swap_state.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/swap_state.c 2005-08-22 12:41:30.000000000 +0100
@@ -263,8 +263,7 @@ static inline void free_swap_cache(struc
/*
* Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page. Can not do a lock_page,
- * as we are holding the page_table_lock spinlock.
+ * this page if it is the last user of the page.
*/
void free_page_and_swap_cache(struct page *page)
{
--- 26136m1-/mm/swapfile.c 2005-08-19 14:30:14.000000000 +0100
+++ 26136m1+/mm/swapfile.c 2005-08-22 12:41:30.000000000 +0100
@@ -397,8 +397,6 @@ void free_swap_and_cache(swp_entry_t ent
/*
* Since we're swapping it in, we mark it as old.
- *
- * vma->vm_mm->page_table_lock is held.
*/
static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, swp_entry_t entry, struct page *page)
@@ -420,23 +418,25 @@ static int unuse_pte_range(struct vm_are
unsigned long addr, unsigned long end,
swp_entry_t entry, struct page *page)
{
- pte_t *pte;
pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+ spinlock_t *ptl;
+ int found = 0;
- pte = pte_offset_map(pmd, addr);
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
/*
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
if (unlikely(pte_same(*pte, swp_pte))) {
- unuse_pte(vma, pte, addr, entry, page);
- pte_unmap(pte);
- return 1;
+ unuse_pte(vma, pte++, addr, entry, page);
+ found = 1;
+ break;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap(pte - 1);
- return 0;
+ pte_unmap_unlock(pte - 1, ptl);
+ return found;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -518,12 +518,10 @@ static int unuse_mm(struct mm_struct *mm
down_read(&mm->mmap_sem);
lock_page(page);
}
- spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma && unuse_vma(vma, entry, page))
break;
}
- spin_unlock(&mm->page_table_lock);
up_read(&mm->mmap_sem);
/*
* Currently unuse_mm cannot fail, but leave error handling
--- 26136m1-/mm/vmalloc.c 2005-06-17 20:48:29.000000000 +0100
+++ 26136m1+/mm/vmalloc.c 2005-08-22 12:41:30.000000000 +0100
@@ -88,7 +88,7 @@ static int vmap_pte_range(pmd_t *pmd, un
{
pte_t *pte;
- pte = pte_alloc_kernel(&init_mm, pmd, addr);
+ pte = pte_alloc_kernel(pmd, addr);
if (!pte)
return -ENOMEM;
do {
@@ -146,14 +146,12 @@ int map_vm_area(struct vm_struct *area,
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
- spin_lock(&init_mm.page_table_lock);
do {
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages);
if (err)
break;
} while (pgd++, addr = next, addr != end);
- spin_unlock(&init_mm.page_table_lock);
flush_cache_vmap((unsigned long) area->addr, end);
return err;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread