* [RFC] atomic pte updates and pae changes, take 2
@ 2000-10-14 0:17 Ben LaHaise
2000-10-14 1:43 ` Linus Torvalds
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Ben LaHaise @ 2000-10-14 0:17 UTC (permalink / raw)
To: torvalds, Ingo Molnar; +Cc: linux-mm, linux-kernel
Hey folks
Below is take two of the patch making pte_clear use atomic xchg in an
effort to avoid the loss of dirty bits. PAE no longer uses cmpxchg8 for
updates; set_pte is two ordered long writes with a barrier. The use of
long long for ptes is also removed; gcc should generate better code now. A
quick test with filemap_rw shows no measurable difference between pae and
non pae code, as well as no degradation from the original non-atomic
non-pae code. This code has been tested on a box with 4GB (about 48MB is
above the 4G boundry) in PAE mode, and in non PAE mode on a couple of
other boxes too. Linus: comments? Ingo: could you have a look over the
code? Thanks,
-ben
diff -ur v2.4.0-test10-pre2/arch/i386/boot/install.sh work-10-2/arch/i386/boot/install.sh
--- v2.4.0-test10-pre2/arch/i386/boot/install.sh Tue Jan 3 06:57:26 1995
+++ work-10-2/arch/i386/boot/install.sh Fri Oct 13 17:19:47 2000
@@ -21,6 +21,7 @@
# User may have a custom install script
+if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
# Default install - same as make zlilo
diff -ur v2.4.0-test10-pre2/include/asm-i386/page.h work-10-2/include/asm-i386/page.h
--- v2.4.0-test10-pre2/include/asm-i386/page.h Thu Oct 12 17:42:11 2000
+++ work-10-2/include/asm-i386/page.h Fri Oct 13 17:36:02 2000
@@ -37,20 +37,20 @@
* These are used to make use of C type-checking..
*/
#if CONFIG_X86_PAE
-typedef struct { unsigned long long pte; } pte_t;
+typedef struct { unsigned long pte_low, pte_high; } pte_t;
typedef struct { unsigned long long pmd; } pmd_t;
typedef struct { unsigned long long pgd; } pgd_t;
-#define PTE_MASK (~(unsigned long long) (PAGE_SIZE-1))
+#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32))
#else
-typedef struct { unsigned long pte; } pte_t;
+typedef struct { unsigned long pte_low; } pte_t;
typedef struct { unsigned long pmd; } pmd_t;
typedef struct { unsigned long pgd; } pgd_t;
-#define PTE_MASK PAGE_MASK
+#define pte_val(x) ((x).pte_low)
#endif
+#define PTE_MASK PAGE_MASK
typedef struct { unsigned long pgprot; } pgprot_t;
-#define pte_val(x) ((x).pte)
#define pmd_val(x) ((x).pmd)
#define pgd_val(x) ((x).pgd)
#define pgprot_val(x) ((x).pgprot)
diff -ur v2.4.0-test10-pre2/include/asm-i386/pgtable-2level.h work-10-2/include/asm-i386/pgtable-2level.h
--- v2.4.0-test10-pre2/include/asm-i386/pgtable-2level.h Fri Dec 3 14:12:23 1999
+++ work-10-2/include/asm-i386/pgtable-2level.h Fri Oct 13 17:41:14 2000
@@ -18,7 +18,7 @@
#define PTRS_PER_PTE 1024
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
+ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
#define pmd_ERROR(e) \
printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
#define pgd_ERROR(e) \
@@ -54,5 +54,12 @@
{
return (pmd_t *) dir;
}
+
+#define __HAVE_ARCH_pte_get_and_clear
+#define pte_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0))
+#define pte_same(a, b) ((a).pte_low == (b).pte_low)
+#define pte_page(x) (mem_map+((unsigned long)(((x).pte_low >> PAGE_SHIFT))))
+#define pte_none(x) (!(x).pte_low)
+#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
#endif /* _I386_PGTABLE_2LEVEL_H */
diff -ur v2.4.0-test10-pre2/include/asm-i386/pgtable-3level.h work-10-2/include/asm-i386/pgtable-3level.h
--- v2.4.0-test10-pre2/include/asm-i386/pgtable-3level.h Mon Dec 6 19:19:13 1999
+++ work-10-2/include/asm-i386/pgtable-3level.h Fri Oct 13 17:39:53 2000
@@ -27,7 +27,7 @@
#define PTRS_PER_PTE 512
#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p(%016Lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
+ printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
#define pmd_ERROR(e) \
printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
#define pgd_ERROR(e) \
@@ -45,8 +45,12 @@
extern inline int pgd_bad(pgd_t pgd) { return 0; }
extern inline int pgd_present(pgd_t pgd) { return !pgd_none(pgd); }
-#define set_pte(pteptr,pteval) \
- set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
+extern inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ barrier();
+ ptep->pte_low = pte.pte_low;
+}
#define set_pmd(pmdptr,pmdval) \
set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
#define set_pgd(pgdptr,pgdval) \
@@ -75,5 +79,35 @@
/* Find an entry in the second-level page table.. */
#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \
__pmd_offset(address))
+
+#define __HAVE_ARCH_pte_get_and_clear
+extern inline pte_t pte_get_and_clear(pte_t *ptep)
+{
+ pte_t res;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ res.pte_low = xchg(&ptep->pte_low, 0);
+ res.pte_high = ptep->pte_high;
+ ptep->pte_high = 0;
+
+ return res;
+}
+
+extern inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
+}
+
+#define pte_page(x) (mem_map+(((x).pte_low >> PAGE_SHIFT) | ((x).pte_high << (32 - PAGE_SHIFT))))
+#define pte_none(x) (!(x).pte_low && !(x).pte_high)
+
+extern inline pte_t __mk_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ pte_t pte;
+
+ pte.pte_high = page_nr >> (32 - PAGE_SHIFT);
+ pte.pte_low = (page_nr << PAGE_SHIFT) | pgprot_val(pgprot);
+ return pte;
+}
#endif /* _I386_PGTABLE_3LEVEL_H */
diff -ur v2.4.0-test10-pre2/include/asm-i386/pgtable.h work-10-2/include/asm-i386/pgtable.h
--- v2.4.0-test10-pre2/include/asm-i386/pgtable.h Mon Oct 2 14:06:43 2000
+++ work-10-2/include/asm-i386/pgtable.h Fri Oct 13 17:41:26 2000
@@ -17,6 +17,10 @@
#include <asm/fixmap.h>
#include <linux/threads.h>
+#ifndef _I386_BITOPS_H
+#include <asm/bitops.h>
+#endif
+
extern pgd_t swapper_pg_dir[1024];
extern void paging_init(void);
@@ -145,6 +149,16 @@
* the page directory entry points directly to a 4MB-aligned block of
* memory.
*/
+#define _PAGE_BIT_PRESENT 0
+#define _PAGE_BIT_RW 1
+#define _PAGE_BIT_USER 2
+#define _PAGE_BIT_PWT 3
+#define _PAGE_BIT_PCD 4
+#define _PAGE_BIT_ACCESSED 5
+#define _PAGE_BIT_DIRTY 6
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+
#define _PAGE_PRESENT 0x001
#define _PAGE_RW 0x002
#define _PAGE_USER 0x004
@@ -231,10 +245,27 @@
extern void __handle_bad_pmd(pmd_t * pmd);
extern void __handle_bad_pmd_kernel(pmd_t * pmd);
-#define pte_none(x) (!pte_val(x))
-#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
+#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0)
+#define __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+ return test_and_clear_bit(_PAGE_BIT_DIRTY, page_table);
+}
+
+#define __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+ return test_and_clear_bit(_PAGE_BIT_ACCESSED, page_table);
+}
+
+#define __HAVE_ARCH_pte_clear_wrprotect
+static inline void pte_clear_wrprotect(pte_t *page_table)
+{
+ clear_bit(_PAGE_BIT_RW, page_table);
+}
+
#define pmd_none(x) (!pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
@@ -246,49 +277,44 @@
*/
#define page_address(page) ((page)->virtual)
#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
-#define pte_page(x) (mem_map+((unsigned long)((pte_val(x) >> PAGE_SHIFT))))
/*
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-extern inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_USER; }
-extern inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_USER; }
-extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; }
-extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; }
-extern inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; }
-
-extern inline pte_t pte_rdprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
-extern inline pte_t pte_exprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
-extern inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
-extern inline pte_t pte_mkold(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; }
-extern inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; }
-extern inline pte_t pte_mkread(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; }
-extern inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; }
-extern inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
-extern inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
-extern inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+extern inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
+extern inline int pte_exec(pte_t pte) { return (pte).pte_low & _PAGE_USER; }
+extern inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; }
+extern inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; }
+extern inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; }
+
+extern inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
+extern inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; }
+extern inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; }
+extern inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; }
+extern inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; }
+extern inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
+extern inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; }
+extern inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; }
+extern inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; }
+extern inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; }
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
-#define mk_pte(page,pgprot) \
-({ \
- pte_t __pte; \
- \
- set_pte(&__pte, __pte(((page)-mem_map) * \
- (unsigned long long)PAGE_SIZE + pgprot_val(pgprot))); \
- __pte; \
-})
+#define mk_pte(page, pgprot) __mk_pte((page) - mem_map, (pgprot))
/* This takes a physical page address that is used by the remapping functions */
-#define mk_pte_phys(physpage, pgprot) \
-({ pte_t __pte; set_pte(&__pte, __pte(physpage + pgprot_val(pgprot))); __pte; })
+#define mk_pte_phys(physpage, pgprot) __mk_pte((physpage) >> PAGE_SHIFT, pgprot)
extern inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{ set_pte(&pte, __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot))); return pte; }
+{
+ pte.pte_low &= _PAGE_CHG_MASK;
+ pte.pte_low |= pgprot_val(newprot);
+ return pte;
+}
#define page_pte(page) page_pte_prot(page, __pgprot(0))
@@ -324,7 +350,7 @@
#define SWP_TYPE(x) (((x).val >> 1) & 0x3f)
#define SWP_OFFSET(x) ((x).val >> 8)
#define SWP_ENTRY(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
-#define pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) })
+#define pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define swp_entry_to_pte(x) ((pte_t) { (x).val })
#define module_map vmalloc
diff -ur v2.4.0-test10-pre2/include/linux/mm.h work-10-2/include/linux/mm.h
--- v2.4.0-test10-pre2/include/linux/mm.h Tue Oct 3 13:40:38 2000
+++ work-10-2/include/linux/mm.h Fri Oct 13 17:41:26 2000
@@ -532,6 +532,43 @@
#define vmlist_modify_lock(mm) vmlist_access_lock(mm)
#define vmlist_modify_unlock(mm) vmlist_access_unlock(mm)
+#ifndef __HAVE_ARCH_pte_test_and_clear_young
+static inline int pte_test_and_clear_young(pte_t *page_table, pte_t pte)
+{
+ if (!pte_young(pte))
+ return 0;
+ set_pte(page_table, pte_mkold(pte));
+ return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_test_and_clear_dirty
+static inline int pte_test_and_clear_dirty(pte_t *page_table, pte_t pte)
+{
+ if (!pte_dirty(pte))
+ return 0;
+ set_pte(page_table, pte_mkclean(pte));
+ return 1;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_get_and_clear
+static pte_t pte_get_and_clear(pte_t *page_table)
+{
+ pte_t pte = *page_table;
+ pte_clear(page_table);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_pte_clear_wrprotect
+static inline void pte_clear_wrprotect(pte_t *page_table)
+{
+ pte_t old_pte = *page_table;
+ set_pte(page_table, pte_wrprotect(old_pte));
+}
+#endif
+
#endif /* __KERNEL__ */
#endif
diff -ur v2.4.0-test10-pre2/mm/filemap.c work-10-2/mm/filemap.c
--- v2.4.0-test10-pre2/mm/filemap.c Tue Oct 3 13:40:38 2000
+++ work-10-2/mm/filemap.c Fri Oct 13 17:19:47 2000
@@ -1475,39 +1475,47 @@
return retval;
}
+/* Called with mm->page_table_lock held to protect against other
+ * threads/the swapper from ripping pte's out from under us.
+ */
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
unsigned long pgoff;
- pte_t pte = *ptep;
+ pte_t pte;
struct page *page;
int error;
+ pte = *ptep;
+
if (!(flags & MS_INVALIDATE)) {
if (!pte_present(pte))
- return 0;
- if (!pte_dirty(pte))
- return 0;
+ goto out;
+ if (!pte_test_and_clear_dirty(ptep, pte))
+ goto out;
flush_page_to_ram(pte_page(pte));
flush_cache_page(vma, address);
- set_pte(ptep, pte_mkclean(pte));
flush_tlb_page(vma, address);
page = pte_page(pte);
page_cache_get(page);
} else {
if (pte_none(pte))
- return 0;
+ goto out;
flush_cache_page(vma, address);
- pte_clear(ptep);
+
+ pte = pte_get_and_clear(ptep);
flush_tlb_page(vma, address);
+
if (!pte_present(pte)) {
+ spin_unlock(&vma->vm_mm->page_table_lock);
swap_free(pte_to_swp_entry(pte));
- return 0;
+ spin_lock(&vma->vm_mm->page_table_lock);
+ goto out;
}
page = pte_page(pte);
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
page_cache_free(page);
- return 0;
+ goto out;
}
}
pgoff = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
@@ -1516,11 +1524,18 @@
printk("weirdness: pgoff=%lu index=%lu address=%lu vm_start=%lu vm_pgoff=%lu\n",
pgoff, page->index, address, vma->vm_start, vma->vm_pgoff);
}
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
lock_page(page);
error = filemap_write_page(vma->vm_file, page, 1);
UnlockPage(page);
page_cache_free(page);
+
+ spin_lock(&vma->vm_mm->page_table_lock);
return error;
+
+out:
+ return 0;
}
static inline int filemap_sync_pte_range(pmd_t * pmd,
@@ -1590,6 +1605,11 @@
unsigned long end = address + size;
int error = 0;
+ /* Aquire the lock early; it may be possible to avoid dropping
+ * and reaquiring it repeatedly.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+
dir = pgd_offset(vma->vm_mm, address);
flush_cache_range(vma->vm_mm, end - size, end);
if (address >= end)
@@ -1600,6 +1620,9 @@
dir++;
} while (address && (address < end));
flush_tlb_range(vma->vm_mm, end - size, end);
+
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
return error;
}
diff -ur v2.4.0-test10-pre2/mm/highmem.c work-10-2/mm/highmem.c
--- v2.4.0-test10-pre2/mm/highmem.c Fri Oct 13 17:18:37 2000
+++ work-10-2/mm/highmem.c Fri Oct 13 17:19:47 2000
@@ -130,10 +130,9 @@
if (pkmap_count[i] != 1)
continue;
pkmap_count[i] = 0;
- pte = pkmap_page_table[i];
+ pte = pte_get_and_clear(pkmap_page_table+i);
if (pte_none(pte))
BUG();
- pte_clear(pkmap_page_table+i);
page = pte_page(pte);
page->virtual = NULL;
}
diff -ur v2.4.0-test10-pre2/mm/memory.c work-10-2/mm/memory.c
--- v2.4.0-test10-pre2/mm/memory.c Tue Oct 3 13:40:38 2000
+++ work-10-2/mm/memory.c Fri Oct 13 17:19:47 2000
@@ -215,30 +215,30 @@
/* copy_one_pte */
if (pte_none(pte))
- goto cont_copy_pte_range;
+ goto cont_copy_pte_range_noset;
if (!pte_present(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
- set_pte(dst_pte, pte);
goto cont_copy_pte_range;
}
ptepage = pte_page(pte);
if ((!VALID_PAGE(ptepage)) ||
- PageReserved(ptepage)) {
- set_pte(dst_pte, pte);
+ PageReserved(ptepage))
goto cont_copy_pte_range;
- }
+
/* If it's a COW mapping, write protect it both in the parent and the child */
if (cow) {
- pte = pte_wrprotect(pte);
- set_pte(src_pte, pte);
+ pte_clear_wrprotect(src_pte);
+ pte = *src_pte;
}
+
/* If it's a shared mapping, mark it clean in the child */
if (vma->vm_flags & VM_SHARED)
pte = pte_mkclean(pte);
- set_pte(dst_pte, pte_mkold(pte));
+ pte = pte_mkold(pte);
get_page(ptepage);
-
-cont_copy_pte_range: address += PAGE_SIZE;
+
+cont_copy_pte_range: set_pte(dst_pte, pte);
+cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out;
src_pte++;
@@ -306,10 +306,9 @@
pte_t page;
if (!size)
break;
- page = *pte;
+ page = pte_get_and_clear(pte);
pte++;
size--;
- pte_clear(pte-1);
if (pte_none(page))
continue;
freed += free_pte(page);
@@ -642,7 +641,7 @@
end = PMD_SIZE;
do {
pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
- pte_t oldpage = *pte;
+ pte_t oldpage = pte_get_and_clear(pte);
set_pte(pte, zero_pte);
forget_pte(oldpage);
address += PAGE_SIZE;
@@ -712,8 +711,8 @@
end = PMD_SIZE;
do {
struct page *page;
- pte_t oldpage = *pte;
- pte_clear(pte);
+ pte_t oldpage;
+ oldpage = pte_get_and_clear(pte);
page = virt_to_page(__va(phys_addr));
if ((!VALID_PAGE(page)) || PageReserved(page))
@@ -746,6 +745,7 @@
return 0;
}
+/* Note: this is only safe if the mm semaphore is held when called. */
int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
{
int error = 0;
@@ -867,7 +867,7 @@
/*
* Re-check the pte - we dropped the lock
*/
- if (pte_val(*page_table) == pte_val(pte)) {
+ if (pte_same(*page_table, pte)) {
if (PageReserved(old_page))
++mm->rss;
break_cow(vma, old_page, new_page, address, page_table);
@@ -1214,7 +1214,7 @@
* didn't change from under us..
*/
spin_lock(&mm->page_table_lock);
- if (pte_val(entry) == pte_val(*pte)) {
+ if (pte_same(entry, *pte)) {
if (write_access) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address, pte, entry);
diff -ur v2.4.0-test10-pre2/mm/mprotect.c work-10-2/mm/mprotect.c
--- v2.4.0-test10-pre2/mm/mprotect.c Tue Mar 14 20:45:21 2000
+++ work-10-2/mm/mprotect.c Fri Oct 13 17:19:47 2000
@@ -30,9 +30,16 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t entry = *pte;
- if (pte_present(entry))
+ if (pte_present(*pte)) {
+ pte_t entry;
+
+ /* Avoid an SMP race with hardware updated dirty/clean
+ * bits by wiping the pte and then setting the new pte
+ * into place.
+ */
+ entry = pte_get_and_clear(pte);
set_pte(pte, pte_modify(entry, newprot));
+ }
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
diff -ur v2.4.0-test10-pre2/mm/mremap.c work-10-2/mm/mremap.c
--- v2.4.0-test10-pre2/mm/mremap.c Tue Oct 3 13:40:38 2000
+++ work-10-2/mm/mremap.c Fri Oct 13 17:19:47 2000
@@ -63,14 +63,14 @@
pte_t pte;
spin_lock(&mm->page_table_lock);
- pte = *src;
+ pte = pte_get_and_clear(src);
if (!pte_none(pte)) {
- error++;
- if (dst) {
- pte_clear(src);
- set_pte(dst, pte);
- error--;
+ if (!dst) {
+ /* No dest? We must put it back. */
+ dst = src;
+ error++;
}
+ set_pte(dst, pte);
}
spin_unlock(&mm->page_table_lock);
return error;
diff -ur v2.4.0-test10-pre2/mm/swapfile.c work-10-2/mm/swapfile.c
--- v2.4.0-test10-pre2/mm/swapfile.c Tue Aug 8 00:01:36 2000
+++ work-10-2/mm/swapfile.c Fri Oct 13 17:19:47 2000
@@ -223,10 +223,11 @@
if (pte_page(pte) != page)
return;
/* We will be removing the swap cache in a moment, so... */
+ pte = pte_get_and_clear(dir);
set_pte(dir, pte_mkdirty(pte));
return;
}
- if (pte_val(pte) != entry.val)
+ if (pte_to_swp_entry(pte).val != entry.val)
return;
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
diff -ur v2.4.0-test10-pre2/mm/vmalloc.c work-10-2/mm/vmalloc.c
--- v2.4.0-test10-pre2/mm/vmalloc.c Fri Oct 13 17:18:37 2000
+++ work-10-2/mm/vmalloc.c Fri Oct 13 17:19:47 2000
@@ -34,8 +34,8 @@
if (end > PMD_SIZE)
end = PMD_SIZE;
do {
- pte_t page = *pte;
- pte_clear(pte);
+ pte_t page;
+ page = pte_get_and_clear(pte);
address += PAGE_SIZE;
pte++;
if (pte_none(page))
diff -ur v2.4.0-test10-pre2/mm/vmscan.c work-10-2/mm/vmscan.c
--- v2.4.0-test10-pre2/mm/vmscan.c Fri Oct 13 17:18:37 2000
+++ work-10-2/mm/vmscan.c Fri Oct 13 17:19:47 2000
@@ -55,8 +55,7 @@
onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
- if (pte_young(pte)) {
- set_pte(page_table, pte_mkold(pte));
+ if (pte_test_and_clear_young(page_table, pte)) {
if (onlist) {
/*
* Transfer the "accessed" bit from the page
@@ -99,6 +98,10 @@
if (PageSwapCache(page)) {
entry.val = page->index;
swap_duplicate(entry);
+ if (pte_dirty(pte))
+ BUG();
+ if (pte_write(pte))
+ BUG();
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
@@ -109,6 +112,13 @@
goto out_failed;
}
+ /* From this point on, the odds are that we're going to
+ * nuke this pte, so read and clear the pte. This hook
+ * is needed on CPUs which update the accessed and dirty
+ * bits in hardware.
+ */
+ pte = pte_get_and_clear(page_table);
+
/*
* Is it a clean page? Then it must be recoverable
* by just paging it in again, and we can just drop
@@ -124,7 +134,6 @@
*/
if (!pte_dirty(pte)) {
flush_cache_page(vma, address);
- pte_clear(page_table);
goto drop_pte;
}
@@ -134,7 +143,7 @@
* locks etc.
*/
if (!(gfp_mask & __GFP_IO))
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Don't do any of the expensive stuff if
@@ -143,7 +152,7 @@
if (page->zone->free_pages + page->zone->inactive_clean_pages
+ page->zone->inactive_dirty_pages
> page->zone->pages_high + inactive_target)
- goto out_unlock;
+ goto out_unlock_restore;
/*
* Ok, it's really dirty. That means that
@@ -169,7 +178,7 @@
int error;
struct file *file = vma->vm_file;
if (file) get_file(file);
- pte_clear(page_table);
+
mm->rss--;
flush_tlb_page(vma, address);
vmlist_access_unlock(mm);
@@ -191,7 +200,7 @@
*/
entry = get_swap_page();
if (!entry.val)
- goto out_unlock; /* No swap space left */
+ goto out_unlock_restore; /* No swap space left */
if (!(page = prepare_highmem_swapout(page)))
goto out_swap_free;
@@ -215,10 +224,12 @@
page_cache_release(page);
return 1;
out_swap_free:
+ set_pte(page_table, pte);
swap_free(entry);
out_failed:
return 0;
-out_unlock:
+out_unlock_restore:
+ set_pte(page_table, pte);
UnlockPage(page);
return 0;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [RFC] atomic pte updates and pae changes, take 2
2000-10-14 0:17 [RFC] atomic pte updates and pae changes, take 2 Ben LaHaise
@ 2000-10-14 1:43 ` Linus Torvalds
2000-10-14 8:26 ` David S. Miller
2000-10-15 12:07 ` Ingo Molnar
2000-10-14 13:33 ` Stephen Tweedie
2000-10-19 20:16 ` oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3) Linus Torvalds
2 siblings, 2 replies; 9+ messages in thread
From: Linus Torvalds @ 2000-10-14 1:43 UTC (permalink / raw)
To: Ben LaHaise; +Cc: Ingo Molnar, linux-mm, linux-kernel
On Fri, 13 Oct 2000, Ben LaHaise wrote:
>
> Hey folks
Me likee.
This looks much nicer. The hack turned into something that looks quite
ddesigned.
Ingo, I'd like you to comment on all the PAE issues just in case, but I
personally don't have any real issues any more. Small nit: I dislike the
"__HAVE_ARCH_xxx" approach, and considering that most architectures will
probably want to do something specific anyway I wonder if we should get
rid of that and just make architectures have their own code.
Thanks,
Linus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] atomic pte updates and pae changes, take 2
2000-10-14 1:43 ` Linus Torvalds
@ 2000-10-14 8:26 ` David S. Miller
2000-10-15 12:07 ` Ingo Molnar
1 sibling, 0 replies; 9+ messages in thread
From: David S. Miller @ 2000-10-14 8:26 UTC (permalink / raw)
To: torvalds; +Cc: bcrl, mingo, linux-mm, linux-kernel
I dislike the "__HAVE_ARCH_xxx" approach, and considering that most
architectures will probably want to do something specific anyway I
wonder if we should get rid of that and just make architectures
have their own code.
Most software-based TLB refill systems could (and sparc64 will) make
all ref/mod bit updates occur in the kernel software fault path so
that none of this special synchronization is necessary.
In such cases it might be nice to have a generic version that all such
ports can share by just not defining __HAVE_ARCH_xxx.
Sure it's a bit ugly, but it does allow code sharing, so it probably
at least deserves a chance :-)
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] atomic pte updates and pae changes, take 2
2000-10-14 1:43 ` Linus Torvalds
2000-10-14 8:26 ` David S. Miller
@ 2000-10-15 12:07 ` Ingo Molnar
1 sibling, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2000-10-15 12:07 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Ben LaHaise, linux-mm, linux-kernel
On Fri, 13 Oct 2000, Linus Torvalds wrote:
> Ingo, I'd like you to comment on all the PAE issues just in case, but
> I personally don't have any real issues any more. [...]
there is one small thing apart of the issue Stephen noticed, barrier()
between the two 32-bit writes should IMO be smp_wmb() instead. It should
not make any difference on current x86 CPUs, but if any future x86 SMP
implementation relaxes memory ordering, it will be easier to sort things
out.
Ingo
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] atomic pte updates and pae changes, take 2
2000-10-14 0:17 [RFC] atomic pte updates and pae changes, take 2 Ben LaHaise
2000-10-14 1:43 ` Linus Torvalds
@ 2000-10-14 13:33 ` Stephen Tweedie
2000-10-15 12:01 ` Ingo Molnar
2000-10-19 20:16 ` oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3) Linus Torvalds
2 siblings, 1 reply; 9+ messages in thread
From: Stephen Tweedie @ 2000-10-14 13:33 UTC (permalink / raw)
To: Ben LaHaise
Cc: torvalds, Ingo Molnar, linux-mm, linux-kernel, Stephen Tweedie
Hi,
On Fri, Oct 13, 2000 at 08:17:42PM -0400, Ben LaHaise wrote:
>
> Below is take two of the patch making pte_clear use atomic xchg in an
> effort to avoid the loss of dirty bits. PAE no longer uses cmpxchg8 for
> updates; set_pte is two ordered long writes with a barrier.
Looks good. The only trouble I can see left is that pte_clear() is
still using set_pte(), which doesn't work right for PAE36. set_pte()
is setting the high word first, which is fine for installing a new pte,
but if you do that to clear a pte then you have left the old
page-present bit intact while you've removed have of the pte.
pte_clear() needs to clear the words in the other order (just as
pte_get_and_clear correctly does).
Cheers,
Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC] atomic pte updates and pae changes, take 2
2000-10-14 13:33 ` Stephen Tweedie
@ 2000-10-15 12:01 ` Ingo Molnar
0 siblings, 0 replies; 9+ messages in thread
From: Ingo Molnar @ 2000-10-15 12:01 UTC (permalink / raw)
To: Stephen Tweedie; +Cc: Ben LaHaise, torvalds, linux-mm, linux-kernel
On Sat, 14 Oct 2000, Stephen Tweedie wrote:
> Looks good. The only trouble I can see left is that pte_clear() is
> still using set_pte(), which doesn't work right for PAE36. set_pte()
> is setting the high word first, which is fine for installing a new
> pte, but if you do that to clear a pte then you have left the old
> page-present bit intact while you've removed have of the pte.
> pte_clear() needs to clear the words in the other order (just as
> pte_get_and_clear correctly does).
yep, that should make it work. The pgd operations must use 64-bit
instructions - the pmds are fine to be 64-bit for the time being, but they
can be changed to the pte logic as well.
Ingo
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
* oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3)
2000-10-14 0:17 [RFC] atomic pte updates and pae changes, take 2 Ben LaHaise
2000-10-14 1:43 ` Linus Torvalds
2000-10-14 13:33 ` Stephen Tweedie
@ 2000-10-19 20:16 ` Linus Torvalds
2000-10-20 5:44 ` Ben LaHaise
2 siblings, 1 reply; 9+ messages in thread
From: Linus Torvalds @ 2000-10-19 20:16 UTC (permalink / raw)
To: Ben LaHaise; +Cc: Ingo Molnar, linux-mm, linux-kernel
Ben,
you added these two BUG() conditions in your atomic pte patch:
> diff -ur v2.4.0-test10-pre2/mm/vmscan.c work-10-2/mm/vmscan.c
> --- v2.4.0-test10-pre2/mm/vmscan.c Fri Oct 13 17:18:37 2000
> +++ work-10-2/mm/vmscan.c Fri Oct 13 17:19:47 2000
> @@ -99,6 +98,10 @@
> if (PageSwapCache(page)) {
> entry.val = page->index;
> swap_duplicate(entry);
> + if (pte_dirty(pte))
> + BUG();
> + if (pte_write(pte))
> + BUG();
> set_pte(page_table, swp_entry_to_pte(entry));
> drop_pte:
> UnlockPage(page);
> @@ -109,6 +112,13 @@
and people are getting them left and right when they start swapping.
As far as I can tell, the thing you test for is not actually a bug at all.
The pte may be dirty, but that's ok - the swap cache page is obviously
up-to-date, as it's actually physically the very same page.
I think you overlooked the fact that SHM mappings use the page cache, and
it's ok if such pages are dirty and writable - they will get written out
by the shm_swap() logic once there are no mappings active any more.
I like the test per se, because I think it's correct for the "normal"
case of a private page, but I really think those two BUG()'s are not bugs
at all in general, and we should just remove the two tests.
Comments? Anything I've overlooked?
Linus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3)
2000-10-19 20:16 ` oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3) Linus Torvalds
@ 2000-10-20 5:44 ` Ben LaHaise
2000-10-20 17:18 ` Linus Torvalds
0 siblings, 1 reply; 9+ messages in thread
From: Ben LaHaise @ 2000-10-20 5:44 UTC (permalink / raw)
To: Linus Torvalds; +Cc: Ingo Molnar, linux-mm, linux-kernel
On Thu, 19 Oct 2000, Linus Torvalds wrote:
....
> I think you overlooked the fact that SHM mappings use the page cache, and
> it's ok if such pages are dirty and writable - they will get written out
> by the shm_swap() logic once there are no mappings active any more.
>
> I like the test per se, because I think it's correct for the "normal"
> case of a private page, but I really think those two BUG()'s are not bugs
> at all in general, and we should just remove the two tests.
>
> Comments? Anything I've overlooked?
The primary reason I added the BUG was that if this is valid, it means
that the pte has to be removed from the page tables first with
pte_get_and_clear since it can be modified by the other CPU. Although
this may be safe for shm, I think it's very ugly and inconsistent. I'd
rather make the code transfer the dirty bit to the page struct so that we
*know* there is no information loss.
If the above is correct, then the following patch should do (untested).
Oh, I think I missed adding pte_same in the generic pgtable.h macros, too.
<doh!> I'm willing to take a closer look if you think it's needed.
-ben
diff -urN v2.4.0-test10-pre4/include/asm-generic/pgtable.h work-foo/include/asm-generic/pgtable.h
--- v2.4.0-test10-pre4/include/asm-generic/pgtable.h Fri Oct 20 00:58:03 2000
+++ work-foo/include/asm-generic/pgtable.h Fri Oct 20 01:42:24 2000
@@ -38,4 +38,6 @@
set_pte(ptep, pte_mkdirty(old_pte));
}
+#define pte_same(left,right) (pte_val(left) == pte_val(right))
+
#endif /* _ASM_GENERIC_PGTABLE_H */
diff -urN v2.4.0-test10-pre4/mm/vmscan.c work-foo/mm/vmscan.c
--- v2.4.0-test10-pre4/mm/vmscan.c Fri Oct 20 00:58:04 2000
+++ work-foo/mm/vmscan.c Fri Oct 20 01:43:54 2000
@@ -87,6 +87,13 @@
if (TryLockPage(page))
goto out_failed;
+ /* From this point on, the odds are that we're going to
+ * nuke this pte, so read and clear the pte. This hook
+ * is needed on CPUs which update the accessed and dirty
+ * bits in hardware.
+ */
+ pte = ptep_get_and_clear(page_table);
+
/*
* Is the page already in the swap cache? If so, then
* we can just drop our reference to it without doing
@@ -98,10 +105,6 @@
if (PageSwapCache(page)) {
entry.val = page->index;
swap_duplicate(entry);
- if (pte_dirty(pte))
- BUG();
- if (pte_write(pte))
- BUG();
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
@@ -111,13 +114,6 @@
page_cache_release(page);
goto out_failed;
}
-
- /* From this point on, the odds are that we're going to
- * nuke this pte, so read and clear the pte. This hook
- * is needed on CPUs which update the accessed and dirty
- * bits in hardware.
- */
- pte = ptep_get_and_clear(page_table);
/*
* Is it a clean page? Then it must be recoverable
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3)
2000-10-20 5:44 ` Ben LaHaise
@ 2000-10-20 17:18 ` Linus Torvalds
0 siblings, 0 replies; 9+ messages in thread
From: Linus Torvalds @ 2000-10-20 17:18 UTC (permalink / raw)
To: Ben LaHaise; +Cc: Ingo Molnar, linux-mm, linux-kernel
On Fri, 20 Oct 2000, Ben LaHaise wrote:
>
> The primary reason I added the BUG was that if this is valid, it means
> that the pte has to be removed from the page tables first with
> pte_get_and_clear since it can be modified by the other CPU. Although
> this may be safe for shm, I think it's very ugly and inconsistent. I'd
> rather make the code transfer the dirty bit to the page struct so that we
> *know* there is no information loss.
Note that we should have done this regardless of the BUG() tests: remember
the PAE case, and the fact that it was illegal to do
set_pte(page_table, swp_entry_to_pte(entry));
without having atomically cleared the pte first.
So regardless of any dirty/writable issues, that ptep_get_and_clear()
should be above the test for the PageSwapCache. Thanks for the patch.
Now, I agree 100% with you that we should _also_ make sure that we
transfer the dirty bit from the page tables to "struct page". Even if we
don't actually use that information right now. We _could_ use it: in
particular we could probably fairly easily speed up shared memory handling
by using the same kind of optimization that we do for private mappings -
using the dirty bit in the page table to determine whether we need to
write the page out again or not.
This all needs more thought, I suspect. But for now moving the
ptep_get_and_clear() up, and removing the BUG() is sufficient to get us
where we used to be.
Linus
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2000-10-20 17:18 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-10-14 0:17 [RFC] atomic pte updates and pae changes, take 2 Ben LaHaise
2000-10-14 1:43 ` Linus Torvalds
2000-10-14 8:26 ` David S. Miller
2000-10-15 12:07 ` Ingo Molnar
2000-10-14 13:33 ` Stephen Tweedie
2000-10-15 12:01 ` Ingo Molnar
2000-10-19 20:16 ` oopses in test10-pre4 (was Re: [RFC] atomic pte updates and pae changes, take 3) Linus Torvalds
2000-10-20 5:44 ` Ben LaHaise
2000-10-20 17:18 ` Linus Torvalds
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox