From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from internal-mail-relay1.corp.sgi.com (internal-mail-relay1.corp.sgi.com [198.149.32.52]) by omx3.sgi.com (8.12.11/8.12.9/linux-outbound_gateway-1.1) with ESMTP id j0SLDhC8010079 for ; Fri, 28 Jan 2005 13:13:50 -0800 Received: from spindle.corp.sgi.com (spindle.corp.sgi.com [198.29.75.13]) by internal-mail-relay1.corp.sgi.com (8.12.9/8.12.10/SGI_generic_relay-1.2) with ESMTP id j0SLnM0W155802258 for ; Fri, 28 Jan 2005 13:49:22 -0800 (PST) Received: from schroedinger.engr.sgi.com (schroedinger.engr.sgi.com [163.154.5.55]) by spindle.corp.sgi.com (SGI-8.12.5/8.12.9/generic_config-1.2) with ESMTP id j0SLkJae18450893 for ; Fri, 28 Jan 2005 13:46:19 -0800 (PST) Date: Fri, 28 Jan 2005 13:46:19 -0800 (PST) From: Christoph Lameter Subject: [RFC] No page table lock COW Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-linux-mm@kvack.org Return-Path: To: linux-mm@kvack.org List-ID: Do not use the page_table_lock for COW. Patch depends on the following patches having been applied first: make_rss_atomic pte_cmpxchg ptl_drop_first_use no_ptl_do_anon_page Major issue: The patch enabled the complete replacement of a pte without obtaining the page_table_lock during COW. This means that obtaining the page_table_lock does no longer ensures that a (read only) page is not replaced. The page_table_lock will still have the effect that a writable pte is not replaced. I am not sure if this issue can be addressed at all without doing a full rework of vm locking like in Nick Piggins' patches. But systems seem to boot fine and survive a couple of tests (none of them targeted at this problem) that I have run so far. Index: linux-2.6.10/mm/memory.c =================================================================== --- linux-2.6.10.orig/mm/memory.c 2005-01-27 17:02:41.000000000 -0800 +++ linux-2.6.10/mm/memory.c 2005-01-27 17:11:39.000000000 -0800 @@ -1256,21 +1256,6 @@ static inline pte_t maybe_mkwrite(pte_t } /* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - flush_cache_page(vma, address); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); -} - -/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -1348,12 +1333,14 @@ static int do_wp_page(struct mm_struct * copy_cow_page(old_page,new_page,address); /* - * Re-check the pte - so far we may not have acquired the - * page_table_lock + * Re-check the pte via a cmpxchg */ - spin_lock(&mm->page_table_lock); + page_table_atomic_start(mm); page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { + flush_cache_page(vma, address); + entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), + vma); + if (ptep_cmpxchg(page_table, pte, entry)) { if (PageAnon(old_page)) update_mm_counter(mm, anon_rss, -1); if (PageReserved(old_page)) { @@ -1363,7 +1350,7 @@ static int do_wp_page(struct mm_struct * } else page_remove_rmap(old_page); - break_cow(vma, new_page, address, page_table); + update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1373,7 +1360,7 @@ static int do_wp_page(struct mm_struct * pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); - spin_unlock(&mm->page_table_lock); + page_table_atomic_stop(mm); return VM_FAULT_MINOR; no_new_page: @@ -1722,15 +1709,15 @@ static int do_swap_page(struct mm_struct /* * Back out if somebody else faulted in this pte */ - spin_lock(&mm->page_table_lock); + page_table_atomic_start(mm); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; + page_table_atomic_stop(&mm->page_table_lock); + return ret; } /* Had to read the page from swap area: Major fault */ @@ -1740,56 +1727,46 @@ static int do_swap_page(struct mm_struct } SetPageReferenced(page); + /* The lock here is enough to guarantee exclusivity in the + * following code. Any other access before the pte is installed + * will wait at lock_page. + */ lock_page(page); /* * Back out if somebody else faulted in this pte */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); - ret = VM_FAULT_MINOR; - goto out; - } - - /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - - update_mm_counter(mm, rss, 1); - acct_update_integrals(); - update_mem_hiwater(); - pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - unlock_page(page); - + flush_icache_page(vma, page); - set_pte(page_table, pte); - page_add_anon_rmap(page, vma, address); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + if (ptep_cmpxchg(page_table, orig_pte, pte)) { + unlock_page(page); - if (write_access) { - page_table_atomic_start(mm); - if (do_wp_page(mm, vma, address, - page_table, pmd, pte) == VM_FAULT_OOM) - ret = VM_FAULT_OOM; + page_add_anon_rmap(page, vma, address); + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + update_mm_counter(mm, rss, 1); + acct_update_integrals(); + update_mem_hiwater(); + if (write_access) { + if (do_wp_page(mm, vma, address, + page_table, pmd, pte) == VM_FAULT_OOM) + return VM_FAULT_OOM; + return ret; + } + } else { + /* Another thread was racing with us an won */ + pte_unmap(page_table); + unlock_page(page); + page_cache_release(page); } -out: + page_table_atomic_stop(mm); return ret; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: aart@kvack.org