From mboxrd@z Thu Jan 1 00:00:00 1970 Message-ID: <3AB9D2F8.86D8E359@uow.edu.au> Date: Thu, 22 Mar 2001 21:24:56 +1100 From: Andrew Morton MIME-Version: 1.0 Subject: Re: 3rd version of R/W mmap_sem patch available References: <3AB77311.77EB7D60@uow.edu.au> Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: Linus Torvalds Cc: linux-mm@kvack.org List-ID: Linus Torvalds wrote: > > On Wed, 21 Mar 2001, Andrew Morton wrote: > > > > The handling of mm->rss is racy. But I think > > it always has been? > > It always has been. Right now I think we hold the page_table_lock over > most of them, that the old patch to fix this might end up being just that > one place. Somebody interested in checking? > There were two places which needed fixing, and it was pretty trivial. With this patch, mm_struct.rss handling is racefree on x86. Some other archs (notably ia64/ia32) are still a little racy on the exec() path. I was sorely tempted to make put_dirty_page() require that tsk->mm->page_table_lock be held by the caller, which would save a bunch of locking. But put_dirty_page() is used by architectures which I don't understand. The patch also includes a feeble attempt to document some locking rules. --- linux-2.4.3-pre6/include/linux/sched.h Thu Mar 22 18:52:52 2001 +++ lk/include/linux/sched.h Thu Mar 22 19:41:06 2001 @@ -209,9 +209,12 @@ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ struct rw_semaphore mmap_sem; - spinlock_t page_table_lock; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ - struct list_head mmlist; /* List of all active mm's */ + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; --- linux-2.4.3-pre6/fs/exec.c Thu Mar 22 18:52:52 2001 +++ lk/fs/exec.c Thu Mar 22 19:51:35 2001 @@ -252,6 +252,8 @@ /* * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. + * + * tsk->mmap_sem is held for writing. */ void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) { @@ -291,6 +293,7 @@ unsigned long stack_base; struct vm_area_struct *mpnt; int i; + unsigned long rss_increment = 0; stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE; @@ -322,11 +325,14 @@ struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - current->mm->rss++; + rss_increment++; put_dirty_page(current,page,stack_base); } stack_base += PAGE_SIZE; } + spin_lock(¤t->mm->page_table_lock); + current->mm->rss += rss_increment; + spin_unlock(¤t->mm->page_table_lock); up_write(¤t->mm->mmap_sem); return 0; --- linux-2.4.3-pre6/mm/memory.c Thu Mar 22 18:52:52 2001 +++ lk/mm/memory.c Thu Mar 22 21:13:29 2001 @@ -374,7 +374,6 @@ address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); /* * Update rss for the mm_struct (not necessarily current->mm) * Notice that rss is an unsigned long. @@ -383,6 +382,7 @@ mm->rss -= freed; else mm->rss = 0; + spin_unlock(&mm->page_table_lock); } @@ -792,6 +792,8 @@ * - flush the old one * - update the page tables * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) { @@ -800,6 +802,9 @@ update_mmu_cache(vma, address, entry); } +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ static inline void break_cow(struct vm_area_struct * vma, struct page * old_page, struct page * new_page, unsigned long address, pte_t *page_table) { @@ -1024,8 +1029,7 @@ } /* - * We hold the mm semaphore and the page_table_lock on entry - * and exit. + * We hold the mm semaphore and the page_table_lock on entry and exit. */ static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, --- linux-2.4.3-pre6/mm/mmap.c Thu Mar 22 18:52:52 2001 +++ lk/mm/mmap.c Thu Mar 22 19:19:08 2001 @@ -889,8 +889,8 @@ spin_lock(&mm->page_table_lock); mpnt = mm->mmap; mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; - spin_unlock(&mm->page_table_lock); mm->rss = 0; + spin_unlock(&mm->page_table_lock); mm->total_vm = 0; mm->locked_vm = 0; --- linux-2.4.3-pre6/mm/vmscan.c Tue Jan 16 07:36:49 2001 +++ lk/mm/vmscan.c Thu Mar 22 19:32:11 2001 @@ -25,16 +25,15 @@ #include /* - * The swap-out functions return 1 if they successfully - * threw something out, and we got a free page. It returns - * zero if it couldn't do anything, and any other value - * indicates it decreased rss, but the page was shared. + * The swap-out function returns 1 if it successfully + * scanned all the pages it was asked to (`count'). + * It returns zero if it couldn't do anything, * - * NOTE! If it sleeps, it *must* return 1 to make sure we - * don't continue with the swap-out. Otherwise we may be - * using a process that no longer actually exists (it might - * have died while we slept). + * rss may decrease because pages are shared, but this + * doesn't count as having freed a page. */ + +/* mm->page_table_lock is held. mmap_sem is not held */ static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page) { pte_t pte; @@ -129,6 +128,7 @@ return; } +/* mm->page_table_lock is held. mmap_sem is not held */ static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count) { pte_t * pte; @@ -165,6 +165,7 @@ return count; } +/* mm->page_table_lock is held. mmap_sem is not held */ static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count) { pmd_t * pmd; @@ -194,6 +195,7 @@ return count; } +/* mm->page_table_lock is held. mmap_sem is not held */ static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count) { pgd_t *pgdir; @@ -218,6 +220,9 @@ return count; } +/* + * Returns non-zero if we scanned all `count' pages + */ static int swap_out_mm(struct mm_struct * mm, int count) { unsigned long address; --- linux-2.4.3-pre6/mm/swapfile.c Sun Feb 25 17:37:14 2001 +++ lk/mm/swapfile.c Thu Mar 22 19:56:36 2001 @@ -209,6 +209,7 @@ * share this swap entry, so be cautious and let do_wp_page work out * what to do if a write is requested later. */ +/* tasklist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, pte_t *dir, swp_entry_t entry, struct page* page) { @@ -234,6 +235,7 @@ ++vma->vm_mm->rss; } +/* tasklist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, swp_entry_t entry, struct page* page) @@ -261,6 +263,7 @@ } while (address && (address < end)); } +/* tasklist_lock and vma->vm_mm->page_table_lock are held */ static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, swp_entry_t entry, struct page* page) @@ -291,6 +294,7 @@ } while (address && (address < end)); } +/* tasklist_lock and vma->vm_mm->page_table_lock are held */ static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, swp_entry_t entry, struct page* page) { -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/