From mboxrd@z Thu Jan 1 00:00:00 1970 From: kanoj@google.engr.sgi.com (Kanoj Sarcar) Message-Id: <199906280148.SAA94463@google.engr.sgi.com> Subject: Re: filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races Date: Sun, 27 Jun 1999 18:48:47 -0700 (PDT) In-Reply-To: from "Andrea Arcangeli" at Jun 25, 99 02:26:11 am MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: Andrea Arcangeli Cc: torvalds@transmeta.com, sct@redhat.com, linux-mm@kvack.org, Kanoj Sarcar List-ID: Linus/Andrea/Stephen, This is the patch that tries to cure the swapoff races with processes forking, exiting, and (readahead) swapping by faulting. Basically, all these operations are synchronized by the process mmap_sem. Unfortunately, swapoff has to visit all processes, during which it must hold tasklist_lock, a spinlock. Hence, it can not take the mmap_sem, a sleeping mutex. So, the patch links up all active mm's in a list that swapoff can visit (with minor restructuring, kswapd can also use this, although it can not hold mmap_sem). Addition/deletions to the list are protected by a sleeping mutex, hence swapoff can grab the individual mmap_sems, while preventing changes to the list. Effectively, process creation and destruction are locked out if swapoff is running. To do this, the lock ordering is mm_sem -> mmap_sem. To prevent deadlocks, care must be taken that a process invoking delete/insert_mmlist does not have its own mmap_sem held. For this, the do_fork path needs to change so as not to acquire mmap_sem early, rather only when it is really needed. This does not open up a resource-ordering problem between kernel_lock and mmap_sem, since the kernel_lock is a monitor lock that is released at schedule time, so no deadlocks are possible. I have just done basic sanity testing on this, I am hoping Andrea can run his swapoff stress tests to see whether this patch helps cure the problem he was seeing. Thanks. Kanoj kanoj@engr.sgi.com --- /usr/tmp/p_rdiff_a009HP/exec.c Sun Jun 27 16:51:58 1999 +++ fs/exec.c Sun Jun 27 15:14:43 1999 @@ -399,6 +399,7 @@ up(&mm->mmap_sem); mm_release(); mmput(old_mm); + insert_mmlist(mm); return 0; /* --- /usr/tmp/p_rdiff_a009HP/sched.h Sun Jun 27 16:52:01 1999 +++ include/linux/sched.h Fri Jun 25 17:22:56 1999 @@ -170,6 +170,8 @@ atomic_t count; int map_count; /* number of VMAs */ struct semaphore mmap_sem; + struct mm_struct *prev; /* list of allocated mms */ + struct mm_struct *next; /* list of allocated mms */ unsigned long context; unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; @@ -191,7 +193,7 @@ swapper_pg_dir, \ ATOMIC_INIT(1), 1, \ __MUTEX_INITIALIZER(name.mmap_sem), \ - 0, \ + &init_mm, &init_mm, 0, \ 0, 0, 0, 0, \ 0, 0, 0, \ 0, 0, 0, 0, \ @@ -611,6 +613,7 @@ /* * Routines for handling mm_structs */ +extern struct semaphore mm_sem; extern struct mm_struct * mm_alloc(void); static inline void mmget(struct mm_struct * mm) { @@ -619,6 +622,22 @@ extern void mmput(struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(void); +static inline void insert_mmlist(struct mm_struct * mm) +{ + down(&mm_sem); + mm->prev = &init_mm; + mm->next = init_mm.next; + init_mm.next->prev = mm; + init_mm.next = mm; + up(&mm_sem); +} +static inline void delete_mmlist(struct mm_struct * mm) +{ + down(&mm_sem); + mm->next->prev = mm->prev; + mm->prev->next = mm->next; + up(&mm_sem); +} extern int copy_thread(int, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); extern void flush_thread(void); --- /usr/tmp/p_rdiff_a009HP/fork.c Sun Jun 27 16:52:04 1999 +++ kernel/fork.c Sun Jun 27 15:28:34 1999 @@ -351,6 +351,7 @@ release_segments(mm); exit_mmap(mm); free_page_tables(mm); + delete_mmlist(mm); kmem_cache_free(mm_cachep, mm); } } @@ -383,7 +384,11 @@ retval = new_page_tables(tsk); if (retval) goto free_mm; + insert_mmlist(mm); + + down(¤t->mm->mmap_sem); retval = dup_mmap(mm); + up(¤t->mm->mmap_sem); if (retval) goto free_pt; up(&mm->mmap_sem); @@ -549,7 +554,6 @@ *p = *current; - down(¤t->mm->mmap_sem); lock_kernel(); retval = -EAGAIN; @@ -676,7 +680,6 @@ ++total_forks; bad_fork: unlock_kernel(); - up(¤t->mm->mmap_sem); fork_out: if ((clone_flags & CLONE_VFORK) && (retval > 0)) down(&sem); --- /usr/tmp/p_rdiff_a009HP/mmap.c Sun Jun 27 16:52:07 1999 +++ mm/mmap.c Sun Jun 27 15:20:08 1999 @@ -39,6 +39,8 @@ /* SLAB cache for vm_area_struct's. */ kmem_cache_t *vm_area_cachep; +struct semaphore mm_sem; + int sysctl_overcommit_memory; /* Check that a process has enough memory to allocate a @@ -812,6 +814,7 @@ { struct vm_area_struct * mpnt; + down(&mm->mmap_sem); mpnt = mm->mmap; mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL; mm->rss = 0; @@ -843,6 +846,7 @@ printk("exit_mmap: map count is %d\n", mm->map_count); clear_page_tables(mm, 0, USER_PTRS_PER_PGD); + up(&mm->mmap_sem); } /* Insert vm structure into process list sorted by address @@ -957,6 +961,7 @@ void __init vma_init(void) { + init_MUTEX(&mm_sem); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, SLAB_HWCACHE_ALIGN, --- /usr/tmp/p_rdiff_a009HP/page_alloc.c Sun Jun 27 16:52:09 1999 +++ mm/page_alloc.c Sun Jun 27 15:39:58 1999 @@ -385,10 +385,9 @@ } /* - * The tests may look silly, but it essentially makes sure that - * no other process did a swap-in on us just as we were waiting. + * Concurrent swap-in via swapoff is interlocked out. * - * Also, don't bother to add to the swap cache if this page-in + * Don't bother to add to the swap cache if this page-in * was due to a write access. */ void swap_in(struct task_struct * tsk, struct vm_area_struct * vma, @@ -400,11 +399,6 @@ if (!page_map) { swapin_readahead(entry); page_map = read_swap_cache(entry); - } - if (pte_val(*page_table) != entry) { - if (page_map) - free_page_and_swap_cache(page_address(page_map)); - return; } if (!page_map) { set_pte(page_table, BAD_PAGE); --- /usr/tmp/p_rdiff_a009HP/swapfile.c Sun Jun 27 16:52:12 1999 +++ mm/swapfile.c Sun Jun 27 15:27:49 1999 @@ -259,20 +259,20 @@ } } -static void unuse_process(struct mm_struct * mm, unsigned long entry, +static void unuse_mm(struct mm_struct * mm, unsigned long entry, unsigned long page) { struct vm_area_struct* vma; /* - * Go through process' page directory. + * Go through address space page directory. */ - if (!mm || mm == &init_mm) - return; + down(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); unuse_vma(vma, pgd, entry, page); } + up(&mm->mmap_sem); return; } @@ -283,8 +283,8 @@ */ static int try_to_unuse(unsigned int type) { + struct mm_struct * mm; struct swap_info_struct * si = &swap_info[type]; - struct task_struct *p; struct page *page_map; unsigned long entry, page; int i; @@ -316,10 +316,12 @@ return -ENOMEM; } page = page_address(page_map); - read_lock(&tasklist_lock); - for_each_task(p) - unuse_process(p->mm, entry, page); - read_unlock(&tasklist_lock); + down(&mm_sem); + mm = init_mm.next; + while (mm != &init_mm) { + unuse_mm(mm, entry, page); + } + up(&mm_sem); shm_unuse(entry, page); /* Now get rid of the extra reference to the temporary page we've been using. */ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://humbolt.geo.uu.nl/Linux-MM/