From: kanoj@google.engr.sgi.com (Kanoj Sarcar)
To: Andrea Arcangeli <andrea@suse.de>
Cc: torvalds@transmeta.com, sct@redhat.com, linux-mm@kvack.org,
Kanoj Sarcar <kanoj@google.engr.sgi.com>
Subject: Re: filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races
Date: Sun, 27 Jun 1999 18:48:47 -0700 (PDT) [thread overview]
Message-ID: <199906280148.SAA94463@google.engr.sgi.com> (raw)
In-Reply-To: <Pine.LNX.4.10.9906250203110.22024-100000@laser.random> from "Andrea Arcangeli" at Jun 25, 99 02:26:11 am
Linus/Andrea/Stephen,
This is the patch that tries to cure the swapoff races with processes
forking, exiting, and (readahead) swapping by faulting.
Basically, all these operations are synchronized by the process
mmap_sem. Unfortunately, swapoff has to visit all processes, during
which it must hold tasklist_lock, a spinlock. Hence, it can not take
the mmap_sem, a sleeping mutex. So, the patch links up all active
mm's in a list that swapoff can visit (with minor restructuring,
kswapd can also use this, although it can not hold mmap_sem).
Addition/deletions to the list are protected by a sleeping
mutex, hence swapoff can grab the individual mmap_sems, while
preventing changes to the list. Effectively, process creation
and destruction are locked out if swapoff is running.
To do this, the lock ordering is mm_sem -> mmap_sem. To
prevent deadlocks, care must be taken that a process invoking
delete/insert_mmlist does not have its own mmap_sem held. For
this, the do_fork path needs to change so as not to acquire
mmap_sem early, rather only when it is really needed. This does
not open up a resource-ordering problem between kernel_lock and
mmap_sem, since the kernel_lock is a monitor lock that is released
at schedule time, so no deadlocks are possible.
I have just done basic sanity testing on this, I am hoping Andrea
can run his swapoff stress tests to see whether this patch helps
cure the problem he was seeing.
Thanks.
Kanoj
kanoj@engr.sgi.com
--- /usr/tmp/p_rdiff_a009HP/exec.c Sun Jun 27 16:51:58 1999
+++ fs/exec.c Sun Jun 27 15:14:43 1999
@@ -399,6 +399,7 @@
up(&mm->mmap_sem);
mm_release();
mmput(old_mm);
+ insert_mmlist(mm);
return 0;
/*
--- /usr/tmp/p_rdiff_a009HP/sched.h Sun Jun 27 16:52:01 1999
+++ include/linux/sched.h Fri Jun 25 17:22:56 1999
@@ -170,6 +170,8 @@
atomic_t count;
int map_count; /* number of VMAs */
struct semaphore mmap_sem;
+ struct mm_struct *prev; /* list of allocated mms */
+ struct mm_struct *next; /* list of allocated mms */
unsigned long context;
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
@@ -191,7 +193,7 @@
swapper_pg_dir, \
ATOMIC_INIT(1), 1, \
__MUTEX_INITIALIZER(name.mmap_sem), \
- 0, \
+ &init_mm, &init_mm, 0, \
0, 0, 0, 0, \
0, 0, 0, \
0, 0, 0, 0, \
@@ -611,6 +613,7 @@
/*
* Routines for handling mm_structs
*/
+extern struct semaphore mm_sem;
extern struct mm_struct * mm_alloc(void);
static inline void mmget(struct mm_struct * mm)
{
@@ -619,6 +622,22 @@
extern void mmput(struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(void);
+static inline void insert_mmlist(struct mm_struct * mm)
+{
+ down(&mm_sem);
+ mm->prev = &init_mm;
+ mm->next = init_mm.next;
+ init_mm.next->prev = mm;
+ init_mm.next = mm;
+ up(&mm_sem);
+}
+static inline void delete_mmlist(struct mm_struct * mm)
+{
+ down(&mm_sem);
+ mm->next->prev = mm->prev;
+ mm->prev->next = mm->next;
+ up(&mm_sem);
+}
extern int copy_thread(int, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
extern void flush_thread(void);
--- /usr/tmp/p_rdiff_a009HP/fork.c Sun Jun 27 16:52:04 1999
+++ kernel/fork.c Sun Jun 27 15:28:34 1999
@@ -351,6 +351,7 @@
release_segments(mm);
exit_mmap(mm);
free_page_tables(mm);
+ delete_mmlist(mm);
kmem_cache_free(mm_cachep, mm);
}
}
@@ -383,7 +384,11 @@
retval = new_page_tables(tsk);
if (retval)
goto free_mm;
+ insert_mmlist(mm);
+
+ down(¤t->mm->mmap_sem);
retval = dup_mmap(mm);
+ up(¤t->mm->mmap_sem);
if (retval)
goto free_pt;
up(&mm->mmap_sem);
@@ -549,7 +554,6 @@
*p = *current;
- down(¤t->mm->mmap_sem);
lock_kernel();
retval = -EAGAIN;
@@ -676,7 +680,6 @@
++total_forks;
bad_fork:
unlock_kernel();
- up(¤t->mm->mmap_sem);
fork_out:
if ((clone_flags & CLONE_VFORK) && (retval > 0))
down(&sem);
--- /usr/tmp/p_rdiff_a009HP/mmap.c Sun Jun 27 16:52:07 1999
+++ mm/mmap.c Sun Jun 27 15:20:08 1999
@@ -39,6 +39,8 @@
/* SLAB cache for vm_area_struct's. */
kmem_cache_t *vm_area_cachep;
+struct semaphore mm_sem;
+
int sysctl_overcommit_memory;
/* Check that a process has enough memory to allocate a
@@ -812,6 +814,7 @@
{
struct vm_area_struct * mpnt;
+ down(&mm->mmap_sem);
mpnt = mm->mmap;
mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
mm->rss = 0;
@@ -843,6 +846,7 @@
printk("exit_mmap: map count is %d\n", mm->map_count);
clear_page_tables(mm, 0, USER_PTRS_PER_PGD);
+ up(&mm->mmap_sem);
}
/* Insert vm structure into process list sorted by address
@@ -957,6 +961,7 @@
void __init vma_init(void)
{
+ init_MUTEX(&mm_sem);
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct),
0, SLAB_HWCACHE_ALIGN,
--- /usr/tmp/p_rdiff_a009HP/page_alloc.c Sun Jun 27 16:52:09 1999
+++ mm/page_alloc.c Sun Jun 27 15:39:58 1999
@@ -385,10 +385,9 @@
}
/*
- * The tests may look silly, but it essentially makes sure that
- * no other process did a swap-in on us just as we were waiting.
+ * Concurrent swap-in via swapoff is interlocked out.
*
- * Also, don't bother to add to the swap cache if this page-in
+ * Don't bother to add to the swap cache if this page-in
* was due to a write access.
*/
void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
@@ -400,11 +399,6 @@
if (!page_map) {
swapin_readahead(entry);
page_map = read_swap_cache(entry);
- }
- if (pte_val(*page_table) != entry) {
- if (page_map)
- free_page_and_swap_cache(page_address(page_map));
- return;
}
if (!page_map) {
set_pte(page_table, BAD_PAGE);
--- /usr/tmp/p_rdiff_a009HP/swapfile.c Sun Jun 27 16:52:12 1999
+++ mm/swapfile.c Sun Jun 27 15:27:49 1999
@@ -259,20 +259,20 @@
}
}
-static void unuse_process(struct mm_struct * mm, unsigned long entry,
+static void unuse_mm(struct mm_struct * mm, unsigned long entry,
unsigned long page)
{
struct vm_area_struct* vma;
/*
- * Go through process' page directory.
+ * Go through address space page directory.
*/
- if (!mm || mm == &init_mm)
- return;
+ down(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start);
unuse_vma(vma, pgd, entry, page);
}
+ up(&mm->mmap_sem);
return;
}
@@ -283,8 +283,8 @@
*/
static int try_to_unuse(unsigned int type)
{
+ struct mm_struct * mm;
struct swap_info_struct * si = &swap_info[type];
- struct task_struct *p;
struct page *page_map;
unsigned long entry, page;
int i;
@@ -316,10 +316,12 @@
return -ENOMEM;
}
page = page_address(page_map);
- read_lock(&tasklist_lock);
- for_each_task(p)
- unuse_process(p->mm, entry, page);
- read_unlock(&tasklist_lock);
+ down(&mm_sem);
+ mm = init_mm.next;
+ while (mm != &init_mm) {
+ unuse_mm(mm, entry, page);
+ }
+ up(&mm_sem);
shm_unuse(entry, page);
/* Now get rid of the extra reference to the temporary
page we've been using. */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/
next prev parent reply other threads:[~1999-06-28 1:48 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
1999-06-21 5:29 filecache/swapcache questions Kanoj Sarcar
1999-06-21 11:25 ` Stephen C. Tweedie
1999-06-21 16:46 ` Kanoj Sarcar
1999-06-21 16:57 ` Stephen C. Tweedie
1999-06-21 17:36 ` Kanoj Sarcar
1999-06-21 17:49 ` Stephen C. Tweedie
1999-06-21 18:46 ` Kanoj Sarcar
1999-06-21 23:44 ` Kanoj Sarcar
1999-06-24 22:23 ` Andrea Arcangeli
1999-06-24 23:55 ` Kanoj Sarcar
1999-06-25 0:26 ` Andrea Arcangeli
1999-06-28 1:48 ` Kanoj Sarcar [this message]
1999-06-28 10:35 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races Andrea Arcangeli
1999-06-28 17:11 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Kanoj Sarcar
1999-06-28 16:32 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races Stephen C. Tweedie
1999-06-28 17:25 ` Kanoj Sarcar
1999-06-28 20:40 ` Stephen C. Tweedie
1999-06-28 21:11 ` Kanoj Sarcar
1999-06-28 22:12 ` Stephen C. Tweedie
1999-06-28 23:43 ` Kanoj Sarcar
1999-06-29 11:44 ` Stephen C. Tweedie
1999-06-29 22:01 ` Kanoj Sarcar
1999-06-30 17:28 ` Stephen C. Tweedie
1999-06-30 18:05 ` Kanoj Sarcar
1999-06-28 19:39 ` Chuck Lever
1999-06-28 19:55 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Kanoj Sarcar
1999-06-28 20:33 ` Chuck Lever
1999-06-28 20:51 ` Kanoj Sarcar
1999-06-28 21:32 ` Chuck Lever
1999-06-28 21:38 ` Kanoj Sarcar
1999-06-28 21:50 ` Chuck Lever
1999-06-28 22:15 ` Kanoj Sarcar
1999-06-29 11:23 ` Stephen C. Tweedie
1999-06-29 17:36 ` Kanoj Sarcar
1999-06-28 22:22 ` Stephen C. Tweedie
1999-06-28 22:21 ` Stephen C. Tweedie
1999-06-28 22:57 ` Andrea Arcangeli
1999-06-29 2:13 ` Chuck Lever
1999-06-29 12:01 ` Stephen C. Tweedie
1999-06-29 12:32 ` Andrea Arcangeli
1999-06-30 15:59 ` Stephen C. Tweedie
1999-06-29 1:00 ` Chuck Lever
1999-06-28 22:08 ` Stephen C. Tweedie
1999-06-28 22:59 ` Andrea Arcangeli
1999-06-29 0:53 ` Chuck Lever
1999-06-29 11:14 ` Stephen C. Tweedie
1999-06-28 22:09 ` Stephen C. Tweedie
1999-06-28 20:45 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races Stephen C. Tweedie
1999-06-28 21:14 ` Chuck Lever
1999-06-28 21:25 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Kanoj Sarcar
1999-06-28 22:15 ` filecache/swapcache questions [RFC] [RFT] [PATCH] kanoj-mm12-2.3.8 Fix swapoff races Stephen C. Tweedie
1999-06-28 22:48 ` Andrea Arcangeli
1999-06-29 1:29 ` Chuck Lever
1999-06-29 11:58 ` Stephen C. Tweedie
1999-06-29 12:09 ` Andrea Arcangeli
1999-06-29 15:27 ` Chuck Lever
1999-06-29 11:55 ` Stephen C. Tweedie
1999-06-29 20:08 ` Andrea Arcangeli
1999-06-28 22:36 ` filecache/swapcache questions Stephen C. Tweedie
1999-06-28 23:24 ` Kanoj Sarcar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=199906280148.SAA94463@google.engr.sgi.com \
--to=kanoj@google.engr.sgi.com \
--cc=andrea@suse.de \
--cc=linux-mm@kvack.org \
--cc=sct@redhat.com \
--cc=torvalds@transmeta.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox