linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [patch] minimal page-LRU
@ 1999-08-04 11:33 Andrea Arcangeli
  1999-08-04 11:50 ` Ingo Molnar
  1999-08-04 17:07 ` Andrea Arcangeli
  0 siblings, 2 replies; 4+ messages in thread
From: Andrea Arcangeli @ 1999-08-04 11:33 UTC (permalink / raw)
  To: Linus Torvalds, MOLNAR Ingo, David S. Miller; +Cc: linux-kernel, linux-mm

I cleanedup the page-LRU code and I removed everything unrelated and I
tried to stay in sync as much as possible with the current 2.3.12 VM. The
patch should be the _shorter_ as possible and it should be safe as far as
the 2.3.12 VM is safe.

The patch reduces of an order of magnitude the complexity of shrink_mmap
(that now runs in O(num_physpages)) at the expense of two more pointer in
each struct page. It also make shrink_mmap SMP threaded in respect of the
big kernel lock and of the page cache. The page-LRU now is completly
orthogonal with the pagecache_lock. The improvement should be _huge_ in
high memory machines that uses most of memory in shm or anonymous memory
and where the working set doesn't fit all in cache.

The patch is against clean 2.3.12 and just includes the shrink_mmap fix to
consider a progress the freeing of a anonymous or metadata buffer.

There are still various minor bugs and races in the current VM (unrelated
with the patch) but I'll address them leather (otherwise the patch would
grow too much and I would like to get the page-LRU code included before
the freeze ;). For example is_page_shared and do_wp_page are not aware of
the additional reference of the buffer cache, and some locking issue in
do_wp_page/free_page_and_swap_cache (they can't sleep but they uses
delete_from_swap_cache that _can_ sleep).

diff -ur 2.3.12/fs/buffer.c 2.3.12-lru/fs/buffer.c
--- 2.3.12/fs/buffer.c	Sun Aug  1 18:11:17 1999
+++ 2.3.12-lru/fs/buffer.c	Wed Aug  4 00:40:08 1999
@@ -1252,7 +1252,7 @@
 	if (!PageLocked(page))
 		BUG();
 	if (!page->buffers)
-		return 0;
+		return 1;
 
 	head = page->buffers;
 	bh = head;
@@ -1293,10 +1293,13 @@
 	 */
 	if (!offset) {
 		if (!try_to_free_buffers(page))
+		{
 			atomic_add(PAGE_CACHE_SIZE, &buffermem);
+			return 0;
+		}
 	}
 
-	return 0;
+	return 1;
 }
 
 static void create_empty_buffers(struct page *page, struct inode *inode, unsigned long blocksize)
@@ -1905,6 +1908,7 @@
 static int grow_buffers(int size)
 {
 	unsigned long page;
+	struct page * page_map;
 	struct buffer_head *bh, *tmp;
 	struct buffer_head * insert_point;
 	int isize;
@@ -1947,7 +1951,9 @@
 	free_list[isize].list = bh;
 	spin_unlock(&free_list[isize].lock);
 
-	mem_map[MAP_NR(page)].buffers = bh;
+	page_map = mem_map + MAP_NR(page);
+	page_map->buffers = bh;
+	lru_cache_add(page_map, 0);
 	atomic_add(PAGE_SIZE, &buffermem);
 	return 1;
 }
diff -ur 2.3.12/fs/dcache.c 2.3.12-lru/fs/dcache.c
--- 2.3.12/fs/dcache.c	Tue Jul 13 02:01:39 1999
+++ 2.3.12-lru/fs/dcache.c	Tue Aug  3 18:28:27 1999
@@ -20,6 +20,7 @@
 #include <linux/malloc.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 
@@ -473,9 +474,11 @@
 {
 	if (gfp_mask & __GFP_IO) {
 		int count = 0;
+		lock_kernel();
 		if (priority)
 			count = dentry_stat.nr_unused / priority;
 		prune_dcache(count);
+		unlock_kernel();
 	}
 }
 
diff -ur 2.3.12/include/linux/mm.h 2.3.12-lru/include/linux/mm.h
--- 2.3.12/include/linux/mm.h	Wed Aug  4 12:28:17 1999
+++ 2.3.12-lru/include/linux/mm.h	Wed Aug  4 12:28:42 1999
@@ -125,6 +125,7 @@
 	struct page *next_hash;
 	atomic_t count;
 	unsigned long flags;	/* atomic flags, some possibly updated asynchronously */
+	struct list_head lru;
 	wait_queue_head_t wait;
 	struct page **pprev_hash;
 	struct buffer_head * buffers;
diff -ur 2.3.12/include/linux/swap.h 2.3.12-lru/include/linux/swap.h
--- 2.3.12/include/linux/swap.h	Wed Aug  4 12:28:17 1999
+++ 2.3.12-lru/include/linux/swap.h	Wed Aug  4 12:31:15 1999
@@ -64,6 +64,8 @@
 
 extern int nr_swap_pages;
 extern int nr_free_pages;
+extern int nr_lru_pages;
+extern struct list_head lru_cache, lru_swap_cache;
 extern atomic_t nr_async_pages;
 extern struct inode swapper_inode;
 extern atomic_t page_cache_size;
@@ -160,6 +162,27 @@
 		count--;
 	return  count > 1;
 }
+
+extern spinlock_t pagemap_lru_lock;
+
+/*
+ * Helper macros for lru_pages handling.
+ */
+#define	lru_cache_add(page, swap_cache)					     \
+do {									     \
+	spin_lock(&pagemap_lru_lock);					     \
+	list_add(&(page)->lru, !(swap_cache) ? &lru_cache:&lru_swap_cache); \
+	nr_lru_pages++;							     \
+	spin_unlock(&pagemap_lru_lock);					     \
+} while (0)
+
+#define	lru_cache_del(page)			\
+do {						\
+	spin_lock(&pagemap_lru_lock);		\
+	list_del(&(page)->lru);			\
+	nr_lru_pages--;				\
+	spin_unlock(&pagemap_lru_lock);		\
+} while (0)
 
 #endif /* __KERNEL__*/
 
diff -ur 2.3.12/ipc/shm.c 2.3.12-lru/ipc/shm.c
--- 2.3.12/ipc/shm.c	Thu Jul 22 01:07:28 1999
+++ 2.3.12-lru/ipc/shm.c	Tue Aug  3 16:16:16 1999
@@ -719,10 +719,12 @@
 	int loop = 0;
 	int counter;
 	struct page * page_map;
+	int ret = 0;
 	
+	lock_kernel();
 	counter = shm_rss >> prio;
 	if (!counter || !(swap_nr = get_swap_page()))
-		return 0;
+		goto out_unlock;
 
  check_id:
 	shp = shm_segs[swap_id];
@@ -755,7 +757,7 @@
 	if (--counter < 0) { /* failed */
 		failed:
 		swap_free (swap_nr);
-		return 0;
+		goto out_unlock;
 	}
 	if (page_count(mem_map + MAP_NR(pte_page(page))) != 1)
 		goto check_table;
@@ -768,7 +770,10 @@
 	swap_successes++;
 	shm_swp++;
 	shm_rss--;
-	return 1;
+	ret = 1;
+ out_unlock:
+	unlock_kernel();
+	return ret;
 }
 
 /*
diff -ur 2.3.12/mm/filemap.c 2.3.12-lru/mm/filemap.c
--- 2.3.12/mm/filemap.c	Thu Jul 22 01:07:28 1999
+++ 2.3.12-lru/mm/filemap.c	Wed Aug  4 12:04:11 1999
@@ -33,6 +33,8 @@
  *
  * finished 'unifying' the page and buffer cache and SMP-threaded the
  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
+ *
+ * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  */
 
 atomic_t page_cache_size = ATOMIC_INIT(0);
@@ -40,6 +42,11 @@
 struct page **page_hash_table;
 
 spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
+/*
+ * NOTE: to avoid deadlocking you must never acquire the pagecache_lock with
+ *       the pagemap_lru_lock held.
+ */
+spinlock_t pagemap_lru_lock = SPIN_LOCK_UNLOCKED;
 
 
 void __add_page_to_hash_queue(struct page * page, struct page **p)
@@ -117,6 +124,7 @@
 		}
 		if (page_count(page) != 2)
 			printk("hm, busy page invalidated? (not necesserily a bug)\n");
+		lru_cache_del(page);
 
 		remove_page_from_inode_queue(page);
 		remove_page_from_hash_queue(page);
@@ -151,8 +159,9 @@
 
 			lock_page(page);
 
-			if (inode->i_op->flushpage)
-				inode->i_op->flushpage(inode, page, 0);
+			if (!inode->i_op->flushpage ||
+			    inode->i_op->flushpage(inode, page, 0))
+				lru_cache_del(page);
 
 			/*
 			 * We remove the page from the page cache
@@ -214,91 +223,65 @@
 
 extern atomic_t too_many_dirty_buffers;
 
-int shrink_mmap(int priority, int gfp_mask)
+static inline int shrink_mmap_lru(struct list_head * lru, int * count,
+				  int gfp_mask)
 {
-	static unsigned long clock = 0;
-	unsigned long limit = num_physpages << 1;
+	int ret = 0;
+	LIST_HEAD(young);
+	LIST_HEAD(old);
+	LIST_HEAD(forget);
+	struct list_head * page_lru, * dispose;
 	struct page * page;
-	int count, users;
-
-	count = limit >> priority;
 
-	page = mem_map + clock;
-	do {
-		int referenced;
-
-		/* This works even in the presence of PageSkip because
-		 * the first two entries at the beginning of a hole will
-		 * be marked, not just the first.
-		 */
-		page++;
-		clock++;
-		if (clock >= max_mapnr) {
-			clock = 0;
-			page = mem_map;
-		}
-		if (PageSkip(page)) {
-			/* next_hash is overloaded for PageSkip */
-			page = page->next_hash;
-			clock = page - mem_map;
+	while (*count > 0 && (page_lru = lru->prev) != lru)
+	{
+		page = list_entry(page_lru, struct page, lru);
+		list_del(page_lru);
+
+		if (test_and_clear_bit(PG_referenced, &page->flags))
+		{
+			/* Roll the page at the top of the lru list,
+			 * we could also be more aggressive putting
+			 * the page in the young-dispose-list, so
+			 * avoiding to free young pages in each pass.
+			 */
+			list_add(page_lru, lru);
+			continue;
 		}
-		
-		referenced = test_and_clear_bit(PG_referenced, &page->flags);
+		spin_unlock(&pagemap_lru_lock);
 
+		dispose = &old;
 		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
-			continue;
+			goto dispose_continue;
 
-		count--;
+		(*count)--;
 
-		/*
-		 * Some common cases that we just short-circuit without
-		 * getting the locks - we need to re-check this once we
-		 * have the lock, but that's fine.
-		 */
-		users = page_count(page);
-		if (!users)
-			continue;
-		if (!page->buffers) {
-			if (!page->inode)
-				continue;
-			if (users > 1)
-				continue;
-		}
+		dispose = &young;
+		/* avoid unscalable SMP locking */
+		if (!page->buffers && page_count(page) > 1)
+			goto dispose_continue;
 
-		/*
-		 * ok, now the page looks interesting. Re-check things
-		 * and keep the lock.
-		 */
 		spin_lock(&pagecache_lock);
-		if (!page->inode && !page->buffers) {
-			spin_unlock(&pagecache_lock);
-			continue;
-		}
-		if (!page_count(page)) {
-			spin_unlock(&pagecache_lock);
-			BUG();
-			continue;
-		}
-		get_page(page);
-		if (TryLockPage(page)) {
+		if (TryLockPage(page))
+		{
 			spin_unlock(&pagecache_lock);
-			goto put_continue;
+			goto dispose_continue;
 		}
 
-		/*
-		 * we keep pagecache_lock locked and unlock it in
-		 * each branch, so that the page->inode case doesnt
-		 * have to re-grab it. Here comes the 'real' logic
-		 * to free memory:
-		 */
+		/* avoid freeing the page while it's locked */
+		get_page(page);
 
 		/* Is it a buffer page? */
 		if (page->buffers) {
-			int mem = page->inode ? 0 : PAGE_CACHE_SIZE;
 			spin_unlock(&pagecache_lock);
 			if (!try_to_free_buffers(page))
 				goto unlock_continue;
-			atomic_sub(mem, &buffermem);
+			/* page was locked, inode can't go away under us */
+			if (!page->inode)
+			{
+				atomic_sub(PAGE_CACHE_SIZE, &buffermem);
+				goto made_buffer_progress;
+			}
 			spin_lock(&pagecache_lock);
 		}
 
@@ -307,7 +290,7 @@
 		 * (count == 2 because we added one ourselves above).
 		 */
 		if (page_count(page) != 2)
-			goto spin_unlock_continue;
+			goto cache_unlock_continue;
 
 		/*
 		 * Is it a page swap page? If so, we want to
@@ -316,35 +299,73 @@
 		 */
 		if (PageSwapCache(page)) {
 			spin_unlock(&pagecache_lock);
-			if (referenced && swap_count(page->offset) != 2)
-				goto unlock_continue;
 			__delete_from_swap_cache(page);
-			page_cache_release(page);
-			goto made_progress;
+			goto made_inode_progress;
 		}	
 
 		/* is it a page-cache page? */
-		if (!referenced && page->inode && !pgcache_under_min()) {
-			remove_page_from_inode_queue(page);
-			remove_page_from_hash_queue(page);
-			page->inode = NULL;
-			spin_unlock(&pagecache_lock);
-
-			page_cache_release(page);
-			goto made_progress;
+		if (page->inode)
+		{
+			dispose = &old;
+			if (!pgcache_under_min())
+			{
+				remove_page_from_inode_queue(page);
+				remove_page_from_hash_queue(page);
+				page->inode = NULL;
+				spin_unlock(&pagecache_lock);
+				goto made_inode_progress;
+			}
+			goto cache_unlock_continue;
 		}
-spin_unlock_continue:
+
+		dispose = &forget;
+		printk(KERN_ERR "shrink_mmap: unknown LRU page!\n");
+
+cache_unlock_continue:
 		spin_unlock(&pagecache_lock);
 unlock_continue:
 		UnlockPage(page);
-put_continue:
 		put_page(page);
-	} while (count > 0);
-	return 0;
-made_progress:
+dispose_continue:
+		/* no need of the spinlock to play with the
+		   local dispose lists */
+		list_add(page_lru, dispose);
+		spin_lock(&pagemap_lru_lock);
+	}
+	goto out;
+
+made_inode_progress:
+	page_cache_release(page);
+made_buffer_progress:
 	UnlockPage(page);
 	put_page(page);
-	return 1;
+	ret = 1;
+	spin_lock(&pagemap_lru_lock);
+	/* nr_lru_pages needs the spinlock */
+	nr_lru_pages--;
+
+out:
+	list_splice(&young, lru);
+	list_splice(&old, lru->prev);
+
+	return ret;
+}
+
+int shrink_mmap(int priority, int gfp_mask)
+{
+	int ret = 0, count, i;
+	struct list_head * lru[2] = { &lru_swap_cache, &lru_cache, };
+
+	count = nr_lru_pages / (priority+1);
+
+	spin_lock(&pagemap_lru_lock);
+
+	for (i=0; count > 0 && !ret && i<2; i++)
+		ret = shrink_mmap_lru(lru[i], &count, gfp_mask);
+
+	spin_unlock(&pagemap_lru_lock);
+
+	return ret;
 }
 
 static inline struct page * __find_page_nolock(struct inode * inode, unsigned long offset, struct page *page)
@@ -461,13 +482,14 @@
 {
 	unsigned long flags;
 
-	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error));
-	page->flags = flags |  ((1 << PG_locked) | (1 << PG_referenced));
+	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced));
+	page->flags = flags | (1 << PG_locked);
 	page->owner = current;	/* REMOVEME */
 	get_page(page);
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
+	lru_cache_add(page, PageSwapCache(page));
 }
 
 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
diff -ur 2.3.12/mm/page_alloc.c 2.3.12-lru/mm/page_alloc.c
--- 2.3.12/mm/page_alloc.c	Tue Jul 13 02:02:40 1999
+++ 2.3.12-lru/mm/page_alloc.c	Mon Aug  2 17:21:47 1999
@@ -20,6 +20,9 @@
 
 int nr_swap_pages = 0;
 int nr_free_pages = 0;
+int nr_lru_pages;
+LIST_HEAD(lru_cache);
+LIST_HEAD(lru_swap_cache);
 
 /*
  * Free area management
@@ -127,7 +130,6 @@
 		if (PageLocked(page))
 			PAGE_BUG(page);
 
-		page->flags &= ~(1 << PG_referenced);
 		free_pages_ok(page - mem_map, 0);
 		return 1;
 	}
@@ -145,7 +147,6 @@
 				PAGE_BUG(map);
 			if (PageLocked(map))
 				PAGE_BUG(map);
-			map->flags &= ~(1 << PG_referenced);
 			free_pages_ok(map_nr, order);
 			return 1;
 		}
@@ -269,8 +270,9 @@
  	unsigned long total = 0;
 
 	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
-	printk("Free: %d (%d %d %d)\n",
+	printk("Free: %d, lru_cache: %d (%d %d %d)\n",
 		nr_free_pages,
+		nr_lru_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
diff -ur 2.3.12/mm/swap_state.c 2.3.12-lru/mm/swap_state.c
--- 2.3.12/mm/swap_state.c	Tue Jul 13 02:02:10 1999
+++ 2.3.12-lru/mm/swap_state.c	Wed Aug  4 13:23:04 1999
@@ -214,8 +214,6 @@
 		   page_address(page), page_count(page));
 #endif
 	PageClearSwapCache(page);
-	if (inode->i_op->flushpage)
-		inode->i_op->flushpage(inode, page, 0);
 	remove_inode_page(page);
 }
 
@@ -239,6 +237,15 @@
 	swap_free (entry);
 }
 
+static void delete_from_swap_cache_nolock(struct page *page)
+{
+	if (!swapper_inode.i_op->flushpage ||
+	    swapper_inode.i_op->flushpage(&swapper_inode, page, 0))
+		lru_cache_del(page);
+
+	__delete_from_swap_cache(page);
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
@@ -247,7 +254,7 @@
 {
 	lock_page(page);
 
-	__delete_from_swap_cache(page);
+	delete_from_swap_cache_nolock(page);
 
 	UnlockPage(page);
 	page_cache_release(page);
@@ -267,9 +274,7 @@
 	 */
 	lock_page(page);
 	if (PageSwapCache(page) && !is_page_shared(page)) {
-		long entry = page->offset;
-		remove_from_swap_cache(page);
-		swap_free(entry);
+		delete_from_swap_cache_nolock(page);
 		page_cache_release(page);
 	}
 	UnlockPage(page);
diff -ur 2.3.12/mm/vmscan.c 2.3.12-lru/mm/vmscan.c
--- 2.3.12/mm/vmscan.c	Thu Jul 22 01:07:28 1999
+++ 2.3.12-lru/mm/vmscan.c	Wed Aug  4 12:16:35 1999
@@ -323,7 +323,9 @@
 {
 	struct task_struct * p, * pbest;
 	int counter, assign, max_cnt;
+	int ret = 0;
 
+	lock_kernel();
 	/* 
 	 * We make one or two passes through the task list, indexed by 
 	 * assign = {0, 1}:
@@ -373,11 +375,14 @@
 			goto out;
 		}
 
+		ret = 1;
 		if (swap_out_process(pbest, gfp_mask))
-			return 1;
+			goto out;
+		ret = 0;
 	}
 out:
-	return 0;
+	unlock_kernel();
+	return ret;
 }
 
 /*
@@ -394,8 +399,6 @@
 	int priority;
 	int count = SWAP_CLUSTER_MAX;
 
-	lock_kernel();
-
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
@@ -423,7 +426,6 @@
 		shrink_dcache_memory(priority, gfp_mask);
 	} while (--priority >= 0);
 done:
-	unlock_kernel();
 
 	return priority >= 0;
 }


I did only a little not interesting benchmark. I compiled the kernel with
2.3.12 and 2.3.12-LRU and these are the numbers:

2.3.12:
real    3m0.974s
user    3m22.400s
sys     0m16.350s

2.3.12-lru:
real    2m58.483s
user    3m23.350s
sys     0m15.920s

NOTE: I have 128mbyte of ram so the kernel almost fit in cache during the
compile and there isn't high I/O activity so I didn't ever expected such
two seconds improvement...

Anyway I had many old numbers about my 2.2.x lru patches that show an huge
improvement (and in 2.2.x my shrink_mmap wasn't smp-threaded...).

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] minimal page-LRU
  1999-08-04 11:33 [patch] minimal page-LRU Andrea Arcangeli
@ 1999-08-04 11:50 ` Ingo Molnar
  1999-08-04 17:07 ` Andrea Arcangeli
  1 sibling, 0 replies; 4+ messages in thread
From: Ingo Molnar @ 1999-08-04 11:50 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Linus Torvalds, MOLNAR Ingo, David S. Miller, linux-kernel, linux-mm

On Wed, 4 Aug 1999, Andrea Arcangeli wrote:

> I did only a little not interesting benchmark. I compiled the kernel with
> 2.3.12 and 2.3.12-LRU and these are the numbers:
> 
> 2.3.12:
> real    3m0.974s
> user    3m22.400s
> sys     0m16.350s
> 
> 2.3.12-lru:
> real    2m58.483s
> user    3m23.350s
> sys     0m15.920s
> 
> NOTE: I have 128mbyte of ram so the kernel almost fit in cache during the
> compile and there isn't high I/O activity so I didn't ever expected such
> two seconds improvement...

even two seconds can be statistical noise (eg. look at the user-time
numbers, those increased by one second.). But it's not so hard to test
high-intensity VM with kernel compiles. This method is from Davem: compile
the kernel with 'make -jN', where N = 1,2,3... increasingly. [also put
'make -jN' into the top Makefile.] Sometimes at N=6 or so you'll fall out
of core 128M RAM. This is both a good stability and a good performance
test. You can even automate it. This way you'll see both the effect on
'normal' (cached) and 'high load' (swapping) situations. Such a list of
numbers is a much more reliable measurement of performance than just one
arbitrary number.

-- mingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] minimal page-LRU
  1999-08-04 11:33 [patch] minimal page-LRU Andrea Arcangeli
  1999-08-04 11:50 ` Ingo Molnar
@ 1999-08-04 17:07 ` Andrea Arcangeli
  1999-08-04 18:05   ` Andrea Arcangeli
  1 sibling, 1 reply; 4+ messages in thread
From: Andrea Arcangeli @ 1999-08-04 17:07 UTC (permalink / raw)
  To: Linus Torvalds, MOLNAR Ingo, David S. Miller; +Cc: linux-kernel, linux-mm

On Wed, 4 Aug 1999, Andrea Arcangeli wrote:

>tried to stay in sync as much as possible with the current 2.3.12 VM. The
>patch should be the _shorter_ as possible and it should be safe as far as
>the 2.3.12 VM is safe.

Continuing thinking about the code I just found a race condition between
the page-LRU shrink_mmap and truncate_inode_pages() ;).

The SMP race looks like this:

	shrink_mmap cpu 0			truncate_inode_pages cpu 1
	-----------------			--------------------------
	- remove page from lru list
	- release lru lock
						- grab pagecache_lock
						- increase page count
						- lockdown the page
						- flushpage and remove
						  from lru list but the 
						  page was not present
						  any lru lists!!

To fix this race I must simply avoid to release the pagemap_lru lock

	if (the page is not queued in any lru list &&
	    I have not yet locked down the page)

Once I have locked down the page I can also safely relase the lru lock
even if the page is not yet queued in any lru list, since
truncate_inode_pages() will wait me in lock_page() (and if
truncate_inode_pages() is waiting me I'll give up in the page_count()
check that it's done atomically with the pagecache_lock held).

I also need to access the local lru queues with the spinlock held since
truncate_inode_page() may remove the page from my local lru queue while I
am working on the global lru lists.

This fix is incremental with the minimal lru patch against 2.3.12 I posted
some hours ago.

--- 2.3.12-lru1/mm/filemap.c	Wed Aug  4 18:35:59 1999
+++ 2.3.12-lru/mm/filemap.c	Wed Aug  4 18:37:51 1999
@@ -238,35 +238,42 @@
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
+		dispose = lru;
 		if (test_and_clear_bit(PG_referenced, &page->flags))
-		{
 			/* Roll the page at the top of the lru list,
 			 * we could also be more aggressive putting
 			 * the page in the young-dispose-list, so
 			 * avoiding to free young pages in each pass.
 			 */
-			list_add(page_lru, lru);
-			continue;
-		}
-		spin_unlock(&pagemap_lru_lock);
+			goto dispose_continue;
 
 		dispose = &old;
+		/* don't account passes over not DMA pages */
 		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 			goto dispose_continue;
 
 		(*count)--;
 
 		dispose = &young;
+		if (TryLockPage(page))
+			goto dispose_continue;
+
+		/* Release the pagemap_lru lock even if the page is not yet
+		   queued in any lru queue since we have just locked down
+		   the page so nobody else may SMP race with us running
+		   a lru_cache_del() (lru_cache_del() always run with the
+		   page locked down ;). */
+		spin_unlock(&pagemap_lru_lock);
+
 		/* avoid unscalable SMP locking */
 		if (!page->buffers && page_count(page) > 1)
-			goto dispose_continue;
+			goto unlock_noput_continue;
 
+		/* Take the pagecache_lock spinlock held to avoid
+		   other tasks to notice the page while we are looking at its
+		   page count. If it's a pagecache-page we'll free it
+		   in one atomic transaction after checking its page count. */
 		spin_lock(&pagecache_lock);
-		if (TryLockPage(page))
-		{
-			spin_unlock(&pagecache_lock);
-			goto dispose_continue;
-		}
 
 		/* avoid freeing the page while it's locked */
 		get_page(page);
@@ -326,11 +333,21 @@
 unlock_continue:
 		UnlockPage(page);
 		put_page(page);
+dispose_relock_continue:
+		/* even if the dispose list is local, a truncate_inode_page()
+		   may remove a page from its queue so always
+		   synchronize with the lru lock while accesing the
+		   page->lru field */
+		spin_lock(&pagemap_lru_lock);
+		list_add(page_lru, dispose);
+		continue;
+
+unlock_noput_continue:
+		UnlockPage(page);
+		goto dispose_relock_continue;
+
 dispose_continue:
-		/* no need of the spinlock to play with the
-		   local dispose lists */
 		list_add(page_lru, dispose);
-		spin_lock(&pagemap_lru_lock);
 	}
 	goto out;
 

Andrea

PS. I also added some more comment ;).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch] minimal page-LRU
  1999-08-04 17:07 ` Andrea Arcangeli
@ 1999-08-04 18:05   ` Andrea Arcangeli
  0 siblings, 0 replies; 4+ messages in thread
From: Andrea Arcangeli @ 1999-08-04 18:05 UTC (permalink / raw)
  To: Linus Torvalds, MOLNAR Ingo, David S. Miller; +Cc: linux-kernel, linux-mm

On Wed, 4 Aug 1999, Andrea Arcangeli wrote:

>Continuing thinking about the code I just found a race condition between
>the page-LRU shrink_mmap and truncate_inode_pages() ;).

I did some more profiling and I noticed a performance problem.

I had to join the two global page-lru headers into one, since in the
lru-patch I am proposing for inclusion I removed the code to take the
pages mapped in process space (so unfreeable from shrink_mmap) out of the
lru list (since it was increasing a lot the complxity of the code now that
everything is SMP threaded).

So with the current code that still uses the double lru-lists but is not
capable to skip the mapped-pages, if there would be 10mbyte of swap cache
mapped in process space I would end up trying to free such 10mbyte of
memory at each shrink_mmap pass... not good. NOTE: this performance
problem can trigger only with swap cache mapped in the process space (I
couldn't notice that with the kernel compile bench).

The reason I was using two lists is that shrinking the swap cache before
touching the page-cache _far_ better preserve the working set.

So here it is a strighforward patch (incremental to the second I posted
today) to take all pages queued in the same lru list. These are only
details, the core of the code doesn't change at all:

diff -ur 2.3.12-lruswap/fs/buffer.c 2.3.12-lru/fs/buffer.c
--- 2.3.12-lruswap/fs/buffer.c	Wed Aug  4 19:17:22 1999
+++ 2.3.12-lru/fs/buffer.c	Wed Aug  4 19:24:08 1999
@@ -1953,7 +1953,7 @@
 
 	page_map = mem_map + MAP_NR(page);
 	page_map->buffers = bh;
-	lru_cache_add(page_map, 0);
+	lru_cache_add(page_map);
 	atomic_add(PAGE_SIZE, &buffermem);
 	return 1;
 }
diff -ur 2.3.12-lruswap/include/linux/swap.h 2.3.12-lru/include/linux/swap.h
--- 2.3.12-lruswap/include/linux/swap.h	Wed Aug  4 19:17:22 1999
+++ 2.3.12-lru/include/linux/swap.h	Wed Aug  4 19:15:26 1999
@@ -65,7 +65,7 @@
 extern int nr_swap_pages;
 extern int nr_free_pages;
 extern int nr_lru_pages;
-extern struct list_head lru_cache, lru_swap_cache;
+extern struct list_head lru_cache;
 extern atomic_t nr_async_pages;
 extern struct inode swapper_inode;
 extern atomic_t page_cache_size;
@@ -168,12 +168,12 @@
 /*
  * Helper macros for lru_pages handling.
  */
-#define	lru_cache_add(page, swap_cache)					     \
-do {									     \
-	spin_lock(&pagemap_lru_lock);					     \
-	list_add(&(page)->lru, !(swap_cache) ? &lru_cache:&lru_swap_cache); \
-	nr_lru_pages++;							     \
-	spin_unlock(&pagemap_lru_lock);					     \
+#define	lru_cache_add(page)			\
+do {						\
+	spin_lock(&pagemap_lru_lock);		\
+	list_add(&(page)->lru, &lru_cache);	\
+	nr_lru_pages++;				\
+	spin_unlock(&pagemap_lru_lock);		\
 } while (0)
 
 #define	lru_cache_del(page)			\
diff -ur 2.3.12-lruswap/mm/filemap.c 2.3.12-lru/mm/filemap.c
--- 2.3.12-lruswap/mm/filemap.c	Wed Aug  4 19:17:23 1999
+++ 2.3.12-lru/mm/filemap.c	Wed Aug  4 19:23:14 1999
@@ -223,22 +223,25 @@
 
 extern atomic_t too_many_dirty_buffers;
 
-static inline int shrink_mmap_lru(struct list_head * lru, int * count,
-				  int gfp_mask)
+int shrink_mmap(int priority, int gfp_mask)
 {
-	int ret = 0;
+	int ret = 0, count;
 	LIST_HEAD(young);
 	LIST_HEAD(old);
 	LIST_HEAD(forget);
 	struct list_head * page_lru, * dispose;
 	struct page * page;
 
-	while (*count > 0 && (page_lru = lru->prev) != lru)
+	count = nr_lru_pages / (priority+1);
+
+	spin_lock(&pagemap_lru_lock);
+
+	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache)
 	{
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
-		dispose = lru;
+		dispose = &lru_cache;
 		if (test_and_clear_bit(PG_referenced, &page->flags))
 			/* Roll the page at the top of the lru list,
 			 * we could also be more aggressive putting
@@ -252,7 +255,7 @@
 		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
 			goto dispose_continue;
 
-		(*count)--;
+		count--;
 
 		dispose = &young;
 		if (TryLockPage(page))
@@ -362,23 +365,8 @@
 	nr_lru_pages--;
 
 out:
-	list_splice(&young, lru);
-	list_splice(&old, lru->prev);
-
-	return ret;
-}
-
-int shrink_mmap(int priority, int gfp_mask)
-{
-	int ret = 0, count, i;
-	struct list_head * lru[2] = { &lru_swap_cache, &lru_cache, };
-
-	count = nr_lru_pages / (priority+1);
-
-	spin_lock(&pagemap_lru_lock);
-
-	for (i=0; count > 0 && !ret && i<2; i++)
-		ret = shrink_mmap_lru(lru[i], &count, gfp_mask);
+	list_splice(&young, &lru_cache);
+	list_splice(&old, lru_cache.prev);
 
 	spin_unlock(&pagemap_lru_lock);
 
@@ -506,7 +494,7 @@
 	page->offset = offset;
 	add_page_to_inode_queue(inode, page);
 	__add_page_to_hash_queue(page, hash);
-	lru_cache_add(page, PageSwapCache(page));
+	lru_cache_add(page);
 }
 
 void add_to_page_cache(struct page * page, struct inode * inode, unsigned long offset)
diff -ur 2.3.12-lruswap/mm/page_alloc.c 2.3.12-lru/mm/page_alloc.c
--- 2.3.12-lruswap/mm/page_alloc.c	Wed Aug  4 19:17:23 1999
+++ 2.3.12-lru/mm/page_alloc.c	Wed Aug  4 19:16:33 1999
@@ -22,7 +22,6 @@
 int nr_free_pages = 0;
 int nr_lru_pages;
 LIST_HEAD(lru_cache);
-LIST_HEAD(lru_swap_cache);
 
 /*
  * Free area management



Now the profile numbers are fine. With 6mbyte of swap cache mapped in
memory I am running in background `cat /usr/bin/* /usr/X11R6/bin/*
>/dev/null` and a `cp /dev/hda /dev/null` and these are the profiling
numbers I get:

andrea@laser:~ > readprofile -m /System.map |sort -nr | head -20
 12536 total                                      0.0193
 12236 cpu_idle                                 145.6667
    62 file_read_actor                            0.8158
    21 ide_set_handler                            0.2917
    16 startup_32                                 0.0976
    12 try_to_free_buffers                        0.0246
    12 kmem_cache_free                            0.0280
    12 do_rw_disk                                 0.0175
    10 ide_do_request                             0.0072
     8 do_generic_file_read                       0.0036
     7 shrink_mmap                                0.0068
     6 free_pages                                 0.0174
     6 block_read_full_page                       0.0118
     5 schedule                                   0.0034
     5 remove_page_from_inode_queue               0.0833
     5 ide_wait_stat                              0.0231
     5 ide_dmaproc                                0.0137
     5 get_unused_buffer_head                     0.0192
     5 get_hash_table                             0.0291
     5 ext2_get_block                             0.0037

As you can see try_to_free_buffers is eating far more CPU than
shrink_mmap.

I also uploaded a whole patch against clean 2.3.12 that merges all the
three patches here, if somebody would try it out please feedback ;).

	ftp://e-mind.com/pub/andrea/kernel-patches/2.3.12/page_lru-2.3.12-H
	ftp://ftp.suse.com/pub/people/andrea/kernel-patches/2.3.12/page_lru-2.3.12-H

Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://humbolt.geo.uu.nl/Linux-MM/

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~1999-08-04 18:05 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
1999-08-04 11:33 [patch] minimal page-LRU Andrea Arcangeli
1999-08-04 11:50 ` Ingo Molnar
1999-08-04 17:07 ` Andrea Arcangeli
1999-08-04 18:05   ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox