[PATCH] VM patch 3 for -ac7

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] VM patch 3 for -ac7
@ 2000-06-03 15:17 Rik van Riel
  2000-06-04 17:46 ` Zlatko Calusic
  0 siblings, 1 reply; 2+ messages in thread
From: Rik van Riel @ 2000-06-03 15:17 UTC (permalink / raw)
  To: Alan Cox; +Cc: yoann, gandalf, linux-mm, linux-kernel

Hi,

this patch (versus 2.4.0-test1-ac7) fixes the last balancing
problems with virtual memory. It adds two negative feedback
loops, one in __alloc_pages to make sure kswapd is woken up
often enough but not too often and another one in
do_try_to_free_pages, to balance between the memory freed and
the amount of pages unmapped to "generate" more freeable memory.

This one seems to really work, but of course I'm interested in
feedback ;)

regards,

Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.

Wanna talk about the kernel?  irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/		http://www.surriel.com/


--- linux-2.4.0-t1-ac7/fs/buffer.c.orig	Thu Jun  1 10:37:59 2000
+++ linux-2.4.0-t1-ac7/fs/buffer.c	Thu Jun  1 14:51:14 2000
@@ -1868,6 +1868,7 @@
 	}
 	
 	spin_unlock(&unused_list_lock);
+	wake_up(&buffer_wait);
 
 	return iosize;
 }
@@ -2004,6 +2005,8 @@
 		__put_unused_buffer_head(bh[bhind]);
 	}
 	spin_unlock(&unused_list_lock);
+	wake_up(&buffer_wait);
+
 	goto finished;
 }
 
@@ -2181,6 +2184,12 @@
 }
 
 /*
+ * Can the buffer be thrown out?
+ */
+#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+
+/*
  * Sync all the buffers on one page..
  *
  * If we have old buffers that are locked, we'll
@@ -2190,7 +2199,7 @@
  * This all is required so that we can free up memory
  * later.
  */
-static void sync_page_buffers(struct buffer_head *bh, int wait)
+static int sync_page_buffers(struct buffer_head *bh, int wait)
 {
 	struct buffer_head * tmp = bh;
 
@@ -2203,13 +2212,17 @@
 		} else if (buffer_dirty(p))
 			ll_rw_block(WRITE, 1, &p);
 	} while (tmp != bh);
-}
 
-/*
- * Can the buffer be thrown out?
- */
-#define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+	do {
+		struct buffer_head *p = tmp;
+		tmp = tmp->b_this_page;
+		if (buffer_busy(p))
+			return 0;
+	} while (tmp != bh);
+
+	/* Success. Now try_to_free_buffers can free the page. */
+	return 1;
+}
 
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
@@ -2227,6 +2240,7 @@
 	struct buffer_head * tmp, * bh = page->buffers;
 	int index = BUFSIZE_INDEX(bh->b_size);
 
+again:
 	spin_lock(&lru_list_lock);
 	write_lock(&hash_table_lock);
 	spin_lock(&free_list[index].lock);
@@ -2272,7 +2286,8 @@
 	spin_unlock(&free_list[index].lock);
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);	
-	sync_page_buffers(bh, wait);
+	if (sync_page_buffers(bh, wait))
+		goto again;
 	return 0;
 }
 
--- linux-2.4.0-t1-ac7/mm/vmscan.c.orig	Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/vmscan.c	Sat Jun  3 10:29:54 2000
@@ -439,12 +439,12 @@
  * latency.
  */
 #define FREE_COUNT	8
-#define SWAP_COUNT	16
 static int do_try_to_free_pages(unsigned int gfp_mask)
 {
 	int priority;
 	int count = FREE_COUNT;
-	int swap_count;
+	int swap_count = 0;
+	int ret = 0;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
@@ -452,6 +452,7 @@
 	priority = 64;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
+			ret = 1;
 			if (!--count)
 				goto done;
 		}
@@ -466,9 +467,12 @@
 			 */
 			count -= shrink_dcache_memory(priority, gfp_mask);
 			count -= shrink_icache_memory(priority, gfp_mask);
-			if (count <= 0)
+			if (count <= 0) {
+				ret = 1;
 				goto done;
+			}
 			while (shm_swap(priority, gfp_mask)) {
+				ret = 1;
 				if (!--count)
 					goto done;
 			}
@@ -480,24 +484,30 @@
 		 * This will not actually free any pages (they get
 		 * put in the swap cache), so we must not count this
 		 * as a "count" success.
+		 *
+		 * The amount we page out is the amount of pages we're
+		 * short freeing, amplified by the number of times we
+		 * failed above. This generates a negative feedback loop:
+		 * the more difficult it was to free pages, the easier we
+		 * will make it.
 		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
+		swap_count += count;
+		while (swap_out(priority, gfp_mask)) {
 			if (--swap_count < 0)
 				break;
+		}
 
 	} while (--priority >= 0);
 
 	/* Always end on a shrink_mmap.. */
 	while (shrink_mmap(0, gfp_mask)) {
+		ret = 1;
 		if (!--count)
 			goto done;
 	}
-	/* We return 1 if we are freed some page */
-	return (count != FREE_COUNT);
 
 done:
-	return 1;
+	return ret;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
--- linux-2.4.0-t1-ac7/mm/page_alloc.c.orig	Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/page_alloc.c	Fri Jun  2 15:29:21 2000
@@ -222,6 +222,9 @@
 {
 	zone_t **zone = zonelist->zones;
 	extern wait_queue_head_t kswapd_wait;
+	static int last_woke_kswapd;
+	static int kswapd_pause = HZ;
+	int gfp_mask = zonelist->gfp_mask;
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
@@ -248,14 +251,28 @@
 		}
 	}
 
-	/* All zones are in need of kswapd. */
-	if (waitqueue_active(&kswapd_wait))
+	/*
+	 * Kswapd should be freeing enough memory to satisfy all allocations
+	 * immediately.  Calling try_to_free_pages from processes will slow
+	 * down the system a lot.  On the other hand, waking up kswapd too
+	 * often means wasted memory and cpu time.
+	 *
+	 * We tune the kswapd pause interval in such a way that kswapd is
+	 * always just agressive enough to free the amount of memory we
+	 * want freed.
+	 */
+	if (waitqueue_active(&kswapd_wait) &&
+			time_after(jiffies, last_woke_kswapd + kswapd_pause)) {
+		kswapd_pause++;
+		last_woke_kswapd = jiffies;
 		wake_up_interruptible(&kswapd_wait);
+	}
 
 	/*
 	 * Ok, we don't have any zones that don't need some
 	 * balancing.. See if we have any that aren't critical..
 	 */
+again:
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
@@ -267,16 +284,29 @@
 				z->low_on_memory = 1;
 			if (page)
 				return page;
+		} else {
+			if (kswapd_pause > 0)
+				kswapd_pause--;
 		}
 	}
 
+	/* We didn't kick kswapd often enough... */
+	kswapd_pause /= 2;
+	if (waitqueue_active(&kswapd_wait))
+		wake_up_interruptible(&kswapd_wait);
+	/* If we're low priority, we just wait a bit and try again later. */
+	if ((gfp_mask & __GFP_WAIT) && current->need_resched &&
+				current->state == TASK_RUNNING) {
+		schedule();
+		goto again;
+	}
+
 	/*
 	 * Uhhuh. All the zones have been critical, which means that
 	 * we'd better do some synchronous swap-out. kswapd has not
 	 * been able to cope..
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
 		if (!try_to_free_pages(gfp_mask)) {
 			if (!(gfp_mask & __GFP_HIGH))
 				goto fail;
@@ -303,7 +333,6 @@
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
-		int gfp_mask = zonelist->gfp_mask;
 		if (!z)
 			break;
 		if (z->free_pages > z->pages_min) {
--- linux-2.4.0-t1-ac7/mm/filemap.c.orig	Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/filemap.c	Fri Jun  2 15:42:25 2000
@@ -334,13 +334,6 @@
 
 		count--;
 		/*
-		 * Page is from a zone we don't care about.
-		 * Don't drop page cache entries in vain.
-		 */
-		if (page->zone->free_pages > page->zone->pages_high)
-			goto dispose_continue;
-
-		/*
 		 * Avoid unscalable SMP locking for pages we can
 		 * immediate tell are untouchable..
 		 */
@@ -375,6 +368,13 @@
 			}
 		}
 
+		/*
+		 * Page is from a zone we don't care about.
+		 * Don't drop page cache entries in vain.
+		 */
+		if (page->zone->free_pages > page->zone->pages_high)
+			goto unlock_continue;
+
 		/* Take the pagecache_lock spinlock held to avoid
 		   other tasks to notice the page while we are looking at its
 		   page count. If it's a pagecache-page we'll free it
@@ -400,8 +400,15 @@
 				goto made_inode_progress;
 			}
 			/* PageDeferswap -> we swap out the page now. */
-			if (gfp_mask & __GFP_IO)
-				goto async_swap_continue;
+			if (gfp_mask & __GFP_IO) {
+				spin_unlock(&pagecache_lock);
+				/* Do NOT unlock the page ... brw_page does. */
+				ClearPageDirty(page);
+				rw_swap_page(WRITE, page, 0);
+				spin_lock(&pagemap_lru_lock);
+				page_cache_release(page);
+				goto dispose_continue;
+			}
 			goto cache_unlock_continue;
 		}
 
@@ -422,14 +429,6 @@
 unlock_continue:
 		spin_lock(&pagemap_lru_lock);
 		UnlockPage(page);
-		page_cache_release(page);
-		goto dispose_continue;
-async_swap_continue:
-		spin_unlock(&pagecache_lock);
-		/* Do NOT unlock the page ... that is done after IO. */
-		ClearPageDirty(page);
-		rw_swap_page(WRITE, page, 0);
-		spin_lock(&pagemap_lru_lock);
 		page_cache_release(page);
 dispose_continue:
 		list_add(page_lru, &lru_cache);
--- linux-2.4.0-t1-ac7/include/linux/swap.h.orig	Wed May 31 21:00:06 2000
+++ linux-2.4.0-t1-ac7/include/linux/swap.h	Thu Jun  1 11:51:25 2000
@@ -166,7 +166,7 @@
  * The 2.4 code, however, is mostly simple and stable ;)
  */
 #define PG_AGE_MAX	64
-#define PG_AGE_START	5
+#define PG_AGE_START	2
 #define PG_AGE_ADV	3
 #define PG_AGE_DECL	1
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] VM patch 3 for -ac7
  2000-06-03 15:17 [PATCH] VM patch 3 for -ac7 Rik van Riel
@ 2000-06-04 17:46 ` Zlatko Calusic
  0 siblings, 0 replies; 2+ messages in thread
From: Zlatko Calusic @ 2000-06-04 17:46 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm, linux-kernel

Hi, Rik!

I tested all versions of your autotune patch (1-3) and am mostly
satisfied with the direction of the development. But still, I have
some objections and lots of questions. :)

First, something that is bothering me for a long time now (as 2.3.42
gets more far away timewise, and I have chosen that kernel version to
represent code that doesn't exhibit this particular bad behaviour):

Bulk I/O is performing terribly. Spurious swapping is killing us as we
read big chunks of data from disk. Ext2 optimizations and
anti-fragmentation code are now officialy obsolete, because they never
have a chance to come in effect. For example, check the following
chunk of "vmstat 1" output:

 0  0  0   6976  85140    232   4772   0   0     0     0  101   469   0   0  99
 0  1  0   6976  74532    244  15284   0   0  2638     7  290   802   1   4  94
 1  0  0   6976  59028    260  30772   0   0  3876     0  347   892   0   5  94
 0  1  0   6976  43012    276  46772   0   0  4004     0  356   779   0   5  95
 1  0  0   6976  26964    292  62900   0   0  4036     0  355   918   0   6  93
 0  1  0   6976  10852    308  78900   0   0  4004     0  355   931   0   5  94
   procs                      memory    swap          io     system         cpu
 r  b  w   swpd   free   buff  cache  si  so    bi    bo   in    cs  us  sy  id
 2  0  0   7304   3128    184  89120   0  56  2978    26  305   780   1  14  85
 1  0  1   8084   2916    156  90320   0 220  2659    55  306   764   0  18  82
 0  2  0   9448   2112    168  92236 104 312  1873    78  281   790   0  11  88
 0  2  0   9916   2656    180  92016 264 212   795    53  199   465   0   4  96
 0  1  1  10340   2956    192  92024   0 288  2175    72  268   727   1  10  89
 0  2  0  10460   1936    204  92928  24 308  2588    77  296   804   1   6  93
 0  1  0  10772   2028    208  93080  16 456  1706   114  252   648   0   8  92
 1  0  1  10824   2900    204  92232   0 556  2402   139  298   784   0   5  94
 0  2  0  10868   2036    192  93124  24 140  2767    35  301   844   0   9  91
 0  2  0  11080   1944    192  93460  16 104  2526    26  286   836   0   6  94
 0  1  0  11620   2604    192  93220   4  88  2553    22  277   760   0  10  90
 0  1  0  11816   2164    196  93844   0 264  2620    66  292   792   0   9  91
 0  2  0  12084   1840    204  94320  80 196  1416    49  232   567   0   5  95
 0  1  0  12084   1708    216  94352 240   0  1467     0  219   676   0   1  98

At time T (top of the output), I started reading a big file from the
4k ext2 FS. The machine is completely idle, and as you can see has
lots of memory free. Before the memory gets filled (first few lines),
you can also see that data is coming at a 16MB/sec pace (bi ~ 4000),
which is _exactly_ the available (and expected) bandwidth.

And *then* we get in the trouble. VM kicks in and starts to swap in
an' out at will. Disk heads starts thrashing with sounds similar to
the ones heard when running netscape on a 16MB machine. Of course, the
reading speed drops drastically, and in the end we finish 10 seconds
later than we expected. I/O bandwidth is effectively halved, and why?
Because we enlarged page cache from completely satisfying 90MB!!! to
95MB (by 5%!), and to get that pissy 5MB we were swapping out as mad,
then processes started recolecting their pages back from the disk,
then all over again...

Now the question is: is such behaviour as expected or will that get
fixed before the final 2.4.0?

I'm worried that we are going to release the ultimate swapping machine
and say to people: here is the new an' great stable kernel! Watch it
swap and never stop. :)

What especially bother me is that nobody sees the problem. Everybody
is talking about better and better kernel, how things are getting
stable and response time is getting better, but I see new releases
getting worse and worse with performance going down the drain. Tell me
that I'm an idiot, that system is supposed to swap all the time and
then I'll maybe stop bitching. But not before. :)

Second thing: Around two years before, IIRC, Linus and people on this
group decided that we don't need page aging, that it only kills
performance and thus code is removed, not to be seen again. I wasn't
so sure then it was such a good idea, but when Andrea's lru page
management got in, I become very satisfied with our page replacement
policies.

Obviously, with zoned memory we got in the trouble once again and now,
as a solution you're getting page aging in the kernel again. Could you
tell us what are you're reasons? What has changed in the meantime, so
that we haven't needed page aging two years before, and now we need
it?

I didn't find any improvement (as can be seen from the vmstat output
above). Yes, I'm well aware of what are you _trying_ to achieve, but
in reality, you've just added lots of logic to the kernel and
accomplished nothing. Not to say we're back to 2.1.something and
history is repeating. :(

Gratuitous adding of untested code this far in the development doesn't
look like a very good idea to me.

In the end, I hope nobody sees this rather long complaint as an
attack, but rather as a call to a debate of how we could improve the
kernel and hopefully get us 2.4.0 out sooner. 2.4.0 we'll be proud of.

Regards,
-- 
Zlatko
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2000-06-04 17:46 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-06-03 15:17 [PATCH] VM patch 3 for -ac7 Rik van Riel
2000-06-04 17:46 ` Zlatko Calusic

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox