* [PATCH] VM patch 3 for -ac7
@ 2000-06-03 15:17 Rik van Riel
2000-06-04 17:46 ` Zlatko Calusic
0 siblings, 1 reply; 2+ messages in thread
From: Rik van Riel @ 2000-06-03 15:17 UTC (permalink / raw)
To: Alan Cox; +Cc: yoann, gandalf, linux-mm, linux-kernel
Hi,
this patch (versus 2.4.0-test1-ac7) fixes the last balancing
problems with virtual memory. It adds two negative feedback
loops, one in __alloc_pages to make sure kswapd is woken up
often enough but not too often and another one in
do_try_to_free_pages, to balance between the memory freed and
the amount of pages unmapped to "generate" more freeable memory.
This one seems to really work, but of course I'm interested in
feedback ;)
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--- linux-2.4.0-t1-ac7/fs/buffer.c.orig Thu Jun 1 10:37:59 2000
+++ linux-2.4.0-t1-ac7/fs/buffer.c Thu Jun 1 14:51:14 2000
@@ -1868,6 +1868,7 @@
}
spin_unlock(&unused_list_lock);
+ wake_up(&buffer_wait);
return iosize;
}
@@ -2004,6 +2005,8 @@
__put_unused_buffer_head(bh[bhind]);
}
spin_unlock(&unused_list_lock);
+ wake_up(&buffer_wait);
+
goto finished;
}
@@ -2181,6 +2184,12 @@
}
/*
+ * Can the buffer be thrown out?
+ */
+#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
+#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+
+/*
* Sync all the buffers on one page..
*
* If we have old buffers that are locked, we'll
@@ -2190,7 +2199,7 @@
* This all is required so that we can free up memory
* later.
*/
-static void sync_page_buffers(struct buffer_head *bh, int wait)
+static int sync_page_buffers(struct buffer_head *bh, int wait)
{
struct buffer_head * tmp = bh;
@@ -2203,13 +2212,17 @@
} else if (buffer_dirty(p))
ll_rw_block(WRITE, 1, &p);
} while (tmp != bh);
-}
-/*
- * Can the buffer be thrown out?
- */
-#define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
+ do {
+ struct buffer_head *p = tmp;
+ tmp = tmp->b_this_page;
+ if (buffer_busy(p))
+ return 0;
+ } while (tmp != bh);
+
+ /* Success. Now try_to_free_buffers can free the page. */
+ return 1;
+}
/*
* try_to_free_buffers() checks if all the buffers on this particular page
@@ -2227,6 +2240,7 @@
struct buffer_head * tmp, * bh = page->buffers;
int index = BUFSIZE_INDEX(bh->b_size);
+again:
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
spin_lock(&free_list[index].lock);
@@ -2272,7 +2286,8 @@
spin_unlock(&free_list[index].lock);
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
- sync_page_buffers(bh, wait);
+ if (sync_page_buffers(bh, wait))
+ goto again;
return 0;
}
--- linux-2.4.0-t1-ac7/mm/vmscan.c.orig Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/vmscan.c Sat Jun 3 10:29:54 2000
@@ -439,12 +439,12 @@
* latency.
*/
#define FREE_COUNT 8
-#define SWAP_COUNT 16
static int do_try_to_free_pages(unsigned int gfp_mask)
{
int priority;
int count = FREE_COUNT;
- int swap_count;
+ int swap_count = 0;
+ int ret = 0;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
@@ -452,6 +452,7 @@
priority = 64;
do {
while (shrink_mmap(priority, gfp_mask)) {
+ ret = 1;
if (!--count)
goto done;
}
@@ -466,9 +467,12 @@
*/
count -= shrink_dcache_memory(priority, gfp_mask);
count -= shrink_icache_memory(priority, gfp_mask);
- if (count <= 0)
+ if (count <= 0) {
+ ret = 1;
goto done;
+ }
while (shm_swap(priority, gfp_mask)) {
+ ret = 1;
if (!--count)
goto done;
}
@@ -480,24 +484,30 @@
* This will not actually free any pages (they get
* put in the swap cache), so we must not count this
* as a "count" success.
+ *
+ * The amount we page out is the amount of pages we're
+ * short freeing, amplified by the number of times we
+ * failed above. This generates a negative feedback loop:
+ * the more difficult it was to free pages, the easier we
+ * will make it.
*/
- swap_count = SWAP_COUNT;
- while (swap_out(priority, gfp_mask))
+ swap_count += count;
+ while (swap_out(priority, gfp_mask)) {
if (--swap_count < 0)
break;
+ }
} while (--priority >= 0);
/* Always end on a shrink_mmap.. */
while (shrink_mmap(0, gfp_mask)) {
+ ret = 1;
if (!--count)
goto done;
}
- /* We return 1 if we are freed some page */
- return (count != FREE_COUNT);
done:
- return 1;
+ return ret;
}
DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
--- linux-2.4.0-t1-ac7/mm/page_alloc.c.orig Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/page_alloc.c Fri Jun 2 15:29:21 2000
@@ -222,6 +222,9 @@
{
zone_t **zone = zonelist->zones;
extern wait_queue_head_t kswapd_wait;
+ static int last_woke_kswapd;
+ static int kswapd_pause = HZ;
+ int gfp_mask = zonelist->gfp_mask;
/*
* (If anyone calls gfp from interrupts nonatomically then it
@@ -248,14 +251,28 @@
}
}
- /* All zones are in need of kswapd. */
- if (waitqueue_active(&kswapd_wait))
+ /*
+ * Kswapd should be freeing enough memory to satisfy all allocations
+ * immediately. Calling try_to_free_pages from processes will slow
+ * down the system a lot. On the other hand, waking up kswapd too
+ * often means wasted memory and cpu time.
+ *
+ * We tune the kswapd pause interval in such a way that kswapd is
+ * always just agressive enough to free the amount of memory we
+ * want freed.
+ */
+ if (waitqueue_active(&kswapd_wait) &&
+ time_after(jiffies, last_woke_kswapd + kswapd_pause)) {
+ kswapd_pause++;
+ last_woke_kswapd = jiffies;
wake_up_interruptible(&kswapd_wait);
+ }
/*
* Ok, we don't have any zones that don't need some
* balancing.. See if we have any that aren't critical..
*/
+again:
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
@@ -267,16 +284,29 @@
z->low_on_memory = 1;
if (page)
return page;
+ } else {
+ if (kswapd_pause > 0)
+ kswapd_pause--;
}
}
+ /* We didn't kick kswapd often enough... */
+ kswapd_pause /= 2;
+ if (waitqueue_active(&kswapd_wait))
+ wake_up_interruptible(&kswapd_wait);
+ /* If we're low priority, we just wait a bit and try again later. */
+ if ((gfp_mask & __GFP_WAIT) && current->need_resched &&
+ current->state == TASK_RUNNING) {
+ schedule();
+ goto again;
+ }
+
/*
* Uhhuh. All the zones have been critical, which means that
* we'd better do some synchronous swap-out. kswapd has not
* been able to cope..
*/
if (!(current->flags & PF_MEMALLOC)) {
- int gfp_mask = zonelist->gfp_mask;
if (!try_to_free_pages(gfp_mask)) {
if (!(gfp_mask & __GFP_HIGH))
goto fail;
@@ -303,7 +333,6 @@
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
- int gfp_mask = zonelist->gfp_mask;
if (!z)
break;
if (z->free_pages > z->pages_min) {
--- linux-2.4.0-t1-ac7/mm/filemap.c.orig Wed May 31 14:08:50 2000
+++ linux-2.4.0-t1-ac7/mm/filemap.c Fri Jun 2 15:42:25 2000
@@ -334,13 +334,6 @@
count--;
/*
- * Page is from a zone we don't care about.
- * Don't drop page cache entries in vain.
- */
- if (page->zone->free_pages > page->zone->pages_high)
- goto dispose_continue;
-
- /*
* Avoid unscalable SMP locking for pages we can
* immediate tell are untouchable..
*/
@@ -375,6 +368,13 @@
}
}
+ /*
+ * Page is from a zone we don't care about.
+ * Don't drop page cache entries in vain.
+ */
+ if (page->zone->free_pages > page->zone->pages_high)
+ goto unlock_continue;
+
/* Take the pagecache_lock spinlock held to avoid
other tasks to notice the page while we are looking at its
page count. If it's a pagecache-page we'll free it
@@ -400,8 +400,15 @@
goto made_inode_progress;
}
/* PageDeferswap -> we swap out the page now. */
- if (gfp_mask & __GFP_IO)
- goto async_swap_continue;
+ if (gfp_mask & __GFP_IO) {
+ spin_unlock(&pagecache_lock);
+ /* Do NOT unlock the page ... brw_page does. */
+ ClearPageDirty(page);
+ rw_swap_page(WRITE, page, 0);
+ spin_lock(&pagemap_lru_lock);
+ page_cache_release(page);
+ goto dispose_continue;
+ }
goto cache_unlock_continue;
}
@@ -422,14 +429,6 @@
unlock_continue:
spin_lock(&pagemap_lru_lock);
UnlockPage(page);
- page_cache_release(page);
- goto dispose_continue;
-async_swap_continue:
- spin_unlock(&pagecache_lock);
- /* Do NOT unlock the page ... that is done after IO. */
- ClearPageDirty(page);
- rw_swap_page(WRITE, page, 0);
- spin_lock(&pagemap_lru_lock);
page_cache_release(page);
dispose_continue:
list_add(page_lru, &lru_cache);
--- linux-2.4.0-t1-ac7/include/linux/swap.h.orig Wed May 31 21:00:06 2000
+++ linux-2.4.0-t1-ac7/include/linux/swap.h Thu Jun 1 11:51:25 2000
@@ -166,7 +166,7 @@
* The 2.4 code, however, is mostly simple and stable ;)
*/
#define PG_AGE_MAX 64
-#define PG_AGE_START 5
+#define PG_AGE_START 2
#define PG_AGE_ADV 3
#define PG_AGE_DECL 1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 2+ messages in thread* Re: [PATCH] VM patch 3 for -ac7
2000-06-03 15:17 [PATCH] VM patch 3 for -ac7 Rik van Riel
@ 2000-06-04 17:46 ` Zlatko Calusic
0 siblings, 0 replies; 2+ messages in thread
From: Zlatko Calusic @ 2000-06-04 17:46 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, linux-kernel
Hi, Rik!
I tested all versions of your autotune patch (1-3) and am mostly
satisfied with the direction of the development. But still, I have
some objections and lots of questions. :)
First, something that is bothering me for a long time now (as 2.3.42
gets more far away timewise, and I have chosen that kernel version to
represent code that doesn't exhibit this particular bad behaviour):
Bulk I/O is performing terribly. Spurious swapping is killing us as we
read big chunks of data from disk. Ext2 optimizations and
anti-fragmentation code are now officialy obsolete, because they never
have a chance to come in effect. For example, check the following
chunk of "vmstat 1" output:
0 0 0 6976 85140 232 4772 0 0 0 0 101 469 0 0 99
0 1 0 6976 74532 244 15284 0 0 2638 7 290 802 1 4 94
1 0 0 6976 59028 260 30772 0 0 3876 0 347 892 0 5 94
0 1 0 6976 43012 276 46772 0 0 4004 0 356 779 0 5 95
1 0 0 6976 26964 292 62900 0 0 4036 0 355 918 0 6 93
0 1 0 6976 10852 308 78900 0 0 4004 0 355 931 0 5 94
procs memory swap io system cpu
r b w swpd free buff cache si so bi bo in cs us sy id
2 0 0 7304 3128 184 89120 0 56 2978 26 305 780 1 14 85
1 0 1 8084 2916 156 90320 0 220 2659 55 306 764 0 18 82
0 2 0 9448 2112 168 92236 104 312 1873 78 281 790 0 11 88
0 2 0 9916 2656 180 92016 264 212 795 53 199 465 0 4 96
0 1 1 10340 2956 192 92024 0 288 2175 72 268 727 1 10 89
0 2 0 10460 1936 204 92928 24 308 2588 77 296 804 1 6 93
0 1 0 10772 2028 208 93080 16 456 1706 114 252 648 0 8 92
1 0 1 10824 2900 204 92232 0 556 2402 139 298 784 0 5 94
0 2 0 10868 2036 192 93124 24 140 2767 35 301 844 0 9 91
0 2 0 11080 1944 192 93460 16 104 2526 26 286 836 0 6 94
0 1 0 11620 2604 192 93220 4 88 2553 22 277 760 0 10 90
0 1 0 11816 2164 196 93844 0 264 2620 66 292 792 0 9 91
0 2 0 12084 1840 204 94320 80 196 1416 49 232 567 0 5 95
0 1 0 12084 1708 216 94352 240 0 1467 0 219 676 0 1 98
At time T (top of the output), I started reading a big file from the
4k ext2 FS. The machine is completely idle, and as you can see has
lots of memory free. Before the memory gets filled (first few lines),
you can also see that data is coming at a 16MB/sec pace (bi ~ 4000),
which is _exactly_ the available (and expected) bandwidth.
And *then* we get in the trouble. VM kicks in and starts to swap in
an' out at will. Disk heads starts thrashing with sounds similar to
the ones heard when running netscape on a 16MB machine. Of course, the
reading speed drops drastically, and in the end we finish 10 seconds
later than we expected. I/O bandwidth is effectively halved, and why?
Because we enlarged page cache from completely satisfying 90MB!!! to
95MB (by 5%!), and to get that pissy 5MB we were swapping out as mad,
then processes started recolecting their pages back from the disk,
then all over again...
Now the question is: is such behaviour as expected or will that get
fixed before the final 2.4.0?
I'm worried that we are going to release the ultimate swapping machine
and say to people: here is the new an' great stable kernel! Watch it
swap and never stop. :)
What especially bother me is that nobody sees the problem. Everybody
is talking about better and better kernel, how things are getting
stable and response time is getting better, but I see new releases
getting worse and worse with performance going down the drain. Tell me
that I'm an idiot, that system is supposed to swap all the time and
then I'll maybe stop bitching. But not before. :)
Second thing: Around two years before, IIRC, Linus and people on this
group decided that we don't need page aging, that it only kills
performance and thus code is removed, not to be seen again. I wasn't
so sure then it was such a good idea, but when Andrea's lru page
management got in, I become very satisfied with our page replacement
policies.
Obviously, with zoned memory we got in the trouble once again and now,
as a solution you're getting page aging in the kernel again. Could you
tell us what are you're reasons? What has changed in the meantime, so
that we haven't needed page aging two years before, and now we need
it?
I didn't find any improvement (as can be seen from the vmstat output
above). Yes, I'm well aware of what are you _trying_ to achieve, but
in reality, you've just added lots of logic to the kernel and
accomplished nothing. Not to say we're back to 2.1.something and
history is repeating. :(
Gratuitous adding of untested code this far in the development doesn't
look like a very good idea to me.
In the end, I hope nobody sees this rather long complaint as an
attack, but rather as a call to a debate of how we could improve the
kernel and hopefully get us 2.4.0 out sooner. 2.4.0 we'll be proud of.
Regards,
--
Zlatko
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2000-06-04 17:46 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-06-03 15:17 [PATCH] VM patch 3 for -ac7 Rik van Riel
2000-06-04 17:46 ` Zlatko Calusic
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox