* [PATCH] 2.3.99-pre6-3+ VM rebalancing
@ 2000-04-23 2:08 Rik van Riel
2000-04-25 1:25 ` Simon Kirby
0 siblings, 1 reply; 26+ messages in thread
From: Rik van Riel @ 2000-04-23 2:08 UTC (permalink / raw)
To: linux-mm; +Cc: Stephen C. Tweedie, Ben LaHaise, linux-kernel
Hi,
the following patch makes VM in 2.3.99-pre6+ behave more nice
than in previous versions. It does that by:
- having a global lru queue for shrink_mmap()
- slightly improving the lru scanning
- being less agressive with lru scanning, so we'll have
more pages in the lru queue and will do better page
aging (and also gives us a bigger buffer of clean pages,
this way big memory hogs have less impact on the rest of
the system)
- freeing some pages from the "wrong" zone when freeing
from one particular zone ... this keeps memory balanced
because __alloc_pages() will allocate most pages from
the least busy zone
It has done some amazing things in test situations on my
machine, but I have no idea what it'll do to kswapd cpu
usage on >1GB machines. I think that the extra freedom in
allocation will offset the slightly more expensive freeing
code almost all of the time.
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--- linux-2.3.99-pre6-3/mm/filemap.c.orig Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/filemap.c Sat Apr 22 22:14:10 2000
@@ -44,6 +44,7 @@
atomic_t page_cache_size = ATOMIC_INIT(0);
unsigned int page_hash_bits;
struct page **page_hash_table;
+struct list_head lru_cache;
spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
/*
@@ -149,11 +150,16 @@
/* page wholly truncated - free it */
if (offset >= start) {
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ get_page(page);
+ wait_on_page(page);
+ put_page(page);
+ goto repeat;
+ }
get_page(page);
spin_unlock(&pagecache_lock);
- lock_page(page);
-
if (!page->buffers || block_flushpage(page, 0))
lru_cache_del(page);
@@ -191,11 +197,13 @@
continue;
/* partial truncate, clear end of page */
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ goto repeat;
+ }
get_page(page);
spin_unlock(&pagecache_lock);
- lock_page(page);
-
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (page->buffers)
block_flushpage(page, partial);
@@ -208,6 +216,9 @@
*/
UnlockPage(page);
page_cache_release(page);
+ get_page(page);
+ wait_on_page(page);
+ put_page(page);
goto repeat;
}
spin_unlock(&pagecache_lock);
@@ -215,46 +226,56 @@
int shrink_mmap(int priority, int gfp_mask, zone_t *zone)
{
- int ret = 0, count;
+ int ret = 0, loop = 0, count;
LIST_HEAD(young);
LIST_HEAD(old);
LIST_HEAD(forget);
struct list_head * page_lru, * dispose;
- struct page * page;
-
+ struct page * page = NULL;
+ struct zone_struct * p_zone;
+
if (!zone)
BUG();
- count = nr_lru_pages / (priority+1);
+ count = nr_lru_pages >> priority;
+ if (!count)
+ return ret;
spin_lock(&pagemap_lru_lock);
-
- while (count > 0 && (page_lru = zone->lru_cache.prev) != &zone->lru_cache) {
+again:
+ /* we need pagemap_lru_lock for list_del() ... subtle code below */
+ while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
page = list_entry(page_lru, struct page, lru);
list_del(page_lru);
+ p_zone = page->zone;
- dispose = &zone->lru_cache;
- if (test_and_clear_bit(PG_referenced, &page->flags))
- /* Roll the page at the top of the lru list,
- * we could also be more aggressive putting
- * the page in the young-dispose-list, so
- * avoiding to free young pages in each pass.
- */
- goto dispose_continue;
-
+ /*
+ * These two tests are there to make sure we don't free too
+ * many pages from the "wrong" zone. We free some anyway,
+ * they are the least recently used pages in the system.
+ * When we don't free them, leave them in &old.
+ */
dispose = &old;
- /* don't account passes over not DMA pages */
- if (zone && (!memclass(page->zone, zone)))
+ if (p_zone->free_pages > p_zone->pages_high)
goto dispose_continue;
- count--;
-
+ if (loop > 5 && page->zone != zone)
+ goto dispose_continue;
+
+ /* The page is in use, or was used very recently, put it in
+ * &young to make sure that we won't try to free it the next
+ * time */
dispose = &young;
-
- /* avoid unscalable SMP locking */
if (!page->buffers && page_count(page) > 1)
goto dispose_continue;
+ /* Only count pages that have a chance of being freeable */
+ count--;
+ if (test_and_clear_bit(PG_referenced, &page->flags))
+ goto dispose_continue;
+
+ /* Page not used -> free it; if that fails -> &old */
+ dispose = &old;
if (TryLockPage(page))
goto dispose_continue;
@@ -327,6 +348,7 @@
list_add(page_lru, dispose);
continue;
+ /* we're holding pagemap_lru_lock, so we can just loop again */
dispose_continue:
list_add(page_lru, dispose);
}
@@ -342,9 +364,14 @@
/* nr_lru_pages needs the spinlock */
nr_lru_pages--;
+ loop++;
+ /* wrong zone? not looped too often? roll again... */
+ if (page->zone != zone && loop < (128 >> priority))
+ goto again;
+
out:
- list_splice(&young, &zone->lru_cache);
- list_splice(&old, zone->lru_cache.prev);
+ list_splice(&young, &lru_cache);
+ list_splice(&old, lru_cache.prev);
spin_unlock(&pagemap_lru_lock);
--- linux-2.3.99-pre6-3/mm/page_alloc.c.orig Mon Apr 17 12:21:46 2000
+++ linux-2.3.99-pre6-3/mm/page_alloc.c Sat Apr 22 17:28:31 2000
@@ -25,7 +25,7 @@
#endif
int nr_swap_pages = 0;
-int nr_lru_pages;
+int nr_lru_pages = 0;
pg_data_t *pgdat_list = (pg_data_t *)0;
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -530,6 +530,7 @@
freepages.min += i;
freepages.low += i * 2;
freepages.high += i * 3;
+ memlist_init(&lru_cache);
/*
* Some architectures (with lots of mem and discontinous memory
@@ -609,7 +610,6 @@
unsigned long bitmap_size;
memlist_init(&zone->free_area[i].free_list);
- memlist_init(&zone->lru_cache);
mask += mask;
size = (size + ~mask) & mask;
bitmap_size = size >> i;
--- linux-2.3.99-pre6-3/include/linux/mm.h.orig Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mm.h Sat Apr 22 16:13:15 2000
@@ -15,6 +15,7 @@
extern unsigned long num_physpages;
extern void * high_memory;
extern int page_cluster;
+extern struct list_head lru_cache;
#include <asm/page.h>
#include <asm/pgtable.h>
--- linux-2.3.99-pre6-3/include/linux/mmzone.h.orig Mon Apr 17 12:22:22 2000
+++ linux-2.3.99-pre6-3/include/linux/mmzone.h Sat Apr 22 16:13:02 2000
@@ -31,7 +31,6 @@
char low_on_memory;
char zone_wake_kswapd;
unsigned long pages_min, pages_low, pages_high;
- struct list_head lru_cache;
/*
* free areas of different sizes
--- linux-2.3.99-pre6-3/include/linux/swap.h.orig Mon Apr 17 12:22:23 2000
+++ linux-2.3.99-pre6-3/include/linux/swap.h Sat Apr 22 16:19:38 2000
@@ -166,7 +166,7 @@
#define lru_cache_add(page) \
do { \
spin_lock(&pagemap_lru_lock); \
- list_add(&(page)->lru, &page->zone->lru_cache); \
+ list_add(&(page)->lru, &lru_cache); \
nr_lru_pages++; \
spin_unlock(&pagemap_lru_lock); \
} while (0)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-23 2:08 [PATCH] 2.3.99-pre6-3+ VM rebalancing Rik van Riel
@ 2000-04-25 1:25 ` Simon Kirby
2000-04-25 15:09 ` Rik van Riel
0 siblings, 1 reply; 26+ messages in thread
From: Simon Kirby @ 2000-04-25 1:25 UTC (permalink / raw)
To: riel; +Cc: linux-mm, Stephen C. Tweedie, Ben LaHaise, linux-kernel
On Sat, Apr 22, 2000 at 11:08:35PM -0300, Rik van Riel wrote:
> Hi,
>
> the following patch makes VM in 2.3.99-pre6+ behave more nice
> than in previous versions. It does that by:
>
> - having a global lru queue for shrink_mmap()
> - slightly improving the lru scanning
> - being less agressive with lru scanning, so we'll have
> more pages in the lru queue and will do better page
> aging (and also gives us a bigger buffer of clean pages,
> this way big memory hogs have less impact on the rest of
> the system)
> - freeing some pages from the "wrong" zone when freeing
> from one particular zone ... this keeps memory balanced
> because __alloc_pages() will allocate most pages from
> the least busy zone
>
> It has done some amazing things in test situations on my
> machine, but I have no idea what it'll do to kswapd cpu
> usage on >1GB machines. I think that the extra freedom in
> allocation will offset the slightly more expensive freeing
> code almost all of the time.
Hi,
This patch seems to help a lot overall in keeping the machine from diving
deep into swap after an average day's work in X (glade, netscape,
mozilla, many rxvts, etc.), but I still seem to see some situations that
seem broken. Here's an example from when I was diffing pre6-5 against
pre6-6 while listening to an MP3 (shrunk a bit to aovid wrapping):
0 0 0 20224 3136 3312 60392 0 0 16 0 126 1173 2 0 98
0 1 0 20024 2572 3340 60292 0 0 253 254 280 1276 2 2 96
0 1 0 19932 3068 3404 60208 0 44 208 11 303 1423 5 2 93
0 1 0 19768 3020 3384 60340 0 32 424 8 335 1567 2 12 85
0 1 0 19780 2912 3284 60472 0 28 357 11 346 1596 3 11 86
1 1 0 19764 2932 3236 60472 0 32 389 8 357 1614 3 11 85
0 1 0 19644 2780 3252 60620 0 0 296 0 316 1551 3 7 90
1 1 0 19596 2892 3340 60352 0 0 211 0 286 1466 3 5 92
0 1 0 19396 2076 3364 61128 0 0 416 0 392 1712 2 7 91
0 0 0 19044 2956 3412 60096 0 0 304 12 356 1605 2 11 87
1 0 0 18952 2824 3420 60240 0 32 364 8 363 1644 1 6 92
0 0 1 17880 3068 3476 59908 0 52 481 13 398 1730 3 9 88
0 1 0 17760 2904 3556 60012 0 24 400 6 378 1667 1 6 93
1 1 0 17652 2772 3612 60032 0 0 275 0 288 1488 2 2 96
0 1 0 17580 2800 3636 59888 0 32 257 8 275 1468 2 1 96
1 1 0 17384 2568 3692 60072 0 0 568 0 364 1659 4 4 92
0 1 0 17164 2528 3668 60164 0 16 413 4 438 1800 1 3 95
0 2 0 17204 2728 3544 60088 0 40 452 10 434 1788 1 5 94
1 1 0 17236 2932 3588 59752 0 32 253 8 333 1591 12 38 50
It seems a bit odd that it is swapping out here when there is a lot of
cache memory available.
Dual processors at 450 MHz w/128 MB ECC SDRAM and a 7200 RPM WD 27.3 GB
IDE drive.
Simon-
[ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
[ sim@stormix.com ][ sim@netnation.com ]
[ Opinions expressed are not necessarily those of my employers. ]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 1:25 ` Simon Kirby
@ 2000-04-25 15:09 ` Rik van Riel
2000-04-25 15:59 ` Andrea Arcangeli
0 siblings, 1 reply; 26+ messages in thread
From: Rik van Riel @ 2000-04-25 15:09 UTC (permalink / raw)
To: Simon Kirby; +Cc: linux-mm, Stephen C. Tweedie, Ben LaHaise, linux-kernel
On Mon, 24 Apr 2000, Simon Kirby wrote:
> On Sat, Apr 22, 2000 at 11:08:35PM -0300, Rik van Riel wrote:
>
> > the following patch makes VM in 2.3.99-pre6+ behave more nice
> > than in previous versions. It does that by:
[snip]
> 0 2 0 17204 2728 3544 60088 0 40 452 10 434 1788 1 5 94
> 1 1 0 17236 2932 3588 59752 0 32 253 8 333 1591 12 38 50
>
> It seems a bit odd that it is swapping out here when there is a
> lot of cache memory available.
If you look closer, you'll see that none of the swapped out
stuff is swapped back in again. This shows that the VM
subsystem did make the right choice here...
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 15:09 ` Rik van Riel
@ 2000-04-25 15:59 ` Andrea Arcangeli
2000-04-25 17:20 ` Rik van Riel
0 siblings, 1 reply; 26+ messages in thread
From: Andrea Arcangeli @ 2000-04-25 15:59 UTC (permalink / raw)
To: riel; +Cc: Simon Kirby, linux-mm, Stephen C. Tweedie, Ben LaHaise, linux-kernel
On Tue, 25 Apr 2000, Rik van Riel wrote:
>If you look closer, you'll see that none of the swapped out
>stuff is swapped back in again. This shows that the VM
>subsystem did make the right choice here...
Swapping out with 50mbyte of cache isn't the right choice unless all the
50mbyte of cache were mapped in memory (and I bet that wasn't the case).
Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 15:59 ` Andrea Arcangeli
@ 2000-04-25 17:20 ` Rik van Riel
2000-04-25 18:36 ` Simon Kirby
0 siblings, 1 reply; 26+ messages in thread
From: Rik van Riel @ 2000-04-25 17:20 UTC (permalink / raw)
To: Andrea Arcangeli
Cc: Simon Kirby, linux-mm, Stephen C. Tweedie, Ben LaHaise, linux-kernel
On Tue, 25 Apr 2000, Andrea Arcangeli wrote:
> On Tue, 25 Apr 2000, Rik van Riel wrote:
>
> >If you look closer, you'll see that none of the swapped out
> >stuff is swapped back in again. This shows that the VM
> >subsystem did make the right choice here...
>
> Swapping out with 50mbyte of cache isn't the right choice unless
> all the 50mbyte of cache were mapped in memory (and I bet that
> wasn't the case).
Funny you just state this without explaining why.
If the memory that's swapped out isn't used again
in the next 5 minutes, but the pages in the file
cache _are_ used (eg. for compiling that kernel you
just unpacked), then it definately is the right
choice to keep the cached data in memory and swap
out some part of netscape.
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 17:20 ` Rik van Riel
@ 2000-04-25 18:36 ` Simon Kirby
2000-04-25 18:59 ` Jeff Garzik
0 siblings, 1 reply; 26+ messages in thread
From: Simon Kirby @ 2000-04-25 18:36 UTC (permalink / raw)
To: riel
Cc: Andrea Arcangeli, linux-mm, Stephen C. Tweedie, Ben LaHaise,
linux-kernel
On Tue, Apr 25, 2000 at 02:20:19PM -0300, Rik van Riel wrote:
> On Tue, 25 Apr 2000, Andrea Arcangeli wrote:
> > On Tue, 25 Apr 2000, Rik van Riel wrote:
> >
> > >If you look closer, you'll see that none of the swapped out
> > >stuff is swapped back in again. This shows that the VM
> > >subsystem did make the right choice here...
> >
> > Swapping out with 50mbyte of cache isn't the right choice unless
> > all the 50mbyte of cache were mapped in memory (and I bet that
> > wasn't the case).
>
> Funny you just state this without explaining why.
> If the memory that's swapped out isn't used again
> in the next 5 minutes, but the pages in the file
> cache _are_ used (eg. for compiling that kernel you
> just unpacked), then it definately is the right
> choice to keep the cached data in memory and swap
> out some part of netscape.
Well, from the way I look at it...
In the ideal world, everybody would have unlimited quantities of RAM and
swap would be unnecessary. In the desktop world, this is pretty much the
opposite, but RAM is always getting cheaper and newer machines always have
more RAM. The ideal server setup is one where it never has to use swap,
_but also_ where it never has to read in anything at all after it's been
up for a while.
For desktops with low memory, it probably is an advantage to swap out
occasionally to be able to keep more things in cache. However, for
higher-end servers, I don't think it would be an advantage to swap simply
when the cache has used up the remaining free memory and more memory is
needed because it would slow down the response time for running programs
(although this is all a balance, I see :)). I suppose it would make more
of a difference on desktops where people switch between windows in X and
want a speedy response from large programs, where as on a server it would
probably just be a small daemon that doesn't get used much.
Hrmm.. I guess the ideal solution would be that swappable pages would age
just like cache pages and everything else? Then, if a particular
program's page hasn't been accessed for 60 seconds and there is nothing
older in the page cahce, it would swap out... I don't think this is
possible, though, because it would have to keep track of reads to every
page (slow, right?).
Simon-
[ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
[ sim@stormix.com ][ sim@netnation.com ]
[ Opinions expressed are not necessarily those of my employers. ]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 18:36 ` Simon Kirby
@ 2000-04-25 18:59 ` Jeff Garzik
2000-04-25 19:06 ` Simon Kirby
0 siblings, 1 reply; 26+ messages in thread
From: Jeff Garzik @ 2000-04-25 18:59 UTC (permalink / raw)
To: Simon Kirby
Cc: riel, Andrea Arcangeli, linux-mm, Stephen C. Tweedie,
Ben LaHaise, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 1005 bytes --]
Simon Kirby wrote:
> Hrmm.. I guess the ideal solution would be that swappable pages would age
> just like cache pages and everything else? Then, if a particular
> program's page hasn't been accessed for 60 seconds and there is nothing
> older in the page cahce, it would swap out...
Again a policy decision... I think such a feature should be present and
enabled by default, but there are some people who would prefer that
their configuration not do this, or would prefer that the timeout for
old pages be far longer than 60 seconds.
The main reason is there is a noticeable performance increase when you
have so much more physical memory available for page and buffer cache.
I manually force this behavior now with the attached 'fillmem' program,
usually before a big compile on an otherwise quiet machine.
Jeff
--
Jeff Garzik | Nothing cures insomnia like the
Building 1024 | realization that it's time to get up.
MandrakeSoft, Inc. | -- random fortune
[-- Attachment #2: fillmem.c --]
[-- Type: text/plain, Size: 707 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define MEGS 140
#define MEG (1024 * 1024)
int main (int argc, char *argv[])
{
void **data;
int i, r;
size_t megs = MEGS;
if ((argc >= 2) && (atoi(argv[1]) > 0))
megs = atoi(argv[1]);
data = malloc (megs * sizeof (void*));
if (!data) abort();
memset (data, 0, megs * sizeof (void*));
srand(time(NULL));
for (i = 0; i < megs; i++) {
data[i] = malloc(MEG);
memset (data[i], i, MEG);
printf("malloc/memset %03d/%03lu\n", i+1, megs);
}
for (i = megs - 1; i >= 0; i--) {
r = rand() % 200;
memset (data[i], r, MEG);
printf("memset #2 %03d/%03lu = %d\n", i+1, megs, r);
}
printf("done\n");
return 0;
}
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 18:59 ` Jeff Garzik
@ 2000-04-25 19:06 ` Simon Kirby
2000-04-25 19:34 ` Rik van Riel
2000-04-26 11:01 ` Stephen C. Tweedie
0 siblings, 2 replies; 26+ messages in thread
From: Simon Kirby @ 2000-04-25 19:06 UTC (permalink / raw)
To: Jeff Garzik
Cc: riel, Andrea Arcangeli, linux-mm, Stephen C. Tweedie,
Ben LaHaise, linux-kernel
On Tue, Apr 25, 2000 at 02:59:50PM -0400, Jeff Garzik wrote:
> Simon Kirby wrote:
> > Hrmm.. I guess the ideal solution would be that swappable pages would age
> > just like cache pages and everything else? Then, if a particular
> > program's page hasn't been accessed for 60 seconds and there is nothing
> > older in the page cahce, it would swap out...
>
> Again a policy decision... I think such a feature should be present and
> enabled by default, but there are some people who would prefer that
> their configuration not do this, or would prefer that the timeout for
> old pages be far longer than 60 seconds.
Sorry, I made a mistake there while writing..I was going to give an
example and wrote 60 seconds, but I didn't actually mean to limit
anything to 60 seconds. I just meant to make a really big global lru
that contains everything including page cache and swap. :)
Simon-
[ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
[ sim@stormix.com ][ sim@netnation.com ]
[ Opinions expressed are not necessarily those of my employers. ]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 19:06 ` Simon Kirby
@ 2000-04-25 19:34 ` Rik van Riel
2000-04-26 11:01 ` Stephen C. Tweedie
1 sibling, 0 replies; 26+ messages in thread
From: Rik van Riel @ 2000-04-25 19:34 UTC (permalink / raw)
To: Simon Kirby
Cc: Jeff Garzik, Andrea Arcangeli, linux-mm, Stephen C. Tweedie,
Ben LaHaise, linux-kernel
On Tue, 25 Apr 2000, Simon Kirby wrote:
> Sorry, I made a mistake there while writing..I was going to give an
> example and wrote 60 seconds, but I didn't actually mean to limit
> anything to 60 seconds. I just meant to make a really big global lru
> that contains everything including page cache and swap. :)
We already have that big global lru queue (actually, it's a
bit more closer to second chance replacement).
For pages which are in the page tables of processes, we
put the pages on the queue when we scan them and they
weren't used since we scanned them the last time (NRU
replacement). After that, they go through the lru queue
and are reclaimed when it's their turn.
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-25 19:06 ` Simon Kirby
2000-04-25 19:34 ` Rik van Riel
@ 2000-04-26 11:01 ` Stephen C. Tweedie
2000-04-26 11:15 ` Rik van Riel
2000-04-26 11:25 ` David S. Miller
1 sibling, 2 replies; 26+ messages in thread
From: Stephen C. Tweedie @ 2000-04-26 11:01 UTC (permalink / raw)
To: Simon Kirby
Cc: Jeff Garzik, riel, Andrea Arcangeli, linux-mm,
Stephen C. Tweedie, Ben LaHaise, linux-kernel
Hi,
On Tue, Apr 25, 2000 at 12:06:58PM -0700, Simon Kirby wrote:
>
> Sorry, I made a mistake there while writing..I was going to give an
> example and wrote 60 seconds, but I didn't actually mean to limit
> anything to 60 seconds. I just meant to make a really big global lru
> that contains everything including page cache and swap. :)
Doesn't work. If you do that, a "find / | grep ..." swaps out
everything in your entire system.
Getting the VM to respond properly in a way which doesn't freak out
in the mass-filescan case is non-trivial. Simple LRU over all pages
simply doesn't cut it.
--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 11:01 ` Stephen C. Tweedie
@ 2000-04-26 11:15 ` Rik van Riel
2000-04-26 12:29 ` Stephen C. Tweedie
2000-04-26 11:25 ` David S. Miller
1 sibling, 1 reply; 26+ messages in thread
From: Rik van Riel @ 2000-04-26 11:15 UTC (permalink / raw)
To: Stephen C. Tweedie
Cc: Simon Kirby, Jeff Garzik, Andrea Arcangeli, linux-mm,
Ben LaHaise, linux-kernel
On Wed, 26 Apr 2000, Stephen C. Tweedie wrote:
> On Tue, Apr 25, 2000 at 12:06:58PM -0700, Simon Kirby wrote:
> >
> > Sorry, I made a mistake there while writing..I was going to give an
> > example and wrote 60 seconds, but I didn't actually mean to limit
> > anything to 60 seconds. I just meant to make a really big global lru
> > that contains everything including page cache and swap. :)
>
> Doesn't work. If you do that, a "find / | grep ..." swaps out
> everything in your entire system.
>
> Getting the VM to respond properly in a way which doesn't freak out
> in the mass-filescan case is non-trivial. Simple LRU over all pages
> simply doesn't cut it.
It seems to work pretty well, because pages "belonging to" processes
are mapped into the address space of each process and will never go
through swap_out() if shrink_mmap() will succeed.
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 11:15 ` Rik van Riel
@ 2000-04-26 12:29 ` Stephen C. Tweedie
2000-04-26 12:45 ` David S. Miller
0 siblings, 1 reply; 26+ messages in thread
From: Stephen C. Tweedie @ 2000-04-26 12:29 UTC (permalink / raw)
To: riel
Cc: Stephen C. Tweedie, Simon Kirby, Jeff Garzik, Andrea Arcangeli,
linux-mm, Ben LaHaise, linux-kernel
Hi,
On Wed, Apr 26, 2000 at 08:15:14AM -0300, Rik van Riel wrote:
> On Wed, 26 Apr 2000, Stephen C. Tweedie wrote:
> > On Tue, Apr 25, 2000 at 12:06:58PM -0700, Simon Kirby wrote:
> > >
> > > Sorry, I made a mistake there while writing..I was going to give an
> > > example and wrote 60 seconds, but I didn't actually mean to limit
> > > anything to 60 seconds. I just meant to make a really big global lru
> > > that contains everything including page cache and swap. :)
> >
> > Doesn't work. If you do that, a "find / | grep ..." swaps out
> > everything in your entire system.
> >
> > Getting the VM to respond properly in a way which doesn't freak out
> > in the mass-filescan case is non-trivial. Simple LRU over all pages
> > simply doesn't cut it.
>
> It seems to work pretty well, because pages "belonging to" processes
> are mapped into the address space of each process and will never go
> through swap_out() if shrink_mmap() will succeed.
I know. The post wasn't talking about what we do now. It was talking
about a hypothetical LRU which covers "everything including page cache
and swap." LRU over just the page cache pages works fine. If you
start treating swap exactly the same, on a page-by-page LRU, then a
filesystem "find" scan will swap out most of your VM. Bad news.
--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 12:29 ` Stephen C. Tweedie
@ 2000-04-26 12:45 ` David S. Miller
0 siblings, 0 replies; 26+ messages in thread
From: David S. Miller @ 2000-04-26 12:45 UTC (permalink / raw)
To: sct; +Cc: riel, sim, jgarzik, andrea, linux-mm, bcrl, linux-kernel
If you start treating swap exactly the same, on a page-by-page LRU,
then a filesystem "find" scan will swap out most of your VM. Bad
news.
I never got the impression from the original posting that swap pages
would be treated "exactly" the same, and any sane LRU implementation
which included swap and anonymous pages would prefer clean page
liberation to dirty page liberation. I consider this a given.
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 11:01 ` Stephen C. Tweedie
2000-04-26 11:15 ` Rik van Riel
@ 2000-04-26 11:25 ` David S. Miller
2000-04-26 13:00 ` Stephen C. Tweedie
2000-04-26 13:46 ` Rik van Riel
1 sibling, 2 replies; 26+ messages in thread
From: David S. Miller @ 2000-04-26 11:25 UTC (permalink / raw)
To: sct; +Cc: sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
On Tue, Apr 25, 2000 at 12:06:58PM -0700, Simon Kirby wrote:
>
> Sorry, I made a mistake there while writing..I was going to give an
> example and wrote 60 seconds, but I didn't actually mean to limit
> anything to 60 seconds. I just meant to make a really big global lru
> that contains everything including page cache and swap. :)
Doesn't work. If you do that, a "find / | grep ..." swaps out
everything in your entire system.
Getting the VM to respond properly in a way which doesn't freak out
in the mass-filescan case is non-trivial. Simple LRU over all pages
simply doesn't cut it.
I believe this is not true at all. Clean pages will be preferred to
toss simply because they are easier to get rid of. In fact, "find / |
grep" is a perfect example of a case where LRU'ing only clean page
cache pages will keep the free page pools in equilibrium and we won't
need to swap anything.
I can say this with confidence, because I actually implemented a one
day hack which centralized all of page cache, swap cache, and all
anonymous pages into the LRU, deleted the crap we call swap_out and
taught the LRU queue processing how to toss pages from user address
spaces. Since I gave a mapping to anonymous pages, this became a
doable and almost trivial task. In these hacks I also created a
multi-list LRU scheme (active, inactive, dirty) so that
try_to_free_pages already had the LRU pool pre-sorted, so it only had
to look at pages which were unreferenced at the onset of memory
pressure. When we're not paging, kswapd would wake up periodically
to do some LRU aging and populate the inactive/dirty LRU queues.
I have to be quite frank, and say that the FreeBSD people are pretty
much on target when they say that our swapping and paging stinks, it
really does.
I am of the opinion that vmscan.c:swap_out() is one of our biggest
problems, because it kills us in the case where a few processes have
a pagecache page mapped, haven't accessed it in a long time, and
swap_out doesn't unmap those pages in time for the LRU shrink_mmap
code to fully toss it. This happens even though these pages are
excellant candidates for freeing. So here is where I came to the
conclusion that LRU needs to have the capability of tossing arbitrary
pages from process address spaces. This is why in my experiental
hacks I just killed swap_out() completely, and taught LRU how to
do all of the things swap_out did. I could do this because the
LRU scanner could go from a page to all mappings of that page, even
for anonymous and swap pages.
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 11:25 ` David S. Miller
@ 2000-04-26 13:00 ` Stephen C. Tweedie
2000-04-26 13:11 ` David S. Miller
2000-04-26 13:46 ` Rik van Riel
1 sibling, 1 reply; 26+ messages in thread
From: Stephen C. Tweedie @ 2000-04-26 13:00 UTC (permalink / raw)
To: David S. Miller
Cc: sct, sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
Hi,
On Wed, Apr 26, 2000 at 04:25:23AM -0700, David S. Miller wrote:
>
> Getting the VM to respond properly in a way which doesn't freak out
> in the mass-filescan case is non-trivial. Simple LRU over all pages
> simply doesn't cut it.
>
> I believe this is not true at all. Clean pages will be preferred to
> toss simply because they are easier to get rid of.
As soon as you differentiate between clean and dirty page again, you
no longer have pure LRU. We're agreeing here --- LRU on its own is not
enough, you need _some_ mechanism to give preference to the eviction of
clean, pure cache pages. Whether it's different queues, or separate
mechanisms for swapout as we have now, is a different issue --- the one
thing we cannot afford is blind LRU without any feedback on the
properties of the pages themselves.
> I am of the opinion that vmscan.c:swap_out() is one of our biggest
> problems, because it kills us in the case where a few processes have
> a pagecache page mapped, haven't accessed it in a long time, and
> swap_out doesn't unmap those pages in time for the LRU shrink_mmap
> code to fully toss it.
Yep
> This happens even though these pages are
> excellant candidates for freeing. So here is where I came to the
> conclusion that LRU needs to have the capability of tossing arbitrary
> pages from process address spaces. This is why in my experiental
> hacks I just killed swap_out() completely, and taught LRU how to
> do all of the things swap_out did. I could do this because the
> LRU scanner could go from a page to all mappings of that page, even
> for anonymous and swap pages.
Doing it isn't the problem. Doing it efficiently is, if you have
fork() and mremap() in the picture. With mremap(), you cannot assume
that the virtual address of an anonymous page is the same in all
processes which have the page mapped.
So, basically, to find all the ptes for a given page, you have to
walk every single vma in every single mm which is a fork()ed
ancestor or descendent of the mm whose address_space you indexed
the page against.
Granted, it's probably faster than the current swap_out mechanism, but
the worst case is still not much fun if you have fragmented address
spaces with a lot of vmas.
Detecting the right vma isn't hard, because the vma's vm_pgoff is
preserved over mremap(). It's the linear scan that is the danger.
--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 13:00 ` Stephen C. Tweedie
@ 2000-04-26 13:11 ` David S. Miller
2000-04-26 15:23 ` Stephen C. Tweedie
0 siblings, 1 reply; 26+ messages in thread
From: David S. Miller @ 2000-04-26 13:11 UTC (permalink / raw)
To: sct; +Cc: sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
Doing it isn't the problem. Doing it efficiently is, if you have
fork() and mremap() in the picture. With mremap(), you cannot assume
that the virtual address of an anonymous page is the same in all
processes which have the page mapped.
Who makes that assumption? The virtual address of a physical page
is:
(page->index - vma->vm_pgoff) << PAGE_SHIFT
Add that to vma->vm_start and if the resulting value is not
>= vma->vm_end, then you have the proper virtual address, always.
So, basically, to find all the ptes for a given page, you have to
walk every single vma in every single mm which is a fork()ed
ancestor or descendent of the mm whose address_space you indexed
the page against.
If you implement things correctly, this is not true at all.
Detecting the right vma isn't hard, because the vma's vm_pgoff is
preserved over mremap(). It's the linear scan that is the danger.
In my implementation there is no linear scan, only VMA's which
can actually contain the anonymous page in question are scanned.
It's called an anonymous layer, and it provides pseudo backing objects
for VMA's which have at least one privatized anonymous page. Each
such object is no more than a reference count, and an address_space
struct. The anonymous pages are queued into the address_space page
list, and have their page->index fields set appropriately.
When VMA's move around, get duplicated in fork'd processes, etc.
the anon layer gets called and adjusts things appropriately.
Instead of talk, I'll show some code :-) The following is the
anon layer I implemented for 2.3.x in my hacks.
--- ./mm/anon.c.~1~ Tue Apr 25 00:39:55 2000
+++ ./mm/anon.c Tue Apr 25 07:08:28 2000
@@ -0,0 +1,370 @@
+/*
+ * linux/mm/anon.c
+ *
+ * Written by DaveM.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+
+/* The anon layer provides a virtual backing object for anonymous
+ * private pages. The anon objects hang off of vmas and are created
+ * at the first cow fault into a private mapping.
+ *
+ * The anon address space is just like the page cache, it holds a
+ * reference to each of the pages attached to it.
+ */
+
+/* The layout of this structure is completely private to the
+ * anon layer. There is no reason to export it so we don't.
+ */
+struct anon_area {
+ atomic_t count;
+ struct address_space mapping;
+};
+
+extern spinlock_t pagecache_lock;
+static kmem_cache_t *anon_cachep = NULL;
+
+static __inline__ void anon_insert_vma(struct vm_area_struct *vma,
+ struct anon_area *anon)
+{
+ struct address_space *mapping = &anon->mapping;
+ struct vm_area_struct *next;
+
+ spin_lock(&mapping->i_shared_lock);
+ next = mapping->i_mmap;
+ if ((vma->vm_anon_next_share = next) != NULL)
+ next->vm_anon_pprev_share = &vma->vm_anon_next_share;
+ mapping->i_mmap = vma;
+ vma->vm_anon_pprev_share = &mapping->i_mmap;
+ spin_unlock(&mapping->i_shared_lock);
+}
+
+static __inline__ void anon_remove_vma(struct vm_area_struct *vma,
+ struct anon_area *anon)
+{
+ struct address_space *mapping = &anon->mapping;
+ struct vm_area_struct *next;
+
+ spin_lock(&mapping->i_shared_lock);
+ next = vma->vm_anon_next_share;
+ if (next)
+ next->vm_anon_pprev_share = vma->vm_anon_pprev_share;
+ *(vma->vm_anon_pprev_share) = next;
+ spin_unlock(&mapping->i_shared_lock);
+}
+
+/* Attach VMA's anon_area to NEW_VMA */
+void anon_dup(struct vm_area_struct *vma, struct vm_area_struct *new_vma)
+{
+ struct anon_area *anon = vma->vm_anon;
+
+ if (anon == NULL)
+ BUG();
+
+ atomic_inc(&anon->count);
+ anon_insert_vma(new_vma, anon);
+ new_vma->vm_anon = anon;
+}
+
+/* Free up all the pages assosciated with ANON. */
+static void invalidate_anon_pages(struct anon_area *anon)
+{
+ spin_lock(&pagecache_lock);
+
+ for (;;) {
+ struct list_head *entry = anon->mapping.pages.next;
+ struct page *page;
+
+ if (entry == &anon->mapping.pages)
+ break;
+
+ page = list_entry(entry, struct page, list);
+
+ get_page(page);
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+ lock_page(page);
+ spin_lock(&pagecache_lock);
+ }
+
+ if (PageSwapCache(page)) {
+ spin_unlock(&pagecache_lock);
+ __delete_from_swap_cache(page);
+ spin_lock(&pagecache_lock);
+ }
+
+ put_page(page);
+
+ lru_cache_del(page);
+
+ list_del(&page->list);
+ anon->mapping.nrpages--;
+ ClearPageAnon(page);
+ page->mapping = NULL;
+ UnlockPage(page);
+
+ __free_page(page);
+ }
+
+ spin_unlock(&pagecache_lock);
+
+ if (anon->mapping.nrpages != 0)
+ BUG();
+}
+
+/* VMA has been resized in some way, or one of the anon_area owners
+ * has gone away. Trim the anonymous pages from the anon_area which
+ * have a reference count of one. These pages are no longer
+ * referenced validly by any VMA and thus can be safely disposed.
+ *
+ * This is actually an optimization of sorts, we could just
+ * ignore this situation and let the eventual final anon_put
+ * get rid of the pages.
+ *
+ * It is the callers responsibility to unmap and free the
+ * pages from the address space of the process before invoking
+ * this. It cannot work otherwise.
+ */
+void anon_trim(struct vm_area_struct *vma)
+{
+ struct anon_area *anon = vma->vm_anon;
+ struct list_head *entry;
+
+ spin_lock(&pagecache_lock);
+
+ entry = anon->mapping.pages.next;
+ while (entry != &anon->mapping.pages) {
+ struct page *page = list_entry(entry, struct page, list);
+ struct list_head *next = entry->next;
+
+ entry = next;
+
+ if (page_count(page) != 1)
+ continue;
+
+ if (TryLockPage(page))
+ continue;
+
+ lru_cache_del(page);
+
+ list_del(&page->list);
+ anon->mapping.nrpages--;
+ ClearPageAnon(page);
+ page->mapping = NULL;
+ UnlockPage(page);
+
+ __free_page(page);
+ }
+
+ spin_unlock(&pagecache_lock);
+}
+
+/* Disassosciate VMA with the vm_anon attached to it. */
+void anon_put(struct vm_area_struct *vma)
+{
+ struct anon_area *anon = vma->vm_anon;
+
+ if (anon == NULL)
+ BUG();
+ if (atomic_read(&anon->count) < 1)
+ BUG();
+
+ anon_remove_vma(vma, anon);
+
+ if (atomic_dec_and_test(&anon->count)) {
+ if (anon->mapping.i_mmap != NULL)
+ BUG();
+ invalidate_anon_pages(anon);
+ kmem_cache_free(anon_cachep, anon);
+ } else
+ anon_trim(vma);
+
+ vma->vm_anon = NULL;
+}
+
+
+/* Forcibly delete an anon_area page. This also kills the
+ * original reference made by anon_cow.
+ */
+void anon_page_kill(struct page *page)
+{
+ spin_lock(&pagecache_lock);
+
+ if (TryLockPage(page)) {
+ spin_unlock(&pagecache_lock);
+
+ lock_page(page);
+
+ spin_lock(&pagecache_lock);
+ }
+
+ lru_cache_del(page);
+
+ page->mapping->nrpages--;
+ list_del(&page->list);
+ ClearPageAnon(page);
+ page->mapping = NULL;
+ UnlockPage(page);
+
+ put_page(page);
+ __free_page(page);
+
+ spin_unlock(&pagecache_lock);
+}
+
+static int anon_try_to_free_page(struct page *page)
+{
+ int ret = 0;
+
+ if (page_count(page) <= 1)
+ BUG();
+ if (!PageLocked(page))
+ BUG();
+
+ spin_lock(&pagecache_lock);
+ if (PageSwapCache(page)) {
+ spin_unlock(&pagecache_lock);
+ __delete_from_swap_cache(page);
+ spin_lock(&pagecache_lock);
+ }
+ if (page_count(page) == 2) {
+ struct address_space *mapping = page->mapping;
+
+ mapping->nrpages--;
+ list_del(&page->list);
+
+ ClearPageAnon(page);
+ page->mapping = NULL;
+ ret = 1;
+ }
+ spin_unlock(&pagecache_lock);
+
+ if (ret == 1)
+ __free_page(page);
+
+ return ret;
+}
+
+struct address_space_operations anon_address_space_operations = {
+ try_to_free_page: anon_try_to_free_page
+};
+
+/* SLAB constructor for anon_area structs. */
+static void anon_ctor(void *__p, kmem_cache_t *cache, unsigned long flags)
+{
+ struct anon_area *anon = __p;
+ struct address_space *mapping = &anon->mapping;
+
+ INIT_LIST_HEAD(&mapping->pages);
+ mapping->nrpages = 0;
+ mapping->a_ops = &anon_address_space_operations;
+ mapping->host = anon;
+ spin_lock_init(&mapping->i_shared_lock);
+}
+
+/* Create a new anon_area, and attach it to VMA. */
+static struct anon_area *anon_alloc(struct vm_area_struct *vma)
+{
+ struct anon_area *anon = kmem_cache_alloc(anon_cachep, GFP_KERNEL);
+
+ if (anon) {
+ struct address_space *mapping = &anon->mapping;
+
+ atomic_set(&anon->count, 1);
+ mapping->i_mmap = vma;
+ vma->vm_anon = anon;
+ vma->vm_anon_next_share = NULL;
+ vma->vm_anon_pprev_share = &mapping->i_mmap;
+ }
+
+ return anon;
+}
+
+static void anon_page_insert(struct vm_area_struct *vma, unsigned long address, struct address_space *mapping, struct page *page)
+{
+ page->index = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ get_page(page);
+
+ spin_lock(&pagecache_lock);
+ SetPageAnon(page);
+ mapping->nrpages++;
+ list_add(&page->list, &mapping->pages);
+ page->mapping = mapping;
+ spin_unlock(&pagecache_lock);
+
+ lru_cache_add(page);
+}
+
+static __inline__ struct anon_area *get_anon(struct vm_area_struct *vma)
+{
+ struct anon_area *anon = vma->vm_anon;
+
+ if (anon == NULL)
+ anon = anon_alloc(vma);
+
+ return anon;
+}
+
+int anon_page_add(struct vm_area_struct *vma, unsigned long address, struct page *page)
+{
+ struct anon_area *anon = get_anon(vma);
+
+ if (anon) {
+ anon_page_insert(vma, address, &anon->mapping, page);
+ return 0;
+ }
+
+ return -1;
+}
+
+/*
+ * We special-case the C-O-W ZERO_PAGE, because it's such
+ * a common occurrence (no need to read the page to know
+ * that it's zero - better for the cache and memory subsystem).
+ */
+static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
+{
+ if (from == ZERO_PAGE(address)) {
+ clear_user_highpage(to, address);
+ return;
+ }
+ copy_user_highpage(to, from, address);
+}
+
+struct page *anon_cow(struct vm_area_struct *vma, unsigned long address, struct page *orig_page)
+{
+ struct anon_area *anon = get_anon(vma);
+
+ if (anon) {
+ struct page *new_page = alloc_page(GFP_HIGHUSER);
+
+ if (new_page) {
+ copy_cow_page(orig_page, new_page, address);
+ anon_page_insert(vma, address, &anon->mapping, new_page);
+ }
+
+ return new_page;
+ }
+
+ return NULL;
+}
+
+void anon_init(void)
+{
+ anon_cachep = kmem_cache_create("anon_area",
+ sizeof(struct anon_area),
+ 0, SLAB_HWCACHE_ALIGN,
+ anon_ctor, NULL);
+ if (!anon_cachep)
+ panic("anon_init: Cannot alloc anon_area cache.");
+}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 13:11 ` David S. Miller
@ 2000-04-26 15:23 ` Stephen C. Tweedie
2000-04-26 15:25 ` David S. Miller
0 siblings, 1 reply; 26+ messages in thread
From: Stephen C. Tweedie @ 2000-04-26 15:23 UTC (permalink / raw)
To: David S. Miller
Cc: sct, sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
Hi,
On Wed, Apr 26, 2000 at 06:11:15AM -0700, David S. Miller wrote:
>
> Doing it isn't the problem. Doing it efficiently is, if you have
> fork() and mremap() in the picture. With mremap(), you cannot assume
> that the virtual address of an anonymous page is the same in all
> processes which have the page mapped.
>
> Who makes that assumption?
Nobody does --- that's the point. If you _could_ make that assumption,
then looking up the vma which maps a given page in a given mm would be
easy. But because the assumption doesn't hold, you have to walk all of
the vmas.
> In my implementation there is no linear scan, only VMA's which
> can actually contain the anonymous page in question are scanned.
>
> It's called an anonymous layer, and it provides pseudo backing objects
> for VMA's which have at least one privatized anonymous page.
...
> Instead of talk, I'll show some code :-) The following is the
> anon layer I implemented for 2.3.x in my hacks.
OK --- I'm assuming you allow all of these address spaces to act as
swapper address spaces for the purpose of the swap cache? This looks
good, do you have the rest of the VM changes in a usable (testable)
state?
On fork(), I assume you just leave multiple vmas attached to the same
address space? With things like mprotect, you'll still have a list
of vmas to search for in this design, I'd think.
--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 15:23 ` Stephen C. Tweedie
@ 2000-04-26 15:25 ` David S. Miller
2000-04-26 16:09 ` Stephen C. Tweedie
` (2 more replies)
0 siblings, 3 replies; 26+ messages in thread
From: David S. Miller @ 2000-04-26 15:25 UTC (permalink / raw)
To: sct; +Cc: sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
> Instead of talk, I'll show some code :-) The following is the
> anon layer I implemented for 2.3.x in my hacks.
OK --- I'm assuming you allow all of these address spaces to act as
swapper address spaces for the purpose of the swap cache?
Essentially, this is how it works yes.
This looks good, do you have the rest of the VM changes in a usable
(testable) state?
No, this is why I haven't posted the complete patch for general
consumption. It's in an "almost works" state, very dangerous,
and I don't even try leaving single user mode when I'm testing
it :-)))
On fork(), I assume you just leave multiple vmas attached to the
same address space? With things like mprotect, you'll still have a
list of vmas to search for in this design, I'd think.
At fork, the code which copies the address space just calls
"anon_dup()" for non-NULL vma->vm_anon, to clone the anon_area in the
child's VMA. anon_dup adds a new VMA to the mapping->i_mmap list and
bumps the anon_area reference count.
Actually, come to think of it, the anon_area reference count is
superfluous, because anon->mapping.i_mmap being NULL is equivalent to
the count going to zero. Superb, I can just kill that special
anon_area structure and use "struct address_space *vm_anon;" in the
vm_area_struct.
I'll try to clean up and stabilize my changes and post a patch
in the next few days.
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 15:25 ` David S. Miller
@ 2000-04-26 16:09 ` Stephen C. Tweedie
2000-04-27 20:28 ` Simon Kirby
2000-04-27 22:32 ` Jamie Lokier
2 siblings, 0 replies; 26+ messages in thread
From: Stephen C. Tweedie @ 2000-04-26 16:09 UTC (permalink / raw)
To: David S. Miller
Cc: sct, sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
Hi,
On Wed, Apr 26, 2000 at 08:25:59AM -0700, David S. Miller wrote:
>
> No, this is why I haven't posted the complete patch for general
> consumption. It's in an "almost works" state, very dangerous,
> and I don't even try leaving single user mode when I'm testing
> it :-)))
OK. You might find this useful:
ftp://ftp.uk.linux.org/pub/linux/sct/vm/mtest.c
which is a small utility I wrote while I was testing the
swap cache code. It creates a heap of memory, forks a variable
number of reader and/or writer processes to access that heap,
and touches/modifies the heap randomly from the children. It
is very good at testing the swap code for pages shared over
fork. It's what I use any time I need to push a box into swap
for VM testing.
--Stephen
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 15:25 ` David S. Miller
2000-04-26 16:09 ` Stephen C. Tweedie
@ 2000-04-27 20:28 ` Simon Kirby
2000-04-27 22:32 ` Jamie Lokier
2 siblings, 0 replies; 26+ messages in thread
From: Simon Kirby @ 2000-04-27 20:28 UTC (permalink / raw)
To: David S. Miller; +Cc: sct, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
On Wed, Apr 26, 2000 at 08:25:59AM -0700, David S. Miller wrote:
> I'll try to clean up and stabilize my changes and post a patch
> in the next few days.
This sounds really interesting. Anything we can play with yet? :)
Simon-
[ Stormix Technologies Inc. ][ NetNation Communications Inc. ]
[ sim@stormix.com ][ sim@netnation.com ]
[ Opinions expressed are not necessarily those of my employers. ]
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 15:25 ` David S. Miller
2000-04-26 16:09 ` Stephen C. Tweedie
2000-04-27 20:28 ` Simon Kirby
@ 2000-04-27 22:32 ` Jamie Lokier
2 siblings, 0 replies; 26+ messages in thread
From: Jamie Lokier @ 2000-04-27 22:32 UTC (permalink / raw)
To: David S. Miller
Cc: sct, sim, jgarzik, riel, andrea, linux-mm, bcrl, linux-kernel
David S. Miller wrote:
> On fork(), I assume you just leave multiple vmas attached to the
> same address space? With things like mprotect, you'll still have a
> list of vmas to search for in this design, I'd think.
>
> At fork, the code which copies the address space just calls
> "anon_dup()" for non-NULL vma->vm_anon, to clone the anon_area in the
> child's VMA. anon_dup adds a new VMA to the mapping->i_mmap list and
> bumps the anon_area reference count.
Fwiw, I don't think you need separate anon-layer structures at all.
struct address_space is enough. However, you do need a (small) tree of
those.
Here's a conceptual design I came up with today. It uses only
address_space and vmas:
- Each vma points to an address_space, as does each page.
- There's an address_space for each file/shm and each new anon mapping.
- Each address_space may have a *parent* address_space.
- Each address_space has a list of all it's child spaces and all the
vmas which use it directly.
- For private mappings only, the first copy-on-write for a given vma
creates a *new* address_space. All privately modified pages go in
the new address space, and of course these spaces are swappable.
The new space's parent is the old space.
- When vmas are duplicated for fork(), the new vma points to the same
address_space as the old one. If it's a private mapping, both vmas
are flagged so that the first c-o-w will generate new address_spaces.
- To get from vma to a page (e.g. to map the page), look up the page
in the vma's address_space, then in its parent if necessary etc. up
to the root of that tree.
At first this looks like it might slow down page lookup at fault time,
but it's not that bad. Now that you can get from struct page to all its
ptes, you don't in general have to unmap ptes for swapping to work.
This means you often know which address_space must contain the page,
or at least which one to check first.
A few optimisations keep the tree in shape, but they're not necessary:
- When you've modified all the pages in a private mapping, the parent
address_space is no longer required by this mapping. So cut the
tree there. That will release the parent if nobody else refers to
it.
- If a child is the only reference to its parent, they can be merged.
As with David's code, the big advantage is you can now easily find all
the page table entries for a given page. So swapping gets simpler.
Especially, the dynamics of swapping get simpler and so have fewer
instabilities.
Although it's possible to have large address_space trees, this is
unlikely. I would expect trees only 2 or 3 deep for normal cases.
There's no problem with working out the offset for anonymous mappings.
It's simply vm_pgoff == index, however the mapping was mremapped etc.
In some sense, I think this tree structure is actually the minimum you
need to traverse to find all the vmas that may currently map the page,
if you do not wish to maintain structures describing smaller regions.
The advantage over David's anon layer is that there isn't one :-)
It's also pretty close to what we have already.
Would this scheme work? Comments, please.
thanks,
-- Jamie
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 11:25 ` David S. Miller
2000-04-26 13:00 ` Stephen C. Tweedie
@ 2000-04-26 13:46 ` Rik van Riel
2000-04-26 14:33 ` David S. Miller
1 sibling, 1 reply; 26+ messages in thread
From: Rik van Riel @ 2000-04-26 13:46 UTC (permalink / raw)
To: David S. Miller; +Cc: sct, sim, jgarzik, andrea, linux-mm, bcrl, linux-kernel
On Wed, 26 Apr 2000, David S. Miller wrote:
> I have to be quite frank, and say that the FreeBSD people are
> pretty much on target when they say that our swapping and paging
> stinks, it really does.
Hehe ;)
> I am of the opinion that vmscan.c:swap_out() is one of our
> biggest problems, because it kills us in the case where a few
> processes have a pagecache page mapped, haven't accessed it in a
> long time, and swap_out doesn't unmap those pages in time for
> the LRU shrink_mmap code to fully toss it.
Please take a look at the patch I sent to the list a few
minutes ago. The "anti-hog" code, using swap_out() as a
primary mechanism for achieving its goal, seems to bring
some amazing results ... for one, memory hogs no longer
have a big performance impact on small processes.
I believe that it will be pretty much impossible to achieve
"fair" balancing with any VM code which weighs all pages the
same. And before you start crying that all pages should be
weighed the same to protect the performance of that important
memory hogging server process, the fact that it'll be the only
process waiting for disk and that its pages are aged better
often make the memory hog run faster as well! ;)
regards,
Rik
--
The Internet is not a network of computers. It is a network
of people. That is its real strength.
Wanna talk about the kernel? irc.openprojects.net / #kernelnewbies
http://www.conectiva.com/ http://www.surriel.com/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 13:46 ` Rik van Riel
@ 2000-04-26 14:33 ` David S. Miller
2000-04-26 16:31 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: David S. Miller @ 2000-04-26 14:33 UTC (permalink / raw)
To: riel; +Cc: sct, sim, jgarzik, andrea, linux-mm, bcrl, linux-kernel
> I am of the opinion that vmscan.c:swap_out() is one of our
> biggest problems, because it kills us in the case where a few
> processes have a pagecache page mapped, haven't accessed it in a
> long time, and swap_out doesn't unmap those pages in time for
> the LRU shrink_mmap code to fully toss it.
Please take a look at the patch I sent to the list a few
minutes ago. The "anti-hog" code, using swap_out() as a
primary mechanism for achieving its goal, seems to bring
some amazing results ... for one, memory hogs no longer
have a big performance impact on small processes.
It's a nice change considering we are so close to 2.4.x
but long term I still contend that swap_out is a hack that
should die soon.
I believe that it will be pretty much impossible to achieve
"fair" balancing with any VM code which weighs all pages the
same. And before you start crying that all pages should be
weighed the same to protect the performance of that important
memory hogging server process, the fact that it'll be the only
process waiting for disk and that its pages are aged better
often make the memory hog run faster as well! ;)
Let's start at square one. I have never suggested that we weigh
all pages the same. Global aging of all pages, on the other hand,
is something completely different. It doesn't treat all pages the
same, it treats them all differently based upon how people are using
the page.
Inactive pages are inactive pages are inactive pages, regardless of
who has used them or what they are being used specifically for. Let
me give a rough outline of what kind of paging algorithm I am
suggesting:
check_page_references(page)
{
if (TestAndClearReferenced(page) ||
(page->mapping->i_mmap &&
test_and_clear_pgtable_references(page)))
return 1;
return 0;
}
populate_inactive_list(long goal_pages)
{
for each active lru page {
if (! check_page_references(page)) {
add_to_inactive_lru(page)
if (--goal_pages <= 0)
break;
}
add page back to head of active lru
}
}
free_inactive_pages(long goal_pages)
{
for each inactive lru page {
if (check_page_references(page)) {
add page back to active lru
} else if (page_dirty_somewhere(page)) {
add page to head of dirty lru
} else {
if (page->buffers) {
... deal with them just like current code ...
}
mapping = page->mapping;
if (! mapping->a_ops->try_to_free_page(page)) {
add page to head of inactive lru
} else {
if (--goal_pages <= 0)
break;
}
}
}
}
swap_out_dirty_pages(long goal_pages)
{
for each dirty lru page {
try to swap it out, you get the idea
}
}
Some salient points about what is missing in this suggested
infrastructure:
1) There is no metric mentioned for handling pages that reactivate
themselves often (ie. get referenced while they are on the
inactive list), one is certainly needed.
A simple scheme would be a counter in the page struct which we
increment (up to some MAX value) when the page gets moved back
to the active list from the inactive list. Then the inactive list
population decrements this counter when it finds the page
unreferenced, and only if the counter comes down to zero does it
actually move the page to the inactive list.
Some more heuristics could be added to this simple scheme, such
as adding to this counter in the number of references seen at a
reactivation event.
2) There are no methods mentioned to control when we actually do
the page table walks, if and when to delete the user mappings
for an inactive page to get the counts down to just the mapping
owning it, etc. These sorts of heuristics would be needed to
for a well tuned implementation.
Next, let's assume we have the above and the general try_to_free_pages
toplevel code does something like:
try_to_free_pages()
{
goal = number_of_freepages_we_would_like_to_have -
nr_free_pages;
free_inactive_pages(goal);
populate_inactive_list(sysctl_inactive_list_goal /* or whatever */);
if (nr_free_pages >= goal)
break;
goal = number_of_freepages_we_would_like_to_have -
nr_free_pages;
swap_out_dirty_pages(goal);
}
[ AMAZING, astute readers will notice that all of this looks
suspiciously familiar to sys/vm/vm_paging.c in the freebsd
sources, and this is not by accident.
Sometimes I wonder if I am the only person who went and checked
out what they were doing when the accusations went flying around
that our paging sucks. ]
You get the idea, and next we have kswapd wake up periodically to
just do populate_inactive_list() runs to keep the inactive lru
list ready to go at the onset of future paging. Of course, kswapd
does forces try_to_free_pages runs when memory starts to run low, just
like it does now.
Now what will such a scheme like the above (remember, swap and
anonymous pages are in these LRU queues too) do in the memory hog
case you mentioned?
The big problem I have with the memory hog hacks is that it needs to
classify _processes_ to work effectively in some set of cases. When
what we really are concerned about is classification of _pages_, and
the system just does this naturally by setting dirty/referenced state
on the page->flags and the ptes which map those pages.
See? The global LRU scheme dynamically figures out what page usage is
like, it doesn't need to classify processes in a certain way, because
the per-page reference and dirty state will drive the page liberation
to just do the right thing.
Also, the anon layer I posted earlier today also allows us to provide
the strict swap reservation people cry for from time to time, since we
track all anonymous pages, we can do a "nr_swap_pages--" check and
fail if it would hit zero. The only hard part about this would be
adding a way to specify the boot time swap device before the first
process is executed, or just ignore this issue and only worry about
swap space reservation when the swap is actually enabled during the
init scripts.
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 14:33 ` David S. Miller
@ 2000-04-26 16:31 ` Andi Kleen
2000-04-26 15:28 ` David S. Miller
0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2000-04-26 16:31 UTC (permalink / raw)
To: David S. Miller; +Cc: riel, linux-mm
"David S. Miller" <davem@redhat.com> writes:
>
> See? The global LRU scheme dynamically figures out what page usage is
> like, it doesn't need to classify processes in a certain way, because
> the per-page reference and dirty state will drive the page liberation
> to just do the right thing.
But is that still fair ? A memory hog could rapidly allocate and
dirty pages, killing the small innocent daemon which just needs to
get some work done.
At least the FreeBSD code i have here has a way to limit maximum
swapout per process and increase it based on the resident pages rlimit.
Linux with your new dancing scheme will probably need this too.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 16:31 ` Andi Kleen
@ 2000-04-26 15:28 ` David S. Miller
2000-04-26 15:41 ` Andi Kleen
0 siblings, 1 reply; 26+ messages in thread
From: David S. Miller @ 2000-04-26 15:28 UTC (permalink / raw)
To: ak; +Cc: riel, linux-mm
But is that still fair ? A memory hog could rapidly allocate and
dirty pages, killing the small innocent daemon which just needs to
get some work done.
If the daemon is actually doing anything, he'll reference his
pages which will cause us to not liberate them. If he's not doing
anything, why should we keep his pages around?
Later,
David S. Miller
davem@redhat.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH] 2.3.99-pre6-3+ VM rebalancing
2000-04-26 15:28 ` David S. Miller
@ 2000-04-26 15:41 ` Andi Kleen
0 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2000-04-26 15:41 UTC (permalink / raw)
To: David S. Miller; +Cc: ak, riel, linux-mm
On Wed, Apr 26, 2000 at 08:28:21AM -0700, David S. Miller wrote:
> From: Andi Kleen <ak@suse.de>
> Date: 26 Apr 2000 18:31:50 +0200
>
> But is that still fair ? A memory hog could rapidly allocate and
> dirty pages, killing the small innocent daemon which just needs to
> get some work done.
>
> If the daemon is actually doing anything, he'll reference his
> pages which will cause us to not liberate them. If he's not doing
> anything, why should we keep his pages around?
What is if he isn't doing stuff quickly enough compared to the
spending significant parts of the CPU just to dirty pages memory hog ?
I imagine that the page scanning intervals will be too slow, if
you age more often you eat too much CPU [at least on Intel/SMP every pte
access is a locked transfer on the bus], if you do it too seldom
the memory hog can easily kill the system.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/
^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2000-04-27 22:32 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-04-23 2:08 [PATCH] 2.3.99-pre6-3+ VM rebalancing Rik van Riel
2000-04-25 1:25 ` Simon Kirby
2000-04-25 15:09 ` Rik van Riel
2000-04-25 15:59 ` Andrea Arcangeli
2000-04-25 17:20 ` Rik van Riel
2000-04-25 18:36 ` Simon Kirby
2000-04-25 18:59 ` Jeff Garzik
2000-04-25 19:06 ` Simon Kirby
2000-04-25 19:34 ` Rik van Riel
2000-04-26 11:01 ` Stephen C. Tweedie
2000-04-26 11:15 ` Rik van Riel
2000-04-26 12:29 ` Stephen C. Tweedie
2000-04-26 12:45 ` David S. Miller
2000-04-26 11:25 ` David S. Miller
2000-04-26 13:00 ` Stephen C. Tweedie
2000-04-26 13:11 ` David S. Miller
2000-04-26 15:23 ` Stephen C. Tweedie
2000-04-26 15:25 ` David S. Miller
2000-04-26 16:09 ` Stephen C. Tweedie
2000-04-27 20:28 ` Simon Kirby
2000-04-27 22:32 ` Jamie Lokier
2000-04-26 13:46 ` Rik van Riel
2000-04-26 14:33 ` David S. Miller
2000-04-26 16:31 ` Andi Kleen
2000-04-26 15:28 ` David S. Miller
2000-04-26 15:41 ` Andi Kleen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox