From mboxrd@z Thu Jan 1 00:00:00 1970 Message-ID: <392AA3D5.FD6B5399@norran.net> Date: Tue, 23 May 2000 17:29:25 +0200 From: Roger Larsson MIME-Version: 1.0 Subject: [PATCH--] Re: Linux VM/IO balancing (fwd to linux-mm?) (fwd) References: Content-Type: multipart/mixed; boundary="------------6AEA0E9B0F30D4BC4EB40ED5" Sender: owner-linux-mm@kvack.org Return-Path: To: Rik van Riel Cc: linux-mm@kvack.org, Matthew Dillon List-ID: This is a multi-part message in MIME format. --------------6AEA0E9B0F30D4BC4EB40ED5 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit From: Matthew Dillon > The algorithm is a *modified* LRU. Lets say you decide on a weighting > betweeen 0 and 10. When a page is first allocated (either to the > buffer cache or for anonymous memory) its statistical weight is > set to the middle (5). If the page is used often the statistical > weight slowly rises to its maximum (10). If the page remains idle > (or was just used once) the statistical weight slowly drops to its > minimum (0). My patches has been approaching this a while... [slowly...] The currently included patch adds has divided lru in four lists [0..3]. New pages are added at level 1. Scan is performed - and referenced pages are moved up. Pages are moved down due to list balancing, but I have been playing with other ideas. These patches should be a good continuation point. Patches are against pre9-3 with Quintela applied. /RogerL -- Home page: http://www.norran.net/nra02596/ --------------6AEA0E9B0F30D4BC4EB40ED5 Content-Type: text/plain; charset=us-ascii; name="patch-2.3.99-pre9.3-q-shrink_mmap.3" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="patch-2.3.99-pre9.3-q-shrink_mmap.3" diff -aur linux-2.3.99-pre9.3-q/include/linux/mm.h linux/include/linux/mm.h --- linux-2.3.99-pre9.3-q/include/linux/mm.h Fri May 12 21:16:14 2000 +++ linux/include/linux/mm.h Mon May 22 23:30:15 2000 @@ -15,7 +15,7 @@ extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; -extern struct list_head lru_cache; +extern struct list_head * new_lru_cache; #include #include @@ -456,6 +456,8 @@ extern void remove_inode_page(struct page *); extern unsigned long page_unuse(struct page *); extern int shrink_mmap(int, int); +extern int age_mmap(void); +extern void lru_cache_init(void); extern void truncate_inode_pages(struct address_space *, loff_t); /* generic vm_area_ops exported for stackable file systems */ diff -aur linux-2.3.99-pre9.3-q/include/linux/swap.h linux/include/linux/swap.h --- linux-2.3.99-pre9.3-q/include/linux/swap.h Fri May 12 21:16:13 2000 +++ linux/include/linux/swap.h Mon May 22 23:30:19 2000 @@ -166,7 +166,7 @@ #define lru_cache_add(page) \ do { \ spin_lock(&pagemap_lru_lock); \ - list_add(&(page)->lru, &lru_cache); \ + list_add(&(page)->lru, new_lru_cache); \ nr_lru_pages++; \ spin_unlock(&pagemap_lru_lock); \ } while (0) diff -aur linux-2.3.99-pre9.3-q/mm/filemap.c linux/mm/filemap.c --- linux-2.3.99-pre9.3-q/mm/filemap.c Mon May 22 22:25:51 2000 +++ linux/mm/filemap.c Tue May 23 16:36:41 2000 @@ -44,7 +44,10 @@ atomic_t page_cache_size = ATOMIC_INIT(0); unsigned int page_hash_bits; struct page **page_hash_table; -struct list_head lru_cache; + +#define NR_OF_LRU_LISTS 4 +static struct list_head lru_cache[NR_OF_LRU_LISTS]; // pages removed from low +struct list_head * new_lru_cache = &lru_cache[1]; // here goes new pages static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED; /* @@ -249,20 +252,19 @@ * before doing sync writes. We can only do sync writes if we can * wait for IO (__GFP_IO set). */ -int shrink_mmap(int priority, int gfp_mask) +static unsigned long shrink_mmap_referenced_moved; /* debug only? */ + +static inline int + _shrink_mmap(struct list_head *lru_head, + int gfp_mask, int *toscan, int *toskipwait) { - int ret = 0, count, nr_dirty; - struct list_head * page_lru; + int ret = 0, count = *toscan, nr_dirty = *toskipwait; + + struct list_head * page_lru = lru_head; struct page * page = NULL; - - count = nr_lru_pages / (priority + 1); - nr_dirty = priority; - /* we need pagemap_lru_lock for list_del() ... subtle code below */ - spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + while (count > 0 && (page_lru = page_lru->prev) != lru_head) { page = list_entry(page_lru, struct page, lru); - list_del(page_lru); count--; if (PageTestandClearReferenced(page)) @@ -273,10 +275,12 @@ * immediate tell are untouchable.. */ if (!page->buffers && page_count(page) > 1) - goto dispose_continue; + continue; + /* Lock this lru page, reentrant + * will be disposed correctly when unlocked */ if (TryLockPage(page)) - goto dispose_continue; + continue; /* Release the pagemap_lru lock even if the page is not yet queued in any lru queue since we have just locked down @@ -352,26 +356,171 @@ spin_lock(&pagemap_lru_lock); UnlockPage(page); page_cache_release(page); + continue; + dispose_continue: - list_add(page_lru, &lru_cache); - } - goto out; + /* have the pagemap_lru_lock, lru cannot change */ + /* Move it to top of this list with referenced cleared + * to give a chance for progress even when running without + * kswapd... + */ + { + struct list_head * page_lru_to_move = page_lru; + page_lru = page_lru->next; /* continues with page_lru.prev */ + list_del(page_lru_to_move); + list_add(page_lru_to_move, lru_head); + shrink_mmap_referenced_moved++; + } + continue; made_inode_progress: - page_cache_release(page); + page_cache_release(page); made_buffer_progress: - UnlockPage(page); - page_cache_release(page); - ret = 1; + /* like to have the lru lock before UnlockPage */ + spin_lock(&pagemap_lru_lock); + + UnlockPage(page); + + /* lru manipulation needs the spin lock */ + { + struct list_head * page_lru_to_free = page_lru; + page_lru = page_lru->next; /* continues with page_lru.prev */ + list_del(page_lru_to_free); + } + + /* nr_lru_pages needs the spinlock */ + page_cache_release(page); + ret++; + nr_lru_pages--; + + } + + *toskipwait = nr_dirty; + *toscan = count; + return ret; +} + +int shrink_mmap(int priority, int gfp_mask) +{ + int ret = 0, count, ix, nr_dirty; + + count = nr_lru_pages >> priority; + nr_dirty = 1 << priority; /* magic number */ + + /* we need pagemap_lru_lock for list_del() ... subtle code below */ + /* NOTE: we could have a lock per list... */ spin_lock(&pagemap_lru_lock); - /* nr_lru_pages needs the spinlock */ - nr_lru_pages--; -out: + for (ix = 0; ix < NR_OF_LRU_LISTS; ix ++) + ret += _shrink_mmap(&lru_cache[ix], gfp_mask, &count, &nr_dirty); + spin_unlock(&pagemap_lru_lock); return ret; } + +/* called with pagemap_lru_lock locked! */ + +struct age_pyramid +{ + int number_of; + int moved_up; + int moved_down; +}; + +static inline void + _age_mmap_list(struct list_head *page_lru_head, + struct list_head *dispose_up, + struct list_head *dispose_down, + struct age_pyramid *statistics) +{ + struct list_head * page_lru = page_lru_head; + struct list_head * dispose = NULL; + struct page * page; + + int balance = nr_lru_pages / NR_OF_LRU_LISTS; + + statistics->number_of = 0; + statistics->moved_up = 0; + statistics->moved_down = 0; + + while ((page_lru = page_lru->prev) != page_lru_head) { + page = list_entry(page_lru, struct page, lru); + + balance--; + + if (PageTestandClearReferenced(page)) { + dispose = dispose_up; + statistics->moved_up++; + } + else if (balance < 0) { + dispose = dispose_down; + statistics->moved_down++; + } + else { + statistics->number_of++; + } + + /* dispose the page */ + if (dispose) + { + struct list_head * page_lru_to_move = page_lru; + page_lru = page_lru->next; /* continues with page_lru.prev */ + list_del(page_lru_to_move); + list_add(page_lru_to_move, dispose); + } + } +} + +int age_mmap(void) +{ + LIST_HEAD(referenced); + + struct list_head * dispose_up; + + int moved_pre, ix; + struct age_pyramid stat[NR_OF_LRU_LISTS]; + + spin_lock(&pagemap_lru_lock); + + /* Scanned in shrink_mmap ? */ + moved_pre = shrink_mmap_referenced_moved; + shrink_mmap_referenced_moved = 0; + + /* Scan new list first */ + dispose_up = &referenced; + for (ix = NR_OF_LRU_LISTS - 1; ix > 0; ix--) { + _age_mmap_list(&lru_cache[ix], + dispose_up, + &lru_cache[ix - 1], + &stat[ix]); + dispose_up = &lru_cache[ix]; + } + _age_mmap_list(&lru_cache[ix], dispose_up, NULL, &stat[ix]); + + /* referenced pages in top list go top of lru_cache (same list) */ + list_splice(&referenced, &lru_cache[NR_OF_LRU_LISTS - 1]); + + spin_unlock(&pagemap_lru_lock); + + printk(KERN_DEBUG "age_mmap: [%4d %4d<%4d>%4d %4d<%4d>%4d %4d<%4d>%4d / %6d]\n", + moved_pre, + stat[0].moved_down, stat[0].number_of, stat[0].moved_up, + stat[1].moved_down, stat[1].number_of, stat[1].moved_up, + stat[2].moved_down, stat[2].number_of, stat[2].moved_up, + nr_lru_pages); + + return 0; /* until it is known what to return... */ +} + + +void lru_cache_init(void) +{ + int ix; + for (ix = 0; ix < NR_OF_LRU_LISTS; ix++) + INIT_LIST_HEAD(&lru_cache[ix]); +} + static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { diff -aur linux-2.3.99-pre9.3-q/mm/page_alloc.c linux/mm/page_alloc.c --- linux-2.3.99-pre9.3-q/mm/page_alloc.c Fri May 12 20:21:20 2000 +++ linux/mm/page_alloc.c Mon May 22 22:37:55 2000 @@ -497,7 +497,8 @@ freepages.min += i; freepages.low += i * 2; freepages.high += i * 3; - memlist_init(&lru_cache); + + lru_cache_init(); /* * Some architectures (with lots of mem and discontinous memory diff -aur linux-2.3.99-pre9.3-q/mm/vmscan.c linux/mm/vmscan.c --- linux-2.3.99-pre9.3-q/mm/vmscan.c Mon May 22 22:25:51 2000 +++ linux/mm/vmscan.c Tue May 23 00:11:23 2000 @@ -363,7 +363,7 @@ * Think of swap_cnt as a "shadow rss" - it tells us which process * we want to page out (always try largest first). */ - counter = (nr_threads << 2) >> (priority >> 2); + counter = (nr_threads << 1) >> (priority >> 1); if (counter < 1) counter = 1; @@ -435,17 +435,15 @@ { int priority; int count = FREE_COUNT; - int swap_count; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - priority = 64; + priority = 6; do { - while (shrink_mmap(priority, gfp_mask)) { - if (!--count) - goto done; - } + count -= shrink_mmap(priority, gfp_mask); + if (count <= 0) + goto done; /* Try to get rid of some shared memory pages.. */ @@ -472,18 +470,20 @@ * put in the swap cache), so we must not count this * as a "count" success. */ - swap_count = SWAP_COUNT; - while (swap_out(priority, gfp_mask)) - if (--swap_count < 0) - break; + { + int swap_count = SWAP_COUNT; + while (swap_out(priority, gfp_mask)) + if (--swap_count < 0) + break; + } } while (--priority >= 0); /* Always end on a shrink_mmap.. */ - while (shrink_mmap(0, gfp_mask)) { - if (!--count) - goto done; - } + count -= shrink_mmap(0, gfp_mask); + if (count <= 0) + goto done; + /* We return 1 if we are freed some page */ return (count != FREE_COUNT); @@ -543,15 +543,52 @@ if (!zone->size || !zone->zone_wake_kswapd) continue; if (zone->free_pages < zone->pages_low) - something_to_do = 1; - do_try_to_free_pages(GFP_KSWAPD); + something_to_do += 100; + + something_to_do += 1; + + printk(KERN_DEBUG + "kswapd: low memory zone: %s (%ld)\n", + zone->name, zone->free_pages + ); } pgdat = pgdat->node_next; } while (pgdat); - if (!something_to_do) { + if (something_to_do) { + do_try_to_free_pages(GFP_KSWAPD); + run_task_queue(&tq_disk); + printk(KERN_DEBUG "kswapd: to do %d - done\n", something_to_do); + } + + /* Always sleep - give requestors a chance to use + * freed pages. Even low prio ones - they might be + * the only ones. + */ + + /* Do not depend on kswapd.. it should be able to sleep + * sleep time is possible to trim: + * - function of pages aged? + * - function of something_to_do? + * - free_pages? + * right now 1s periods, wakeup possible */ + if (something_to_do < MAX_NR_ZONES/2 + 1) + { + static long aging_time = 0L; + if (aging_time == 0L) { + aging_time = 1L*HZ; + } + tsk->state = TASK_INTERRUPTIBLE; - interruptible_sleep_on(&kswapd_wait); + aging_time = interruptible_sleep_on_timeout(&kswapd_wait, 1*HZ); + + /* age after wakeup, slept at least one jiffie or have + * been waken up - a lot might have happened. + */ + (void)age_mmap(); + } + else if (tsk->need_resched) { + schedule(); } } } --------------6AEA0E9B0F30D4BC4EB40ED5-- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/