diff -Naur linux-2.6.12.6/fs/proc/proc_misc.c linux-2.6.12.6.with.clock.pro2/fs/proc/proc_misc.c --- linux-2.6.12.6/fs/proc/proc_misc.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/fs/proc/proc_misc.c 2005-09-12 16:44:01.000000000 +0800 @@ -125,11 +125,13 @@ unsigned long free; unsigned long committed; unsigned long allowed; + unsigned long active_limit; struct vmalloc_info vmi; long cached; get_page_state(&ps); get_zone_counts(&active, &inactive, &free); + allowed = get_active_limit(); /* * display in kilobytes. @@ -158,6 +160,7 @@ "SwapCached: %8lu kB\n" "Active: %8lu kB\n" "Inactive: %8lu kB\n" + "ActiveLimit: %8lu kB\n" "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" @@ -181,6 +184,7 @@ K(total_swapcache_pages), K(active), K(inactive), + K(active_limit), K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), @@ -219,6 +223,20 @@ .release = seq_release, }; +extern struct seq_operations refaults_op; +static int refaults_open(struct inode *inode, struct file *file) +{ + (void)inode; + return seq_open(file, &refaults_op); +} + +static struct file_operations refaults_file_operations = { + .open = refaults_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static int version_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -588,6 +606,7 @@ create_seq_entry("interrupts", 0, &proc_interrupts_operations); create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); + create_seq_entry("refaults",S_IRUGO, &refaults_file_operations); create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); create_seq_entry("diskstats", 0, &proc_diskstats_operations); #ifdef CONFIG_MODULES diff -Naur linux-2.6.12.6/include/linux/mmzone.h linux-2.6.12.6.with.clock.pro2/include/linux/mmzone.h --- linux-2.6.12.6/include/linux/mmzone.h 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/include/linux/mmzone.h 2005-09-12 16:11:21.000000000 +0800 @@ -143,7 +143,9 @@ unsigned long nr_inactive; unsigned long pages_scanned; /* since last reclaim */ int all_unreclaimable; /* All pages pinned */ - + unsigned long active_limit; + unsigned long active_scanned; + /* * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim diff -Naur linux-2.6.12.6/include/linux/page-flags.h linux-2.6.12.6.with.clock.pro2/include/linux/page-flags.h --- linux-2.6.12.6/include/linux/page-flags.h 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/include/linux/page-flags.h 2005-09-12 16:06:53.000000000 +0800 @@ -77,6 +77,8 @@ #define PG_nosave_free 19 /* Free, should not be written */ #define PG_uncached 20 /* Page has been mapped as uncached */ +#define PG_new 21 /* Newly allocated page */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -306,6 +308,11 @@ #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageNew(page) test_bit(PG_new, &(page)->flags) +#define SetPageNew(page) set_bit(PG_new, &(page)->flags) +#define ClearPageNew(page) clear_bit(PG_new, &(page)->flags) +#define TestClearPageNew(page) test_and_clear_bit(PG_new, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -Naur linux-2.6.12.6/include/linux/swap.h linux-2.6.12.6.with.clock.pro2/include/linux/swap.h --- linux-2.6.12.6/include/linux/swap.h 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/include/linux/swap.h 2005-09-12 16:23:00.000000000 +0800 @@ -153,6 +153,17 @@ /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); +/* linux/mm/nonresident.c */ +extern int do_remember_page(struct address_space *, unsigned long); +extern int recently_evicted(struct address_space *, unsigned long); +extern void init_nonresident(void); + +/* linux/mm/clockpro.c */ +extern void remember_page(struct page *, struct address_space *, unsigned long); +extern int page_is_hot(struct page *, struct address_space *, unsigned long); +extern unsigned long get_active_limit(void); +DECLARE_PER_CPU(unsigned long, evicted_pages); + /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; @@ -288,6 +299,14 @@ #define grab_swap_token() do { } while(0) #define has_swap_token(x) 0 +/* linux/mm/nonresident.c */ +#define init_nonresident() do { } while (0) +#define remember_page(x,y) 0 +#define recently_evicted(x,y) 0 + +/* linux/mm/clockpro.c */ +#define page_is_hot(x,y,z) 0 + #endif /* CONFIG_SWAP */ #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff -Naur linux-2.6.12.6/init/main.c linux-2.6.12.6.with.clock.pro2/init/main.c --- linux-2.6.12.6/init/main.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/init/main.c 2005-09-12 15:53:24.000000000 +0800 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ } #endif vfs_caches_init_early(); + init_nonresident(); mem_init(); kmem_cache_init(); numa_policy_init(); diff -Naur linux-2.6.12.6/mm/clockpro.c linux-2.6.12.6.with.clock.pro2/mm/clockpro.c --- linux-2.6.12.6/mm/clockpro.c 1970-01-01 08:00:00.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/clockpro.c 2005-09-12 16:23:00.000000000 +0800 @@ -0,0 +1,113 @@ +/* + * mm/clockpro.c + * (C) 2005 Red Hat, Inc + * Written by Rik van Riel + * Released under the GPL, see the file COPYING for details. + * + * Helper functions to implement CLOCK-Pro page replacement policy. + * For details see: http://linux-mm.org/wiki/AdvancedPageReplacement + */ +#include +#include +#include + +DEFINE_PER_CPU(unsigned long, evicted_pages); +static unsigned long get_evicted(void) +{ + unsigned long total = 0; + int cpu; + + for (cpu = first_cpu(cpu_online_map); cpu < NR_CPUS; cpu++) + total += per_cpu(evicted_pages, cpu); + + return total; +} + +static unsigned long estimate_pageable_memory(void) +{ + static unsigned long next_check; + static unsigned long total; + unsigned long active, inactive, free; + + if (time_after(jiffies, next_check)) { + get_zone_counts(&active, &inactive, &free); + total = active + inactive + free; + next_check = jiffies + HZ/10; + } + + return total; +} + +static void decay_clockpro_variables(void) +{ + struct zone * zone; + int cpu; + + for (cpu = first_cpu(cpu_online_map); cpu < NR_CPUS; cpu++) + per_cpu(evicted_pages, cpu) /= 2; + + for_each_zone(zone) + zone->active_scanned /= 2; +} + +int page_is_hot(struct page * page, struct address_space * mapping, + unsigned long index) +{ + unsigned long long distance; + unsigned long long evicted; + int refault_distance; + struct zone *zone; + + /* Was the page recently evicted ? */ + refault_distance = recently_evicted(mapping, index); + if (refault_distance < 0) + return 0; + + distance = estimate_pageable_memory() + refault_distance; + evicted = get_evicted(); + zone = page_zone(page); + + /* Only consider recent history for the calculation below. */ + if (unlikely(evicted > distance)) + decay_clockpro_variables(); + + /* + * Estimate whether the inter-reference distance of the tested + * page is smaller than the inter-reference distance of the + * oldest page on the active list. + * + * distance zone->nr_active + * ---------- < ---------------------- + * evicted zone->active_scanned + */ + if (distance * zone->active_scanned < evicted * zone->nr_active) { + if (zone->active_limit > zone->present_pages / 8) + zone->active_limit--; + return 1; + } + + /* Increase the active limit more slowly. */ + if ((evicted & 1) && zone->active_limit < zone->present_pages * 7 / 8) + zone->active_limit++; + return 0; +} + +void remember_page(struct page * page, struct address_space * mapping, + unsigned long index) +{ + struct zone * zone = page_zone(page); + if (do_remember_page(mapping, index) && (index & 1) && + zone->active_limit < zone->present_pages * 7 / 8) + zone->active_limit++; +} + +unsigned long get_active_limit(void) +{ + unsigned long total = 0; + struct zone * zone; + + for_each_zone(zone) + total += zone->active_limit; + + return total; +} diff -Naur linux-2.6.12.6/mm/filemap.c linux-2.6.12.6.with.clock.pro2/mm/filemap.c --- linux-2.6.12.6/mm/filemap.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/filemap.c 2005-09-12 16:13:34.000000000 +0800 @@ -383,6 +383,7 @@ if (!error) { page_cache_get(page); SetPageLocked(page); + SetPageNew(page); page->mapping = mapping; page->index = offset; mapping->nrpages++; @@ -400,8 +401,13 @@ pgoff_t offset, int gfp_mask) { int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + + if (ret == 0) { + if (page_is_hot(page, mapping, offset)) + lru_cache_add_active(page); + else + lru_cache_add(page); + } return ret; } @@ -722,7 +728,7 @@ unsigned long offset; unsigned long last_index; unsigned long next_index; - unsigned long prev_index; +// unsigned long prev_index; loff_t isize; struct page *cached_page; int error; @@ -731,7 +737,7 @@ cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; - prev_index = ra.prev_page; +// prev_index = ra.prev_page; last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; @@ -782,9 +788,9 @@ * When (part of) the same page is read multiple times * in succession, only mark it as accessed the first time. */ - if (prev_index != index) - mark_page_accessed(page); - prev_index = index; +// if (prev_index != index) + mark_page_accessed(page); +// prev_index = index; /* * Ok, we have the page, and it's up-to-date, so diff -Naur linux-2.6.12.6/mm/Makefile linux-2.6.12.6.with.clock.pro2/mm/Makefile --- linux-2.6.12.6/mm/Makefile 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/Makefile 2005-09-12 16:12:16.000000000 +0800 @@ -12,7 +12,8 @@ readahead.o slab.o swap.o truncate.o vmscan.o \ prio_tree.o $(mmu-y) -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \ + nonresident.o clockpro.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SHMEM) += shmem.o diff -Naur linux-2.6.12.6/mm/nonresident.c linux-2.6.12.6.with.clock.pro2/mm/nonresident.c --- linux-2.6.12.6/mm/nonresident.c 1970-01-01 08:00:00.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/nonresident.c 2005-09-12 16:14:23.000000000 +0800 @@ -0,0 +1,236 @@ +/* + * mm/nonresident.c + * (C) 2004,2005 Red Hat, Inc + * Written by Rik van Riel + * Released under the GPL, see the file COPYING for details. + * + * Keeps track of whether a non-resident page was recently evicted + * and should be immediately promoted to the active list. This also + * helps automatically tune the inactive target. + * + * The pageout code stores a recently evicted page in this cache + * by calling remember_page(mapping/mm, index/vaddr, generation) + * and can look it up in the cache by calling recently_evicted() + * with the same arguments. + * + * Note that there is no way to invalidate pages after eg. truncate + * or exit, we let the pages fall out of the non-resident set through + * normal replacement. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Number of non-resident pages per hash bucket. Never smaller than 15. */ +#if (L1_CACHE_BYTES < 64) +#define NR_BUCKET_BYTES 64 +#else +#define NR_BUCKET_BYTES L1_CACHE_BYTES +#endif +#define NUM_NR ((NR_BUCKET_BYTES - sizeof(atomic_t))/sizeof(u32)) + +struct nr_bucket +{ + atomic_t hand; + u32 page[NUM_NR]; +} ____cacheline_aligned; + +/* Histogram for non-resident refault hits. [NUM_NR] means "not found". */ +DEFINE_PER_CPU(unsigned long[NUM_NR+1], refault_histogram); + +/* The non-resident page hash table. */ +static struct nr_bucket * nonres_table; +static unsigned int nonres_shift; +static unsigned int nonres_mask; + +struct nr_bucket * nr_hash(void * mapping, unsigned long index) +{ + unsigned long bucket; + unsigned long hash; + + hash = hash_ptr(mapping, BITS_PER_LONG); + hash = 37 * hash + hash_long(index, BITS_PER_LONG); + bucket = hash & nonres_mask; + + return nonres_table + bucket; +} + +static u32 nr_cookie(struct address_space * mapping, unsigned long index) +{ + unsigned long cookie = hash_ptr(mapping, BITS_PER_LONG); + cookie = 37 * cookie + hash_long(index, BITS_PER_LONG); + + if (mapping->host) { + cookie = 37 * cookie + hash_long(mapping->host->i_ino, BITS_PER_LONG); + } + + return (u32)(cookie >> (BITS_PER_LONG - 32)); +} + +int recently_evicted(struct address_space * mapping, unsigned long index) +{ + struct nr_bucket * nr_bucket; + int distance; + u32 wanted; + int i; + + prefetch(mapping->host); + nr_bucket = nr_hash(mapping, index); + + prefetch(nr_bucket); + wanted = nr_cookie(mapping, index); + + for (i = 0; i < NUM_NR; i++) { + if (nr_bucket->page[i] == wanted) { + nr_bucket->page[i] = 0; + /* Return the distance between entry and clock hand. */ + distance = atomic_read(&nr_bucket->hand) + NUM_NR - i; + distance = (distance % NUM_NR); + __get_cpu_var(refault_histogram)[distance]++; + return (distance + 1) * (1 << nonres_shift); + } + } + /* If this page was evicted, it was longer ago than our history. */ + __get_cpu_var(refault_histogram)[NUM_NR]++; + return -1; +} + +int do_remember_page(struct address_space * mapping, unsigned long index) +{ + struct nr_bucket * nr_bucket; + u32 nrpage; + int i; + + prefetch(mapping->host); + nr_bucket = nr_hash(mapping, index); + + prefetchw(nr_bucket); + nrpage = nr_cookie(mapping, index); + + /* Atomically find the next array index. */ + preempt_disable(); + retry: + i = atomic_inc_return(&nr_bucket->hand); + if (unlikely(i >= NUM_NR)) { + if (i == NUM_NR) + atomic_set(&nr_bucket->hand, -1); + goto retry; + } + preempt_enable(); + + __get_cpu_var(evicted_pages)++; + + /* Statistics may want to know whether the entry was in use. */ + return xchg(&nr_bucket->page[i], nrpage); +} + +/* + * For interactive workloads, we remember about as many non-resident pages + * as we have actual memory pages. For server workloads with large inter- + * reference distances we could benefit from remembering more. + */ +static __initdata unsigned long nonresident_factor = 1; +void __init init_nonresident(void) +{ + int target; + int i; + + /* + * Calculate the non-resident hash bucket target. Use a power of + * two for the division because alloc_large_system_hash rounds up. + */ + target = nr_all_pages * nonresident_factor; + target /= (sizeof(struct nr_bucket) / sizeof(u32)); + + nonres_table = alloc_large_system_hash("Non-resident page tracking", + sizeof(struct nr_bucket), + target, + 0, + HASH_EARLY | HASH_HIGHMEM, + &nonres_shift, + &nonres_mask, + 0); + + for (i = 0; i < (1 << nonres_shift); i++) + atomic_set(&nonres_table[i].hand, 0); +} + +static int __init set_nonresident_factor(char * str) +{ + if (!str) + return 0; + nonresident_factor = simple_strtoul(str, &str, 0); + return 1; +} +__setup("nonresident_factor=", set_nonresident_factor); + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + if (*pos < 0 || *pos > NUM_NR) + return NULL; + + m->private = (unsigned long)*pos; + + return pos; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + if (*pos < NUM_NR) { + (*pos)++; + (unsigned long)m->private++; + return pos; + } + return NULL; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +unsigned long get_refault_stat(unsigned long index) +{ + unsigned long total = 0; + int cpu; + + for (cpu = first_cpu(cpu_online_map); cpu < NR_CPUS; cpu++) { + total += per_cpu(refault_histogram, cpu)[index]; + } + return total; +} + +static int frag_show(struct seq_file *m, void *arg) +{ + unsigned long index = (unsigned long)m->private; + unsigned long upper = ((unsigned long)index + 1) << nonres_shift; + unsigned long lower = (unsigned long)index << nonres_shift; + unsigned long hits = get_refault_stat(index); + + if (index == 0) + seq_printf(m, " Refault distance Hits\n"); + + if (index < NUM_NR) + seq_printf(m, "%9lu - %9lu %9lu\n", lower, upper, hits); + else + seq_printf(m, " New/Beyond %9lu %9lu\n", lower, hits); + + return 0; +} + +struct seq_operations refaults_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; +#endif /* CONFIG_PROCFS */ diff -Naur linux-2.6.12.6/mm/page_alloc.c linux-2.6.12.6.with.clock.pro2/mm/page_alloc.c --- linux-2.6.12.6/mm/page_alloc.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/page_alloc.c 2005-09-12 16:14:51.000000000 +0800 @@ -1713,6 +1713,8 @@ zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + zone->active_limit = zone->present_pages * 2 / 3; + if (!size) continue; diff -Naur linux-2.6.12.6/mm/swap_state.c linux-2.6.12.6.with.clock.pro2/mm/swap_state.c --- linux-2.6.12.6/mm/swap_state.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/swap_state.c 2005-09-12 16:17:11.000000000 +0800 @@ -323,6 +323,7 @@ struct vm_area_struct *vma, unsigned long addr) { struct page *found_page, *new_page = NULL; + int active; int err; do { @@ -354,12 +355,16 @@ * the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ + active = page_is_hot(new_page, &swapper_space, entry.val); err = add_to_swap_cache(new_page, entry); if (!err) { /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + if (active) { + lru_cache_add_active(new_page); + } else + lru_cache_add(new_page); swap_readpage(NULL, new_page); return new_page; } diff -Naur linux-2.6.12.6/mm/vmscan.c linux-2.6.12.6.with.clock.pro2/mm/vmscan.c --- linux-2.6.12.6/mm/vmscan.c 2005-08-30 00:55:27.000000000 +0800 +++ linux-2.6.12.6.with.clock.pro2/mm/vmscan.c 2005-09-12 16:22:43.000000000 +0800 @@ -376,12 +376,14 @@ while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; + struct zone *zone; int may_enter_fs; int referenced; cond_resched(); page = lru_to_page(page_list); + zone = page_zone(page); list_del(&page->lru); if (TestSetPageLocked(page)) @@ -509,6 +511,7 @@ #ifdef CONFIG_SWAP if (PageSwapCache(page)) { swp_entry_t swap = { .val = page->private }; + remember_page(page, &swapper_space, page->private); __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); @@ -517,6 +520,7 @@ } #endif /* CONFIG_SWAP */ + remember_page(page, page->mapping, page->index); __remove_from_page_cache(page); write_unlock_irq(&mapping->tree_lock); __put_page(page); @@ -698,6 +702,7 @@ pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, &l_hold, &pgscanned); zone->pages_scanned += pgscanned; + zone->active_scanned += pgscanned; zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); @@ -813,10 +818,14 @@ unsigned long nr_inactive; /* - * Add one to `nr_to_scan' just to make sure that the kernel will - * slowly sift through the active list. + * Scan the active list if we have too many active pages. + * The limit is automatically adjusted through refaults + * measuring how well the VM did in the past. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + if (zone->nr_active > zone->active_limit) + zone->nr_scan_active += zone->nr_active - zone->active_limit; + else if (sc->priority < DEF_PRIORITY - 2) + zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0;