--- fs/proc/array.c.~1~ Sat Dec 19 00:44:22 1998 +++ fs/proc/array.c Sat Dec 19 14:40:16 1998 @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -415,6 +416,28 @@ i.freeswap >> 10); } +static int get_swapstats(char * buffer) +{ + unsigned long *w = swapstats.kswap_wakeups; + + return sprintf(buffer, + "ProcFreeTry: %8lu\n" + "ProcFreeSucc: %8lu\n" + "ProcShrinkTry: %8lu\n" + "ProcShrinkSucc: %8lu\n" + "KswapFreeTry: %8lu\n" + "KswapFreeSucc: %8lu\n" + "KswapWakeups: %8lu %lu %lu %lu\n", + swapstats.gfp_freepage_attempts, + swapstats.gfp_freepage_successes, + swapstats.gfp_shrink_attempts, + swapstats.gfp_shrink_successes, + swapstats.kswap_freepage_attempts, + swapstats.kswap_freepage_successes, + w[0], w[1], w[2], w[3] + ); +} + static int get_version(char * buffer) { extern char *linux_banner; @@ -1301,6 +1324,9 @@ case PROC_MEMINFO: return get_meminfo(page); + case PROC_SWAPSTATS: + return get_swapstats(page); + #ifdef CONFIG_PCI_OLD_PROC case PROC_PCI: return get_pci_list(page); @@ -1386,7 +1412,7 @@ static int process_unauthorized(int type, int pid) { struct task_struct *p; - uid_t euid; /* Save the euid keep the lock short */ + uid_t euid=0; /* Save the euid keep the lock short */ read_lock(&tasklist_lock); --- fs/proc/root.c.~1~ Sat Dec 19 00:44:22 1998 +++ fs/proc/root.c Sat Dec 19 13:10:27 1998 @@ -494,6 +494,11 @@ S_IFREG | S_IRUGO, 1, 0, 0, 0, &proc_array_inode_operations }; +static struct proc_dir_entry proc_root_swapstats = { + PROC_SWAPSTATS, 9, "swapstats", + S_IFREG | S_IRUGO, 1, 0, 0, + 0, &proc_array_inode_operations +}; static struct proc_dir_entry proc_root_kmsg = { PROC_KMSG, 4, "kmsg", S_IFREG | S_IRUSR, 1, 0, 0, @@ -654,6 +659,7 @@ proc_register(&proc_root, &proc_root_loadavg); proc_register(&proc_root, &proc_root_uptime); proc_register(&proc_root, &proc_root_meminfo); + proc_register(&proc_root, &proc_root_swapstats); proc_register(&proc_root, &proc_root_kmsg); proc_register(&proc_root, &proc_root_version); proc_register(&proc_root, &proc_root_cpuinfo); --- include/linux/mm.h.~1~ Fri Nov 27 12:36:29 1998 +++ include/linux/mm.h Sat Dec 19 15:05:14 1998 @@ -11,6 +11,7 @@ extern unsigned long max_mapnr; extern unsigned long num_physpages; extern void * high_memory; +extern int page_cluster; #include #include --- include/linux/proc_fs.h.~1~ Sat Dec 19 00:55:10 1998 +++ include/linux/proc_fs.h Sat Dec 19 15:20:25 1998 @@ -53,7 +53,8 @@ PROC_STRAM, PROC_SOUND, PROC_MTRR, /* whether enabled or not */ - PROC_FS + PROC_FS, + PROC_SWAPSTATS }; enum pid_directory_inos { --- include/linux/swap.h.~1~ Sat Dec 19 00:42:54 1998 +++ include/linux/swap.h Sat Dec 19 13:57:47 1998 @@ -61,6 +61,15 @@ extern unsigned long page_cache_size; extern int buffermem; +struct swap_stats +{ + long proc_freepage_attempts; + long proc_freepage_successes; + long kswap_freepage_attempts; + long kswap_freepage_successes; +}; +extern struct swap_stats swap_stats; + /* Incomplete types for prototype declarations: */ struct task_struct; struct vm_area_struct; @@ -69,8 +78,12 @@ /* linux/ipc/shm.c */ extern int shm_swap (int, int); +/* linux/mm/swap.c */ +extern void swap_setup (void); + /* linux/mm/vmscan.c */ extern int try_to_free_pages(unsigned int gfp_mask, int count); +extern void try_to_shrink_cache(int); /* linux/mm/page_io.c */ extern void rw_swap_page(int, unsigned long, char *, int); @@ -87,6 +100,7 @@ extern int add_to_swap_cache(struct page *, unsigned long); extern int swap_duplicate(unsigned long); extern int swap_check_entry(unsigned long); +struct page * lookup_swap_cache(unsigned long); extern struct page * read_swap_cache_async(unsigned long, int); #define read_swap_cache(entry) read_swap_cache_async(entry, 1); extern int FASTCALL(swap_count(unsigned long)); --- include/linux/swapctl.h~ Sat Dec 19 00:55:55 1998 +++ include/linux/swapctl.h Sat Dec 19 16:19:20 1998 @@ -22,11 +22,19 @@ typedef struct swapstat_v1 { - unsigned int wakeups; - unsigned int pages_reclaimed; - unsigned int pages_shm; - unsigned int pages_mmap; - unsigned int pages_swap; + unsigned long wakeups; + unsigned long pages_reclaimed; + unsigned long pages_shm; + unsigned long pages_mmap; + unsigned long pages_swap; + + unsigned long gfp_freepage_attempts; + unsigned long gfp_freepage_successes; + unsigned long gfp_shrink_attempts; + unsigned long gfp_shrink_successes; + unsigned long kswap_freepage_attempts; + unsigned long kswap_freepage_successes; + unsigned long kswap_wakeups[4]; } swapstat_v1; typedef swapstat_v1 swapstat_t; extern swapstat_t swapstats; --- include/linux/sysctl.h.~1~ Sat Dec 19 00:44:22 1998 +++ include/linux/sysctl.h Sat Dec 19 00:45:09 1998 @@ -103,7 +103,8 @@ VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ - VM_PGT_CACHE=9 /* struct: Set page table cache parameters */ + VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ + VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */ }; --- kernel/sysctl.c.~1~ Fri Nov 27 12:36:42 1998 +++ kernel/sysctl.c Sat Dec 19 00:45:09 1998 @@ -216,6 +216,8 @@ &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PGT_CACHE, "pagetable_cache", &pgt_cache_water, 2*sizeof(int), 0600, NULL, &proc_dointvec}, + {VM_PAGE_CLUSTER, "page-cluster", + &page_cluster, sizeof(int), 0600, NULL, &proc_dointvec}, {0} }; --- mm/filemap.c.~1~ Sat Dec 19 00:43:23 1998 +++ mm/filemap.c Sat Dec 19 13:37:37 1998 @@ -200,7 +200,11 @@ struct page * page; int count; +#if 0 count = (limit<<1) >> (priority); +#else + count = (limit<<2) >> (priority); +#endif page = mem_map + clock; do { @@ -212,13 +216,26 @@ if (shrink_one_page(page, gfp_mask)) return 1; + /* + * If the page we looked at was recyclable but we didn't + * reclaim it (presumably due to PG_referenced), don't + * count it as scanned. This way, the more referenced + * page cache pages we encounter, the more rapidly we + * will age them. + */ + +#if 1 + if (atomic_read(&page->count) != 1 || + (!page->inode && !page->buffers)) +#endif + count--; page++; clock++; if (clock >= max_mapnr) { clock = 0; page = mem_map; } - } while (--count >= 0); + } while (count >= 0); return 0; } @@ -962,7 +979,7 @@ struct file * file = area->vm_file; struct dentry * dentry = file->f_dentry; struct inode * inode = dentry->d_inode; - unsigned long offset; + unsigned long offset, reada, i; struct page * page, **hash; unsigned long old_page, new_page; @@ -1023,7 +1040,19 @@ return new_page; no_cached_page: - new_page = __get_free_page(GFP_USER); + /* + * Try to read in an entire cluster at once. + */ + reada = offset; + reada >>= PAGE_SHIFT; + reada = (reada / page_cluster) * page_cluster; + reada <<= PAGE_SHIFT; + + for (i=0; ii_op->readpage(file, page) != 0) goto failure; - /* - * Do a very limited read-ahead if appropriate - */ - if (PageLocked(page)) - new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0); goto found_page; page_locked_wait: @@ -1625,7 +1649,7 @@ if (!page) { if (!new) goto out; - page_cache = get_free_page(GFP_KERNEL); + page_cache = get_free_page(GFP_USER); if (!page_cache) goto out; page = mem_map + MAP_NR(page_cache); --- mm/page_alloc.c.~1~ Fri Nov 27 12:36:42 1998 +++ mm/page_alloc.c Sat Dec 19 15:14:23 1998 @@ -241,7 +241,17 @@ goto nopage; } - if (freepages.min > nr_free_pages) { + /* Try this if you want, but it seems to result in too + * much IO activity during builds, and does not + * substantially reduce the number of times we invoke + * kswapd. --sct */ +#if 0 + if (nr_free_pages < freepages.high && + !(gfp_mask & (__GFP_MED | __GFP_HIGH))) + try_to_shrink_cache(gfp_mask); +#endif + + if (nr_free_pages < freepages.min) { int freed; freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX); /* @@ -359,6 +369,37 @@ return start_mem; } +/* + * Primitive swap readahead code. We simply read an aligned block of + * (page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(unsigned long entry) { + int i; + struct page *new_page; + unsigned long offset = SWP_OFFSET(entry); + struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; + + offset = (offset/page_cluster) * page_cluster; + + for (i = 0; i < page_cluster; i++) { + if (offset >= swapdev->max + || nr_free_pages - atomic_read(&nr_async_pages) < + (freepages.high + freepages.low)/2) + return; + if (!swapdev->swap_map[offset] || + swapdev->swap_map[offset] == SWAP_MAP_BAD || + test_bit(offset, swapdev->swap_lockmap)) + continue; + new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0); + if (new_page != NULL) + __free_page(new_page); + offset++; + } + return; +} + /* * The tests may look silly, but it essentially makes sure that * no other process did a swap-in on us just as we were waiting. @@ -370,10 +411,12 @@ pte_t * page_table, unsigned long entry, int write_access) { unsigned long page; - struct page *page_map; - - page_map = read_swap_cache(entry); + struct page *page_map = lookup_swap_cache(entry); + if (!page_map) { + swapin_readahead(entry); + page_map = read_swap_cache(entry); + } if (pte_val(*page_table) != entry) { if (page_map) free_page_and_swap_cache(page_address(page_map)); --- mm/page_io.c.~1~ Fri Nov 27 12:36:42 1998 +++ mm/page_io.c Sat Dec 19 00:45:09 1998 @@ -60,7 +60,7 @@ } /* Don't allow too many pending pages in flight.. */ - if (atomic_read(&nr_async_pages) > SWAP_CLUSTER_MAX) + if (atomic_read(&nr_async_pages) > pager_daemon.swap_cluster) wait = 1; p = &swap_info[type]; --- mm/swap.c.~1~ Sat Dec 19 00:42:55 1998 +++ mm/swap.c Sat Dec 19 12:49:51 1998 @@ -39,6 +39,9 @@ 144 /* freepages.high */ }; +/* How many pages do we try to swap or page in/out together? */ +int page_cluster = 16; /* Default value modified in swap_setup() */ + /* We track the number of pages currently being asynchronously swapped out, so that we don't try to swap TOO many pages out at once */ atomic_t nr_async_pages = ATOMIC_INIT(0); @@ -61,13 +64,13 @@ swapstat_t swapstats = {0}; buffer_mem_t buffer_mem = { - 5, /* minimum percent buffer */ + 1, /* minimum percent buffer */ 10, /* borrow percent buffer */ 60 /* maximum percent buffer */ }; buffer_mem_t page_cache = { - 5, /* minimum percent page cache */ + 1, /* minimum percent page cache */ 15, /* borrow percent page cache */ 75 /* maximum */ }; @@ -77,3 +80,19 @@ SWAP_CLUSTER_MAX, /* minimum number of tries */ SWAP_CLUSTER_MAX, /* do swap I/O in clusters of this size */ }; + + +/* + * Perform any setup for the swap system + */ + +void __init swap_setup(void) +{ + /* Use a smaller cluster for memory <16MB or <32MB */ + if (num_physpages < ((16 * 1024 * 1024) >> PAGE_SHIFT)) + page_cluster = 4; + else if (num_physpages < ((32 * 1024 * 1024) >> PAGE_SHIFT)) + page_cluster = 8; + else + page_cluster = 16; +} --- mm/swap_state.c.~1~ Fri Nov 27 12:36:42 1998 +++ mm/swap_state.c Sat Dec 19 13:35:07 1998 @@ -258,7 +258,7 @@ * incremented. */ -static struct page * lookup_swap_cache(unsigned long entry) +struct page * lookup_swap_cache(unsigned long entry) { struct page *found; @@ -305,7 +305,7 @@ if (found_page) goto out; - new_page_addr = __get_free_page(GFP_KERNEL); + new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) goto out; /* Out of memory */ new_page = mem_map + MAP_NR(new_page_addr); --- mm/vmscan.c.~1~ Sat Dec 19 00:43:24 1998 +++ mm/vmscan.c Sat Dec 19 14:58:49 1998 @@ -25,6 +25,11 @@ */ static struct task_struct * kswapd_task = NULL; +/* + * Flag to start low-priorty background kswapping + */ +static int kswap_default_wakeup; + static void init_swap_timer(void); /* @@ -424,21 +429,36 @@ */ static int do_try_to_free_page(int gfp_mask) { + static int state = 0; int i=6; /* Always trim SLAB caches when memory gets low. */ kmem_cache_reap(gfp_mask); - - do { - if (shrink_mmap(i, gfp_mask)) - return 1; - if (shm_swap(i, gfp_mask)) - return 1; - if (swap_out(i, gfp_mask)) + + if (current != kswapd_task) + if (shrink_mmap(6, gfp_mask)) return 1; - shrink_dcache_memory(i, gfp_mask); + + switch (state) { + do { + case 0: + if (shrink_mmap(i, gfp_mask)) + return 1; + state = 1; + case 1: + if (shm_swap(i, gfp_mask)) + return 1; + state = 2; + case 2: + if (swap_out(i, gfp_mask)) + return 1; + state = 3; + case 3: + shrink_dcache_memory(i, gfp_mask); + state = 0; i--; - } while (i >= 0); + } while (i >= 0); + } return 0; } @@ -453,6 +473,8 @@ int i; char *revision="$Revision: 1.5 $", *s, *e; + swap_setup(); + if ((s = strchr(revision, ':')) && (e = strchr(s, '$'))) s++, i = e - s; @@ -514,9 +536,11 @@ /* max one hundreth of a second */ end_time = jiffies + (HZ-1)/100; do { + swapstats.kswap_freepage_attempts++; if (!do_try_to_free_page(0)) break; - if (nr_free_pages > freepages.high + SWAP_CLUSTER_MAX) + swapstats.kswap_freepage_successes++; + if (nr_free_pages > freepages.high + pager_daemon.swap_cluster) break; } while (time_before_eq(jiffies,end_time)); } @@ -544,9 +568,11 @@ if (!(current->flags & PF_MEMALLOC)) { current->flags |= PF_MEMALLOC; do { + swapstats.gfp_freepage_attempts++; retval = do_try_to_free_page(gfp_mask); if (!retval) break; + swapstats.gfp_freepage_successes++; count--; } while (count > 0); current->flags &= ~PF_MEMALLOC; @@ -556,6 +582,24 @@ } /* + * Try to shrink the page cache slightly, on low-priority memory + * allocation. If this fails, it's a hint that maybe kswapd might want + * to start doing something useful. + */ +void try_to_shrink_cache(int gfp_mask) +{ + int i; + for (i = 0; i < 16; i++) { + swapstats.gfp_shrink_attempts++; + if (shrink_mmap(6, gfp_mask)) + swapstats.gfp_shrink_successes++; + else + kswap_default_wakeup = 1; + } +} + + +/* * Wake up kswapd according to the priority * 0 - no wakeup * 1 - wake up as a low-priority process @@ -598,15 +642,22 @@ * that we'd better give kswapd a realtime * priority. */ + want_wakeup = 0; pages = nr_free_pages; if (pages < freepages.high) - want_wakeup = 1; - if (pages < freepages.low) + want_wakeup = kswap_default_wakeup; + if (pages < freepages.low) { want_wakeup = 2; + kswap_default_wakeup = 1; + } if (pages < freepages.min) want_wakeup = 3; - + + /* If you increase the maximum want_wakeup, expand the + swapstats.kswap_wakeups[] table in swapctl.h */ + swapstats.kswap_wakeups[want_wakeup]++; + kswapd_wakeup(p,want_wakeup); }