--- linux-2.4.7/fs/exec.c.orig Thu Jul 26 13:22:26 2001 +++ linux-2.4.7/fs/exec.c Thu Jul 26 16:19:02 2001 @@ -34,6 +34,7 @@ #include #include #include +#include #define __NO_VERSION__ #include @@ -277,8 +278,9 @@ flush_dcache_page(page); flush_page_to_ram(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); + page_add_pmap(page, pte); + spin_unlock(&pagemap_lru_lock); tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); /* no need for flush_tlb */ return; --- linux-2.4.7/mm/filemap.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/filemap.c Thu Jul 26 13:24:37 2001 @@ -520,7 +520,9 @@ page->index = index; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, page_hash(mapping, index)); - lru_cache_add(page); + /* XXX: already on the inactive_dirty list ... deuglify! */ + if (!PageSwapCache(page)) + lru_cache_add(page); spin_unlock(&pagecache_lock); } --- linux-2.4.7/mm/memory.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/memory.c Fri Jul 27 11:08:40 2001 @@ -76,6 +76,7 @@ static inline void free_one_pmd(pmd_t * dir) { pte_t * pte; + struct page * page; if (pmd_none(*dir)) return; @@ -86,6 +87,10 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); + page = virt_to_page(pte); + /* Clear the reverse mapping stuff on this page table page. */ + page->mapping = 0; + page->index = 0; pte_free(pte); } @@ -216,7 +221,9 @@ goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; + /* Swapped out, skip the pmap stuff. */ + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || @@ -236,6 +243,7 @@ get_page(ptepage); cont_copy_pte_range: set_pte(dst_pte, pte); + page_add_pmap(ptepage, dst_pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out_unlock; @@ -312,6 +320,8 @@ if (!size) break; page = ptep_get_and_clear(pte); + if (pte_present(page)) + page_remove_pmap(pte_page(page), pte); pte++; size--; if (pte_none(page)) @@ -849,6 +859,7 @@ * - flush the old one * - update the page tables * - inform the TLB about the new one + * - update the reverse mappings (if the page changes) * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ @@ -868,7 +879,9 @@ copy_cow_page(old_page,new_page,address); flush_page_to_ram(new_page); flush_cache_page(vma, address); + page_remove_pmap(old_page, page_table); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); + page_add_pmap(new_page, page_table); } /* @@ -1139,6 +1152,7 @@ flush_page_to_ram(page); flush_icache_page(vma, page); set_pte(page_table, pte); + page_add_pmap(page, page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1153,14 +1167,13 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { pte_t entry; + struct page *page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { - struct page *page; - /* Allocate our own private page. */ spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1178,6 +1191,7 @@ } set_pte(page_table, entry); + page_add_pmap(page, page_table); /* Ignores empty_zero_page ;) */ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1240,6 +1254,7 @@ !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry); + page_add_pmap(new_page, page_table); } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -1372,6 +1387,7 @@ pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { if (!pmd_present(*pmd)) { + struct page * page; pte_t *new; /* "fast" allocation can happen without dropping the lock.. */ @@ -1392,6 +1408,10 @@ goto out; } } + /* Add reverse pte mapping pointers for pmap.c. */ + page = virt_to_page(new); + page->mapping = (void *)mm; + page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); pmd_populate(mm, pmd, new); } out: --- linux-2.4.7/mm/mremap.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/mremap.c Thu Jul 26 13:24:37 2001 @@ -63,6 +63,7 @@ pte_t pte; if (!pte_none(*src)) { + page_remove_pmap(pte_page(pte), src); pte = ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -70,6 +71,7 @@ error++; } set_pte(dst, pte); + page_add_pmap(pte_page(pte), dst); } return error; } --- linux-2.4.7/mm/page_alloc.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/page_alloc.c Thu Jul 26 13:24:37 2001 @@ -87,6 +87,8 @@ BUG(); if (PageInactiveClean(page)) BUG(); + if (page->pte_chain) + BUG(); page->flags &= ~((1<age = PAGE_AGE_START; --- linux-2.4.7/mm/swap.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/swap.c Thu Jul 26 13:24:37 2001 @@ -164,22 +164,12 @@ void deactivate_page_nolock(struct page * page) { /* - * One for the cache, one for the extra reference the - * caller has and (maybe) one for the buffers. - * - * This isn't perfect, but works for just about everything. - * Besides, as long as we don't move unfreeable pages to the - * inactive_clean list it doesn't need to be perfect... - */ - int maxcount = (page->buffers ? 3 : 2); - page->age = 0; - ClearPageReferenced(page); - - /* * Don't touch it if it's not on the active list. * (some pages aren't on any list at all) */ - if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) { + if (PageActive(page) && !page_ramdisk(page)) { + page->age = 0; + ClearPageReferenced(page); del_page_from_active_list(page); add_page_to_inactive_dirty_list(page); } @@ -266,8 +256,6 @@ */ void lru_cache_del(struct page * page) { - if (!PageLocked(page)) - BUG(); spin_lock(&pagemap_lru_lock); __lru_cache_del(page); spin_unlock(&pagemap_lru_lock); --- linux-2.4.7/mm/swapfile.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/swapfile.c Thu Jul 26 13:24:37 2001 @@ -232,6 +232,7 @@ set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); swap_free(entry); get_page(page); + page_add_pmap(page, dir); ++vma->vm_mm->rss; } --- linux-2.4.7/mm/swap_state.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/swap_state.c Thu Jul 26 13:55:13 2001 @@ -165,7 +165,13 @@ if (exclusive_swap_page(page)) delete_from_swap_cache_nolock(page); UnlockPage(page); - } + /* + * If we are the only user and it was an anonymous page + * without swap backing, remove the page from the list. + * SMP/fork() safe because we hold the mm->page_table_lock. + */ + } else if (page_count(page) == 1) + lru_cache_del(page); page_cache_release(page); } --- linux-2.4.7/mm/vmscan.c.orig Thu Jul 26 13:22:09 2001 +++ linux-2.4.7/mm/vmscan.c Tue Jul 31 18:12:07 2001 @@ -27,30 +27,71 @@ #define MAX(a,b) ((a) > (b) ? (a) : (b)) /* - * The swap-out function returns 1 if it successfully - * scanned all the pages it was asked to (`count'). - * It returns zero if it couldn't do anything, + * Try_to_swap_out does nothing but unmap a page table entry to + * a page. It has 4 return values: + * SWAP_SUCCESS - we succeeded in unmapping this page table entry + * SWAP_AGAIN - we failed a try_lock, try again later + * SWAP_FAIL - we cannot swap out this page any time soon + * (mlocked page, no swap space left, ...) + * SWAP_ERROR - an error occurred * - * rss may decrease because pages are shared, but this - * doesn't count as having freed a page. + * NOTE: we have to use trylock everywhere, since our locking + * order is opposite of the page fault handler, etc. */ -/* mm->page_table_lock is held. mmap_sem is not held */ -static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page) +/* mm->page_table_lock needs to be held. mmap_sem is not held */ +int try_to_swap_out(struct mm_struct * mm, unsigned long address, pte_t * page_table, struct page *page) { pte_t pte; swp_entry_t entry; + struct vm_area_struct * vma; + int ret; - /* Don't look at this pte if it's been accessed recently. */ - if (ptep_test_and_clear_young(page_table)) { - page->age += PAGE_AGE_ADV; - if (page->age > PAGE_AGE_MAX) - page->age = PAGE_AGE_MAX; - return; + /* + * Try to get the mm->page_table_lock. Ideally we'd + * change the page fault code so we never grab the + * pagemap_lru_lock or page_table_lock while holding + * the mm->page_table_lock, but for now we trylock... + */ + if (!spin_trylock(&mm->page_table_lock)) { + ret = SWAP_AGAIN; + goto out_unlock; + } + + /* + * First, check for various error conditions. + * These should be BUG() or panic() once the pmap + * VM stabilises... + */ + ret = SWAP_ERROR; + + if (!mm || !page_table || !page) { + printk ("try_to_swap_out: called with null argument...\n"); + goto out_unlock; + } + + if (!PageLocked(page)) { + printk("try_to_swap_out: page not locked!\n"); + goto out_unlock; + } + + vma = find_vma(mm, address); + if (!vma) { + printk ("try_to_swap_out: page not in a VMA?!\n"); + goto out_unlock; + } + + /* + * OK, no errors occurred. There are still various + * opportunities for failure, though ... + */ + ret = SWAP_FAIL; + + /* mlock()ed VMA */ + if (vma->vm_flags & VM_LOCKED) { + goto out_unlock; } - if (TryLockPage(page)) - return; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook @@ -74,11 +115,9 @@ set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: mm->rss--; - if (!page->age) - deactivate_page(page); - UnlockPage(page); page_cache_release(page); - return; + ret = SWAP_SUCCESS; + goto out_unlock; } /* @@ -120,206 +159,18 @@ goto out_unlock_restore; /* No swap space left */ /* Add it to the swap cache and mark it dirty */ + /* XXX: SMP deadlock due to pagecache_lock / pagemap_lru_lock order */ add_to_swap_cache(page, entry); set_page_dirty(page); goto set_swap_pte; out_unlock_restore: set_pte(page_table, pte); - UnlockPage(page); - return; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count) -{ - pte_t * pte; - unsigned long pmd_end; - - if (pmd_none(*dir)) - return count; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return count; - } - - pte = pte_offset(dir, address); - - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - - do { - if (pte_present(*pte)) { - struct page *page = pte_page(*pte); - - if (VALID_PAGE(page) && !PageReserved(page)) { - try_to_swap_out(mm, vma, address, pte, page); - if (!--count) - break; - } - } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); - mm->swap_address = address + PAGE_SIZE; - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count) -{ - pmd_t * pmd; - unsigned long pgd_end; - - if (pgd_none(*dir)) - return count; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return count; - } - - pmd = pmd_offset(dir, address); - - pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; - if (pgd_end && (end > pgd_end)) - end = pgd_end; - - do { - count = swap_out_pmd(mm, vma, pmd, address, end, count); - if (!count) - break; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return count; -} - -/* mm->page_table_lock is held. mmap_sem is not held */ -static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count) -{ - pgd_t *pgdir; - unsigned long end; - - /* Don't swap out areas which are locked down */ - if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) - return count; - - pgdir = pgd_offset(mm, address); - - end = vma->vm_end; - if (address >= end) - BUG(); - do { - count = swap_out_pgd(mm, vma, pgdir, address, end, count); - if (!count) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (address && (address < end)); - return count; -} - -/* - * Returns non-zero if we scanned all `count' pages - */ -static int swap_out_mm(struct mm_struct * mm, int count) -{ - unsigned long address; - struct vm_area_struct* vma; - - if (!count) - return 1; - /* - * Go through process' page directory. - */ - - /* - * Find the proper vm-area after freezing the vma chain - * and ptes. - */ - spin_lock(&mm->page_table_lock); - address = mm->swap_address; - vma = find_vma(mm, address); - if (vma) { - if (address < vma->vm_start) - address = vma->vm_start; - - for (;;) { - count = swap_out_vma(mm, vma, address, count); - if (!count) - goto out_unlock; - vma = vma->vm_next; - if (!vma) - break; - address = vma->vm_start; - } - } - /* Reset to 0 when we reach the end of address space */ - mm->swap_address = 0; - out_unlock: spin_unlock(&mm->page_table_lock); - return !count; -} - -#define SWAP_MM_SHIFT 4 -#define SWAP_SHIFT 5 -#define SWAP_MIN 8 - -static inline int swap_amount(struct mm_struct *mm) -{ - int nr = mm->rss >> SWAP_SHIFT; - if (nr < SWAP_MIN) { - nr = SWAP_MIN; - if (nr > mm->rss) - nr = mm->rss; - } - return nr; -} - -static void swap_out(unsigned int priority, int gfp_mask) -{ - int counter; - int retval = 0; - struct mm_struct *mm = current->mm; - - /* Always start by trying to penalize the process that is allocating memory */ - if (mm) - retval = swap_out_mm(mm, swap_amount(mm)); - - /* Then, look at the other mm's */ - counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority; - do { - struct list_head *p; - - spin_lock(&mmlist_lock); - p = init_mm.mmlist.next; - if (p == &init_mm.mmlist) - goto empty; - - /* Move it to the back of the queue.. */ - list_del(p); - list_add_tail(p, &init_mm.mmlist); - mm = list_entry(p, struct mm_struct, mmlist); - - /* Make sure the mm doesn't disappear when we drop the lock.. */ - atomic_inc(&mm->mm_users); - spin_unlock(&mmlist_lock); - - /* Walk about 6% of the address space each time */ - retval |= swap_out_mm(mm, swap_amount(mm)); - mmput(mm); - } while (--counter >= 0); - return; - -empty: - spin_unlock(&mmlist_lock); + return ret; } - /** * reclaim_page - reclaims one page from the inactive_clean list * @zone: reclaim a page from this zone @@ -395,6 +246,16 @@ del_page_from_inactive_clean_list(page); UnlockPage(page); page->age = PAGE_AGE_START; + /* + * The bugs below cannot happen because other processes would + * need the pagecache_lock to find the page. When they find it, + * they need to increase the page count, which makes us move + * the page back to the active list in the code above. + * + * Thus, these checks check other code... + */ + if (page->pte_chain) + BUG(); if (page_count(page) != 1) printk("VM: reclaim_page, found page with count %d!\n", page_count(page)); @@ -454,7 +315,6 @@ /* Page is or was in use? Move it to the active list. */ if (PageReferenced(page) || page->age > 0 || - (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); @@ -472,6 +332,30 @@ } /* + * Try to remove all the mappings processes have to + * this page. Pages can have "hidden" users, if that + * is the case the page gets moved back to the active + * list. + * + * This test is just an optimisation to move unfreeable + * pages back to the active list and prevent us from + * doing (expensive) disk IO. The "real" test is done + * in reclaim_page() and protected by the pagetable_lock. + */ + switch (page_remove_all_pmaps(page)) { + case SWAP_AGAIN: + UnlockPage(page); + continue; + case SWAP_FAIL: + case SWAP_ERROR: + goto page_active; + case SWAP_SUCCESS: + maxcount = (page->buffers ? 2 : 1); + if (page_count(page) > maxcount) + goto page_active; + } + + /* * Dirty swap-cache page? Write it out if * last copy.. */ @@ -630,20 +514,22 @@ } /** - * refill_inactive_scan - scan the active list and find pages to deactivate - * @priority: the priority at which to scan + * refill_inactive - scan the active list and find pages to deactivate + * @maxscan: the priority at which to scan (gets converted to pages) * @target: number of pages to deactivate, zero for background aging * * This function will scan a portion of the active list to find * unused pages, those pages will then be moved to the inactive list. */ -int refill_inactive_scan(unsigned int priority, int target) +int refill_inactive(unsigned int maxscan, int target) { struct list_head * page_lru; struct page * page; - int maxscan = nr_active_pages >> priority; - int page_active = 0; - int nr_deactivated = 0; + int referenced, page_active, nr_deactivated;; + + /* Convert maxscan to the maximum number of pages to scan. */ + maxscan = nr_active_pages >> maxscan; + nr_deactivated = 0; /* * When we are background aging, we try to increase the page aging @@ -666,23 +552,15 @@ } /* Do aging on the pages. */ - if (PageTestandClearReferenced(page)) { - age_page_up_nolock(page); + referenced = page_referenced(page); + if (referenced) { + page->age += (referenced + PAGE_AGE_ADV); + if (page->age > PAGE_AGE_MAX) + page->age = PAGE_AGE_MAX; page_active = 1; } else { age_page_down_ageonly(page); - /* - * Since we don't hold a reference on the page - * ourselves, we have to do our test a bit more - * strict then deactivate_page(). This is needed - * since otherwise the system could hang shuffling - * unfreeable pages from the active list to the - * inactive_dirty list and back again... - * - * SUBTLE: we can have buffer pages with count 1. - */ - if (page->age == 0 && page_count(page) <= - (page->buffers ? 2 : 1)) { + if (page->age == 0) { deactivate_page_nolock(page); page_active = 0; } else { @@ -841,7 +719,7 @@ static int do_try_to_free_pages(unsigned int gfp_mask, int user) { - int ret = 0; + int target, maxscan, ret = 0; /* * If we're low on free pages, move pages from the @@ -859,10 +737,15 @@ /* * If needed, we move pages from the active list - * to the inactive list. - */ - if (inactive_shortage()) - ret += refill_inactive(gfp_mask, user); + * to the inactive list. Note that user processes + * only scan a small part of the active list, so + * that multiple page freeers at the same time + * won't upset page aging. + */ + target = inactive_shortage(); + maxscan = user ? DEF_PRIORITY : 0; + if (target) + ret += refill_inactive(maxscan, target); /* * Reclaim unused slab cache if memory is low. @@ -928,7 +811,7 @@ recalculate_vm_stats(); /* Do background page aging. */ - refill_inactive_scan(DEF_PRIORITY, 0); + refill_inactive(0, 0); } run_task_queue(&tq_disk); --- linux-2.4.7/mm/Makefile.orig Thu Jul 26 13:22:14 2001 +++ linux-2.4.7/mm/Makefile Thu Jul 26 13:24:37 2001 @@ -14,7 +14,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o pmap.o obj-$(CONFIG_HIGHMEM) += highmem.o --- linux-2.4.7/mm/pmap.c.orig Thu Jul 26 13:24:05 2001 +++ linux-2.4.7/mm/pmap.c Thu Jul 26 13:24:37 2001 @@ -0,0 +1,341 @@ +/* + * mm/pmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel (riel@conectiva.com.br) + * + * Released under the General Public License (GPL). + * + * + * The functions in this file provide a physical to virtual + * reverse mapping, which is handy for NUMA memory migration, + * process migration, cache coherency on some machines and + * for easier swapout balancing. + * + * The emphasis in this implementation is on simplicity and + * low overhead. + * + * XXX: integrate the *pte* functions into this file so PPC, S/390 + * and others can override some things here and stop pretending they + * have the exact same page table functionality others have ?? + */ + +/* + * On locking: + * - to keep the change in both pte and pte_chain atomic, we surround + * the changing of the two by the pagemap_lru_lock; we might want to + * change this in the future if there turns out to be contention + * - because swapout locking order is opposite to the locking order + * used by page faults, the swapout path always uses trylock + */ +#include +#include + +#include + +#define DEBUG + +/* + * For shared pages, we have a series of pte_chain structures. + * They are a singly linked list to minimise memory overhead, + * this should not be an issue for most uses. For fork-after-exec + * we'll be in the start of the linked list and 90% of processes + * seem to be short-lived and will be in the start of the list. + * + * Turning this into a doubly-linked list with forward mapping from + * each process to the pte_chain structure could be a benifit on + * workloads where the system has lots of programs which exit after + * about an equal time, say apache with a few hundred children. + * OTOH, in those cases the child processes shouldn't be exiting all + * that often. + */ +struct pte_chain { + struct pte_chain * next; + pte_t * ptep; +}; + +static struct pte_chain * pte_chain_freelist; + +static struct pte_chain * pte_chain_alloc(void); +static void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *); + +/* + * Quick test_and_clear referenced for all mappings to a page. + * + * The caller needs to hold the pagemap_lru_lock. + */ +int page_referenced(struct page * page) +{ + struct pte_chain * pte_chain = page->pte_chain; + int referenced = 0; + + if (PageReferenced(page)) + referenced++; + + while (pte_chain) { + if (ptep_test_and_clear_young(pte_chain->ptep)) + referenced++; + pte_chain = pte_chain->next; + } + + return referenced; +} + +/* + * Add a new pte reverse mapping to a page. New pages get added + * to the pageout lists, reserved and nonexistant pages (eg. + * mmaped devices) get skipped. In order to prevent races with + * the pageout code this function should only get called after + * the page table entry has been set up. + * + * The caller needs to hold the mm->page_table_lock. + */ +void page_add_pmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain; + struct page * pte_page = virt_to_page(ptep); +#ifdef DEBUG + struct mm_struct * mm = (void *) pte_page->mapping; + + if (!mm) + BUG(); +#endif + + if (!page || !ptep) + BUG(); + + if (!pte_present(*ptep)) + return; + + if (!VALID_PAGE(page) || PageReserved(page)) + return; + + spin_lock(&pagemap_lru_lock); +#ifdef DEBUG + pte_chain = page->pte_chain; + while (pte_chain) { + if (pte_chain->ptep == ptep) { + printk("page_add_pmap: pmap for this (page, *pte) already present!\n"); + BUG(); + } + pte_chain = pte_chain->next; + } +#endif + pte_chain = pte_chain_alloc(); + + pte_chain->ptep = ptep; + + pte_chain->next = page->pte_chain; + page->pte_chain = pte_chain; + + /* + * We can get called with new pages, which are not on any of + * the pageout lists yet, in that case we add the page here. + */ + if ((page->flags & ((1<page_table_lock + */ +int page_remove_pmap(struct page * page, pte_t * ptep) +{ + struct pte_chain * pte_chain; + struct pte_chain * prev_pte_chain = NULL; + int ret = SWAP_ERROR; + + if (!page || !ptep) + BUG(); + + if (!VALID_PAGE(page) || PageReserved(page)) + return SWAP_SUCCESS; + + spin_lock(&pagemap_lru_lock); + pte_chain = page->pte_chain; + while (pte_chain) { + if (pte_chain->ptep == ptep) { + pte_chain_free(pte_chain, prev_pte_chain, page); + ret = SWAP_SUCCESS; + goto out; + } + + prev_pte_chain = pte_chain; + pte_chain = pte_chain->next; + } + goto notfound; +out: + spin_unlock(&pagemap_lru_lock); + return ret; + +notfound: + /* Not found, should never happen. */ + printk("page_remove_pmap: pte_chain %p not present...\n", ptep); + printk("page_remove_pmap: only found: "); + pte_chain = page->pte_chain; + while (pte_chain) { + printk("%p ", pte_chain->ptep); + pte_chain = pte_chain->next; + } + printk("\n"); + panic("page_remove_pmap: giving up.\n"); +} + +/* + * Worker function for page_remove_all_pmaps(). + */ +static int pmap_remove(struct page * page, struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain) +{ + pte_t * ptep = pte_chain->ptep; + struct page * pte_page; + struct mm_struct * mm; + unsigned long address, low_address; + + /* Calculate the arguments to try_to_swap_out... */ + pte_page = virt_to_page(ptep); + /* XXX: get this right for non-x86 .. per-arch pte_addr macros? */ + low_address = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; + address = pte_page->index + low_address; + mm = (void *) pte_page->mapping; + if (!mm) { + printk("pmap_remove: NULL mm, %lx address\n", address); + BUG(); + } + + return try_to_swap_out(mm, address, ptep, page); +} + +/** + * page_remove_all_pmaps - remove all mappings to a page + * @page: the page to remove mappings from + * + * This function tries to remove all mappings to a page, it can fail + * if it missed a trylock (SWAP_AGAIN) or if it runs out of swap or + * encounters an unswappable page (SWAP_FAIL). + * + * The caller must hold both the pagemap_lru_lock and the page lock for + * the specified page. + */ +int page_remove_all_pmaps(struct page * page) +{ + struct pte_chain * pte_chain = page->pte_chain; + struct pte_chain * prev_pte_chain = NULL; + int ret = SWAP_SUCCESS; + + /* These pages should never end up on the pageout lists. */ + if (!VALID_PAGE(page) || PageReserved(page)) + BUG(); + + if (!PageLocked(page)) + BUG(); + + while (pte_chain) { + switch (pmap_remove(page, pte_chain, prev_pte_chain)) { + case SWAP_SUCCESS: + /* Free the current pte_chain ... */ + pte_chain->ptep = NULL; + pte_chain_free(pte_chain, prev_pte_chain, page); + /* ... and start at the head of the new list. */ + pte_chain = page->pte_chain; + break; + case SWAP_AGAIN: + /* Skip this pte if we missed a trylock. */ + prev_pte_chain = pte_chain; + pte_chain = pte_chain->next; + ret = SWAP_AGAIN; + break; + case SWAP_FAIL: + /* Give up if the page is unswappable. */ + return SWAP_FAIL; + case SWAP_ERROR: + /* Aieee, invalid arguments ... */ + printk("page_remove_all_pmaps: SWAP_ERROR\n"); + return SWAP_ERROR; + } + } + + return ret; +} + +/* + * Helper function to unlink freed pte_chain structures and add + * them to the freelist. Protected by the pagemap_lru_lock. + */ +static void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page) +{ + if (prev_pte_chain) + prev_pte_chain->next = pte_chain->next; + else if (page) + page->pte_chain = pte_chain->next; + + pte_chain->next = pte_chain_freelist; + pte_chain_freelist = pte_chain; +} + +/* + * When we cannot allocate a new pte_chain structure, we simply unmap + * some other page table entries in the system and use those. + * + * TODO: implementation -- Rik + */ +static void pte_chain_reclaim(void) +{ + panic("Implement pte_chain_reclaim, you lazy bastard!\n"); +} + +/* + * Allocates a pageful of new pte_chains. If the page allocation + * fails we simply reclaim pte_chain structures which are in use + * in the system. Always succeeds. + */ +static void alloc_new_pte_chains(void) +{ + struct pte_chain * pte_chain; + void * page = (void *) get_zeroed_page(GFP_ATOMIC); + if (page) { + int count = PAGE_SIZE / sizeof(struct pte_chain); + pte_chain = page; + do { + pte_chain_free(pte_chain, NULL, NULL); + pte_chain++; + } while (--count > 0); + } else { + /* Reclaim pte_chain structures which are in use. */ + pte_chain_reclaim(); + } +} + +/* + * Grab a pte_chain off the freelist, allocating new pte_chains + * if necessary. We are protected by the pagemap_lru_lock. + */ +static struct pte_chain * pte_chain_alloc(void) +{ + struct pte_chain * pte_chain; + + /* + * If we run out of free pte_chain structures, we try to + * allocate a page of memory and convert that into new + * pte_chain structures. + */ + if (!pte_chain_freelist) { + alloc_new_pte_chains(); + } + + /* Remove pte_chain from list and return it. */ + pte_chain = pte_chain_freelist; + pte_chain_freelist = pte_chain->next; + pte_chain->next = 0; + + return pte_chain; +} --- linux-2.4.7/include/linux/mm.h.orig Thu Jul 26 13:22:46 2001 +++ linux-2.4.7/include/linux/mm.h Thu Jul 26 13:50:49 2001 @@ -130,6 +130,9 @@ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access); }; +/* Incomplete declaration; pte_chain is internal to mm/pmap.c */ +struct pte_chain; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -157,6 +160,7 @@ struct list_head lru; /* Pageout list, eg. active_list; protected by pagemap_lru_lock !! */ unsigned long age; /* Page aging counter. */ + struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */ wait_queue_head_t wait; /* Page locked? Stand in line... */ struct page **pprev_hash; /* Complement to *next_hash. */ struct buffer_head * buffers; /* Buffer maps us to a disk block. */ --- linux-2.4.7/include/linux/swap.h.orig Thu Jul 26 13:22:50 2001 +++ linux-2.4.7/include/linux/swap.h Thu Jul 26 13:52:48 2001 @@ -98,6 +98,18 @@ struct zone_t; +/* linux/mm/pmap.c */ +extern int page_referenced(struct page *); +extern void page_add_pmap(struct page *, pte_t *); +extern int page_remove_pmap(struct page *, pte_t *); +extern int page_remove_all_pmaps(struct page *); + +/* page_unmap_all_pmaps and try_to_swap_out return values */ +#define SWAP_SUCCESS 0 /* unmapped every user of the page */ +#define SWAP_AGAIN 1 /* missed a trylock, try again later */ +#define SWAP_FAIL 2 /* cannot swap this page out, reactivate */ +#define SWAP_ERROR 3 /* illegal arguments or misc error */ + /* linux/mm/swap.c */ extern int memory_pressure; extern void age_page_up(struct page *); @@ -116,6 +128,7 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ +extern int try_to_swap_out(struct mm_struct *, unsigned long, pte_t *, struct page *); extern struct page * reclaim_page(zone_t *); extern wait_queue_head_t kswapd_wait; extern wait_queue_head_t kreclaimd_wait; @@ -261,8 +274,8 @@ */ #define INACTIVE_SHIFT 6 #define inactive_min(a,b) ((a) < (b) ? (a) : (b)) -#define inactive_target inactive_min((memory_pressure >> INACTIVE_SHIFT), \ - (num_physpages / 4)) +#define inactive_target inactive_min(((memory_pressure >> INACTIVE_SHIFT), \ + (num_physpages / 4)) /* * Ugly ugly ugly HACK to make sure the inactive lists --- linux-2.4.7/TODO.orig Thu Jul 26 13:58:00 2001 +++ linux-2.4.7/TODO Thu Jul 26 13:25:41 2001 @@ -0,0 +1,32 @@ + reverse mapping TODO + +- page_add_pmap / page_remove_pmap / ... LOCAL LOCKING ! +- remove add_to_swap_cache() SMP deadlock pagemap_lru_lock/pagecache_lock +- make pmap_remove()/pte_alloc() portable ... per-arch pte_addr macros ? +- no pte_chain struct for unshared pages, direct pte_t * +- implement pte_chain_reclaim() + +- explicit swapout clustering in try_to_swap_out() +- defragmentation for __alloc_pages() ... +- swap chaining ???? (swappable swap chains?) (why? ;)) + + + pte / get / pmap order + + -- adding a page +1. increment page count +2. setup page table entry +3. add page to pmap + + -- removing a page +1. remove pmap +2. clear page table entry +3. decrement page count + +This interferes with the pageout code in only one way: if +the pageout code catches the page without our pmap entry +but with our incremented page count, it'll move the page +to the active list and will not get around to swapping it +out right now. This is a very narrow race window so the +chances of it happening are small and impact on the system +should be zero.