From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Tue, 28 Jan 2003 14:41:04 -0600 From: Dave McCracken Subject: [PATCH 2.5.59-mm6] Speed up task exit Message-ID: <64880000.1043786464@baldur.austin.ibm.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="==========1869179384==========" Sender: owner-linux-mm@kvack.org Return-Path: To: Andrew Morton Cc: Linux Memory Management List-ID: --==========1869179384========== Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit Content-Disposition: inline Andrew, this builds on my first patch eliminating the page_table_lock during page table cleanup on exit. I took a good hard look at clear_page_tables, and realized that it's using up a lot of time to run. It walks through a lot of empty slots looking for pte pages to free. I came to the realization that if we could just keep a count of mapped pages and swap entries, we'd know right away if a pte page is freeable. This patch tracks the count for pte pages and removes them as soon as they're unused, eliminating the need for clear_page_tables entirely. Doing this gained another 5% in my fork/exit timing tests, so the combined patch gives me a 10% improvement in fork/exit. Tracking the reference counts was the last straw in overloading struct page with pte page info, so I created a 'struct ptpage' to use when the struct page describes a page table page. It's a bit of a hack, but I think in the long run will make it more understandable. Dave McCracken ====================================================================== Dave McCracken IBM Linux Base Kernel Team 1-512-838-3059 dmccr@us.ibm.com T/L 678-3059 --==========1869179384========== Content-Type: text/plain; charset=iso-8859-1; name="exit-2.5.59-mm6-2.diff" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="exit-2.5.59-mm6-2.diff"; size=26209 --- 2.5.59-mm6/./include/asm-generic/tlb.h 2003-01-16 20:21:33.000000000 = -0600 +++ 2.5.59-mm6-test/./include/asm-generic/tlb.h 2003-01-27 = 11:10:49.000000000 -0600 @@ -84,13 +84,6 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long = end) { - int freed =3D tlb->freed; - struct mm_struct *mm =3D tlb->mm; - int rss =3D mm->rss; - - if (rss < freed) - freed =3D rss; - mm->rss =3D rss - freed; tlb_flush_mmu(tlb, start, end); =20 /* keep the page table cache within bounds */ --- 2.5.59-mm6/./include/asm-generic/rmap.h 2003-01-16 20:22:19.000000000 = -0600 +++ 2.5.59-mm6-test/./include/asm-generic/rmap.h 2003-01-27 = 11:10:49.000000000 -0600 @@ -26,7 +26,8 @@ */ #include =20 -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * = mm, unsigned long address) +static inline void +pgtable_add_rmap(struct ptpage * page, struct mm_struct * mm, unsigned = long address) { #ifdef BROKEN_PPC_PTE_ALLOC_ONE /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ @@ -35,30 +36,31 @@ if (!mem_init_done) return; #endif - page->mapping =3D (void *)mm; - page->index =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); + page->mm =3D mm; + page->virtual =3D address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); inc_page_state(nr_page_table_pages); } =20 -static inline void pgtable_remove_rmap(struct page * page) +static inline void +pgtable_remove_rmap(struct ptpage * page) { - page->mapping =3D NULL; - page->index =3D 0; + page->mm =3D NULL; + page->virtual =3D 0; dec_page_state(nr_page_table_pages); } =20 static inline struct mm_struct * ptep_to_mm(pte_t * ptep) { - struct page * page =3D kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; + struct ptpage * page =3D (struct ptpage *)kmap_atomic_to_page(ptep); + return page->mm; } =20 static inline unsigned long ptep_to_address(pte_t * ptep) { - struct page * page =3D kmap_atomic_to_page(ptep); + struct ptpage * page =3D (struct ptpage *)kmap_atomic_to_page(ptep); unsigned long low_bits; low_bits =3D ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; + return page->virtual + low_bits; } =20 #if CONFIG_HIGHPTE --- 2.5.59-mm6/./include/linux/mm.h 2003-01-27 11:01:12.000000000 -0600 +++ 2.5.59-mm6-test/./include/linux/mm.h 2003-01-28 10:35:07.000000000 = -0600 @@ -196,6 +196,16 @@ */ #include =20 +struct ptpage { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + struct mm_struct *mm; /* mm_struct this page belongs to */ + unsigned long virtual; /* virtual address this page maps */ + unsigned long mapcount; /* Number of pages mapped to this page */ + unsigned long swapcount; /* Number of swap pages in this page */ +}; + /* * Methods to modify the page usage count. * @@ -365,6 +375,11 @@ void shmem_lock(struct file * file, int lock); int shmem_zero_setup(struct vm_area_struct *); =20 +void increment_rss(struct ptpage *ptpage); +void decrement_rss(struct ptpage *ptpage); +void increment_swapcount(struct ptpage *ptpage); +void decrement_swapcount(struct ptpage *ptpage); + void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, @@ -372,7 +387,6 @@ unsigned long end_addr, unsigned long *nr_accounted); void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address, unsigned long size); -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int = nr); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int remap_page_range(struct vm_area_struct *vma, unsigned long from, --- 2.5.59-mm6/./include/asm-i386/pgalloc.h 2003-01-27 11:01:11.000000000 = -0600 +++ 2.5.59-mm6-test/./include/asm-i386/pgalloc.h 2003-01-27 = 11:10:49.000000000 -0600 @@ -10,10 +10,10 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) =20 -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct = page *pte) +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct = ptpage *pte) { set_pmd(pmd, __pmd(_PAGE_TABLE + - ((unsigned long long)page_to_pfn(pte) << + ((unsigned long long)page_to_pfn((struct page *)pte) << (unsigned long long) PAGE_SHIFT))); } /* @@ -24,20 +24,20 @@ void pgd_free(pgd_t *pgd); =20 pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -struct page *pte_alloc_one(struct mm_struct *, unsigned long); +struct ptpage *pte_alloc_one(struct mm_struct *, unsigned long); =20 static inline void pte_free_kernel(pte_t *pte) { free_page((unsigned long)pte); } =20 -static inline void pte_free(struct page *pte) +static inline void pte_free(struct ptpage *pte) { - __free_page(pte); + __free_page((struct page *)pte); } =20 =20 -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) +#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),((struct page = *)pte)) =20 /* * allocating and freeing a pmd is trivial: the 1-entry pmd is --- 2.5.59-mm6/./include/asm-i386/pgtable.h 2003-01-27 11:01:11.000000000 = -0600 +++ 2.5.59-mm6-test/./include/asm-i386/pgtable.h 2003-01-27 = 11:10:49.000000000 -0600 @@ -229,6 +229,8 @@ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #endif /* !CONFIG_DISCONTIGMEM */ =20 +#define pmd_ptpage(pmd) ((struct ptpage *)pmd_page(pmd)) + #define pmd_large(pmd) \ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) =3D=3D = (_PAGE_PSE|_PAGE_PRESENT)) =20 --- 2.5.59-mm6/./arch/i386/mm/pgtable.c 2003-01-27 11:01:08.000000000 -0600 +++ 2.5.59-mm6-test/./arch/i386/mm/pgtable.c 2003-01-27 11:10:49.000000000 = -0600 @@ -145,24 +145,26 @@ return pte; } =20 -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +struct ptpage *pte_alloc_one(struct mm_struct *mm, unsigned long address) { int count =3D 0; - struct page *pte; + struct ptpage *pte; =20 do { #if CONFIG_HIGHPTE - pte =3D alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); + pte =3D (struct ptpage *)alloc_pages(GFP_KERNEL | __GFP_HIGHMEM, 0); #else - pte =3D alloc_pages(GFP_KERNEL, 0); + pte =3D (struct ptpage *)alloc_pages(GFP_KERNEL, 0); #endif - if (pte) - clear_highpage(pte); - else { + if (pte) { + clear_highpage((struct page *)pte); + pte->mapcount =3D pte->swapcount=3D 0; + break; + } else { current->state =3D TASK_UNINTERRUPTIBLE; schedule_timeout(HZ); } - } while (!pte && (count++ < 10)); + } while (count++ < 10); return pte; } =20 --- 2.5.59-mm6/./fs/exec.c 2003-01-27 11:01:10.000000000 -0600 +++ 2.5.59-mm6-test/./fs/exec.c 2003-01-28 10:44:04.000000000 -0600 @@ -317,7 +317,7 @@ set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY)))); pte_chain =3D page_add_rmap(page, pte, pte_chain); pte_unmap(pte); - tsk->mm->rss++; + increment_rss(pmd_ptpage(*pmd)); spin_unlock(&tsk->mm->page_table_lock); =20 /* no need for flush_tlb */ --- 2.5.59-mm6/./mm/fremap.c 2003-01-16 20:21:34.000000000 -0600 +++ 2.5.59-mm6-test/./mm/fremap.c 2003-01-28 10:54:01.000000000 -0600 @@ -19,9 +19,11 @@ static inline void zap_pte(struct mm_struct *mm, pte_t *ptep) { pte_t pte =3D *ptep; + struct ptpage *ptpage; =20 if (pte_none(pte)) return; + ptpage =3D (struct ptpage *)kmap_atomic_to_page((void *)ptep); if (pte_present(pte)) { unsigned long pfn =3D pte_pfn(pte); =20 @@ -33,12 +35,13 @@ set_page_dirty(page); page_remove_rmap(page, ptep); page_cache_release(page); - mm->rss--; + decrement_rss(ptpage); } } } else { free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(ptep); + decrement_swapcount(ptpage); } } =20 @@ -69,7 +72,6 @@ =20 zap_pte(mm, pte); =20 - mm->rss++; flush_page_to_ram(page); flush_icache_page(vma, page); entry =3D mk_pte(page, protection_map[prot]); @@ -78,6 +80,7 @@ set_pte(pte, entry); pte_chain =3D page_add_rmap(page, pte, pte_chain); pte_unmap(pte); + increment_rss(pmd_ptpage(*pmd)); flush_tlb_page(vma, addr); =20 spin_unlock(&mm->page_table_lock); --- 2.5.59-mm6/./mm/swapfile.c 2003-01-16 20:21:44.000000000 -0600 +++ 2.5.59-mm6-test/./mm/swapfile.c 2003-01-28 11:14:00.000000000 -0600 @@ -379,20 +379,23 @@ */ /* mmlist_lock and vma->vm_mm->page_table_lock are held */ static void -unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, +unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t *dir, swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) { pte_t pte =3D *dir; + struct ptpage *ptpage; =20 if (likely(pte_to_swp_entry(pte).val !=3D entry.val)) return; if (unlikely(pte_none(pte) || pte_present(pte))) return; + ptpage =3D pmd_ptpage(*pmd); get_page(page); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); *pte_chainp =3D page_add_rmap(page, dir, *pte_chainp); + increment_rss(ptpage); + decrement_swapcount(ptpage); swap_free(entry); - ++vma->vm_mm->rss; } =20 /* mmlist_lock and vma->vm_mm->page_table_lock are held */ @@ -423,8 +426,7 @@ */ if (pte_chain =3D=3D NULL) pte_chain =3D pte_chain_alloc(GFP_ATOMIC); - unuse_pte(vma, offset+address-vma->vm_start, - pte, entry, page, &pte_chain); + unuse_pte(vma, dir, pte, entry, page, &pte_chain); address +=3D PAGE_SIZE; pte++; } while (address && (address < end)); --- 2.5.59-mm6/./mm/memory.c 2003-01-16 20:22:06.000000000 -0600 +++ 2.5.59-mm6-test/./mm/memory.c 2003-01-28 11:02:33.000000000 -0600 @@ -64,81 +64,46 @@ void * high_memory; struct page *highmem_start_page; =20 -/* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). - */ -static inline void copy_cow_page(struct page * from, struct page * to, = unsigned long address) +void increment_rss(struct ptpage *ptpage) { - if (from =3D=3D ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } - copy_user_highpage(to, from, address); + ptpage->mapcount++; + ptpage->mm->rss++; } =20 -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) +void decrement_rss(struct ptpage *ptpage) { - struct page *page; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - page =3D pmd_page(*dir); - pmd_clear(dir); - pgtable_remove_rmap(page); - pte_free_tlb(tlb, page); + ptpage->mapcount--; + ptpage->mm->rss--; } =20 -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) +void increment_swapcount(struct ptpage *ptpage) { - int j; - pmd_t * pmd; + ptpage->swapcount++; +} =20 - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd =3D pmd_offset(dir, 0); - pgd_clear(dir); - for (j =3D 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); +void decrement_swapcount(struct ptpage *ptpage) +{ + ptpage->swapcount--; } =20 /* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. - * - * Must be called with pagetable lock held. + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). */ -void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int = nr) +static inline void copy_cow_page(struct page * from, struct page * to, = unsigned long address) { - pgd_t * page_dir =3D tlb->mm->pgd; - - page_dir +=3D first; - do { - free_one_pgd(tlb, page_dir); - page_dir++; - } while (--nr); + if (from =3D=3D ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); } =20 pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long = address) { if (!pmd_present(*pmd)) { - struct page *new; + struct ptpage *new; =20 spin_unlock(&mm->page_table_lock); new =3D pte_alloc_one(mm, address); @@ -182,7 +147,6 @@ pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); pmd_populate_kernel(mm, pmd, new); } out: @@ -252,6 +216,7 @@ =20 do { pte_t * src_pte, * dst_pte; + struct page *ptpage; /* copy_pte_range */ @@ -272,6 +237,7 @@ goto nomem; spin_lock(&src->page_table_lock); src_pte =3D pte_offset_map_nested(src_pmd, address); + ptpage =3D pmd_ptpage(*dst_pmd); do { pte_t pte =3D *src_pte; struct page *page; @@ -285,6 +251,7 @@ if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); set_pte(dst_pte, pte); + increment_swapcount(ptpage); goto cont_copy_pte_range_noset; } pfn =3D pte_pfn(pte); @@ -311,7 +278,7 @@ pte =3D pte_mkclean(pte); pte =3D pte_mkold(pte); get_page(page); - dst->rss++; + increment_rss(ptpage); =20 cont_copy_pte_range: set_pte(dst_pte, pte); @@ -374,6 +341,7 @@ { unsigned long offset; pte_t *ptep; + struct ptpage *ptpage; =20 if (pmd_none(*pmd)) return; @@ -382,6 +350,7 @@ pmd_clear(pmd); return; } + ptpage =3D pmd_ptpage(*pmd); ptep =3D pte_offset_map(pmd, address); offset =3D address & ~PMD_MASK; if (offset + size > PMD_SIZE) @@ -406,13 +375,21 @@ mark_page_accessed(page); tlb->freed++; page_remove_rmap(page, ptep); + decrement_rss(ptpage); tlb_remove_page(tlb, page); } } } else { free_swap_and_cache(pte_to_swp_entry(pte)); + decrement_swapcount(ptpage); pte_clear(ptep); } + if (!ptpage->mapcount && !ptpage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap(ptpage); + pte_free_tlb(tlb, ptpage); + break; + } } pte_unmap(ptep-1); } @@ -596,6 +573,170 @@ spin_unlock(&mm->page_table_lock); } =20 +/** + * unmap_all_pages - unmap all the pages for an mm_struct + * @mm: the mm_struct to unmap + * + * This function is only called when an mm_struct is about to be + * released. It walks through all vmas and removes their pages + * from the page table. It understands shared pte pages and will + * decrement the count appropriately. + */ +void unmap_all_pages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + struct ptpage *ptpage; + struct page *pagevec[16]; + int npages =3D 0; + unsigned long address; + unsigned long vm_end, pmd_end, pte_end; + + lru_add_drain(); + + vma =3D mm->mmap; + + /* On the off chance that the first vma is hugetlb... */ + if (is_vm_hugetlb_page(vma)) { + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + vma =3D vma->vm_next; + mm->map_count--; + } + + for (;;) { + if (!vma) + goto out; + + address =3D vma->vm_start; +next_vma: + vm_end =3D vma->vm_end; + mm->map_count--; + /* + * Advance the vma pointer to the next vma. + * To facilitate coalescing adjacent vmas, the + * pointer always points to the next one + * beyond the range we're currently working + * on, which means vma will be null on the + * last iteration. + */ + vma =3D vma->vm_next; + if (vma) { + /* + * Go ahead and include hugetlb vmas + * in the range we process. The pmd + * entry will be cleared by close, so + * we'll just skip over them. This is + * easier than trying to avoid them. + */ + if (is_vm_hugetlb_page(vma)) + unmap_hugepage_range(vma, vma->vm_start, vma->vm_end); + + /* + * Coalesce adjacent vmas and process + * them all in one iteration. + */ + if (vma->vm_start =3D=3D vm_end) { + goto next_vma; + } + } + pgd =3D pgd_offset(mm, address); + do { + if (pgd_none(*pgd)) + goto skip_pgd; + + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); +skip_pgd: + address =3D (address + PGDIR_SIZE) & PGDIR_MASK; + if (address > vm_end) + address =3D vm_end; + goto next_pgd; + } + pmd =3D pmd_offset(pgd, address); + if (vm_end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + pmd_end =3D (address + PGDIR_SIZE) & PGDIR_MASK; + else + pmd_end =3D vm_end; + + do { + if (pmd_none(*pmd)) + goto skip_pmd; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); +skip_pmd: + address =3D (address + PMD_SIZE) & PMD_MASK; + if (address > pmd_end) + address =3D pmd_end; + goto next_pmd; + } + ptpage =3D pmd_ptpage(*pmd); + pte =3D pte_offset_map(pmd, address); + if (pmd_end > ((address + PMD_SIZE) & PMD_MASK)) + pte_end =3D (address + PMD_SIZE) & PMD_MASK; + else + pte_end =3D pmd_end; + do { + pte_t pteval =3D *pte; + + if (pte_none(pteval)) + goto next_pte; + if (pte_present(pteval)) { + unsigned long pfn =3D pte_pfn(pteval); + if (pfn_valid(pfn)) { + struct page *page =3D pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pteval)) + set_page_dirty(page); + if (page->mapping && + pte_young(pteval) && + !PageSwapCache(page)) + mark_page_accessed(page); + page_remove_rmap(page, pte); + decrement_rss(ptpage); + pagevec[npages++] =3D page; + if (npages =3D=3D 16) { + free_pages_and_swap_cache(pagevec, npages); + npages =3D 0; + } + + } + } + } else { + free_swap_and_cache(pte_to_swp_entry(pteval)); + decrement_swapcount(ptpage); + } + pte_clear(pte); + if (!ptpage->mapcount && !ptpage->swapcount) { + pmd_clear(pmd); + pgtable_remove_rmap(ptpage); + pte_free(ptpage); + address =3D pte_end; + break; + } +next_pte: + address +=3D PAGE_SIZE; + pte++; + } while (address < pte_end); + pte_unmap(pte-1); +next_pmd: + pmd++; + } while (address < pmd_end); +next_pgd: + pgd++; + } while (address < vm_end); + } + +out: + if (npages) + free_pages_and_swap_cache(pagevec, npages); + + flush_tlb_mm(mm); +} + /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. @@ -962,8 +1103,6 @@ spin_lock(&mm->page_table_lock); page_table =3D pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - ++mm->rss; page_remove_rmap(old_page, page_table); break_cow(vma, new_page, address, page_table); pte_chain =3D page_add_rmap(new_page, page_table, pte_chain); @@ -1114,6 +1253,7 @@ swp_entry_t entry =3D pte_to_swp_entry(orig_pte); pte_t pte; int ret =3D VM_FAULT_MINOR; + struct ptpage *ptpage; struct pte_chain *pte_chain =3D NULL; =20 pte_unmap(page_table); @@ -1172,7 +1312,6 @@ if (vm_swap_full()) remove_exclusive_swap_page(page); =20 - mm->rss++; pte =3D mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte =3D pte_mkdirty(pte_mkwrite(pte)); @@ -1182,6 +1321,9 @@ flush_icache_page(vma, page); set_pte(page_table, pte); pte_chain =3D page_add_rmap(page, page_table, pte_chain); + ptpage =3D pmd_ptpage(*pmd); + increment_rss(ptpage); + decrement_swapcount(ptpage); =20 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -1242,7 +1384,6 @@ ret =3D VM_FAULT_MINOR; goto out; } - mm->rss++; flush_page_to_ram(page); entry =3D pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add_active(page); @@ -1253,6 +1394,7 @@ /* ignores ZERO_PAGE */ pte_chain =3D page_add_rmap(page, page_table, pte_chain); pte_unmap(page_table); + increment_rss(pmd_ptpage(*pmd)); =20 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); @@ -1332,7 +1474,6 @@ */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - ++mm->rss; flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry =3D mk_pte(new_page, vma->vm_page_prot); @@ -1341,6 +1482,7 @@ set_pte(page_table, entry); pte_chain =3D page_add_rmap(new_page, page_table, pte_chain); pte_unmap(page_table); + increment_rss(pmd_ptpage(*pmd)); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); --- 2.5.59-mm6/./mm/mremap.c 2003-01-16 20:22:15.000000000 -0600 +++ 2.5.59-mm6-test/./mm/mremap.c 2003-01-28 11:05:22.000000000 -0600 @@ -94,8 +94,10 @@ page =3D pte_page(*src); =20 if (!pte_none(*src)) { - if (page) + if (page) { page_remove_rmap(page, src); + decrement_rss((struct ptpage *)kmap_atomic_to_page((void *)src)); + } pte =3D ptep_get_and_clear(src); if (!dst) { /* No dest? We must put it back. */ @@ -103,8 +105,10 @@ error++; } set_pte(dst, pte); - if (page) + if (page) { *pte_chainp =3D page_add_rmap(page, dst, *pte_chainp); + increment_rss((struct ptpage *)kmap_atomic_to_page((void *)dst)); + } } return error; } --- 2.5.59-mm6/./mm/mmap.c 2003-01-27 11:01:12.000000000 -0600 +++ 2.5.59-mm6-test/./mm/mmap.c 2003-01-27 11:12:51.000000000 -0600 @@ -23,6 +23,8 @@ #include #include =20 +extern void unmap_all_pages(struct mm_struct *mm); + /* * WARNING: the debugging will use recursive algorithms so never enable = this * unless you know what you are doing. @@ -1006,69 +1008,6 @@ } #endif =20 -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct = *prev, - unsigned long start, unsigned long end) -{ - unsigned long first =3D start & PGDIR_MASK; - unsigned long last =3D end + PGDIR_SIZE - 1; - unsigned long start_index, end_index; - struct mm_struct *mm =3D tlb->mm; - - if (!prev) { - prev =3D mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last =3D prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next =3D prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev =3D next; - continue; - } - if (last > next->vm_start) - last =3D next->vm_start; - } - if (prev->vm_end > first) - first =3D prev->vm_end + PGDIR_SIZE - 1; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - /* - * If the PGD bits are not consecutive in the virtual address, the - * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. - */ - start_index =3D pgd_index(first); - if (start_index < FIRST_USER_PGD_NR) - start_index =3D FIRST_USER_PGD_NR; - end_index =3D pgd_index(last); - if (end_index > start_index) { - clear_page_tables(tlb, start_index, end_index - start_index); - flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); - } -} - /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -1134,7 +1073,6 @@ tlb =3D tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted); vm_unacct_memory(nr_accounted); - free_pgtables(tlb, prev, start, end); tlb_finish_mmu(tlb, start, end); } =20 @@ -1382,25 +1320,16 @@ /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { - struct mmu_gather *tlb; struct vm_area_struct *vma; - unsigned long nr_accounted =3D 0; =20 profile_exit_mmap(mm); =20 lru_add_drain(); =20 - spin_lock(&mm->page_table_lock); - - tlb =3D tlb_gather_mmu(mm, 1); flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -=3D unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted); - vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); - tlb_finish_mmu(tlb, 0, TASK_SIZE); + unmap_all_pages(mm); + + BUG_ON(mm->map_count); /* This is just debugging */ =20 vma =3D mm->mmap; mm->mmap =3D mm->mmap_cache =3D NULL; @@ -1409,14 +1338,20 @@ mm->total_vm =3D 0; mm->locked_vm =3D 0; =20 - spin_unlock(&mm->page_table_lock); - /* * Walk the list again, actually closing and freeing it * without holding any MM locks. */ while (vma) { struct vm_area_struct *next =3D vma->vm_next; + + /* + * If the VMA has been charged for, account for its + * removal + */ + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + remove_shared_vm_struct(vma); if (vma->vm_ops) { if (vma->vm_ops->close) --- 2.5.59-mm6/./mm/rmap.c 2003-01-16 20:22:43.000000000 -0600 +++ 2.5.59-mm6-test/./mm/rmap.c 2003-01-28 10:54:31.000000000 -0600 @@ -328,6 +328,7 @@ static int try_to_unmap_one(struct page * page, pte_addr_t paddr) { pte_t *ptep =3D rmap_ptep_map(paddr); + struct ptpage *ptpage =3D (struct ptpage *)kmap_atomic_to_page((void = *)ptep); unsigned long address =3D ptep_to_address(ptep); struct mm_struct * mm =3D ptep_to_mm(ptep); struct vm_area_struct * vma; @@ -338,6 +339,15 @@ BUG(); =20 /* + * If this mm is in the process of exiting, skip this page + * for now to let the exit finish. + */ + if (atomic_read(&mm->mm_users) =3D=3D 0) { + rmap_ptep_unmap(ptep); + return SWAP_AGAIN; + } + + /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ @@ -364,19 +374,20 @@ flush_cache_page(vma, address); pte =3D ptep_get_and_clear(ptep); flush_tlb_page(vma, address); + decrement_rss(ptpage); =20 /* Store the swap location in the pte. See handle_pte_fault() ... */ if (PageSwapCache(page)) { swp_entry_t entry =3D { .val =3D page->index }; swap_duplicate(entry); set_pte(ptep, swp_entry_to_pte(entry)); + increment_swapcount(ptpage); } =20 /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pte)) set_page_dirty(page); =20 - mm->rss--; page_cache_release(page); ret =3D SWAP_SUCCESS; =20 --==========1869179384==========-- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/