From: Rik van Riel <riel@conectiva.com.br>
To: Dave McCracken <dmccr@us.ibm.com>
Cc: linux-mm@kvack.org
Subject: Re: Rmap code?
Date: Wed, 22 Aug 2001 13:49:38 -0300 (BRST) [thread overview]
Message-ID: <Pine.LNX.4.33L.0108221348240.31410-200000@duckman.distro.conectiva> (raw)
In-Reply-To: <7040000.998430236@baldur>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 712 bytes --]
[due to popular demand, cc'd to linux-mm ... if you feel
like playing with this code, please don't do double work
but tell the others]
On Tue, 21 Aug 2001, Dave McCracken wrote:
> On the MM Wiki page you said you have some code already written that does
> reverse mapping. Could you possibly send me a copy of what you've done so
> I can play with it? I'd like to experiment with what's possible, and it'd
> make it a whole lot easier if I started with your ideas so far.
Here you are. Note the TODO file in the top level
directory, where some of the problems are noted
down.
Rik
--
IA64: a worthy successor to the i860.
http://www.surriel.com/
http://www.conectiva.com/ http://distro.conectiva.com/
[-- Attachment #2: 2.4.7-pmap --]
[-- Type: TEXT/PLAIN, Size: 34514 bytes --]
--- linux-2.4.7/fs/exec.c.orig Thu Jul 26 13:22:26 2001
+++ linux-2.4.7/fs/exec.c Thu Jul 26 16:19:02 2001
@@ -34,6 +34,7 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
+#include <linux/swap.h>
#define __NO_VERSION__
#include <linux/module.h>
@@ -277,8 +278,9 @@
flush_dcache_page(page);
flush_page_to_ram(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+ page_add_pmap(page, pte);
+ spin_unlock(&pagemap_lru_lock);
tsk->mm->rss++;
- spin_unlock(&tsk->mm->page_table_lock);
/* no need for flush_tlb */
return;
--- linux-2.4.7/mm/filemap.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/filemap.c Thu Jul 26 13:24:37 2001
@@ -520,7 +520,9 @@
page->index = index;
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, page_hash(mapping, index));
- lru_cache_add(page);
+ /* XXX: already on the inactive_dirty list ... deuglify! */
+ if (!PageSwapCache(page))
+ lru_cache_add(page);
spin_unlock(&pagecache_lock);
}
--- linux-2.4.7/mm/memory.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/memory.c Fri Jul 27 11:08:40 2001
@@ -76,6 +76,7 @@
static inline void free_one_pmd(pmd_t * dir)
{
pte_t * pte;
+ struct page * page;
if (pmd_none(*dir))
return;
@@ -86,6 +87,10 @@
}
pte = pte_offset(dir, 0);
pmd_clear(dir);
+ page = virt_to_page(pte);
+ /* Clear the reverse mapping stuff on this page table page. */
+ page->mapping = 0;
+ page->index = 0;
pte_free(pte);
}
@@ -216,7 +221,9 @@
goto cont_copy_pte_range_noset;
if (!pte_present(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
- goto cont_copy_pte_range;
+ /* Swapped out, skip the pmap stuff. */
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range_noset;
}
ptepage = pte_page(pte);
if ((!VALID_PAGE(ptepage)) ||
@@ -236,6 +243,7 @@
get_page(ptepage);
cont_copy_pte_range: set_pte(dst_pte, pte);
+ page_add_pmap(ptepage, dst_pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
@@ -312,6 +320,8 @@
if (!size)
break;
page = ptep_get_and_clear(pte);
+ if (pte_present(page))
+ page_remove_pmap(pte_page(page), pte);
pte++;
size--;
if (pte_none(page))
@@ -849,6 +859,7 @@
* - flush the old one
* - update the page tables
* - inform the TLB about the new one
+ * - update the reverse mappings (if the page changes)
*
* We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
*/
@@ -868,7 +879,9 @@
copy_cow_page(old_page,new_page,address);
flush_page_to_ram(new_page);
flush_cache_page(vma, address);
+ page_remove_pmap(old_page, page_table);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+ page_add_pmap(new_page, page_table);
}
/*
@@ -1139,6 +1152,7 @@
flush_page_to_ram(page);
flush_icache_page(vma, page);
set_pte(page_table, pte);
+ page_add_pmap(page, page_table);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
@@ -1153,14 +1167,13 @@
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
{
pte_t entry;
+ struct page *page = ZERO_PAGE(addr);
/* Read-only mapping of ZERO_PAGE. */
entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
/* ..except if it's a write access */
if (write_access) {
- struct page *page;
-
/* Allocate our own private page. */
spin_unlock(&mm->page_table_lock);
page = alloc_page(GFP_HIGHUSER);
@@ -1178,6 +1191,7 @@
}
set_pte(page_table, entry);
+ page_add_pmap(page, page_table); /* Ignores empty_zero_page ;) */
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
@@ -1240,6 +1254,7 @@
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);
+ page_add_pmap(new_page, page_table);
} else {
/* One of our sibling threads was faster, back out. */
page_cache_release(new_page);
@@ -1372,6 +1387,7 @@
pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
if (!pmd_present(*pmd)) {
+ struct page * page;
pte_t *new;
/* "fast" allocation can happen without dropping the lock.. */
@@ -1392,6 +1408,10 @@
goto out;
}
}
+ /* Add reverse pte mapping pointers for pmap.c. */
+ page = virt_to_page(new);
+ page->mapping = (void *)mm;
+ page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
pmd_populate(mm, pmd, new);
}
out:
--- linux-2.4.7/mm/mremap.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/mremap.c Thu Jul 26 13:24:37 2001
@@ -63,6 +63,7 @@
pte_t pte;
if (!pte_none(*src)) {
+ page_remove_pmap(pte_page(pte), src);
pte = ptep_get_and_clear(src);
if (!dst) {
/* No dest? We must put it back. */
@@ -70,6 +71,7 @@
error++;
}
set_pte(dst, pte);
+ page_add_pmap(pte_page(pte), dst);
}
return error;
}
--- linux-2.4.7/mm/page_alloc.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/page_alloc.c Thu Jul 26 13:24:37 2001
@@ -87,6 +87,8 @@
BUG();
if (PageInactiveClean(page))
BUG();
+ if (page->pte_chain)
+ BUG();
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
page->age = PAGE_AGE_START;
--- linux-2.4.7/mm/swap.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swap.c Thu Jul 26 13:24:37 2001
@@ -164,22 +164,12 @@
void deactivate_page_nolock(struct page * page)
{
/*
- * One for the cache, one for the extra reference the
- * caller has and (maybe) one for the buffers.
- *
- * This isn't perfect, but works for just about everything.
- * Besides, as long as we don't move unfreeable pages to the
- * inactive_clean list it doesn't need to be perfect...
- */
- int maxcount = (page->buffers ? 3 : 2);
- page->age = 0;
- ClearPageReferenced(page);
-
- /*
* Don't touch it if it's not on the active list.
* (some pages aren't on any list at all)
*/
- if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) {
+ if (PageActive(page) && !page_ramdisk(page)) {
+ page->age = 0;
+ ClearPageReferenced(page);
del_page_from_active_list(page);
add_page_to_inactive_dirty_list(page);
}
@@ -266,8 +256,6 @@
*/
void lru_cache_del(struct page * page)
{
- if (!PageLocked(page))
- BUG();
spin_lock(&pagemap_lru_lock);
__lru_cache_del(page);
spin_unlock(&pagemap_lru_lock);
--- linux-2.4.7/mm/swapfile.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swapfile.c Thu Jul 26 13:24:37 2001
@@ -232,6 +232,7 @@
set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
get_page(page);
+ page_add_pmap(page, dir);
++vma->vm_mm->rss;
}
--- linux-2.4.7/mm/swap_state.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swap_state.c Thu Jul 26 13:55:13 2001
@@ -165,7 +165,13 @@
if (exclusive_swap_page(page))
delete_from_swap_cache_nolock(page);
UnlockPage(page);
- }
+ /*
+ * If we are the only user and it was an anonymous page
+ * without swap backing, remove the page from the list.
+ * SMP/fork() safe because we hold the mm->page_table_lock.
+ */
+ } else if (page_count(page) == 1)
+ lru_cache_del(page);
page_cache_release(page);
}
--- linux-2.4.7/mm/vmscan.c.orig Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/vmscan.c Tue Jul 31 18:12:07 2001
@@ -27,30 +27,71 @@
#define MAX(a,b) ((a) > (b) ? (a) : (b))
/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
+ * Try_to_swap_out does nothing but unmap a page table entry to
+ * a page. It has 4 return values:
+ * SWAP_SUCCESS - we succeeded in unmapping this page table entry
+ * SWAP_AGAIN - we failed a try_lock, try again later
+ * SWAP_FAIL - we cannot swap out this page any time soon
+ * (mlocked page, no swap space left, ...)
+ * SWAP_ERROR - an error occurred
*
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
+ * NOTE: we have to use trylock everywhere, since our locking
+ * order is opposite of the page fault handler, etc.
*/
-/* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+/* mm->page_table_lock needs to be held. mmap_sem is not held */
+int try_to_swap_out(struct mm_struct * mm, unsigned long address, pte_t * page_table, struct page *page)
{
pte_t pte;
swp_entry_t entry;
+ struct vm_area_struct * vma;
+ int ret;
- /* Don't look at this pte if it's been accessed recently. */
- if (ptep_test_and_clear_young(page_table)) {
- page->age += PAGE_AGE_ADV;
- if (page->age > PAGE_AGE_MAX)
- page->age = PAGE_AGE_MAX;
- return;
+ /*
+ * Try to get the mm->page_table_lock. Ideally we'd
+ * change the page fault code so we never grab the
+ * pagemap_lru_lock or page_table_lock while holding
+ * the mm->page_table_lock, but for now we trylock...
+ */
+ if (!spin_trylock(&mm->page_table_lock)) {
+ ret = SWAP_AGAIN;
+ goto out_unlock;
+ }
+
+ /*
+ * First, check for various error conditions.
+ * These should be BUG() or panic() once the pmap
+ * VM stabilises...
+ */
+ ret = SWAP_ERROR;
+
+ if (!mm || !page_table || !page) {
+ printk ("try_to_swap_out: called with null argument...\n");
+ goto out_unlock;
+ }
+
+ if (!PageLocked(page)) {
+ printk("try_to_swap_out: page not locked!\n");
+ goto out_unlock;
+ }
+
+ vma = find_vma(mm, address);
+ if (!vma) {
+ printk ("try_to_swap_out: page not in a VMA?!\n");
+ goto out_unlock;
+ }
+
+ /*
+ * OK, no errors occurred. There are still various
+ * opportunities for failure, though ...
+ */
+ ret = SWAP_FAIL;
+
+ /* mlock()ed VMA */
+ if (vma->vm_flags & VM_LOCKED) {
+ goto out_unlock;
}
- if (TryLockPage(page))
- return;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
@@ -74,11 +115,9 @@
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
- if (!page->age)
- deactivate_page(page);
- UnlockPage(page);
page_cache_release(page);
- return;
+ ret = SWAP_SUCCESS;
+ goto out_unlock;
}
/*
@@ -120,206 +159,18 @@
goto out_unlock_restore; /* No swap space left */
/* Add it to the swap cache and mark it dirty */
+ /* XXX: SMP deadlock due to pagecache_lock / pagemap_lru_lock order */
add_to_swap_cache(page, entry);
set_page_dirty(page);
goto set_swap_pte;
out_unlock_restore:
set_pte(page_table, pte);
- UnlockPage(page);
- return;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
-{
- pte_t * pte;
- unsigned long pmd_end;
-
- if (pmd_none(*dir))
- return count;
- if (pmd_bad(*dir)) {
- pmd_ERROR(*dir);
- pmd_clear(dir);
- return count;
- }
-
- pte = pte_offset(dir, address);
-
- pmd_end = (address + PMD_SIZE) & PMD_MASK;
- if (end > pmd_end)
- end = pmd_end;
-
- do {
- if (pte_present(*pte)) {
- struct page *page = pte_page(*pte);
-
- if (VALID_PAGE(page) && !PageReserved(page)) {
- try_to_swap_out(mm, vma, address, pte, page);
- if (!--count)
- break;
- }
- }
- address += PAGE_SIZE;
- pte++;
- } while (address && (address < end));
- mm->swap_address = address + PAGE_SIZE;
- return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
-{
- pmd_t * pmd;
- unsigned long pgd_end;
-
- if (pgd_none(*dir))
- return count;
- if (pgd_bad(*dir)) {
- pgd_ERROR(*dir);
- pgd_clear(dir);
- return count;
- }
-
- pmd = pmd_offset(dir, address);
-
- pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
- if (pgd_end && (end > pgd_end))
- end = pgd_end;
-
- do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count);
- if (!count)
- break;
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address && (address < end));
- return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
-{
- pgd_t *pgdir;
- unsigned long end;
-
- /* Don't swap out areas which are locked down */
- if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
- return count;
-
- pgdir = pgd_offset(mm, address);
-
- end = vma->vm_end;
- if (address >= end)
- BUG();
- do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count);
- if (!count)
- break;
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- pgdir++;
- } while (address && (address < end));
- return count;
-}
-
-/*
- * Returns non-zero if we scanned all `count' pages
- */
-static int swap_out_mm(struct mm_struct * mm, int count)
-{
- unsigned long address;
- struct vm_area_struct* vma;
-
- if (!count)
- return 1;
- /*
- * Go through process' page directory.
- */
-
- /*
- * Find the proper vm-area after freezing the vma chain
- * and ptes.
- */
- spin_lock(&mm->page_table_lock);
- address = mm->swap_address;
- vma = find_vma(mm, address);
- if (vma) {
- if (address < vma->vm_start)
- address = vma->vm_start;
-
- for (;;) {
- count = swap_out_vma(mm, vma, address, count);
- if (!count)
- goto out_unlock;
- vma = vma->vm_next;
- if (!vma)
- break;
- address = vma->vm_start;
- }
- }
- /* Reset to 0 when we reach the end of address space */
- mm->swap_address = 0;
-
out_unlock:
spin_unlock(&mm->page_table_lock);
- return !count;
-}
-
-#define SWAP_MM_SHIFT 4
-#define SWAP_SHIFT 5
-#define SWAP_MIN 8
-
-static inline int swap_amount(struct mm_struct *mm)
-{
- int nr = mm->rss >> SWAP_SHIFT;
- if (nr < SWAP_MIN) {
- nr = SWAP_MIN;
- if (nr > mm->rss)
- nr = mm->rss;
- }
- return nr;
-}
-
-static void swap_out(unsigned int priority, int gfp_mask)
-{
- int counter;
- int retval = 0;
- struct mm_struct *mm = current->mm;
-
- /* Always start by trying to penalize the process that is allocating memory */
- if (mm)
- retval = swap_out_mm(mm, swap_amount(mm));
-
- /* Then, look at the other mm's */
- counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
- do {
- struct list_head *p;
-
- spin_lock(&mmlist_lock);
- p = init_mm.mmlist.next;
- if (p == &init_mm.mmlist)
- goto empty;
-
- /* Move it to the back of the queue.. */
- list_del(p);
- list_add_tail(p, &init_mm.mmlist);
- mm = list_entry(p, struct mm_struct, mmlist);
-
- /* Make sure the mm doesn't disappear when we drop the lock.. */
- atomic_inc(&mm->mm_users);
- spin_unlock(&mmlist_lock);
-
- /* Walk about 6% of the address space each time */
- retval |= swap_out_mm(mm, swap_amount(mm));
- mmput(mm);
- } while (--counter >= 0);
- return;
-
-empty:
- spin_unlock(&mmlist_lock);
+ return ret;
}
-
/**
* reclaim_page - reclaims one page from the inactive_clean list
* @zone: reclaim a page from this zone
@@ -395,6 +246,16 @@
del_page_from_inactive_clean_list(page);
UnlockPage(page);
page->age = PAGE_AGE_START;
+ /*
+ * The bugs below cannot happen because other processes would
+ * need the pagecache_lock to find the page. When they find it,
+ * they need to increase the page count, which makes us move
+ * the page back to the active list in the code above.
+ *
+ * Thus, these checks check other code...
+ */
+ if (page->pte_chain)
+ BUG();
if (page_count(page) != 1)
printk("VM: reclaim_page, found page with count %d!\n",
page_count(page));
@@ -454,7 +315,6 @@
/* Page is or was in use? Move it to the active list. */
if (PageReferenced(page) || page->age > 0 ||
- (!page->buffers && page_count(page) > 1) ||
page_ramdisk(page)) {
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
@@ -472,6 +332,30 @@
}
/*
+ * Try to remove all the mappings processes have to
+ * this page. Pages can have "hidden" users, if that
+ * is the case the page gets moved back to the active
+ * list.
+ *
+ * This test is just an optimisation to move unfreeable
+ * pages back to the active list and prevent us from
+ * doing (expensive) disk IO. The "real" test is done
+ * in reclaim_page() and protected by the pagetable_lock.
+ */
+ switch (page_remove_all_pmaps(page)) {
+ case SWAP_AGAIN:
+ UnlockPage(page);
+ continue;
+ case SWAP_FAIL:
+ case SWAP_ERROR:
+ goto page_active;
+ case SWAP_SUCCESS:
+ maxcount = (page->buffers ? 2 : 1);
+ if (page_count(page) > maxcount)
+ goto page_active;
+ }
+
+ /*
* Dirty swap-cache page? Write it out if
* last copy..
*/
@@ -630,20 +514,22 @@
}
/**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
+ * refill_inactive - scan the active list and find pages to deactivate
+ * @maxscan: the priority at which to scan (gets converted to pages)
* @target: number of pages to deactivate, zero for background aging
*
* This function will scan a portion of the active list to find
* unused pages, those pages will then be moved to the inactive list.
*/
-int refill_inactive_scan(unsigned int priority, int target)
+int refill_inactive(unsigned int maxscan, int target)
{
struct list_head * page_lru;
struct page * page;
- int maxscan = nr_active_pages >> priority;
- int page_active = 0;
- int nr_deactivated = 0;
+ int referenced, page_active, nr_deactivated;;
+
+ /* Convert maxscan to the maximum number of pages to scan. */
+ maxscan = nr_active_pages >> maxscan;
+ nr_deactivated = 0;
/*
* When we are background aging, we try to increase the page aging
@@ -666,23 +552,15 @@
}
/* Do aging on the pages. */
- if (PageTestandClearReferenced(page)) {
- age_page_up_nolock(page);
+ referenced = page_referenced(page);
+ if (referenced) {
+ page->age += (referenced + PAGE_AGE_ADV);
+ if (page->age > PAGE_AGE_MAX)
+ page->age = PAGE_AGE_MAX;
page_active = 1;
} else {
age_page_down_ageonly(page);
- /*
- * Since we don't hold a reference on the page
- * ourselves, we have to do our test a bit more
- * strict then deactivate_page(). This is needed
- * since otherwise the system could hang shuffling
- * unfreeable pages from the active list to the
- * inactive_dirty list and back again...
- *
- * SUBTLE: we can have buffer pages with count 1.
- */
- if (page->age == 0 && page_count(page) <=
- (page->buffers ? 2 : 1)) {
+ if (page->age == 0) {
deactivate_page_nolock(page);
page_active = 0;
} else {
@@ -841,7 +719,7 @@
static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
- int ret = 0;
+ int target, maxscan, ret = 0;
/*
* If we're low on free pages, move pages from the
@@ -859,10 +737,15 @@
/*
* If needed, we move pages from the active list
- * to the inactive list.
- */
- if (inactive_shortage())
- ret += refill_inactive(gfp_mask, user);
+ * to the inactive list. Note that user processes
+ * only scan a small part of the active list, so
+ * that multiple page freeers at the same time
+ * won't upset page aging.
+ */
+ target = inactive_shortage();
+ maxscan = user ? DEF_PRIORITY : 0;
+ if (target)
+ ret += refill_inactive(maxscan, target);
/*
* Reclaim unused slab cache if memory is low.
@@ -928,7 +811,7 @@
recalculate_vm_stats();
/* Do background page aging. */
- refill_inactive_scan(DEF_PRIORITY, 0);
+ refill_inactive(0, 0);
}
run_task_queue(&tq_disk);
--- linux-2.4.7/mm/Makefile.orig Thu Jul 26 13:22:14 2001
+++ linux-2.4.7/mm/Makefile Thu Jul 26 13:24:37 2001
@@ -14,7 +14,7 @@
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
- shmem.o
+ shmem.o pmap.o
obj-$(CONFIG_HIGHMEM) += highmem.o
--- linux-2.4.7/mm/pmap.c.orig Thu Jul 26 13:24:05 2001
+++ linux-2.4.7/mm/pmap.c Thu Jul 26 13:24:37 2001
@@ -0,0 +1,341 @@
+/*
+ * mm/pmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel (riel@conectiva.com.br)
+ *
+ * Released under the General Public License (GPL).
+ *
+ *
+ * The functions in this file provide a physical to virtual
+ * reverse mapping, which is handy for NUMA memory migration,
+ * process migration, cache coherency on some machines and
+ * for easier swapout balancing.
+ *
+ * The emphasis in this implementation is on simplicity and
+ * low overhead.
+ *
+ * XXX: integrate the *pte* functions into this file so PPC, S/390
+ * and others can override some things here and stop pretending they
+ * have the exact same page table functionality others have ??
+ */
+
+/*
+ * On locking:
+ * - to keep the change in both pte and pte_chain atomic, we surround
+ * the changing of the two by the pagemap_lru_lock; we might want to
+ * change this in the future if there turns out to be contention
+ * - because swapout locking order is opposite to the locking order
+ * used by page faults, the swapout path always uses trylock
+ */
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <asm/smplock.h>
+
+#define DEBUG
+
+/*
+ * For shared pages, we have a series of pte_chain structures.
+ * They are a singly linked list to minimise memory overhead,
+ * this should not be an issue for most uses. For fork-after-exec
+ * we'll be in the start of the linked list and 90% of processes
+ * seem to be short-lived and will be in the start of the list.
+ *
+ * Turning this into a doubly-linked list with forward mapping from
+ * each process to the pte_chain structure could be a benifit on
+ * workloads where the system has lots of programs which exit after
+ * about an equal time, say apache with a few hundred children.
+ * OTOH, in those cases the child processes shouldn't be exiting all
+ * that often.
+ */
+struct pte_chain {
+ struct pte_chain * next;
+ pte_t * ptep;
+};
+
+static struct pte_chain * pte_chain_freelist;
+
+static struct pte_chain * pte_chain_alloc(void);
+static void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *);
+
+/*
+ * Quick test_and_clear referenced for all mappings to a page.
+ *
+ * The caller needs to hold the pagemap_lru_lock.
+ */
+int page_referenced(struct page * page)
+{
+ struct pte_chain * pte_chain = page->pte_chain;
+ int referenced = 0;
+
+ if (PageReferenced(page))
+ referenced++;
+
+ while (pte_chain) {
+ if (ptep_test_and_clear_young(pte_chain->ptep))
+ referenced++;
+ pte_chain = pte_chain->next;
+ }
+
+ return referenced;
+}
+
+/*
+ * Add a new pte reverse mapping to a page. New pages get added
+ * to the pageout lists, reserved and nonexistant pages (eg.
+ * mmaped devices) get skipped. In order to prevent races with
+ * the pageout code this function should only get called after
+ * the page table entry has been set up.
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_pmap(struct page * page, pte_t * ptep)
+{
+ struct pte_chain * pte_chain;
+ struct page * pte_page = virt_to_page(ptep);
+#ifdef DEBUG
+ struct mm_struct * mm = (void *) pte_page->mapping;
+
+ if (!mm)
+ BUG();
+#endif
+
+ if (!page || !ptep)
+ BUG();
+
+ if (!pte_present(*ptep))
+ return;
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+ spin_lock(&pagemap_lru_lock);
+#ifdef DEBUG
+ pte_chain = page->pte_chain;
+ while (pte_chain) {
+ if (pte_chain->ptep == ptep) {
+ printk("page_add_pmap: pmap for this (page, *pte) already present!\n");
+ BUG();
+ }
+ pte_chain = pte_chain->next;
+ }
+#endif
+ pte_chain = pte_chain_alloc();
+
+ pte_chain->ptep = ptep;
+
+ pte_chain->next = page->pte_chain;
+ page->pte_chain = pte_chain;
+
+ /*
+ * We can get called with new pages, which are not on any of
+ * the pageout lists yet, in that case we add the page here.
+ */
+ if ((page->flags & ((1<<PG_active)|(1<<PG_inactive_dirty)|
+ (1<<PG_inactive_clean))) == 0)
+ add_page_to_active_list(page);
+
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * page_remove_pmap - remove one mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * This function removes a reverse pte mapping structure from the
+ * pte_chain of a page. After this the caller can clear the page
+ * table entry and free the page.
+ *
+ * The caller needs to hold the mm->page_table_lock
+ */
+int page_remove_pmap(struct page * page, pte_t * ptep)
+{
+ struct pte_chain * pte_chain;
+ struct pte_chain * prev_pte_chain = NULL;
+ int ret = SWAP_ERROR;
+
+ if (!page || !ptep)
+ BUG();
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return SWAP_SUCCESS;
+
+ spin_lock(&pagemap_lru_lock);
+ pte_chain = page->pte_chain;
+ while (pte_chain) {
+ if (pte_chain->ptep == ptep) {
+ pte_chain_free(pte_chain, prev_pte_chain, page);
+ ret = SWAP_SUCCESS;
+ goto out;
+ }
+
+ prev_pte_chain = pte_chain;
+ pte_chain = pte_chain->next;
+ }
+ goto notfound;
+out:
+ spin_unlock(&pagemap_lru_lock);
+ return ret;
+
+notfound:
+ /* Not found, should never happen. */
+ printk("page_remove_pmap: pte_chain %p not present...\n", ptep);
+ printk("page_remove_pmap: only found: ");
+ pte_chain = page->pte_chain;
+ while (pte_chain) {
+ printk("%p ", pte_chain->ptep);
+ pte_chain = pte_chain->next;
+ }
+ printk("\n");
+ panic("page_remove_pmap: giving up.\n");
+}
+
+/*
+ * Worker function for page_remove_all_pmaps().
+ */
+static int pmap_remove(struct page * page, struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain)
+{
+ pte_t * ptep = pte_chain->ptep;
+ struct page * pte_page;
+ struct mm_struct * mm;
+ unsigned long address, low_address;
+
+ /* Calculate the arguments to try_to_swap_out... */
+ pte_page = virt_to_page(ptep);
+ /* XXX: get this right for non-x86 .. per-arch pte_addr macros? */
+ low_address = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+ address = pte_page->index + low_address;
+ mm = (void *) pte_page->mapping;
+ if (!mm) {
+ printk("pmap_remove: NULL mm, %lx address\n", address);
+ BUG();
+ }
+
+ return try_to_swap_out(mm, address, ptep, page);
+}
+
+/**
+ * page_remove_all_pmaps - remove all mappings to a page
+ * @page: the page to remove mappings from
+ *
+ * This function tries to remove all mappings to a page, it can fail
+ * if it missed a trylock (SWAP_AGAIN) or if it runs out of swap or
+ * encounters an unswappable page (SWAP_FAIL).
+ *
+ * The caller must hold both the pagemap_lru_lock and the page lock for
+ * the specified page.
+ */
+int page_remove_all_pmaps(struct page * page)
+{
+ struct pte_chain * pte_chain = page->pte_chain;
+ struct pte_chain * prev_pte_chain = NULL;
+ int ret = SWAP_SUCCESS;
+
+ /* These pages should never end up on the pageout lists. */
+ if (!VALID_PAGE(page) || PageReserved(page))
+ BUG();
+
+ if (!PageLocked(page))
+ BUG();
+
+ while (pte_chain) {
+ switch (pmap_remove(page, pte_chain, prev_pte_chain)) {
+ case SWAP_SUCCESS:
+ /* Free the current pte_chain ... */
+ pte_chain->ptep = NULL;
+ pte_chain_free(pte_chain, prev_pte_chain, page);
+ /* ... and start at the head of the new list. */
+ pte_chain = page->pte_chain;
+ break;
+ case SWAP_AGAIN:
+ /* Skip this pte if we missed a trylock. */
+ prev_pte_chain = pte_chain;
+ pte_chain = pte_chain->next;
+ ret = SWAP_AGAIN;
+ break;
+ case SWAP_FAIL:
+ /* Give up if the page is unswappable. */
+ return SWAP_FAIL;
+ case SWAP_ERROR:
+ /* Aieee, invalid arguments ... */
+ printk("page_remove_all_pmaps: SWAP_ERROR\n");
+ return SWAP_ERROR;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Helper function to unlink freed pte_chain structures and add
+ * them to the freelist. Protected by the pagemap_lru_lock.
+ */
+static void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page)
+{
+ if (prev_pte_chain)
+ prev_pte_chain->next = pte_chain->next;
+ else if (page)
+ page->pte_chain = pte_chain->next;
+
+ pte_chain->next = pte_chain_freelist;
+ pte_chain_freelist = pte_chain;
+}
+
+/*
+ * When we cannot allocate a new pte_chain structure, we simply unmap
+ * some other page table entries in the system and use those.
+ *
+ * TODO: implementation -- Rik
+ */
+static void pte_chain_reclaim(void)
+{
+ panic("Implement pte_chain_reclaim, you lazy bastard!\n");
+}
+
+/*
+ * Allocates a pageful of new pte_chains. If the page allocation
+ * fails we simply reclaim pte_chain structures which are in use
+ * in the system. Always succeeds.
+ */
+static void alloc_new_pte_chains(void)
+{
+ struct pte_chain * pte_chain;
+ void * page = (void *) get_zeroed_page(GFP_ATOMIC);
+ if (page) {
+ int count = PAGE_SIZE / sizeof(struct pte_chain);
+ pte_chain = page;
+ do {
+ pte_chain_free(pte_chain, NULL, NULL);
+ pte_chain++;
+ } while (--count > 0);
+ } else {
+ /* Reclaim pte_chain structures which are in use. */
+ pte_chain_reclaim();
+ }
+}
+
+/*
+ * Grab a pte_chain off the freelist, allocating new pte_chains
+ * if necessary. We are protected by the pagemap_lru_lock.
+ */
+static struct pte_chain * pte_chain_alloc(void)
+{
+ struct pte_chain * pte_chain;
+
+ /*
+ * If we run out of free pte_chain structures, we try to
+ * allocate a page of memory and convert that into new
+ * pte_chain structures.
+ */
+ if (!pte_chain_freelist) {
+ alloc_new_pte_chains();
+ }
+
+ /* Remove pte_chain from list and return it. */
+ pte_chain = pte_chain_freelist;
+ pte_chain_freelist = pte_chain->next;
+ pte_chain->next = 0;
+
+ return pte_chain;
+}
--- linux-2.4.7/include/linux/mm.h.orig Thu Jul 26 13:22:46 2001
+++ linux-2.4.7/include/linux/mm.h Thu Jul 26 13:50:49 2001
@@ -130,6 +130,9 @@
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access);
};
+/* Incomplete declaration; pte_chain is internal to mm/pmap.c */
+struct pte_chain;
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -157,6 +160,7 @@
struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */
unsigned long age; /* Page aging counter. */
+ struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */
wait_queue_head_t wait; /* Page locked? Stand in line... */
struct page **pprev_hash; /* Complement to *next_hash. */
struct buffer_head * buffers; /* Buffer maps us to a disk block. */
--- linux-2.4.7/include/linux/swap.h.orig Thu Jul 26 13:22:50 2001
+++ linux-2.4.7/include/linux/swap.h Thu Jul 26 13:52:48 2001
@@ -98,6 +98,18 @@
struct zone_t;
+/* linux/mm/pmap.c */
+extern int page_referenced(struct page *);
+extern void page_add_pmap(struct page *, pte_t *);
+extern int page_remove_pmap(struct page *, pte_t *);
+extern int page_remove_all_pmaps(struct page *);
+
+/* page_unmap_all_pmaps and try_to_swap_out return values */
+#define SWAP_SUCCESS 0 /* unmapped every user of the page */
+#define SWAP_AGAIN 1 /* missed a trylock, try again later */
+#define SWAP_FAIL 2 /* cannot swap this page out, reactivate */
+#define SWAP_ERROR 3 /* illegal arguments or misc error */
+
/* linux/mm/swap.c */
extern int memory_pressure;
extern void age_page_up(struct page *);
@@ -116,6 +128,7 @@
extern void swap_setup(void);
/* linux/mm/vmscan.c */
+extern int try_to_swap_out(struct mm_struct *, unsigned long, pte_t *, struct page *);
extern struct page * reclaim_page(zone_t *);
extern wait_queue_head_t kswapd_wait;
extern wait_queue_head_t kreclaimd_wait;
@@ -261,8 +274,8 @@
*/
#define INACTIVE_SHIFT 6
#define inactive_min(a,b) ((a) < (b) ? (a) : (b))
-#define inactive_target inactive_min((memory_pressure >> INACTIVE_SHIFT), \
- (num_physpages / 4))
+#define inactive_target inactive_min(((memory_pressure >> INACTIVE_SHIFT), \
+ (num_physpages / 4))
/*
* Ugly ugly ugly HACK to make sure the inactive lists
--- linux-2.4.7/TODO.orig Thu Jul 26 13:58:00 2001
+++ linux-2.4.7/TODO Thu Jul 26 13:25:41 2001
@@ -0,0 +1,32 @@
+ reverse mapping TODO
+
+- page_add_pmap / page_remove_pmap / ... LOCAL LOCKING !
+- remove add_to_swap_cache() SMP deadlock pagemap_lru_lock/pagecache_lock
+- make pmap_remove()/pte_alloc() portable ... per-arch pte_addr macros ?
+- no pte_chain struct for unshared pages, direct pte_t *
+- implement pte_chain_reclaim()
+
+- explicit swapout clustering in try_to_swap_out()
+- defragmentation for __alloc_pages() ...
+- swap chaining ???? (swappable swap chains?) (why? ;))
+
+
+ pte / get / pmap order
+
+ -- adding a page
+1. increment page count
+2. setup page table entry
+3. add page to pmap
+
+ -- removing a page
+1. remove pmap
+2. clear page table entry
+3. decrement page count
+
+This interferes with the pageout code in only one way: if
+the pageout code catches the page without our pmap entry
+but with our incremented page count, it'll move the page
+to the active list and will not get around to swapping it
+out right now. This is a very narrow race window so the
+chances of it happening are small and impact on the system
+should be zero.
parent reply other threads:[~2001-08-22 16:49 UTC|newest]
Thread overview: expand[flat|nested] mbox.gz Atom feed
[parent not found: <7040000.998430236@baldur>]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Pine.LNX.4.33L.0108221348240.31410-200000@duckman.distro.conectiva \
--to=riel@conectiva.com.br \
--cc=dmccr@us.ibm.com \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox