Re: Rmap code? - Rik van Riel

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Rik van Riel <riel@conectiva.com.br>
To: Dave McCracken <dmccr@us.ibm.com>
Cc: linux-mm@kvack.org
Subject: Re: Rmap code?
Date: Wed, 22 Aug 2001 13:49:38 -0300 (BRST)	[thread overview]
Message-ID: <Pine.LNX.4.33L.0108221348240.31410-200000@duckman.distro.conectiva> (raw)
In-Reply-To: <7040000.998430236@baldur>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 712 bytes --]

[due to popular demand, cc'd to linux-mm ... if you feel
 like playing with this code, please don't do double work
 but tell the others]

On Tue, 21 Aug 2001, Dave McCracken wrote:

> On the MM Wiki page you said you have some code already written that does
> reverse mapping.  Could you possibly send me a copy of what you've done so
> I can play with it?  I'd like to experiment with what's possible, and it'd
> make it a whole lot easier if I started with your ideas so far.

Here you are.  Note the TODO file in the top level
directory, where some of the problems are noted
down.

Rik
--
IA64: a worthy successor to the i860.

		http://www.surriel.com/
http://www.conectiva.com/	http://distro.conectiva.com/

[-- Attachment #2: 2.4.7-pmap --]
[-- Type: TEXT/PLAIN, Size: 34514 bytes --]

--- linux-2.4.7/fs/exec.c.orig	Thu Jul 26 13:22:26 2001
+++ linux-2.4.7/fs/exec.c	Thu Jul 26 16:19:02 2001
@@ -34,6 +34,7 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include <linux/swap.h>
 #define __NO_VERSION__
 #include <linux/module.h>
 
@@ -277,8 +278,9 @@
 	flush_dcache_page(page);
 	flush_page_to_ram(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+	page_add_pmap(page, pte);
+	spin_unlock(&pagemap_lru_lock);
 	tsk->mm->rss++;
-	spin_unlock(&tsk->mm->page_table_lock);
 
 	/* no need for flush_tlb */
 	return;
--- linux-2.4.7/mm/filemap.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/filemap.c	Thu Jul 26 13:24:37 2001
@@ -520,7 +520,9 @@
 	page->index = index;
 	add_page_to_inode_queue(mapping, page);
 	add_page_to_hash_queue(page, page_hash(mapping, index));
-	lru_cache_add(page);
+	/* XXX: already on the inactive_dirty list ... deuglify! */
+	if (!PageSwapCache(page))
+		lru_cache_add(page);
 	spin_unlock(&pagecache_lock);
 }
 
--- linux-2.4.7/mm/memory.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/memory.c	Fri Jul 27 11:08:40 2001
@@ -76,6 +76,7 @@
 static inline void free_one_pmd(pmd_t * dir)
 {
 	pte_t * pte;
+	struct page * page;
 
 	if (pmd_none(*dir))
 		return;
@@ -86,6 +87,10 @@
 	}
 	pte = pte_offset(dir, 0);
 	pmd_clear(dir);
+	page = virt_to_page(pte);
+	/* Clear the reverse mapping stuff on this page table page. */
+	page->mapping = 0;
+	page->index = 0;
 	pte_free(pte);
 }
 
@@ -216,7 +221,9 @@
 					goto cont_copy_pte_range_noset;
 				if (!pte_present(pte)) {
 					swap_duplicate(pte_to_swp_entry(pte));
-					goto cont_copy_pte_range;
+					/* Swapped out, skip the pmap stuff. */
+					set_pte(dst_pte, pte);
+					goto cont_copy_pte_range_noset;
 				}
 				ptepage = pte_page(pte);
 				if ((!VALID_PAGE(ptepage)) || 
@@ -236,6 +243,7 @@
 				get_page(ptepage);
 
 cont_copy_pte_range:		set_pte(dst_pte, pte);
+				page_add_pmap(ptepage, dst_pte);
 cont_copy_pte_range_noset:	address += PAGE_SIZE;
 				if (address >= end)
 					goto out_unlock;
@@ -312,6 +320,8 @@
 		if (!size)
 			break;
 		page = ptep_get_and_clear(pte);
+		if (pte_present(page))
+			page_remove_pmap(pte_page(page), pte);
 		pte++;
 		size--;
 		if (pte_none(page))
@@ -849,6 +859,7 @@
  *  - flush the old one
  *  - update the page tables
  *  - inform the TLB about the new one
+ *  - update the reverse mappings (if the page changes)
  *
  * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  */
@@ -868,7 +879,9 @@
 	copy_cow_page(old_page,new_page,address);
 	flush_page_to_ram(new_page);
 	flush_cache_page(vma, address);
+	page_remove_pmap(old_page, page_table);
 	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
+	page_add_pmap(new_page, page_table);
 }
 
 /*
@@ -1139,6 +1152,7 @@
 	flush_page_to_ram(page);
 	flush_icache_page(vma, page);
 	set_pte(page_table, pte);
+	page_add_pmap(page, page_table);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
@@ -1153,14 +1167,13 @@
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 {
 	pte_t entry;
+	struct page *page = ZERO_PAGE(addr);
 
 	/* Read-only mapping of ZERO_PAGE. */
 	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
 	/* ..except if it's a write access */
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		spin_unlock(&mm->page_table_lock);
 		page = alloc_page(GFP_HIGHUSER);
@@ -1178,6 +1191,7 @@
 	}
 
 	set_pte(page_table, entry);
+	page_add_pmap(page, page_table); /* Ignores empty_zero_page ;) */
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
@@ -1240,6 +1254,7 @@
 			   !(vma->vm_flags & VM_SHARED))
 			entry = pte_wrprotect(entry);
 		set_pte(page_table, entry);
+		page_add_pmap(new_page, page_table);
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
@@ -1372,6 +1387,7 @@
 pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
 	if (!pmd_present(*pmd)) {
+		struct page * page;
 		pte_t *new;
 
 		/* "fast" allocation can happen without dropping the lock.. */
@@ -1392,6 +1408,10 @@
 				goto out;
 			}
 		}
+		/* Add reverse pte mapping pointers for pmap.c. */
+		page = virt_to_page(new);
+		page->mapping = (void *)mm;
+		page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
 		pmd_populate(mm, pmd, new);
 	}
 out:
--- linux-2.4.7/mm/mremap.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/mremap.c	Thu Jul 26 13:24:37 2001
@@ -63,6 +63,7 @@
 	pte_t pte;
 
 	if (!pte_none(*src)) {
+		page_remove_pmap(pte_page(pte), src);
 		pte = ptep_get_and_clear(src);
 		if (!dst) {
 			/* No dest?  We must put it back. */
@@ -70,6 +71,7 @@
 			error++;
 		}
 		set_pte(dst, pte);
+		page_add_pmap(pte_page(pte), dst);
 	}
 	return error;
 }
--- linux-2.4.7/mm/page_alloc.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/page_alloc.c	Thu Jul 26 13:24:37 2001
@@ -87,6 +87,8 @@
 		BUG();
 	if (PageInactiveClean(page))
 		BUG();
+	if (page->pte_chain)
+		BUG();
 
 	page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
 	page->age = PAGE_AGE_START;
--- linux-2.4.7/mm/swap.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swap.c	Thu Jul 26 13:24:37 2001
@@ -164,22 +164,12 @@
 void deactivate_page_nolock(struct page * page)
 {
 	/*
-	 * One for the cache, one for the extra reference the
-	 * caller has and (maybe) one for the buffers.
-	 *
-	 * This isn't perfect, but works for just about everything.
-	 * Besides, as long as we don't move unfreeable pages to the
-	 * inactive_clean list it doesn't need to be perfect...
-	 */
-	int maxcount = (page->buffers ? 3 : 2);
-	page->age = 0;
-	ClearPageReferenced(page);
-
-	/*
 	 * Don't touch it if it's not on the active list.
 	 * (some pages aren't on any list at all)
 	 */
-	if (PageActive(page) && page_count(page) <= maxcount && !page_ramdisk(page)) {
+	if (PageActive(page) && !page_ramdisk(page)) {
+		page->age = 0;
+		ClearPageReferenced(page);
 		del_page_from_active_list(page);
 		add_page_to_inactive_dirty_list(page);
 	}
@@ -266,8 +256,6 @@
  */
 void lru_cache_del(struct page * page)
 {
-	if (!PageLocked(page))
-		BUG();
 	spin_lock(&pagemap_lru_lock);
 	__lru_cache_del(page);
 	spin_unlock(&pagemap_lru_lock);
--- linux-2.4.7/mm/swapfile.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swapfile.c	Thu Jul 26 13:24:37 2001
@@ -232,6 +232,7 @@
 	set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 	swap_free(entry);
 	get_page(page);
+	page_add_pmap(page, dir);
 	++vma->vm_mm->rss;
 }
 
--- linux-2.4.7/mm/swap_state.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/swap_state.c	Thu Jul 26 13:55:13 2001
@@ -165,7 +165,13 @@
 		if (exclusive_swap_page(page))
 			delete_from_swap_cache_nolock(page);
 		UnlockPage(page);
-	}
+	/*
+	 * If we are the only user and it was an anonymous page
+	 * without swap backing, remove the page from the list.
+	 * SMP/fork() safe because we hold the mm->page_table_lock.
+	 */
+	} else if (page_count(page) == 1)
+		lru_cache_del(page);
 	page_cache_release(page);
 }
 
--- linux-2.4.7/mm/vmscan.c.orig	Thu Jul 26 13:22:09 2001
+++ linux-2.4.7/mm/vmscan.c	Tue Jul 31 18:12:07 2001
@@ -27,30 +27,71 @@
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 /*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
+ * Try_to_swap_out does nothing but unmap a page table entry to
+ * a page. It has 4 return values:
+ * SWAP_SUCCESS - we succeeded in unmapping this page table entry
+ * SWAP_AGAIN   - we failed a try_lock, try again later
+ * SWAP_FAIL    - we cannot swap out this page any time soon
+ *                (mlocked page, no swap space left, ...)
+ * SWAP_ERROR   - an error occurred
  *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
+ * NOTE: we have to use trylock everywhere, since our locking
+ * order is opposite of the page fault handler, etc.
  */
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page)
+/* mm->page_table_lock needs to be held. mmap_sem is not held */
+int try_to_swap_out(struct mm_struct * mm, unsigned long address, pte_t * page_table, struct page *page)
 {
 	pte_t pte;
 	swp_entry_t entry;
+	struct vm_area_struct * vma;
+	int ret;
 
-	/* Don't look at this pte if it's been accessed recently. */
-	if (ptep_test_and_clear_young(page_table)) {
-		page->age += PAGE_AGE_ADV;
-		if (page->age > PAGE_AGE_MAX)
-			page->age = PAGE_AGE_MAX;
-		return;
+	/*
+	 * Try to get the mm->page_table_lock. Ideally we'd
+	 * change the page fault code so we never grab the
+	 * pagemap_lru_lock or page_table_lock while holding
+	 * the mm->page_table_lock, but for now we trylock...
+	 */
+	if (!spin_trylock(&mm->page_table_lock)) {
+		ret = SWAP_AGAIN;
+		goto out_unlock;
+	}
+
+	/*
+	 * First, check for various error conditions.
+	 * These should be BUG() or panic() once the pmap
+	 * VM stabilises...
+	 */
+	ret = SWAP_ERROR;
+
+	if (!mm || !page_table || !page) {
+		printk ("try_to_swap_out: called with null argument...\n");
+		goto out_unlock;
+	}
+
+	if (!PageLocked(page)) {
+		printk("try_to_swap_out: page not locked!\n");
+		goto out_unlock;
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma) {
+		printk ("try_to_swap_out: page not in a VMA?!\n");
+		goto out_unlock;
+	}
+
+	/*
+	 * OK, no errors occurred. There are still various
+	 * opportunities for failure, though ...
+	 */
+	ret = SWAP_FAIL;
+
+	/* mlock()ed VMA */
+	if (vma->vm_flags & VM_LOCKED) {
+		goto out_unlock;
 	}
 
-	if (TryLockPage(page))
-		return;
 
 	/* From this point on, the odds are that we're going to
 	 * nuke this pte, so read and clear the pte.  This hook
@@ -74,11 +115,9 @@
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		mm->rss--;
-		if (!page->age)
-			deactivate_page(page);
-		UnlockPage(page);
 		page_cache_release(page);
-		return;
+		ret = SWAP_SUCCESS;
+		goto out_unlock;
 	}
 
 	/*
@@ -120,206 +159,18 @@
 		goto out_unlock_restore; /* No swap space left */
 
 	/* Add it to the swap cache and mark it dirty */
+	/* XXX: SMP deadlock due to pagecache_lock / pagemap_lru_lock order */
 	add_to_swap_cache(page, entry);
 	set_page_dirty(page);
 	goto set_swap_pte;
 
 out_unlock_restore:
 	set_pte(page_table, pte);
-	UnlockPage(page);
-	return;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count)
-{
-	pte_t * pte;
-	unsigned long pmd_end;
-
-	if (pmd_none(*dir))
-		return count;
-	if (pmd_bad(*dir)) {
-		pmd_ERROR(*dir);
-		pmd_clear(dir);
-		return count;
-	}
-	
-	pte = pte_offset(dir, address);
-	
-	pmd_end = (address + PMD_SIZE) & PMD_MASK;
-	if (end > pmd_end)
-		end = pmd_end;
-
-	do {
-		if (pte_present(*pte)) {
-			struct page *page = pte_page(*pte);
-
-			if (VALID_PAGE(page) && !PageReserved(page)) {
-				try_to_swap_out(mm, vma, address, pte, page);
-				if (!--count)
-					break;
-			}
-		}
-		address += PAGE_SIZE;
-		pte++;
-	} while (address && (address < end));
-	mm->swap_address = address + PAGE_SIZE;
-	return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count)
-{
-	pmd_t * pmd;
-	unsigned long pgd_end;
-
-	if (pgd_none(*dir))
-		return count;
-	if (pgd_bad(*dir)) {
-		pgd_ERROR(*dir);
-		pgd_clear(dir);
-		return count;
-	}
-
-	pmd = pmd_offset(dir, address);
-
-	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
-	if (pgd_end && (end > pgd_end))
-		end = pgd_end;
-	
-	do {
-		count = swap_out_pmd(mm, vma, pmd, address, end, count);
-		if (!count)
-			break;
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address && (address < end));
-	return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
-{
-	pgd_t *pgdir;
-	unsigned long end;
-
-	/* Don't swap out areas which are locked down */
-	if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
-		return count;
-
-	pgdir = pgd_offset(mm, address);
-
-	end = vma->vm_end;
-	if (address >= end)
-		BUG();
-	do {
-		count = swap_out_pgd(mm, vma, pgdir, address, end, count);
-		if (!count)
-			break;
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgdir++;
-	} while (address && (address < end));
-	return count;
-}
-
-/*
- * Returns non-zero if we scanned all `count' pages
- */
-static int swap_out_mm(struct mm_struct * mm, int count)
-{
-	unsigned long address;
-	struct vm_area_struct* vma;
-
-	if (!count)
-		return 1;
-	/*
-	 * Go through process' page directory.
-	 */
-
-	/*
-	 * Find the proper vm-area after freezing the vma chain 
-	 * and ptes.
-	 */
-	spin_lock(&mm->page_table_lock);
-	address = mm->swap_address;
-	vma = find_vma(mm, address);
-	if (vma) {
-		if (address < vma->vm_start)
-			address = vma->vm_start;
-
-		for (;;) {
-			count = swap_out_vma(mm, vma, address, count);
-			if (!count)
-				goto out_unlock;
-			vma = vma->vm_next;
-			if (!vma)
-				break;
-			address = vma->vm_start;
-		}
-	}
-	/* Reset to 0 when we reach the end of address space */
-	mm->swap_address = 0;
-
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
-	return !count;
-}
-
-#define SWAP_MM_SHIFT	4
-#define SWAP_SHIFT	5
-#define SWAP_MIN	8
-
-static inline int swap_amount(struct mm_struct *mm)
-{
-	int nr = mm->rss >> SWAP_SHIFT;
-	if (nr < SWAP_MIN) {
-		nr = SWAP_MIN;
-		if (nr > mm->rss)
-			nr = mm->rss;
-	}
-	return nr;
-}
-
-static void swap_out(unsigned int priority, int gfp_mask)
-{
-	int counter;
-	int retval = 0;
-	struct mm_struct *mm = current->mm;
-
-	/* Always start by trying to penalize the process that is allocating memory */
-	if (mm)
-		retval = swap_out_mm(mm, swap_amount(mm));
-
-	/* Then, look at the other mm's */
-	counter = (mmlist_nr << SWAP_MM_SHIFT) >> priority;
-	do {
-		struct list_head *p;
-
-		spin_lock(&mmlist_lock);
-		p = init_mm.mmlist.next;
-		if (p == &init_mm.mmlist)
-			goto empty;
-
-		/* Move it to the back of the queue.. */
-		list_del(p);
-		list_add_tail(p, &init_mm.mmlist);
-		mm = list_entry(p, struct mm_struct, mmlist);
-
-		/* Make sure the mm doesn't disappear when we drop the lock.. */
-		atomic_inc(&mm->mm_users);
-		spin_unlock(&mmlist_lock);
-
-		/* Walk about 6% of the address space each time */
-		retval |= swap_out_mm(mm, swap_amount(mm));
-		mmput(mm);
-	} while (--counter >= 0);
-	return;
-
-empty:
-	spin_unlock(&mmlist_lock);
+	return ret;
 }
 
-
 /**
  * reclaim_page -	reclaims one page from the inactive_clean list
  * @zone: reclaim a page from this zone
@@ -395,6 +246,16 @@
 	del_page_from_inactive_clean_list(page);
 	UnlockPage(page);
 	page->age = PAGE_AGE_START;
+	/*
+	 * The bugs below cannot happen because other processes would
+	 * need the pagecache_lock to find the page. When they find it,
+	 * they need to increase the page count, which makes us move
+	 * the page back to the active list in the code above.
+	 *
+	 * Thus, these checks check other code...
+	 */
+	if (page->pte_chain)
+		BUG();
 	if (page_count(page) != 1)
 		printk("VM: reclaim_page, found page with count %d!\n",
 				page_count(page));
@@ -454,7 +315,6 @@
 
 		/* Page is or was in use?  Move it to the active list. */
 		if (PageReferenced(page) || page->age > 0 ||
-				(!page->buffers && page_count(page) > 1) ||
 				page_ramdisk(page)) {
 			del_page_from_inactive_dirty_list(page);
 			add_page_to_active_list(page);
@@ -472,6 +332,30 @@
 		}
 
 		/*
+		 * Try to remove all the mappings processes have to
+		 * this page.  Pages can have "hidden" users, if that
+		 * is the case the page gets moved back to the active
+		 * list.
+		 *
+		 * This test is just an optimisation to move unfreeable
+		 * pages back to the active list and prevent us from
+		 * doing (expensive) disk IO. The "real" test is done
+		 * in reclaim_page() and protected by the pagetable_lock.
+		 */
+		switch (page_remove_all_pmaps(page)) {
+			case SWAP_AGAIN:
+				UnlockPage(page);
+				continue;
+			case SWAP_FAIL:
+			case SWAP_ERROR:
+				goto page_active;
+			case SWAP_SUCCESS:
+				maxcount = (page->buffers ? 2 : 1);
+				if (page_count(page) > maxcount)
+					goto page_active;
+		}
+
+		/*
 		 * Dirty swap-cache page? Write it out if
 		 * last copy..
 		 */
@@ -630,20 +514,22 @@
 }
 
 /**
- * refill_inactive_scan - scan the active list and find pages to deactivate
- * @priority: the priority at which to scan
+ * refill_inactive - scan the active list and find pages to deactivate
+ * @maxscan: the priority at which to scan (gets converted to pages)
  * @target: number of pages to deactivate, zero for background aging
  *
  * This function will scan a portion of the active list to find
  * unused pages, those pages will then be moved to the inactive list.
  */
-int refill_inactive_scan(unsigned int priority, int target)
+int refill_inactive(unsigned int maxscan, int target)
 {
 	struct list_head * page_lru;
 	struct page * page;
-	int maxscan = nr_active_pages >> priority;
-	int page_active = 0;
-	int nr_deactivated = 0;
+	int referenced, page_active, nr_deactivated;;
+
+	/* Convert maxscan to the maximum number of pages to scan. */
+	maxscan = nr_active_pages >> maxscan;
+	nr_deactivated = 0;
 
 	/*
 	 * When we are background aging, we try to increase the page aging
@@ -666,23 +552,15 @@
 		}
 
 		/* Do aging on the pages. */
-		if (PageTestandClearReferenced(page)) {
-			age_page_up_nolock(page);
+		referenced = page_referenced(page);
+		if (referenced) {
+			page->age += (referenced + PAGE_AGE_ADV);
+			if (page->age > PAGE_AGE_MAX)
+				page->age = PAGE_AGE_MAX;
 			page_active = 1;
 		} else {
 			age_page_down_ageonly(page);
-			/*
-			 * Since we don't hold a reference on the page
-			 * ourselves, we have to do our test a bit more
-			 * strict then deactivate_page(). This is needed
-			 * since otherwise the system could hang shuffling
-			 * unfreeable pages from the active list to the
-			 * inactive_dirty list and back again...
-			 *
-			 * SUBTLE: we can have buffer pages with count 1.
-			 */
-			if (page->age == 0 && page_count(page) <=
-						(page->buffers ? 2 : 1)) {
+			if (page->age == 0) {
 				deactivate_page_nolock(page);
 				page_active = 0;
 			} else {
@@ -841,7 +719,7 @@
 
 static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 {
-	int ret = 0;
+	int target, maxscan, ret = 0;
 
 	/*
 	 * If we're low on free pages, move pages from the
@@ -859,10 +737,15 @@
 
 	/*
 	 * If needed, we move pages from the active list
-	 * to the inactive list.
-	 */
-	if (inactive_shortage())
-		ret += refill_inactive(gfp_mask, user);
+	 * to the inactive list. Note that user processes
+	 * only scan a small part of the active list, so
+	 * that multiple page freeers at the same time
+	 * won't upset page aging.
+	 */
+	target = inactive_shortage();
+	maxscan = user ? DEF_PRIORITY : 0;
+	if (target)
+		ret += refill_inactive(maxscan, target);
 
 	/* 	
 	 * Reclaim unused slab cache if memory is low.
@@ -928,7 +811,7 @@
 			recalculate_vm_stats();
 
 			/* Do background page aging. */
-			refill_inactive_scan(DEF_PRIORITY, 0);
+			refill_inactive(0, 0);
 		}
 
 		run_task_queue(&tq_disk);
--- linux-2.4.7/mm/Makefile.orig	Thu Jul 26 13:22:14 2001
+++ linux-2.4.7/mm/Makefile	Thu Jul 26 13:24:37 2001
@@ -14,7 +14,7 @@
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-	    shmem.o
+	    shmem.o pmap.o
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
 
--- linux-2.4.7/mm/pmap.c.orig	Thu Jul 26 13:24:05 2001
+++ linux-2.4.7/mm/pmap.c	Thu Jul 26 13:24:37 2001
@@ -0,0 +1,341 @@
+/*
+ * mm/pmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel  (riel@conectiva.com.br)
+ *
+ * Released under the General Public License (GPL).
+ *
+ *
+ * The functions in this file provide a physical to virtual
+ * reverse mapping, which is handy for NUMA memory migration,
+ * process migration, cache coherency on some machines and
+ * for easier swapout balancing.
+ *
+ * The emphasis in this implementation is on simplicity and
+ * low overhead.
+ *
+ * XXX: integrate the *pte* functions into this file so PPC, S/390
+ * and others can override some things here and stop pretending they
+ * have the exact same page table functionality others have ??
+ */
+
+/*
+ * On locking:
+ * - to keep the change in both pte and pte_chain atomic, we surround
+ *   the changing of the two by the pagemap_lru_lock; we might want to
+ *   change this in the future if there turns out to be contention
+ * - because swapout locking order is opposite to the locking order
+ *   used by page faults, the swapout path always uses trylock
+ */
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <asm/smplock.h>
+
+#define DEBUG
+
+/*
+ * For shared pages, we have a series of pte_chain structures.
+ * They are a singly linked list to minimise memory overhead,
+ * this should not be an issue for most uses. For fork-after-exec
+ * we'll be in the start of the linked list and 90% of processes
+ * seem to be short-lived and will be in the start of the list.
+ *
+ * Turning this into a doubly-linked list with forward mapping from
+ * each process to the pte_chain structure could be a benifit on
+ * workloads where the system has lots of programs which exit after
+ * about an equal time, say apache with a few hundred children.
+ * OTOH, in those cases the child processes shouldn't be exiting all
+ * that often.
+ */
+struct pte_chain {
+	struct pte_chain * next;
+	pte_t * ptep;
+};
+
+static struct pte_chain * pte_chain_freelist;
+
+static struct pte_chain * pte_chain_alloc(void);
+static void pte_chain_free(struct pte_chain *, struct pte_chain *, struct page *);
+
+/*
+ * Quick test_and_clear referenced for all mappings to a page.
+ *
+ * The caller needs to hold the pagemap_lru_lock.
+ */
+int page_referenced(struct page * page)
+{
+	struct pte_chain * pte_chain = page->pte_chain;
+	int referenced = 0;
+
+	if (PageReferenced(page))
+		referenced++;
+
+	while (pte_chain) {
+		if (ptep_test_and_clear_young(pte_chain->ptep))
+			referenced++;
+		pte_chain = pte_chain->next;
+	}
+
+	return referenced;
+}
+
+/*
+ * Add a new pte reverse mapping to a page. New pages get added
+ * to the pageout lists, reserved and nonexistant pages (eg.
+ * mmaped devices) get skipped.  In order to prevent races with
+ * the pageout code this function should only get called after
+ * the page table entry has been set up.
+ *
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void page_add_pmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pte_chain;
+	struct page * pte_page = virt_to_page(ptep);
+#ifdef DEBUG
+	struct mm_struct * mm = (void *) pte_page->mapping;
+
+	if (!mm)
+		BUG();
+#endif
+
+	if (!page || !ptep)
+		BUG();
+
+	if (!pte_present(*ptep))
+		return;
+
+	if (!VALID_PAGE(page) || PageReserved(page))
+		return;
+
+	spin_lock(&pagemap_lru_lock);
+#ifdef DEBUG
+	pte_chain = page->pte_chain;
+	while (pte_chain) {
+		if (pte_chain->ptep == ptep) {
+			printk("page_add_pmap: pmap for this (page, *pte) already present!\n");
+			BUG();
+		}
+		pte_chain = pte_chain->next;
+	}
+#endif
+	pte_chain = pte_chain_alloc();
+
+	pte_chain->ptep = ptep;
+
+	pte_chain->next = page->pte_chain;
+	page->pte_chain = pte_chain;
+
+	/*
+	 * We can get called with new pages, which are not on any of
+	 * the pageout lists yet, in that case we add the page here.
+	 */
+	if ((page->flags & ((1<<PG_active)|(1<<PG_inactive_dirty)|
+						(1<<PG_inactive_clean))) == 0)
+		add_page_to_active_list(page);
+
+	spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * page_remove_pmap - remove one mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * This function removes a reverse pte mapping structure from the
+ * pte_chain of a page. After this the caller can clear the page
+ * table entry and free the page.
+ *
+ * The caller needs to hold the mm->page_table_lock
+ */
+int page_remove_pmap(struct page * page, pte_t * ptep)
+{
+	struct pte_chain * pte_chain;
+	struct pte_chain * prev_pte_chain = NULL;
+	int ret = SWAP_ERROR;
+
+	if (!page || !ptep)
+		BUG();
+
+	if (!VALID_PAGE(page) || PageReserved(page))
+		return SWAP_SUCCESS;
+
+	spin_lock(&pagemap_lru_lock);
+	pte_chain = page->pte_chain;
+	while (pte_chain) {
+		if (pte_chain->ptep == ptep) {
+			pte_chain_free(pte_chain, prev_pte_chain, page);
+			ret = SWAP_SUCCESS;
+			goto out;
+		}
+		
+		prev_pte_chain = pte_chain;
+		pte_chain = pte_chain->next;
+	}
+	goto notfound;
+out:
+	spin_unlock(&pagemap_lru_lock);
+	return ret;
+
+notfound:
+	/* Not found, should never happen. */
+	printk("page_remove_pmap: pte_chain %p not present...\n", ptep);
+	printk("page_remove_pmap: only found: ");
+	pte_chain = page->pte_chain;
+	while (pte_chain) {
+		printk("%p ", pte_chain->ptep);
+		pte_chain = pte_chain->next;
+	}
+	printk("\n");
+	panic("page_remove_pmap: giving up.\n");
+}
+
+/*
+ * Worker function for page_remove_all_pmaps().
+ */
+static int pmap_remove(struct page * page, struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain)
+{
+	pte_t * ptep = pte_chain->ptep;
+	struct page * pte_page;
+	struct mm_struct * mm;
+	unsigned long address, low_address;
+
+	/* Calculate the arguments to try_to_swap_out... */
+	pte_page = virt_to_page(ptep);
+	/* XXX: get this right for non-x86 .. per-arch pte_addr macros? */
+	low_address = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+	address = pte_page->index + low_address;
+	mm = (void *) pte_page->mapping;
+	if (!mm) {
+		printk("pmap_remove: NULL mm, %lx address\n", address);
+		BUG();
+	}
+
+	return try_to_swap_out(mm, address, ptep, page);
+}
+
+/**
+ * page_remove_all_pmaps - remove all mappings to a page
+ * @page: the page to remove mappings from
+ *
+ * This function tries to remove all mappings to a page, it can fail
+ * if it missed a trylock (SWAP_AGAIN) or if it runs out of swap or
+ * encounters an unswappable page (SWAP_FAIL).
+ *
+ * The caller must hold both the pagemap_lru_lock and the page lock for
+ * the specified page.
+ */
+int page_remove_all_pmaps(struct page * page)
+{
+	struct pte_chain * pte_chain = page->pte_chain;
+	struct pte_chain * prev_pte_chain = NULL;
+	int ret = SWAP_SUCCESS;
+
+	/* These pages should never end up on the pageout lists. */
+	if (!VALID_PAGE(page) || PageReserved(page))
+		BUG();
+
+	if (!PageLocked(page))
+		BUG();
+
+	while (pte_chain) {
+		switch (pmap_remove(page, pte_chain, prev_pte_chain)) {
+			case SWAP_SUCCESS:
+				/* Free the current pte_chain ... */
+				pte_chain->ptep = NULL;
+				pte_chain_free(pte_chain, prev_pte_chain, page);
+				/* ... and start at the head of the new list. */
+				pte_chain = page->pte_chain;
+				break;
+			case SWAP_AGAIN:
+				/* Skip this pte if we missed a trylock. */
+				prev_pte_chain = pte_chain;
+				pte_chain = pte_chain->next;
+				ret = SWAP_AGAIN;
+				break;
+			case SWAP_FAIL:
+				/* Give up if the page is unswappable. */
+				return SWAP_FAIL;
+			case SWAP_ERROR:
+				/* Aieee, invalid arguments ... */
+				printk("page_remove_all_pmaps: SWAP_ERROR\n");
+				return SWAP_ERROR;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Helper function to unlink freed pte_chain structures and add
+ * them to the freelist. Protected by the pagemap_lru_lock.
+ */
+static void pte_chain_free(struct pte_chain * pte_chain, struct pte_chain * prev_pte_chain, struct page * page)
+{
+	if (prev_pte_chain)
+		prev_pte_chain->next = pte_chain->next;
+	else if (page)
+		page->pte_chain = pte_chain->next;
+
+	pte_chain->next = pte_chain_freelist;
+	pte_chain_freelist = pte_chain;
+}
+
+/*
+ * When we cannot allocate a new pte_chain structure, we simply unmap
+ * some other page table entries in the system and use those.
+ *
+ * TODO: implementation -- Rik
+ */
+static void pte_chain_reclaim(void)
+{
+	panic("Implement pte_chain_reclaim, you lazy bastard!\n");
+}
+
+/*
+ * Allocates a pageful of new pte_chains. If the page allocation
+ * fails we simply reclaim pte_chain structures which are in use
+ * in the system. Always succeeds.
+ */
+static void alloc_new_pte_chains(void)
+{
+	struct pte_chain * pte_chain;
+	void * page = (void *) get_zeroed_page(GFP_ATOMIC);
+	if (page) {
+		int count = PAGE_SIZE / sizeof(struct pte_chain);
+		pte_chain = page;
+		do {
+			pte_chain_free(pte_chain, NULL, NULL);
+			pte_chain++;
+		} while (--count > 0);
+	} else {
+		/* Reclaim pte_chain structures which are in use. */
+		pte_chain_reclaim();
+	}
+}
+
+/*
+ * Grab a pte_chain off the freelist, allocating new pte_chains
+ * if necessary. We are protected by the pagemap_lru_lock.
+ */
+static struct pte_chain * pte_chain_alloc(void)
+{
+	struct pte_chain * pte_chain;
+
+	/*
+	 * If we run out of free pte_chain structures, we try to
+	 * allocate a page of memory and convert that into new
+	 * pte_chain structures.
+	 */
+	if (!pte_chain_freelist) {
+		alloc_new_pte_chains();
+	}
+
+	/* Remove pte_chain from list and return it. */
+	pte_chain = pte_chain_freelist;
+	pte_chain_freelist = pte_chain->next;
+	pte_chain->next = 0;
+
+	return pte_chain;
+}
--- linux-2.4.7/include/linux/mm.h.orig	Thu Jul 26 13:22:46 2001
+++ linux-2.4.7/include/linux/mm.h	Thu Jul 26 13:50:49 2001
@@ -130,6 +130,9 @@
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access);
 };
 
+/* Incomplete declaration; pte_chain is internal to mm/pmap.c */
+struct pte_chain;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -157,6 +160,7 @@
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
 	unsigned long age;		/* Page aging counter. */
+	struct pte_chain * pte_chain;	/* Reverse pte mapping pointer. */
 	wait_queue_head_t wait;		/* Page locked?  Stand in line... */
 	struct page **pprev_hash;	/* Complement to *next_hash. */
 	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
--- linux-2.4.7/include/linux/swap.h.orig	Thu Jul 26 13:22:50 2001
+++ linux-2.4.7/include/linux/swap.h	Thu Jul 26 13:52:48 2001
@@ -98,6 +98,18 @@
 
 struct zone_t;
 
+/* linux/mm/pmap.c */
+extern int page_referenced(struct page *);
+extern void page_add_pmap(struct page *, pte_t *);
+extern int page_remove_pmap(struct page *, pte_t *);
+extern int page_remove_all_pmaps(struct page *);
+
+/* page_unmap_all_pmaps and try_to_swap_out return values */
+#define	SWAP_SUCCESS	0	/* unmapped every user of the page */
+#define SWAP_AGAIN	1	/* missed a trylock, try again later */
+#define SWAP_FAIL	2	/* cannot swap this page out, reactivate */
+#define	SWAP_ERROR	3	/* illegal arguments or misc error */
+
 /* linux/mm/swap.c */
 extern int memory_pressure;
 extern void age_page_up(struct page *);
@@ -116,6 +128,7 @@
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern int try_to_swap_out(struct mm_struct *, unsigned long, pte_t *, struct page *);
 extern struct page * reclaim_page(zone_t *);
 extern wait_queue_head_t kswapd_wait;
 extern wait_queue_head_t kreclaimd_wait;
@@ -261,8 +274,8 @@
  */
 #define INACTIVE_SHIFT 6
 #define inactive_min(a,b) ((a) < (b) ? (a) : (b))
-#define inactive_target inactive_min((memory_pressure >> INACTIVE_SHIFT), \
-		(num_physpages / 4))
+#define inactive_target inactive_min(((memory_pressure >> INACTIVE_SHIFT), \
+			(num_physpages / 4))
 
 /*
  * Ugly ugly ugly HACK to make sure the inactive lists
--- linux-2.4.7/TODO.orig	Thu Jul 26 13:58:00 2001
+++ linux-2.4.7/TODO	Thu Jul 26 13:25:41 2001
@@ -0,0 +1,32 @@
+	reverse mapping TODO
+
+- page_add_pmap / page_remove_pmap / ... LOCAL LOCKING !
+- remove add_to_swap_cache() SMP deadlock pagemap_lru_lock/pagecache_lock
+- make pmap_remove()/pte_alloc() portable ... per-arch pte_addr macros ?
+- no pte_chain struct for unshared pages, direct pte_t *
+- implement pte_chain_reclaim()
+
+- explicit swapout clustering in try_to_swap_out()
+- defragmentation for __alloc_pages() ...
+- swap chaining ???? (swappable swap chains?) (why? ;)) 
+
+
+	pte / get / pmap order
+
+ -- adding a page
+1. increment page count
+2. setup page table entry
+3. add page to pmap
+
+ -- removing a page
+1. remove pmap
+2. clear page table entry
+3. decrement page count
+
+This interferes with the pageout code in only one way: if
+the pageout code catches the page without our pmap entry
+but with our incremented page count, it'll move the page
+to the active list and will not get around to swapping it
+out right now.  This is a very narrow race window so the
+chances of it happening are small and impact on the system
+should be zero.

          parent reply	other threads:[~2001-08-22 16:49 UTC|newest]

Thread overview: expand[flat|nested]  mbox.gz  Atom feed
 [parent not found: <7040000.998430236@baldur>]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.33L.0108221348240.31410-200000@duckman.distro.conectiva \
    --to=riel@conectiva.com.br \
    --cc=dmccr@us.ibm.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox