From: Dave McCracken <dmccr@us.ibm.com>
To: Linux Memory Management <linux-mm@kvack.org>
Subject: [PATCH 2.5.62] Partial object-based rmap implementation
Date: Thu, 20 Feb 2003 10:13:31 -0600 [thread overview]
Message-ID: <8390000.1045757611@baldur.austin.ibm.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 964 bytes --]
There's been a fair amount of discussion about the advantages of doing
object-based rmap. I've been looking into it, and we have the pieces to do
it for file-backed objects, ie the ones that have a real address_space
object pointed to from struct page. The stumbling block has always been
anonymous pages.
At Martin Bligh's suggestion, I coded up an object-based implementation for
non-anon pages while leaving the pte_chain code intact for anon pages. My
fork/exit microbenchmark shows roughly 50% improvement for tasks that are
composes of file-backed and/or shared pages. This is the code that Martin
included in 2.5.62-mjb2 and reported his performance results on.
Anyway, here's the patch if anyone wants to check it out.
Dave McCracken
======================================================================
Dave McCracken IBM Linux Base Kernel Team 1-512-838-3059
dmccr@us.ibm.com T/L 678-3059
[-- Attachment #2: objrmap-2.5.62-3.diff --]
[-- Type: text/plain, Size: 9271 bytes --]
--- 2.5.62/./include/linux/mm.h 2003-02-17 16:55:50.000000000 -0600
+++ 2.5.62-objrmap/./include/linux/mm.h 2003-02-19 12:00:47.000000000 -0600
@@ -171,6 +171,7 @@
struct pte_chain *chain;/* Reverse pte mapping pointer.
* protected by PG_chainlock */
pte_addr_t direct;
+ atomic_t mapcount;
} pte;
unsigned long private; /* mapping-private opaque data */
--- 2.5.62/./include/linux/page-flags.h 2003-02-17 16:56:25.000000000 -0600
+++ 2.5.62-objrmap/./include/linux/page-flags.h 2003-02-18 10:22:26.000000000 -0600
@@ -74,6 +74,7 @@
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
+#define PG_anon 20 /* Anonymous page */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -256,6 +257,10 @@
#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags)
#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags)
+#define PageAnon(page) test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags)
+
/*
* The PageSwapCache predicate doesn't use a PG_flag at this time,
* but it may again do so one day.
--- 2.5.62/./fs/exec.c 2003-02-17 16:56:12.000000000 -0600
+++ 2.5.62-objrmap/./fs/exec.c 2003-02-18 11:46:33.000000000 -0600
@@ -316,6 +316,7 @@
lru_cache_add_active(page);
flush_dcache_page(page);
flush_page_to_ram(page);
+ SetPageAnon(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
pte_chain = page_add_rmap(page, pte, pte_chain);
pte_unmap(pte);
--- 2.5.62/./mm/page_alloc.c 2003-02-17 16:55:51.000000000 -0600
+++ 2.5.62-objrmap/./mm/page_alloc.c 2003-02-18 10:22:26.000000000 -0600
@@ -220,6 +220,8 @@
bad_page(function, page);
if (PageDirty(page))
ClearPageDirty(page);
+ if (PageAnon(page))
+ ClearPageAnon(page);
}
/*
--- 2.5.62/./mm/swapfile.c 2003-02-17 16:56:01.000000000 -0600
+++ 2.5.62-objrmap/./mm/swapfile.c 2003-02-19 16:39:24.000000000 -0600
@@ -390,6 +390,7 @@
return;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ SetPageAnon(page);
*pte_chainp = page_add_rmap(page, dir, *pte_chainp);
swap_free(entry);
++vma->vm_mm->rss;
--- 2.5.62/./mm/memory.c 2003-02-17 16:56:14.000000000 -0600
+++ 2.5.62-objrmap/./mm/memory.c 2003-02-18 10:22:26.000000000 -0600
@@ -988,6 +988,7 @@
++mm->rss;
page_remove_rmap(old_page, page_table);
break_cow(vma, new_page, address, page_table);
+ SetPageAnon(new_page);
pte_chain = page_add_rmap(new_page, page_table, pte_chain);
lru_cache_add_active(new_page);
@@ -1197,6 +1198,7 @@
flush_page_to_ram(page);
flush_icache_page(vma, page);
set_pte(page_table, pte);
+ SetPageAnon(page);
pte_chain = page_add_rmap(page, page_table, pte_chain);
/* No need to invalidate - it was non-present before */
@@ -1263,6 +1265,7 @@
entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
lru_cache_add_active(page);
mark_page_accessed(page);
+ SetPageAnon(page);
}
set_pte(page_table, entry);
@@ -1334,6 +1337,7 @@
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
lru_cache_add_active(page);
+ SetPageAnon(page);
new_page = page;
}
--- 2.5.62/./mm/rmap.c 2003-02-17 16:56:58.000000000 -0600
+++ 2.5.62-objrmap/./mm/rmap.c 2003-02-19 12:05:48.000000000 -0600
@@ -86,6 +86,89 @@
* If the page has a single-entry pte_chain, collapse that back to a PageDirect
* representation. This way, it's only done under memory pressure.
*/
+static inline int
+page_referenced_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long loffset;
+ unsigned long address;
+ int referenced = 0;
+
+ loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+ if (loffset < vma->vm_pgoff)
+ goto out;
+
+ address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
+
+ if (address >= vma->vm_end)
+ goto out;
+
+ if (!spin_trylock(&mm->page_table_lock)) {
+ referenced = 1;
+ goto out;
+ }
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd)) {
+ goto out_unlock;
+ }
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd)) {
+ goto out_unlock;
+ }
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ goto out_unmap;
+ }
+ if (page_to_pfn(page) != pte_pfn(*pte)) {
+ goto out_unmap;
+ }
+ if (ptep_test_and_clear_young(pte))
+ referenced++;
+out_unmap:
+ pte_unmap(pte);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+
+out:
+ return referenced;
+}
+
+static int
+page_referenced_obj(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma;
+ int referenced = 0;
+
+ if (atomic_read(&page->pte.mapcount) == 0)
+ return 0;
+
+ if (!mapping)
+ BUG();
+
+ if (PageSwapCache(page))
+ BUG();
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return 1;
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ referenced += page_referenced_obj_one(vma, page);
+ }
+
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ referenced += page_referenced_obj_one(vma, page);
+ }
+
+ up(&mapping->i_shared_sem);
+
+ return referenced;
+}
+
int page_referenced(struct page * page)
{
struct pte_chain * pc;
@@ -94,6 +177,10 @@
if (TestClearPageReferenced(page))
referenced++;
+ if (!PageAnon(page)) {
+ referenced += page_referenced_obj(page);
+ goto out;
+ }
if (PageDirect(page)) {
pte_t *pte = rmap_ptep_map(page->pte.direct);
if (ptep_test_and_clear_young(pte))
@@ -127,6 +214,7 @@
__pte_chain_free(pc);
}
}
+out:
return referenced;
}
@@ -157,6 +245,15 @@
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return pte_chain;
+ if (!PageAnon(page)) {
+ if (!page->mapping)
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ atomic_inc(&page->pte.mapcount);
+ return pte_chain;
+ }
+
pte_chain_lock(page);
#ifdef DEBUG_RMAP
@@ -245,6 +342,17 @@
if (!page_mapped(page))
return; /* remap_page_range() from a driver? */
+ if (!PageAnon(page)) {
+ if (!page->mapping)
+ BUG();
+ if (PageSwapCache(page))
+ BUG();
+ if (atomic_read(&page->pte.mapcount) == 0)
+ BUG();
+ atomic_dec(&page->pte.mapcount);
+ return;
+ }
+
pte_chain_lock(page);
if (PageDirect(page)) {
@@ -310,6 +418,112 @@
return;
}
+static inline int
+try_to_unmap_obj_one(struct vm_area_struct *vma, struct page *page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ pte_t pteval;
+ unsigned long loffset;
+ unsigned long address;
+ int ret = SWAP_SUCCESS;
+
+ loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+ if (loffset < vma->vm_pgoff)
+ goto out;
+
+ address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT);
+
+ if (address >= vma->vm_end)
+ goto out;
+
+ if (!spin_trylock(&mm->page_table_lock)) {
+ ret = SWAP_AGAIN;
+ goto out;
+ }
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd)) {
+ goto out_unlock;
+ }
+ pmd = pmd_offset(pgd, address);
+ if (!pmd_present(*pmd)) {
+ goto out_unlock;
+ }
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ goto out_unmap;
+ }
+ if (page_to_pfn(page) != pte_pfn(*pte)) {
+ goto out_unmap;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ ret = SWAP_FAIL;
+ goto out_unmap;
+ }
+
+ flush_cache_page(vma, address);
+ pteval = ptep_get_and_clear(pte);
+ flush_tlb_page(vma, address);
+
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ if (atomic_read(&page->pte.mapcount) == 0)
+ BUG();
+
+ mm->rss--;
+ atomic_dec(&page->pte.mapcount);
+ page_cache_release(page);
+
+out_unmap:
+ pte_unmap(pte);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+
+out:
+ return ret;
+}
+
+static int
+try_to_unmap_obj(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct vm_area_struct *vma;
+ int ret = SWAP_SUCCESS;
+
+ if (!mapping)
+ BUG();
+
+ if (PageSwapCache(page))
+ BUG();
+
+ if (down_trylock(&mapping->i_shared_sem))
+ return SWAP_AGAIN;
+
+ list_for_each_entry(vma, &mapping->i_mmap, shared) {
+ ret = try_to_unmap_obj_one(vma, page);
+ if (ret != SWAP_SUCCESS)
+ goto out;
+ }
+
+ list_for_each_entry(vma, &mapping->i_mmap_shared, shared) {
+ ret = try_to_unmap_obj_one(vma, page);
+ if (ret != SWAP_SUCCESS)
+ goto out;
+ }
+
+ if (atomic_read(&page->pte.mapcount) != 0)
+ BUG();
+
+out:
+ up(&mapping->i_shared_sem);
+ return ret;
+}
+
/**
* try_to_unmap_one - worker function for try_to_unmap
* @page: page to unmap
@@ -414,6 +628,11 @@
if (!page->mapping)
BUG();
+ if (!PageAnon(page)) {
+ ret = try_to_unmap_obj(page);
+ goto out;
+ }
+
if (PageDirect(page)) {
ret = try_to_unmap_one(page, page->pte.direct);
if (ret == SWAP_SUCCESS) {
next reply other threads:[~2003-02-20 16:13 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-02-20 16:13 Dave McCracken [this message]
2003-02-20 17:19 ` Rik van Riel
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=8390000.1045757611@baldur.austin.ibm.com \
--to=dmccr@us.ibm.com \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox