* [RFC][PATCH 1/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 2/7] " a.p.zijlstra
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-nonresident.patch --]
[-- Type: text/plain, Size: 12877 bytes --]
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -154,6 +154,22 @@ extern void out_of_memory(unsigned int _
/* linux/mm/memory.c */
extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
+/* linux/mm/nonresident.c */
+#define NR_b1 0
+#define NR_b2 1
+#define NR_free 2
+#define NR_lost 3
+
+#define NR_listid 3
+#define NR_found 0x80000000
+
+
+extern int nonresident_put(struct address_space *, unsigned long, int, int);
+extern int nonresident_find(struct address_space *, unsigned long);
+extern unsigned int nonresident_count(int);
+extern unsigned int nonresident_total(void);
+extern void nonresident_init(void);
+
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalhigh_pages;
@@ -292,6 +308,13 @@ static inline swp_entry_t get_swap_page(
#define grab_swap_token() do { } while(0)
#define has_swap_token(x) 0
+/* linux/mm/nonresident.c */
+#define nonresident_put(w,x,y,z) 0
+#define nonresident_find(x,y) 0
+#define nonresident_count(x) 0
+#define nonresident_total() 0
+#define nonresident_init() do { } while (0)
+
#endif /* CONFIG_SWAP */
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
Index: linux-2.6-git/init/main.c
===================================================================
--- linux-2.6-git.orig/init/main.c
+++ linux-2.6-git/init/main.c
@@ -47,6 +47,7 @@
#include <linux/rmap.h>
#include <linux/mempolicy.h>
#include <linux/key.h>
+#include <linux/swap.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -494,6 +495,7 @@ asmlinkage void __init start_kernel(void
}
#endif
vfs_caches_init_early();
+ nonresident_init();
mem_init();
kmem_cache_init();
setup_per_cpu_pageset();
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -12,7 +12,8 @@ obj-y := bootmem.o filemap.o mempool.o
readahead.o slab.o swap.o truncate.o vmscan.o \
prio_tree.o $(mmu-y)
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
+ nonresident.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
Index: linux-2.6-git/mm/nonresident.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/nonresident.c
@@ -0,0 +1,372 @@
+/*
+ * mm/nonresident.c
+ * (C) 2004,2005 Red Hat, Inc
+ * Written by Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ * Adapted by Peter Zijlstra <a.p.zijlstra@chello.nl> for use by ARC
+ * like algorithms.
+ *
+ * Keeps track of whether a non-resident page was recently evicted
+ * and should be immediately promoted to the active list. This also
+ * helps automatically tune the inactive target.
+ *
+ * The pageout code stores a recently evicted page in this cache
+ * by calling remember_page(mapping/mm, index/vaddr)
+ * and can look it up in the cache by calling recently_evicted()
+ * with the same arguments.
+ *
+ * Note that there is no way to invalidate pages after eg. truncate
+ * or exit, we let the pages fall out of the non-resident set through
+ * normal replacement.
+ *
+ *
+ * Modified to work with ARC like algorithms who:
+ * - need to balance two FIFOs; |b1| + |b2| = c,
+ *
+ * The bucket contains four single linked cyclic lists (CLOCKS) and each
+ * clock has a tail hand. By selecting a victim clock upon insertion it
+ * is possible to balance them.
+ *
+ * The first two lists are used for B1/B2 and a third for a free slot list.
+ * The fourth list is unused.
+ *
+ * The slot looks like this:
+ * struct slot_t {
+ * u32 cookie : 24; // LSB
+ * u32 index : 6;
+ * u32 listid : 2;
+ * };
+ *
+ * The bucket is guarded by a spinlock.
+ */
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/kernel.h>
+
+#define TARGET_SLOTS 64
+#define NR_CACHELINES (TARGET_SLOTS*sizeof(u32) / L1_CACHE_BYTES)
+#define NR_SLOTS (((NR_CACHELINES * L1_CACHE_BYTES) - sizeof(spinlock_t) - 2*sizeof(u16)) / sizeof(u32))
+#if 0
+#if NR_SLOTS < (TARGET_SLOTS / 2)
+#warning very small slot size
+#if NR_SLOTS <= 0
+#error no room for slots left
+#endif
+#endif
+#endif
+
+#define BUILD_MASK(bits, shift) (((1 << (bits)) - 1) << (shift))
+
+#define LISTID_BITS 2
+#define LISTID_SHIFT (sizeof(u32)*8 - LISTID_BITS)
+#define LISTID_MASK BUILD_MASK(LISTID_BITS, LISTID_SHIFT)
+
+#define SET_LISTID(x, flg) ((x) = ((x) & ~LISTID_MASK) | ((flg) << LISTID_SHIFT))
+#define GET_LISTID(x) (((x) & LISTID_MASK) >> LISTID_SHIFT)
+
+#define INDEX_BITS 6 /* ceil(log2(NR_SLOTS)) */
+#define INDEX_SHIFT (LISTID_SHIFT - INDEX_BITS)
+#define INDEX_MASK BUILD_MASK(INDEX_BITS, INDEX_SHIFT)
+
+#define SET_INDEX(x, idx) ((x) = ((x) & ~INDEX_MASK) | ((idx) << INDEX_SHIFT))
+#define GET_INDEX(x) (((x) & INDEX_MASK) >> INDEX_SHIFT)
+
+#define COOKIE_MASK BUILD_MASK(sizeof(u32)*8 - LISTID_BITS - INDEX_BITS, 0)
+
+struct nr_bucket
+{
+ spinlock_t lock;
+ u8 hand[4];
+ u32 slot[NR_SLOTS];
+} ____cacheline_aligned;
+
+/* The non-resident page hash table. */
+static struct nr_bucket * nonres_table;
+static unsigned int nonres_shift;
+static unsigned int nonres_mask;
+
+/* hash the address into a bucket */
+static struct nr_bucket * nr_hash(void * mapping, unsigned long index)
+{
+ unsigned long bucket;
+ unsigned long hash;
+
+ hash = (unsigned long)mapping + 37 * index;
+ bucket = hash_long(hash, nonres_shift);
+
+ return nonres_table + bucket;
+}
+
+/* hash the address and inode into a cookie */
+static u32 nr_cookie(struct address_space * mapping, unsigned long index)
+{
+ unsigned long hash;
+
+ hash = 37 * (unsigned long)mapping + index;
+
+ if (mapping && mapping->host)
+ hash = 37 * hash + mapping->host->i_ino;
+
+ return hash_long(hash, sizeof(u32)*8 - LISTID_BITS - INDEX_BITS);
+}
+
+static DEFINE_PER_CPU(unsigned int[4], nonres_count);
+
+/*
+ * remove current (b from 'abc'):
+ *
+ * initial swap(2,3)
+ *
+ * 1: -> [2],a 1: -> [2],a
+ * * 2: -> [3],b 2: -> [1],c
+ * 3: -> [1],c * 3: -> [3],b
+ *
+ * 3 is now free for use.
+ *
+ * @nr_bucket: bucket to operate in
+ * @listid: list that the deletee belongs to
+ * @pos: slot position of deletee
+ * @slot: possible pointer to slot
+ *
+ * returns pointer to removed slot, NULL when list empty.
+ */
+static u32 * __nonresident_del(struct nr_bucket *nr_bucket, int listid, u8 pos, u32 *slot)
+{
+ int next_pos;
+ u32 *next;
+
+ if (slot == NULL) {
+ slot = &nr_bucket->slot[pos];
+ if (GET_LISTID(*slot) != listid)
+ return NULL;
+ }
+
+ --__get_cpu_var(nonres_count[listid]);
+
+ next_pos = GET_INDEX(*slot);
+ if (pos == next_pos) {
+ next = slot;
+ goto out;
+ }
+
+ next = &nr_bucket->slot[next_pos];
+ *next = xchg(slot, *next);
+
+ if (next_pos == nr_bucket->hand[listid])
+ nr_bucket->hand[listid] = pos;
+out:
+ BUG_ON(GET_INDEX(*next) != next_pos);
+ return next;
+}
+
+static inline u32 * __nonresident_pop(struct nr_bucket *nr_bucket, int listid)
+{
+ return __nonresident_del(nr_bucket, listid, nr_bucket->hand[listid], NULL);
+}
+
+/*
+ * insert before (d before b in 'abc')
+ *
+ * initial set 4 swap(2,4)
+ *
+ * 1: -> [2],a 1: -> [2],a 1: -> [2],a
+ * * 2: -> [3],b 2: -> [3],b 2: -> [4],d
+ * 3: -> [1],c 3: -> [1],c 3: -> [1],c
+ * 4: -> [4],nil 4: -> [4],d * 4: -> [3],b
+ *
+ * leaving us with 'adbc'.
+ *
+ * @nr_bucket: bucket to operator in
+ * @listid: list to insert into
+ * @pos: position to insert before
+ * @slot: slot to insert
+ */
+static void __nonresident_insert(struct nr_bucket *nr_bucket, int listid, u8 *pos, u32 *slot)
+{
+ u32 *head;
+
+ SET_LISTID(*slot, listid);
+
+ head = &nr_bucket->slot[*pos];
+
+ *pos = GET_INDEX(*slot);
+ if (GET_LISTID(*head) == listid)
+ *slot = xchg(head, *slot);
+
+ ++__get_cpu_var(nonres_count[listid]);
+}
+
+static inline void __nonresident_push(struct nr_bucket *nr_bucket, int listid, u32 *slot)
+{
+ __nonresident_insert(nr_bucket, listid, &nr_bucket->hand[listid], slot);
+}
+
+/*
+ * Remembers a page by putting a hash-cookie on the @listid list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ * @listid: list to put the page on (NR_b1, NR_b2 and NR_free).
+ * @listid_evict: list to get a free page from when NR_free is empty.
+ *
+ * returns the list an empty page was taken from.
+ */
+int nonresident_put(struct address_space * mapping, unsigned long index, int listid, int listid_evict)
+{
+ struct nr_bucket *nr_bucket;
+ u32 cookie;
+ unsigned long flags;
+ u32 *slot;
+ int evict = NR_free;
+
+ prefetch(mapping->host);
+ nr_bucket = nr_hash(mapping, index);
+
+ spin_lock_prefetch(nr_bucket); // prefetchw_range(nr_bucket, NR_CACHELINES);
+ cookie = nr_cookie(mapping, index);
+
+ spin_lock_irqsave(&nr_bucket->lock, flags);
+ slot = __nonresident_pop(nr_bucket, evict);
+ if (!slot) {
+ evict = listid_evict;
+ slot = __nonresident_pop(nr_bucket, evict);
+ if (!slot) {
+ evict ^= 1;
+ slot = __nonresident_pop(nr_bucket, evict);
+ }
+ }
+ BUG_ON(!slot);
+ SET_INDEX(cookie, GET_INDEX(*slot));
+ cookie = xchg(slot, cookie);
+ __nonresident_push(nr_bucket, listid, slot);
+ spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+ return evict;
+}
+
+/*
+ * Searches a page on the first two lists, and places it on the free list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ *
+ * returns listid of the list the item was found on with NR_found set if found.
+ */
+int nonresident_find(struct address_space * mapping, unsigned long index)
+{
+ struct nr_bucket * nr_bucket;
+ u32 wanted;
+ int j;
+ u8 i;
+ unsigned long flags;
+
+ prefetch(mapping->host);
+ nr_bucket = nr_hash(mapping, index);
+
+ spin_lock_prefetch(nr_bucket); // prefetch_range(nr_bucket, NR_CACHELINES);
+ wanted = nr_cookie(mapping, index) & COOKIE_MASK;
+
+ spin_lock_irqsave(&nr_bucket->lock, flags);
+ for (i = 0; i < 2; ++i) {
+ j = nr_bucket->hand[i];
+ do {
+ u32 *slot = &nr_bucket->slot[j];
+ if (GET_LISTID(*slot) != i)
+ break;
+
+ if ((*slot & COOKIE_MASK) == wanted) {
+ slot = __nonresident_del(nr_bucket, i, j, slot);
+ /* __nonresident_push(nr_bucket, i, slot); */
+ __nonresident_push(nr_bucket, NR_free, slot);
+ return i | NR_found;
+ }
+
+ j = GET_INDEX(*slot);
+ } while (j != nr_bucket->hand[i]);
+ }
+ spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+ return 0;
+}
+
+unsigned int nonresident_count(int listid)
+{
+ unsigned int ret = 0;
+ int cpu;
+
+ preempt_disable();
+ for_each_cpu(cpu) {
+ ret += per_cpu(nonres_count[listid], cpu);
+ }
+ preempt_enable();
+
+ return ret;
+}
+
+unsigned int nonresident_total(void)
+{
+ return (1 << nonres_shift) * NR_SLOTS;
+}
+
+/*
+ * For interactive workloads, we remember about as many non-resident pages
+ * as we have actual memory pages. For server workloads with large inter-
+ * reference distances we could benefit from remembering more.
+ */
+static __initdata unsigned long nonresident_factor = 1;
+void __init nonresident_init(void)
+{
+ int target;
+ int i, j;
+
+ /*
+ * Calculate the non-resident hash bucket target. Use a power of
+ * two for the division because alloc_large_system_hash rounds up.
+ */
+ target = nr_all_pages * nonresident_factor;
+ target /= (sizeof(struct nr_bucket) / sizeof(u32));
+
+ nonres_table = alloc_large_system_hash("Non-resident page tracking",
+ sizeof(struct nr_bucket),
+ target,
+ 0,
+ HASH_EARLY | HASH_HIGHMEM,
+ &nonres_shift,
+ &nonres_mask,
+ 0);
+
+ for (i = 0; i < (1 << nonres_shift); i++) {
+ spin_lock_init(&nonres_table[i].lock);
+ for (j = 0; j < 4; ++j)
+ nonres_table[i].hand[j] = 0;
+
+ for (j = 0; j < NR_SLOTS; ++j) {
+ nonres_table[i].slot[j] = 0;
+ SET_LISTID(nonres_table[i].slot[j], NR_free);
+ if (j < NR_SLOTS - 1)
+ SET_INDEX(nonres_table[i].slot[j], j+1);
+ else /* j == NR_SLOTS - 1 */
+ SET_INDEX(nonres_table[i].slot[j], 0);
+ }
+ }
+
+ for_each_cpu(i) {
+ for (j=0; j<4; ++j)
+ per_cpu(nonres_count[j], i) = 0;
+ }
+}
+
+static int __init set_nonresident_factor(char * str)
+{
+ if (!str)
+ return 0;
+ nonresident_factor = simple_strtoul(str, &str, 0);
+ return 1;
+}
+
+__setup("nonresident_factor=", set_nonresident_factor);
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 2/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 3/7] " a.p.zijlstra
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-nonresident-stats.patch --]
[-- Type: text/plain, Size: 3566 bytes --]
Index: linux-2.6-git/fs/proc/proc_misc.c
===================================================================
--- linux-2.6-git.orig/fs/proc/proc_misc.c
+++ linux-2.6-git/fs/proc/proc_misc.c
@@ -233,6 +233,20 @@ static struct file_operations proc_zonei
.release = seq_release,
};
+extern struct seq_operations nonresident_op;
+static int nonresident_open(struct inode *inode, struct file *file)
+{
+ (void)inode;
+ return seq_open(file, &nonresident_op);
+}
+
+static struct file_operations nonresident_file_operations = {
+ .open = nonresident_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static int version_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -602,6 +616,7 @@ void __init proc_misc_init(void)
create_seq_entry("interrupts", 0, &proc_interrupts_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+ create_seq_entry("nonresident",S_IRUGO, &nonresident_file_operations);
create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
create_seq_entry("diskstats", 0, &proc_diskstats_operations);
Index: linux-2.6-git/mm/nonresident.c
===================================================================
--- linux-2.6-git.orig/mm/nonresident.c
+++ linux-2.6-git/mm/nonresident.c
@@ -370,3 +370,83 @@ static int __init set_nonresident_factor
}
__setup("nonresident_factor=", set_nonresident_factor);
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *stats_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos < 0 || *pos >= (1 << nonres_shift))
+ return NULL;
+
+ m->private = (void*)(unsigned long)*pos;
+
+ return pos;
+}
+
+static void *stats_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ if (*pos < (1 << nonres_shift)-1) {
+ (*pos)++;
+ (unsigned long)m->private++;
+ return pos;
+ }
+ return NULL;
+}
+
+static void stats_stop(struct seq_file *m, void *arg)
+{
+}
+
+static void bucket_stats(struct nr_bucket * nr_bucket, int * b1, int * b2, int * free)
+{
+ unsigned long flags;
+ unsigned int i, b[3] = {0, 0, 0};
+
+ spin_lock_irqsave(&nr_bucket->lock, flags);
+ for (i = 0; i < 3; ++i) {
+ unsigned int j = nr_bucket->hand[i];
+ do
+ {
+ u32 *slot = &nr_bucket->slot[j];
+ if (GET_LISTID(*slot) != i)
+ break;
+ j = GET_INDEX(*slot);
+ ++b[i];
+ } while (j != nr_bucket->hand[i]);
+ }
+ spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+ *b1=b[0];
+ *b2=b[1];
+ *free=b[2];
+}
+
+static int stats_show(struct seq_file *m, void *arg)
+{
+ unsigned int index = (unsigned long)m->private;
+ struct nr_bucket *nr_bucket = &nonres_table[index];
+ int b1, b2, free;
+
+ bucket_stats(nr_bucket, &b1, &b2, &free);
+ seq_printf(m, "%d\t%d\t%d", b1, b2, free);
+ if (index == 0) {
+ seq_printf(m, "\t%d\t%d\t%d",
+ nonresident_count(NR_b1),
+ nonresident_count(NR_b2),
+ nonresident_count(NR_free));
+ }
+ seq_printf(m,"\n");
+
+ return 0;
+}
+
+struct seq_operations nonresident_op = {
+ .start = stats_start,
+ .next = stats_next,
+ .stop = stats_stop,
+ .show = stats_show,
+};
+
+#endif /* CONFIG_PROC_FS */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 3/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 2/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 4/7] " a.p.zijlstra
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-cart.patch --]
[-- Type: text/plain, Size: 22755 bytes --]
Index: linux-2.6-git/include/linux/mm_inline.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_inline.h
+++ linux-2.6-git/include/linux/mm_inline.h
@@ -1,40 +1,15 @@
-
-static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->active_list);
- zone->nr_active++;
-}
-
-static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
-{
- list_add(&page->lru, &zone->inactive_list);
- zone->nr_inactive++;
-}
-
-static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_active--;
-}
-
-static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
-{
- list_del(&page->lru);
- zone->nr_inactive--;
-}
-
static inline void
del_page_from_lru(struct zone *zone, struct page *page)
{
list_del(&page->lru);
- if (PageActive(page)) {
- ClearPageActive(page);
+ if (TestClearPageActive(page)) {
zone->nr_active--;
} else {
zone->nr_inactive--;
}
+ if (TestClearPageLongTerm(page)) {
+ /* zone->nr_longterm--; */
+ } else {
+ zone->nr_shortterm--;
+ }
}
Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -143,13 +143,18 @@ struct zone {
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct list_head active_list;
- struct list_head inactive_list;
+ spinlock_t lru_lock;
+ struct list_head active_list; /* The T1 list of CART */
+ struct list_head inactive_list; /* The T2 list of CART */
unsigned long nr_scan_active;
unsigned long nr_scan_inactive;
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long nr_evicted_active;
+ unsigned long nr_evicted_inactive;
+ unsigned long nr_shortterm; /* number of short term pages */
+ unsigned long nr_p; /* p from the CART paper */
+ unsigned long nr_q; /* q from the cart paper */
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
Index: linux-2.6-git/include/linux/page-flags.h
===================================================================
--- linux-2.6-git.orig/include/linux/page-flags.h
+++ linux-2.6-git/include/linux/page-flags.h
@@ -76,6 +76,9 @@
#define PG_nosave_free 18 /* Free, should not be written */
#define PG_uncached 19 /* Page has been mapped as uncached */
+#define PG_longterm 20 /* Filter bit for CART see mm/cart.c */
+#define PG_new 21 /* New page, skip first reference */
+
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
* allowed.
@@ -305,6 +308,18 @@ extern void __mod_page_state(unsigned lo
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#define PageLongTerm(page) test_bit(PG_longterm, &(page)->flags)
+#define SetPageLongTerm(page) set_bit(PG_longterm, &(page)->flags)
+#define TestSetPageLongTerm(page) test_and_set_bit(PG_longterm, &(page)->flags)
+#define ClearPageLongTerm(page) clear_bit(PG_longterm, &(page)->flags)
+#define TestClearPageLongTerm(page) test_and_clear_bit(PG_longterm, &(page)->flags)
+
+#define PageNew(page) test_bit(PG_new, &(page)->flags)
+#define SetPageNew(page) set_bit(PG_new, &(page)->flags)
+#define TestSetPageNew(page) test_and_set_bit(PG_new, &(page)->flags)
+#define ClearPageNew(page) clear_bit(PG_new, &(page)->flags)
+#define TestClearPageNew(page) test_and_clear_bit(PG_new, &(page)->flags)
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -7,6 +7,7 @@
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/sched.h>
+#include <linux/mm.h>
#include <asm/atomic.h>
#include <asm/page.h>
@@ -170,6 +171,23 @@ extern unsigned int nonresident_count(in
extern unsigned int nonresident_total(void);
extern void nonresident_init(void);
+/* linux/mm/cart.c */
+extern void cart_init(void);
+extern void __cart_insert(struct zone *, struct page *);
+extern unsigned long cart_replace(struct zone *, struct list_head *, unsigned long, unsigned long *);
+extern void cart_reinsert(struct zone *, struct list_head *);
+extern void __cart_rotate_reclaimable(struct zone *, struct page *);
+extern void __cart_remember(struct zone *, struct page*);
+
+static inline void cart_remember(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ __cart_remember(zone, page);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalhigh_pages;
Index: linux-2.6-git/init/main.c
===================================================================
--- linux-2.6-git.orig/init/main.c
+++ linux-2.6-git/init/main.c
@@ -497,6 +497,7 @@ asmlinkage void __init start_kernel(void
vfs_caches_init_early();
nonresident_init();
mem_init();
+ cart_init();
kmem_cache_init();
setup_per_cpu_pageset();
numa_policy_init();
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -13,7 +13,7 @@ obj-y := bootmem.o filemap.o mempool.o
prio_tree.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o \
- nonresident.o
+ nonresident.o cart.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
Index: linux-2.6-git/mm/cart.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/cart.c
@@ -0,0 +1,623 @@
+/*
+ * mm/cart.c
+ *
+ * Written by Peter Zijlstra <a.p.zijlstra@chello.nl>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * This file contains a Page Replacement Algorithm based on CART
+ * Please refer to the CART paper here -
+ * http://www.almaden.ibm.com/cs/people/dmodha/clockfast.pdf
+ *
+ * The algorithm was adapted to work for linux which poses the following
+ * extra constraints:
+ * - multiple memory zones,
+ * - async pageout (PG_writeback).
+ *
+ * The multiple memory zones are handled by decoupling the T lists from the
+ * B lists, keeping T lists per zone while having global B lists. See
+ * mm/nonresident.c for the B list implementation. List sizes are scaled on
+ * comparison.
+ *
+ * The asynchronous pageout makes for the problem of pages that are selected
+ * for eviction but can not (yet) be freed. And since they are not evicted
+ * they could be referenced again, so we have to keep them on the resident
+ * lists. This is handled in the reinsert and rotate_reclaimable functions.
+ *
+ * Also for efficiency's sake the replace operation is batched. This to
+ * avoid holding the much contended zone->lru_lock while calling the
+ * rather slow page_referenced().
+ *
+ * Doubts:
+ * - can a page be on both a T and a B list? The paper does imply this.
+ *
+ * Beyond CART:
+ * - PG_new
+ * - 'S' pages from B1.
+ *
+ * Todo:
+ * - page accounting
+ * - finetune
+ *
+ * All functions that are prefixed with '__' assume that zone->lru_lock is taken.
+ */
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/mm_inline.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+
+#define cart_cT ((zone)->nr_active + (zone)->nr_inactive + (zone)->free_pages)
+#define cart_cB ((zone)->present_pages)
+
+#define T2B(x) (((x) * cart_cB) / cart_cT)
+#define B2T(x) (((x) * cart_cT) / cart_cB)
+
+#define size_T1 ((zone)->nr_active)
+#define size_T2 ((zone)->nr_inactive)
+
+#define list_T1 (&(zone)->active_list)
+#define list_T2 (&(zone)->inactive_list)
+
+#define cart_p ((zone)->nr_p)
+#define cart_q ((zone)->nr_q)
+
+#define size_B1 ((zone)->nr_evicted_active)
+#define size_B2 ((zone)->nr_evicted_inactive)
+/* #define size_B2 (cart_cB - size_B1) */
+
+#define nr_Ns ((zone)->nr_shortterm)
+#define nr_Nl (size_T1 + size_T2 - nr_Ns)
+
+/* Called from init/main.c to initialize the cart parameters */
+void __init cart_init(void)
+{
+ struct zone *zone;
+ for_each_zone(zone) {
+ zone->nr_evicted_active = 0;
+ zone->nr_evicted_inactive = 0;
+ zone->nr_shortterm = 0;
+ zone->nr_p = 0;
+ zone->nr_q = 0;
+ }
+}
+
+static inline void __cart_q_inc(struct zone *zone, unsigned long dq)
+{
+ /* if (|T2| + |B2| + |T1| - ns >= c) q = min(q + 1, 2c - |T1|) */
+ /* |B2| + nl >= c */
+ if (B2T(size_B2) + nr_Nl >= cart_cT)
+ cart_q = min(cart_q + dq, 2*cart_cB - T2B(size_T1));
+}
+
+static inline void __cart_q_dec(struct zone *zone, unsigned long dq)
+{
+ /* q = max(q - 1, c - |T1|) */
+ unsigned long target = cart_cB - T2B(size_T1);
+ if (cart_q <= target)
+ cart_q = target;
+ else if (cart_q >= dq)
+ cart_q -= dq;
+ else
+ cart_q = 0;
+}
+
+/*
+ * Insert page into @zones CART and update adaptive parameters.
+ *
+ * @zone: target zone.
+ * @page: new page.
+ */
+void __cart_insert(struct zone *zone, struct page *page)
+{
+ unsigned int rflags;
+ unsigned long ratio;
+
+ /*
+ * Note: we could give hints to the insertion process using the LRU
+ * specific PG_flags like: PG_active, PG_longterm and PG_referenced.
+ */
+
+ rflags = nonresident_find(page_mapping(page), page_index(page));
+
+ if (rflags & NR_found) {
+ rflags &= NR_listid;
+ if (rflags == NR_b1) {
+ if (likely(size_B1)) --size_B1;
+
+ /* p = min(p + max(1, ns/|B1|), c) */
+ ratio = (nr_Ns / (B2T(size_B1) + 1)) ?: 1UL;
+ cart_p += ratio;
+ if (unlikely(cart_p > cart_cT))
+ cart_p = cart_cT;
+
+ SetPageLongTerm(page);
+ } else if (rflags == NR_b2) {
+ if (likely(size_B2)) --size_B2;
+
+ /* p = max(p - max(1, nl/|B2|), 0) */
+ ratio = (nr_Nl / (B2T(size_B2) + 1)) ?: 1UL;
+ if (cart_p >= ratio)
+ cart_p -= ratio;
+ else
+ cart_p = 0UL;
+
+ SetPageLongTerm(page);
+ __cart_q_inc(zone, 1);
+ }
+ /* ++nr_Nl; */
+ } else {
+ ClearPageLongTerm(page);
+ ++nr_Ns;
+ }
+
+ SetPageNew(page);
+ ClearPageReferenced(page);
+ SetPageActive(page);
+ list_add_tail(&page->lru, list_T1);
+ ++size_T1;
+ BUG_ON(!PageLRU(page));
+}
+
+#define head_to_page(list) list_entry((list)->next, struct page, lru)
+#define tail_to_page(list) list_entry((list)->prev, struct page, lru)
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_next_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.next != _base) { \
+ struct page *next; \
+ \
+ next = head_to_page(&(_page->lru)); \
+ prefetch(&next->_field); \
+ } \
+ } while (0)
+
+#define prefetch_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = tail_to_page(&(_page->lru)); \
+ prefetch(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetch_next_lru_page(_page, _base, _field) do { } while (0)
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_next_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.next != _base) { \
+ struct page *next; \
+ \
+ next = head_to_page(&(_page->lru)); \
+ prefetchw(&next->_field); \
+ } \
+ } while (0)
+
+#define prefetchw_prev_lru_page(_page, _base, _field) \
+ do { \
+ if ((_page)->lru.prev != _base) { \
+ struct page *prev; \
+ \
+ prev = tail_to_page(&(_page->lru)); \
+ prefetchw(&prev->_field); \
+ } \
+ } while (0)
+#else
+#define prefetchw_next_lru_page(_page, _base, _field) do { } while (0)
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * Try to grab a specified number of pages of a list head.
+ * Ups the refcount on the grabbed pages.
+ *
+ * @dst: temp list to hold pages.
+ * @src: list to take pages from.
+ * @nr_dst: target number of pages to grab.
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns how many pages were moved to @dst.
+ */
+static unsigned long cart_isolate(struct zone *zone, struct list_head *dst, struct list_head *src,
+ unsigned long nr_dst, unsigned long *nr_scanned)
+{
+ struct page *page;
+ unsigned long nr_taken = 0, scan = 0;
+
+ spin_lock_irq(&zone->lru_lock);
+ while (scan++ < nr_dst && !list_empty(src)) {
+ page = head_to_page(src);
+ if (!get_page_testone(page)) {
+ list_move_tail(&page->lru, dst);
+ ++nr_taken;
+ } else {
+ list_move_tail(&page->lru, src);
+ __put_page(page);
+ }
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ *nr_scanned += scan;
+ return nr_taken;
+}
+
+/*
+ * Add page to a release pagevec, temp. drop zone lock to release pagevec if full.
+ *
+ * @zone: @pages zone
+ * @page: page to be released
+ * @pvec: pagevec to collect pages in
+ */
+static void __cart_page_release(struct zone *zone, struct page *page, struct pagevec *pvec)
+{
+ if (!pagevec_add(pvec, page)) {
+ spin_unlock_irq(&zone->lru_lock);
+ if (buffer_heads_over_limit)
+ pagevec_strip(pvec);
+ __pagevec_release(pvec);
+ spin_lock_irq(&zone->lru_lock);
+ }
+}
+
+/*
+ * Append list to head and releases pages.
+ *
+ * @zone: zone of pages.
+ * @list: temp list of pages (1 ref).
+ * @head: zone pagelist.
+ * @pvec: pagevec for release.
+ */
+static void __cart_list_splice_tail_release(struct zone *zone, struct list_head *list,
+ struct list_head *head, struct pagevec *pvec)
+{
+ while (!list_empty(list)) { /* head = { head, list } */
+ struct page *page = head_to_page(list);
+ list_move_tail(&page->lru, head);
+ __cart_page_release(zone, page, pvec);
+ }
+}
+
+static void __cart_list_splice_release(struct zone *zone, struct list_head *list,
+ struct list_head *head, struct pagevec *pvec)
+{
+ while (!list_empty(list)) { /* head = { list, head } */
+ struct page *page = tail_to_page(list);
+ list_move(&page->lru, head);
+ __cart_page_release(zone, page, pvec);
+ }
+}
+
+/*
+ * Rebalance the T2 list:
+ * move referenced pages to T1s' tail,
+ * take unreffed pages for reclaim.
+ *
+ * @zone: target zone
+ * @l_t2_head: temp list of reclaimable pages (1 ref)
+ * @nr_dst: quantum of pages
+ * @nr_scanned: counter that keeps the number of pages touched.
+ * @pvec: pagevec used to release pages
+ *
+ * returns whether there are pages on @l_t2_head
+ */
+static unsigned long cart_rebalance_T2(struct zone *zone, struct list_head *l_t2_head,
+ unsigned long nr_dst, unsigned long *nr_scanned,
+ struct pagevec *pvec)
+{
+ struct page *page;
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_t1);
+ unsigned long nr, dq;
+
+ do {
+ nr = cart_isolate(zone, &l_hold, list_T2, nr_dst, nr_scanned);
+ if (!nr)
+ break;
+
+ dq = 0;
+ while (!list_empty(&l_hold)) {
+ page = head_to_page(&l_hold);
+ prefetchw_next_lru_page(page, &l_hold, flags);
+
+ if (page_referenced(page, 0, 0)) {
+ list_move_tail(&page->lru, &l_t1);
+ SetPageActive(page);
+ ++dq;
+ } else {
+ list_move_tail(&page->lru, l_t2_head);
+ }
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ __cart_list_splice_tail_release(zone, &l_t1, list_T1, pvec);
+ size_T2 -= dq;
+ size_T1 += dq;
+ __cart_q_inc(zone, dq);
+ spin_unlock_irq(&zone->lru_lock);
+ } while (list_empty(l_t2_head));
+
+ return !list_empty(l_t2_head);
+}
+
+/*
+ * Rebalance the T1 list:
+ * move referenced pages to tail and possibly mark 'L',
+ * move unreffed 'L' pages to tail T2,
+ * take unreffed 'S' pages for reclaim.
+ *
+ * @zone: target zone
+ * @l_t1_head: temp list of reclaimable pages (1 ref)
+ * @nr_dst: quantum of pages
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns whether there are pages on @l_t1_head
+ */
+static unsigned long cart_rebalance_T1(struct zone *zone, struct list_head *l_t1_head,
+ unsigned long nr_dst, unsigned long *nr_scanned,
+ struct pagevec *pvec)
+{
+ struct page *page;
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_t1);
+ LIST_HEAD(l_t2);
+ unsigned long nr, dq, dsl;
+ int new, referenced;
+
+ do {
+ nr = cart_isolate(zone, &l_hold, list_T1, nr_dst, nr_scanned);
+ if (!nr)
+ break;
+
+ dq = 0;
+ dsl = 0;
+ while (!list_empty(&l_hold)) {
+ page = head_to_page(&l_hold);
+ prefetchw_next_lru_page(page, &l_hold, flags);
+
+ referenced = page_referenced(page, 0, 0);
+ new = TestClearPageNew(page);
+
+ if (referenced) {
+ list_move_tail(&page->lru, &l_t1);
+ // XXX: we race a bit here; do we mind and put it under lru_lock?
+ /* ( |T1| >= min(p + 1, |B1|) ) and ( filter = 'S' ) */
+ if ((size_T1 - dq) >= min(cart_p + 1, B2T(size_B1)) &&
+ !PageLongTerm(page) && !new) {
+ SetPageLongTerm(page);
+ ++dsl;
+ }
+ } else if (PageLongTerm(page)) {
+ list_move_tail(&page->lru, &l_t2);
+ ClearPageActive(page);
+ ++dq;
+ } else {
+ list_move_tail(&page->lru, l_t1_head);
+ }
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ __cart_list_splice_tail_release(zone, &l_t2, list_T2, pvec);
+ __cart_list_splice_tail_release(zone, &l_t1, list_T1, pvec);
+ nr_Ns -= dsl;
+ /* nr_Nl += dsl; */
+ size_T1 -= dq;
+ size_T2 += dq;
+ __cart_q_dec(zone, dq);
+ spin_unlock_irq(&zone->lru_lock);
+ } while (list_empty(l_t1_head));
+
+ return !list_empty(l_t1_head);
+}
+
+/*
+ * Try to reclaim a specified number of pages.
+ *
+ * Clears PG_lru of reclaimed pages, but does take 1 reference.
+ *
+ * @zone: target zone to reclaim pages from.
+ * @dst: temp list of reclaimable pages (1 ref).
+ * @nr_dst: target number of pages to reclaim.
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns number of pages on @dst.
+ */
+unsigned long cart_replace(struct zone *zone, struct list_head *dst,
+ unsigned long nr_dst, unsigned long *nr_scanned)
+{
+ struct page *page;
+ LIST_HEAD(l_t1);
+ LIST_HEAD(l_t2);
+ struct pagevec pvec;
+ unsigned long nr;
+ unsigned int nonres_total;
+
+ pagevec_init(&pvec, 1);
+
+ /*
+ * Rebalance nonresident lists to zone size
+ */
+ nonres_total = nonresident_total();
+
+ spin_lock_irq(&zone->lru_lock);
+ size_B1 = (nonresident_count(NR_b1) * cart_cB) / nonres_total;
+ size_B2 = (nonresident_count(NR_b2) * cart_cB) / nonres_total;
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (nr = 0; nr < nr_dst;) {
+ if (list_empty(&l_t2))
+ cart_rebalance_T2(zone, &l_t2, nr_dst/2, nr_scanned, &pvec);
+ if (list_empty(&l_t1))
+ cart_rebalance_T1(zone, &l_t1, nr_dst/2, nr_scanned, &pvec);
+
+ if (list_empty(&l_t1) && list_empty(&l_t2))
+ break;
+
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(&l_t1) &&
+ (size_T1 > cart_p || !size_T2) && nr < nr_dst) {
+ page = head_to_page(&l_t1);
+ prefetchw_next_lru_page(page, &l_t1, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ if (PageLongTerm(page))
+ BUG();
+ --nr_Ns;
+ --size_T1;
+ ++nr;
+ list_move(&page->lru, dst);
+ }
+ while (!list_empty(&l_t2) &&
+ size_T1 <= cart_p && nr < nr_dst) {
+ page = head_to_page(&l_t2);
+ prefetchw_next_lru_page(page, &l_t2, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ if (!PageLongTerm(page))
+ BUG();
+ /* --nr_Nl; */
+ --size_T2;
+ ++nr;
+ list_move(&page->lru, dst);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ cond_resched();
+ }
+
+ spin_lock_irq(&zone->lru_lock);
+ __cart_list_splice_release(zone, &l_t1, list_T1, &pvec);
+ __cart_list_splice_release(zone, &l_t2, list_T2, &pvec);
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ return nr;
+}
+
+/*
+ * Re-insert pages that were elected for replacement but could not be freed,
+ * writeback has started agains them. Rotate as referenced to let the relaim
+ * path make progress.
+ *
+ * Assumes PG_lru is clear and will set for re-inserted pages.
+ * Assumes the pages have 1 reference taken and will return it.
+ *
+ * @zone: target zone for re-insertion.
+ * @list: list of left over pages.
+ */
+void cart_reinsert(struct zone *zone, struct list_head *list)
+{
+ struct pagevec pvec;
+
+ pagevec_init(&pvec, 1);
+ spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(list))
+ {
+ struct page *page = tail_to_page(list);
+ prefetchw_prev_lru_page(page, list, flags);
+
+ if (TestSetPageLRU(page))
+ BUG();
+ if (!PageActive(page)) { /* T2 */
+ if (!PageLongTerm(page))
+ BUG();
+
+ SetPageActive(page);
+ list_move_tail(&page->lru, list_T1);
+ ++size_T1;
+
+ __cart_q_inc(zone, 1);
+ } else { /* T1 */
+ if (!PageLongTerm(page))
+ ++nr_Ns;
+ /* else ++nr_Nl; */
+
+ list_move_tail(&page->lru, list_T1);
+ ++size_T1;
+
+ /* ( |T1| >= min(p + 1, |B1| ) and ( filter = 'S' ) */
+ /*
+ * Don't modify page state; it wasn't actually referenced
+ */
+ }
+ __cart_page_release(zone, page, &pvec);
+ }
+
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+}
+
+/*
+ * Make page available for direct reclaim.
+ *
+ * @zone: page's zone
+ * @page: page
+ */
+void __cart_rotate_reclaimable(struct zone *zone, struct page *page)
+{
+ if (PageLongTerm(page)) {
+ if (TestClearPageActive(page)) {
+ --size_T1;
+ ++size_T2;
+ __cart_q_dec(zone, 1);
+ }
+ list_move(&page->lru, list_T2);
+ } else {
+ if (!PageActive(page))
+ BUG();
+ list_move(&page->lru, list_T1);
+ }
+}
+
+/*
+ * Puts pages on the non-resident lists on swap-out
+ * XXX: lose the reliance on zone->lru_lock !!!
+ *
+ * @zone: dead pages zone.
+ * @page: dead page.
+ */
+void __cart_remember(struct zone *zone, struct page *page)
+{
+ int listid;
+ int listid_evict;
+
+ if (PageActive(page)) {
+ listid = NR_b1;
+ ++size_B1;
+ } else {
+ listid = NR_b2;
+ ++size_B2;
+ }
+
+ /*
+ * Assume |B1| + |B2| == c + 1, since |B1_j| + |B2_j| := c_j.
+ * The list_empty check is done on the Bn_j side.
+ */
+ /* |B1| > max(0, q) */
+ if (size_B1 > cart_q)
+ listid_evict = NR_b1;
+ else
+ listid_evict = NR_b2;
+
+ listid_evict = nonresident_put(page_mapping(page),
+ page_index(page),
+ listid, listid_evict);
+
+ switch (listid_evict) {
+ case NR_b1:
+ if (likely(size_B1)) --size_B1;
+ break;
+
+ case NR_b2:
+ if (likely(size_B2)) --size_B2;
+ break;
+ }
+}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 4/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
` (2 preceding siblings ...)
2005-09-11 20:25 ` [RFC][PATCH 3/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 5/7] " a.p.zijlstra
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-cart-r.patch --]
[-- Type: text/plain, Size: 4094 bytes --]
Index: linux-2.6-git/mm/cart.c
===================================================================
--- linux-2.6-git.orig/mm/cart.c
+++ linux-2.6-git/mm/cart.c
@@ -63,6 +63,7 @@
#define cart_p ((zone)->nr_p)
#define cart_q ((zone)->nr_q)
+#define cart_r ((zone)->nr_r)
#define size_B1 ((zone)->nr_evicted_active)
#define size_B2 ((zone)->nr_evicted_inactive)
@@ -81,6 +82,7 @@ void __init cart_init(void)
zone->nr_shortterm = 0;
zone->nr_p = 0;
zone->nr_q = 0;
+ zone->nr_r = 0;
}
}
@@ -123,6 +125,12 @@ void __cart_insert(struct zone *zone, st
rflags = nonresident_find(page_mapping(page), page_index(page));
if (rflags & NR_found) {
+ ratio = (nr_Nl / (nr_Ns + 1)) ?: 1;
+ if (cart_r > ratio)
+ cart_r -= ratio;
+ else
+ cart_r = 0UL;
+
rflags &= NR_listid;
if (rflags == NR_b1) {
if (likely(size_B1)) --size_B1;
@@ -149,6 +157,11 @@ void __cart_insert(struct zone *zone, st
}
/* ++nr_Nl; */
} else {
+ ratio = (nr_Ns / (nr_Nl + 1)) ?: 1;
+ cart_r += ratio;
+ if (cart_r > cart_cT)
+ cart_r = cart_cT;
+
ClearPageLongTerm(page);
++nr_Ns;
}
@@ -360,6 +373,7 @@ static unsigned long cart_rebalance_T2(s
* returns whether there are pages on @l_t1_head
*/
static unsigned long cart_rebalance_T1(struct zone *zone, struct list_head *l_t1_head,
+ struct list_head *l_new,
unsigned long nr_dst, unsigned long *nr_scanned,
struct pagevec *pvec)
{
@@ -384,7 +398,11 @@ static unsigned long cart_rebalance_T1(s
referenced = page_referenced(page, 0, 0);
new = TestClearPageNew(page);
- if (referenced) {
+ if (cart_r < nr_Nl && PageLongTerm(page) && new) {
+ list_move_tail(&page->lru, l_new);
+ ClearPageActive(page);
+ ++dq;
+ } else if (referenced) {
list_move_tail(&page->lru, &l_t1);
// XXX: we race a bit here; do we mind and put it under lru_lock?
/* ( |T1| >= min(p + 1, |B1|) ) and ( filter = 'S' ) */
@@ -432,6 +450,7 @@ unsigned long cart_replace(struct zone *
unsigned long nr_dst, unsigned long *nr_scanned)
{
struct page *page;
+ LIST_HEAD(l_new);
LIST_HEAD(l_t1);
LIST_HEAD(l_t2);
struct pagevec pvec;
@@ -454,12 +473,24 @@ unsigned long cart_replace(struct zone *
if (list_empty(&l_t2))
cart_rebalance_T2(zone, &l_t2, nr_dst/2, nr_scanned, &pvec);
if (list_empty(&l_t1))
- cart_rebalance_T1(zone, &l_t1, nr_dst/2, nr_scanned, &pvec);
+ cart_rebalance_T1(zone, &l_t1, &l_new, nr_dst/2, nr_scanned, &pvec);
if (list_empty(&l_t1) && list_empty(&l_t2))
break;
spin_lock_irq(&zone->lru_lock);
+ while (!list_empty(&l_new) && nr < nr_dst) {
+ page = head_to_page(&l_new);
+ prefetchw_next_lru_page(page, &l_new, flags);
+
+ if (!TestClearPageLRU(page))
+ BUG();
+ if (!PageLongTerm(page))
+ BUG();
+ --size_T2;
+ ++nr;
+ list_move(&page->lru, dst);
+ }
while (!list_empty(&l_t1) &&
(size_T1 > cart_p || !size_T2) && nr < nr_dst) {
page = head_to_page(&l_t1);
@@ -496,6 +527,7 @@ unsigned long cart_replace(struct zone *
spin_lock_irq(&zone->lru_lock);
__cart_list_splice_release(zone, &l_t1, list_T1, &pvec);
__cart_list_splice_release(zone, &l_t2, list_T2, &pvec);
+ __cart_list_splice_release(zone, &l_new, list_T2, &pvec);
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -155,6 +155,7 @@ struct zone {
unsigned long nr_shortterm; /* number of short term pages */
unsigned long nr_p; /* p from the CART paper */
unsigned long nr_q; /* q from the cart paper */
+ unsigned long nr_r;
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 5/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
` (3 preceding siblings ...)
2005-09-11 20:25 ` [RFC][PATCH 4/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 6/7] " a.p.zijlstra
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-cart-stats.patch --]
[-- Type: text/plain, Size: 4590 bytes --]
Index: linux-2.6-git/fs/proc/proc_misc.c
===================================================================
--- linux-2.6-git.orig/fs/proc/proc_misc.c
+++ linux-2.6-git/fs/proc/proc_misc.c
@@ -233,6 +233,20 @@ static struct file_operations proc_zonei
.release = seq_release,
};
+extern struct seq_operations cart_op;
+static int cart_open(struct inode *inode, struct file *file)
+{
+ (void)inode;
+ return seq_open(file, &cart_op);
+}
+
+static struct file_operations cart_file_operations = {
+ .open = cart_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
extern struct seq_operations nonresident_op;
static int nonresident_open(struct inode *inode, struct file *file)
{
@@ -616,6 +630,7 @@ void __init proc_misc_init(void)
create_seq_entry("interrupts", 0, &proc_interrupts_operations);
create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+ create_seq_entry("cart",S_IRUGO, &cart_file_operations);
create_seq_entry("nonresident",S_IRUGO, &nonresident_file_operations);
create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
Index: linux-2.6-git/mm/cart.c
===================================================================
--- linux-2.6-git.orig/mm/cart.c
+++ linux-2.6-git/mm/cart.c
@@ -653,3 +653,96 @@ void __cart_remember(struct zone *zone,
break;
}
}
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *stats_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos != 0)
+ return NULL;
+
+ lru_add_drain();
+
+ return pos;
+}
+
+static void *stats_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ return NULL;
+}
+
+static void stats_stop(struct seq_file *m, void *arg)
+{
+}
+
+static int stats_show(struct seq_file *m, void *arg)
+{
+ struct zone *zone;
+ for_each_zone(zone) {
+ spin_lock_irq(&zone->lru_lock);
+ seq_printf(m, "\n\n======> zone: %lu <=====\n", (unsigned long)zone);
+ seq_printf(m, "struct zone values:\n");
+ seq_printf(m, " zone->nr_active: %lu\n", zone->nr_active);
+ seq_printf(m, " zone->nr_inactive: %lu\n", zone->nr_inactive);
+ seq_printf(m, " zone->nr_evicted_active: %lu\n", zone->nr_evicted_active);
+ seq_printf(m, " zone->nr_evicted_inactive: %lu\n", zone->nr_evicted_inactive);
+ seq_printf(m, " zone->nr_shortterm: %lu\n", zone->nr_shortterm);
+ seq_printf(m, " zone->cart_p: %lu\n", zone->nr_p);
+ seq_printf(m, " zone->cart_q: %lu\n", zone->nr_q);
+ seq_printf(m, " zone->cart_r: %lu\n", zone->nr_r);
+ seq_printf(m, " zone->present_pages: %lu\n", zone->present_pages);
+ seq_printf(m, " zone->free_pages: %lu\n", zone->free_pages);
+ seq_printf(m, " zone->pages_min: %lu\n", zone->pages_min);
+ seq_printf(m, " zone->pages_low: %lu\n", zone->pages_low);
+ seq_printf(m, " zone->pages_high: %lu\n", zone->pages_high);
+
+ seq_printf(m, "\n");
+ seq_printf(m, "implicit values:\n");
+ seq_printf(m, " zone->nr_longterm: %lu\n", nr_Nl);
+ seq_printf(m, " zone->cart_cT: %lu\n", cart_cT);
+ seq_printf(m, " zone->cart_cB: %lu\n", cart_cB);
+
+ seq_printf(m, "\n");
+ seq_printf(m, "counted values:\n");
+
+ {
+ struct page *page;
+ unsigned long active = 0, s1 = 0, l1 = 0;
+ unsigned long inactive = 0, s2 = 0, l2 = 0;
+ unsigned long a1=0,i1=0,a2=0,i2=0;
+ list_for_each_entry(page, &zone->active_list, lru) {
+ ++active;
+ if (PageLongTerm(page)) ++l1;
+ else ++s1;
+ if (PageActive(page)) ++a1;
+ else ++i1;
+ }
+ list_for_each_entry(page, &zone->inactive_list, lru) {
+ ++inactive;
+ if (PageLongTerm(page)) ++l2;
+ else ++s2;
+ if (PageActive(page)) ++a2;
+ else ++i2;
+ }
+ seq_printf(m, " zone->nr_active: %lu (%lu, %lu)(%lu, %lu)\n", active, s1, l1, a1, i1);
+ seq_printf(m, " zone->nr_inactive: %lu (%lu, %lu)(%lu, %lu)\n", inactive, s2, l2, a2, i2);
+ seq_printf(m, " zone->nr_shortterm: %lu\n", s1+s2);
+ seq_printf(m, " zone->nr_longterm: %lu\n", l1+l2);
+ }
+
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
+ return 0;
+}
+
+struct seq_operations cart_op = {
+ .start = stats_start,
+ .next = stats_next,
+ .stop = stats_stop,
+ .show = stats_show,
+};
+
+#endif /* CONFIG_PROC_FS */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 6/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
` (4 preceding siblings ...)
2005-09-11 20:25 ` [RFC][PATCH 5/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 7/7] " a.p.zijlstra
2005-09-11 23:05 ` [RFC][PATCH 0/7] " Peter Zijlstra
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-use-once.patch --]
[-- Type: text/plain, Size: 4902 bytes --]
Index: linux-2.6-git/mm/filemap.c
===================================================================
--- linux-2.6-git.orig/mm/filemap.c
+++ linux-2.6-git/mm/filemap.c
@@ -723,7 +723,6 @@ void do_generic_mapping_read(struct addr
unsigned long offset;
unsigned long last_index;
unsigned long next_index;
- unsigned long prev_index;
loff_t isize;
struct page *cached_page;
int error;
@@ -732,7 +731,6 @@ void do_generic_mapping_read(struct addr
cached_page = NULL;
index = *ppos >> PAGE_CACHE_SHIFT;
next_index = index;
- prev_index = ra.prev_page;
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
@@ -779,13 +777,7 @@ page_ok:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- /*
- * When (part of) the same page is read multiple times
- * in succession, only mark it as accessed the first time.
- */
- if (prev_index != index)
- mark_page_accessed(page);
- prev_index = index;
+ mark_page_accessed(page);
/*
* Ok, we have the page, and it's up-to-date, so
Index: linux-2.6-git/mm/shmem.c
===================================================================
--- linux-2.6-git.orig/mm/shmem.c
+++ linux-2.6-git/mm/shmem.c
@@ -1500,11 +1500,8 @@ static void do_shmem_file_read(struct fi
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- /*
- * Mark the page accessed if we read the beginning.
- */
- if (!offset)
- mark_page_accessed(page);
+
+ mark_page_accessed(page);
} else
page = ZERO_PAGE(0);
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -97,37 +97,12 @@ int rotate_reclaimable_page(struct page
}
/*
- * FIXME: speed this up?
- */
-void fastcall activate_page(struct page *page)
-{
- struct zone *zone = page_zone(page);
-
- spin_lock_irq(&zone->lru_lock);
- if (PageLRU(page) && !PageActive(page)) {
- del_page_from_inactive_list(zone, page);
- SetPageActive(page);
- add_page_to_active_list(zone, page);
- inc_page_state(pgactivate);
- }
- spin_unlock_irq(&zone->lru_lock);
-}
-
-/*
* Mark a page as having seen activity.
- *
- * inactive,unreferenced -> inactive,referenced
- * inactive,referenced -> active,unreferenced
- * active,unreferenced -> active,referenced
*/
void fastcall mark_page_accessed(struct page *page)
{
- if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
- activate_page(page);
- ClearPageReferenced(page);
- } else if (!PageReferenced(page)) {
+ if (!PageReferenced(page))
SetPageReferenced(page);
- }
}
EXPORT_SYMBOL(mark_page_accessed);
Index: linux-2.6-git/mm/swapfile.c
===================================================================
--- linux-2.6-git.orig/mm/swapfile.c
+++ linux-2.6-git/mm/swapfile.c
@@ -408,7 +408,7 @@ static void unuse_pte(struct vm_area_str
* Move the page to the active list so it is not
* immediately swapped out again after swapon.
*/
- activate_page(page);
+ SetPageReferenced(page);
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -508,7 +508,7 @@ static int unuse_mm(struct mm_struct *mm
* Activate page so shrink_cache is unlikely to unmap its
* ptes while lock is dropped, so swapoff can make progress.
*/
- activate_page(page);
+ SetPageReferenced(page);
unlock_page(page);
down_read(&mm->mmap_sem);
lock_page(page);
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -235,27 +235,6 @@ static int shrink_slab(unsigned long sca
return ret;
}
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
- struct address_space *mapping;
-
- /* Page is in somebody's page tables. */
- if (page_mapped(page))
- return 1;
-
- /* Be more reluctant to reclaim swapcache than pagecache */
- if (PageSwapCache(page))
- return 1;
-
- mapping = page_mapping(page);
- if (!mapping)
- return 0;
-
- /* File is mmap'd by somebody? */
- return mapping_mapped(mapping);
-}
-
static inline int is_page_cache_freeable(struct page *page)
{
return page_count(page) - !!PagePrivate(page) == 2;
@@ -408,8 +387,8 @@ static int shrink_list(struct list_head
goto keep_locked;
referenced = page_referenced(page, 1, sc->priority <= 0);
- /* In active use or really unfreeable? Activate it. */
- if (referenced && page_mapping_inuse(page))
+
+ if (referenced)
goto activate_locked;
#ifdef CONFIG_SWAP
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [RFC][PATCH 7/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
` (5 preceding siblings ...)
2005-09-11 20:25 ` [RFC][PATCH 6/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
2005-09-11 23:05 ` [RFC][PATCH 0/7] " Peter Zijlstra
7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: cart-use-cart.patch --]
[-- Type: text/plain, Size: 15757 bytes --]
Index: linux-2.6-git/fs/exec.c
===================================================================
--- linux-2.6-git.orig/fs/exec.c
+++ linux-2.6-git/fs/exec.c
@@ -331,7 +331,7 @@ void install_arg_page(struct vm_area_str
goto out;
}
inc_mm_counter(mm, rss);
- lru_cache_add_active(page);
+ lru_cache_add(page);
set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
page, vma->vm_page_prot))));
page_add_anon_rmap(page, vma, address);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -199,8 +199,6 @@ extern unsigned int nr_free_pagecache_pa
/* linux/mm/swap.c */
extern void FASTCALL(lru_cache_add(struct page *));
-extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
extern void FASTCALL(mark_page_accessed(struct page *));
extern void lru_add_drain(void);
extern int rotate_reclaimable_page(struct page *page);
Index: linux-2.6-git/mm/memory.c
===================================================================
--- linux-2.6-git.orig/mm/memory.c
+++ linux-2.6-git/mm/memory.c
@@ -1304,7 +1304,7 @@ static int do_wp_page(struct mm_struct *
page_remove_rmap(old_page);
flush_cache_page(vma, address, pfn);
break_cow(vma, new_page, address, page_table);
- lru_cache_add_active(new_page);
+ lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);
/* Free the old page.. */
@@ -1782,8 +1782,7 @@ do_anonymous_page(struct mm_struct *mm,
entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)),
vma);
- lru_cache_add_active(page);
- SetPageReferenced(page);
+ lru_cache_add(page);
page_add_anon_rmap(page, vma, addr);
}
@@ -1903,7 +1902,7 @@ retry:
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
- lru_cache_add_active(new_page);
+ lru_cache_add(new_page);
page_add_anon_rmap(new_page, vma, address);
} else
page_add_file_rmap(new_page);
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -78,16 +78,13 @@ int rotate_reclaimable_page(struct page
return 1;
if (PageDirty(page))
return 1;
- if (PageActive(page))
- return 1;
if (!PageLRU(page))
return 1;
zone = page_zone(page);
spin_lock_irqsave(&zone->lru_lock, flags);
- if (PageLRU(page) && !PageActive(page)) {
- list_del(&page->lru);
- list_add_tail(&page->lru, &zone->inactive_list);
+ if (PageLRU(page)) {
+ __cart_rotate_reclaimable(zone, page);
inc_page_state(pgrotated);
}
if (!test_clear_page_writeback(page))
@@ -112,7 +109,6 @@ EXPORT_SYMBOL(mark_page_accessed);
* @page: the page to add
*/
static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
void fastcall lru_cache_add(struct page *page)
{
@@ -124,25 +120,12 @@ void fastcall lru_cache_add(struct page
put_cpu_var(lru_add_pvecs);
}
-void fastcall lru_cache_add_active(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
-
- page_cache_get(page);
- if (!pagevec_add(pvec, page))
- __pagevec_lru_add_active(pvec);
- put_cpu_var(lru_add_active_pvecs);
-}
-
void lru_add_drain(void)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &__get_cpu_var(lru_add_active_pvecs);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
put_cpu_var(lru_add_pvecs);
}
@@ -278,7 +261,9 @@ void __pagevec_lru_add(struct pagevec *p
}
if (TestSetPageLRU(page))
BUG();
- add_page_to_inactive_list(zone, page);
+ if (TestClearPageActive(page))
+ BUG();
+ __cart_insert(zone, page);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
@@ -288,33 +273,6 @@ void __pagevec_lru_add(struct pagevec *p
EXPORT_SYMBOL(__pagevec_lru_add);
-void __pagevec_lru_add_active(struct pagevec *pvec)
-{
- int i;
- struct zone *zone = NULL;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- if (TestSetPageLRU(page))
- BUG();
- if (TestSetPageActive(page))
- BUG();
- add_page_to_active_list(zone, page);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
-}
-
/*
* Try to drop buffers from the pages in a pagevec
*/
@@ -396,9 +354,6 @@ static void lru_drain_cache(unsigned int
/* CPU is dead, so no locking needed. */
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_add_active_pvecs, cpu);
- if (pagevec_count(pvec))
- __pagevec_lru_add_active(pvec);
}
/* Drop the CPU's cached committed space back into the central pool. */
Index: linux-2.6-git/mm/swap_state.c
===================================================================
--- linux-2.6-git.orig/mm/swap_state.c
+++ linux-2.6-git/mm/swap_state.c
@@ -359,7 +359,7 @@ struct page *read_swap_cache_async(swp_e
/*
* Initiate read into locked page and return.
*/
- lru_cache_add_active(new_page);
+ lru_cache_add(new_page);
swap_readpage(NULL, new_page);
return new_page;
}
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -376,8 +376,6 @@ static int shrink_list(struct list_head
if (TestSetPageLocked(page))
goto keep;
- BUG_ON(PageActive(page));
-
sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
@@ -498,6 +496,7 @@ static int shrink_list(struct list_head
#ifdef CONFIG_SWAP
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->private };
+ cart_remember(page);
__delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock);
swap_free(swap);
@@ -506,11 +505,13 @@ static int shrink_list(struct list_head
}
#endif /* CONFIG_SWAP */
+ cart_remember(page);
__remove_from_page_cache(page);
write_unlock_irq(&mapping->tree_lock);
__put_page(page);
free_it:
+ ClearPageActive(page);
unlock_page(page);
reclaimed++;
if (!pagevec_add(&freed_pvec, page))
@@ -535,55 +536,6 @@ keep:
}
/*
- * zone->lru_lock is heavily contended. Some of the functions that
- * shrink the lists perform better by taking out a batch of pages
- * and working on them outside the LRU lock.
- *
- * For pagecache intensive workloads, this function is the hottest
- * spot in the kernel (apart from copy_*_user functions).
- *
- * Appropriate locks must be held before calling this function.
- *
- * @nr_to_scan: The number of pages to look through on the list.
- * @src: The LRU list to pull pages off.
- * @dst: The temp list to put pages on to.
- * @scanned: The number of pages that were scanned.
- *
- * returns how many pages were moved onto *@dst.
- */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
-{
- int nr_taken = 0;
- struct page *page;
- int scan = 0;
-
- while (scan++ < nr_to_scan && !list_empty(src)) {
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
-
- if (!TestClearPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (get_page_testone(page)) {
- /*
- * It is being freed elsewhere
- */
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
- nr_taken++;
- }
- }
-
- *scanned = scan;
- return nr_taken;
-}
-
-/*
* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
*/
static void shrink_cache(struct zone *zone, struct scan_control *sc)
@@ -595,19 +547,15 @@ static void shrink_cache(struct zone *zo
pagevec_init(&pvec, 1);
lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
while (max_scan > 0) {
- struct page *page;
int nr_taken;
- int nr_scan;
+ unsigned long nr_scan = 0;
int nr_freed;
- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
- &page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
+ nr_taken = cart_replace(zone, &page_list, sc->swap_cluster_max,
+ &nr_scan);
+ /* we race a bit, do we care? */
zone->pages_scanned += nr_scan;
- spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
goto done;
@@ -623,220 +571,44 @@ static void shrink_cache(struct zone *zo
mod_page_state_zone(zone, pgsteal, nr_freed);
sc->nr_to_reclaim -= nr_freed;
- spin_lock_irq(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
- while (!list_empty(&page_list)) {
- page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
- list_del(&page->lru);
- if (PageActive(page))
- add_page_to_active_list(zone, page);
- else
- add_page_to_inactive_list(zone, page);
- if (!pagevec_add(&pvec, page)) {
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
+ cart_reinsert(zone, &page_list);
}
- spin_unlock_irq(&zone->lru_lock);
done:
pagevec_release(&pvec);
}
/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_count against each page.
- * But we had to alter page->flags anyway.
- */
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
-{
- int pgmoved;
- int pgdeactivate = 0;
- int pgscanned;
- int nr_pages = sc->nr_to_scan;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
- LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
- LIST_HEAD(l_active); /* Pages to go onto the active_list */
- struct page *page;
- struct pagevec pvec;
- int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
-
- lru_add_drain();
- spin_lock_irq(&zone->lru_lock);
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned);
- zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
- spin_unlock_irq(&zone->lru_lock);
-
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
- while (!list_empty(&l_hold)) {
- cond_resched();
- page = lru_to_page(&l_hold);
- list_del(&page->lru);
- if (page_mapped(page)) {
- if (!reclaim_mapped ||
- (total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0, sc->priority <= 0)) {
- list_add(&page->lru, &l_active);
- continue;
- }
- }
- list_add(&page->lru, &l_inactive);
- }
-
- pagevec_init(&pvec, 1);
- pgmoved = 0;
- spin_lock_irq(&zone->lru_lock);
- while (!list_empty(&l_inactive)) {
- page = lru_to_page(&l_inactive);
- prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
- list_move(&page->lru, &zone->inactive_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pgdeactivate += pgmoved;
- pgmoved = 0;
- if (buffer_heads_over_limit)
- pagevec_strip(&pvec);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_inactive += pgmoved;
- pgdeactivate += pgmoved;
- if (buffer_heads_over_limit) {
- spin_unlock_irq(&zone->lru_lock);
- pagevec_strip(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
-
- pgmoved = 0;
- while (!list_empty(&l_active)) {
- page = lru_to_page(&l_active);
- prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
- BUG_ON(!PageActive(page));
- list_move(&page->lru, &zone->active_list);
- pgmoved++;
- if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
- pgmoved = 0;
- spin_unlock_irq(&zone->lru_lock);
- __pagevec_release(&pvec);
- spin_lock_irq(&zone->lru_lock);
- }
- }
- zone->nr_active += pgmoved;
- spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
-
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
-}
-
-/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void
shrink_zone(struct zone *zone, struct scan_control *sc)
{
unsigned long nr_active;
- unsigned long nr_inactive;
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+ zone->nr_scan_active += ((zone->nr_active + zone->nr_inactive) >> sc->priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
- nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= sc->swap_cluster_max)
- zone->nr_scan_inactive = 0;
- else
- nr_inactive = 0;
sc->nr_to_reclaim = sc->swap_cluster_max;
- while (nr_active || nr_inactive) {
- if (nr_active) {
- sc->nr_to_scan = min(nr_active,
- (unsigned long)sc->swap_cluster_max);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
- }
-
- if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
- (unsigned long)sc->swap_cluster_max);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
- }
+ while (nr_active) {
+ sc->nr_to_scan = min(nr_active,
+ (unsigned long)sc->swap_cluster_max);
+ nr_active -= sc->nr_to_scan;
+ shrink_cache(zone, sc);
+ if (sc->nr_to_reclaim <= 0)
+ break;
}
throttle_vm_writeout();
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [RFC][PATCH 0/7] CART Implementation v3
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
` (6 preceding siblings ...)
2005-09-11 20:25 ` [RFC][PATCH 7/7] " a.p.zijlstra
@ 2005-09-11 23:05 ` Peter Zijlstra
7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2005-09-11 23:05 UTC (permalink / raw)
To: linux-mm
[-- Attachment #1: Type: text/plain, Size: 1225 bytes --]
On Sun, 2005-09-11 at 22:25 +0200, a.p.zijlstra@chello.nl wrote:
> Hi All,
>
> Here my latest efforts on implementing CART, an advanced page replacement
> policy.
>
> It seems pretty stable, except for a spurious OOM. However it yet has to
> run on something other than UML.
>
> A complete CART implementation should be present in cart-cart.patch.
> The cart-cart-r.patch improves thereon by keeping a 3th adaptive parameter
> which measures the amount of fresh pages (not in |T1| u |T2| u |B1| u |B2|).
> When the amount of fresh pages drops below the number of longterm pages
> we start to reclaim pages that have just been inserted.
>
> This works very well for a simple looped linear scan larger than the total
> resident set. Also it doesn't seem to regress normal workloads.
>
Some numbers. All run in an UML with mem=64M and 128M of swapspace, sync
ubd.
linux-2.6.13-rc7
make -j4
real 107m15.351s
user 24m4.820s
sys 12m16.590s
scan 60 16
real 3m39.432s
user 0m4.990s
sys 0m21.920s
linux-2.6.13-rc7-cart
make -j4
real 93m18.035s
user 22m44.280s
sys 9m20.220s
scan 60 16
real 1m47.857s
user 0m4.690s
sys 0m11.690s
--
Peter Zijlstra <a.p.zijlstra@chello.nl>
[-- Attachment #2: scan.c --]
[-- Type: text/x-csrc, Size: 748 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
int main(int argc, char **argv)
{
char *ptr;
int size = -1;
int loops = -1;
if (argc > 1) {
size = atoi(argv[1]);
}
if (argc > 2) {
loops = atoi(argv[2]);
}
if (size < 0) {
printf("no size specified\n");
return 0;
}
if (loops < 0) {
printf("no loops specified\n");
return 0;
}
printf("Size: %dMB\n", size);
printf("Loops: %d\n", loops);
size *= 1024*1024;
ptr = (char*)mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (ptr) {
for (;loops; --loops) {
int i;
for (i=0; i<size; ++i) {
*(ptr + i) = loops;
}
printf(".");
fflush(stdout);
}
printf("\n");
munmap(ptr, size);
}
return 0;
}
^ permalink raw reply [flat|nested] 9+ messages in thread