linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 0/7] CART Implementation v3
@ 2005-09-11 20:25 a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

Hi All,

Here my latest efforts on implementing CART, an advanced page replacement 
policy.

It seems pretty stable, except for a spurious OOM. However it yet has to
run on something other than UML.

A complete CART implementation should be present in cart-cart.patch. 
The cart-cart-r.patch improves thereon by keeping a 3th adaptive parameter
which measures the amount of fresh pages (not in |T1| u |T2| u |B1| u |B2|).
When the amount of fresh pages drops below the number of longterm pages
we start to reclaim pages that have just been inserted.

This works very well for a simple looped linear scan larger than the total 
resident set. Also it doesn't seem to regress normal workloads.

More test{s,ing} needed.

Kind regards,

Peter Zijlstra

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 1/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 2/7] " a.p.zijlstra
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-nonresident.patch --]
[-- Type: text/plain, Size: 12877 bytes --]

Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -154,6 +154,22 @@ extern void out_of_memory(unsigned int _
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
 
+/* linux/mm/nonresident.c */
+#define NR_b1		0
+#define NR_b2		1
+#define NR_free		2
+#define NR_lost		3
+
+#define NR_listid	3
+#define NR_found	0x80000000
+
+
+extern int nonresident_put(struct address_space *, unsigned long, int, int);
+extern int nonresident_find(struct address_space *, unsigned long);
+extern unsigned int nonresident_count(int);
+extern unsigned int nonresident_total(void);
+extern void nonresident_init(void);
+
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalhigh_pages;
@@ -292,6 +308,13 @@ static inline swp_entry_t get_swap_page(
 #define grab_swap_token()  do { } while(0)
 #define has_swap_token(x) 0
 
+/* linux/mm/nonresident.c */
+#define nonresident_put(w,x,y,z) 0
+#define nonresident_find(x,y) 0
+#define nonresident_count(x) 0
+#define nonresident_total() 0
+#define nonresident_init() do { } while (0)
+
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
Index: linux-2.6-git/init/main.c
===================================================================
--- linux-2.6-git.orig/init/main.c
+++ linux-2.6-git/init/main.c
@@ -47,6 +47,7 @@
 #include <linux/rmap.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/swap.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -494,6 +495,7 @@ asmlinkage void __init start_kernel(void
 	}
 #endif
 	vfs_caches_init_early();
+	nonresident_init();
 	mem_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -12,7 +12,8 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   readahead.o slab.o swap.o truncate.o vmscan.o \
 			   prio_tree.o $(mmu-y)
 
-obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o \
+				nonresident.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
Index: linux-2.6-git/mm/nonresident.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/nonresident.c
@@ -0,0 +1,372 @@
+/*
+ * mm/nonresident.c
+ * (C) 2004,2005 Red Hat, Inc
+ * Written by Rik van Riel <riel@redhat.com>
+ * Released under the GPL, see the file COPYING for details.
+ * Adapted by Peter Zijlstra <a.p.zijlstra@chello.nl> for use by ARC
+ * like algorithms.
+ *
+ * Keeps track of whether a non-resident page was recently evicted
+ * and should be immediately promoted to the active list. This also
+ * helps automatically tune the inactive target.
+ *
+ * The pageout code stores a recently evicted page in this cache
+ * by calling remember_page(mapping/mm, index/vaddr)
+ * and can look it up in the cache by calling recently_evicted()
+ * with the same arguments.
+ *
+ * Note that there is no way to invalidate pages after eg. truncate
+ * or exit, we let the pages fall out of the non-resident set through
+ * normal replacement.
+ *
+ *
+ * Modified to work with ARC like algorithms who:
+ *  - need to balance two FIFOs; |b1| + |b2| = c,
+ *
+ * The bucket contains four single linked cyclic lists (CLOCKS) and each
+ * clock has a tail hand. By selecting a victim clock upon insertion it
+ * is possible to balance them.
+ *
+ * The first two lists are used for B1/B2 and a third for a free slot list.
+ * The fourth list is unused.
+ *
+ * The slot looks like this:
+ * struct slot_t {
+ *         u32 cookie : 24; // LSB
+ *         u32 index  :  6;
+ *         u32 listid :  2;
+ * };
+ *
+ * The bucket is guarded by a spinlock.
+ */
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/hash.h>
+#include <linux/prefetch.h>
+#include <linux/kernel.h>
+
+#define TARGET_SLOTS	64
+#define NR_CACHELINES  (TARGET_SLOTS*sizeof(u32) / L1_CACHE_BYTES)
+#define NR_SLOTS	(((NR_CACHELINES * L1_CACHE_BYTES) - sizeof(spinlock_t) - 2*sizeof(u16)) / sizeof(u32))
+#if 0
+#if NR_SLOTS < (TARGET_SLOTS / 2)
+#warning very small slot size
+#if NR_SLOTS <= 0
+#error no room for slots left
+#endif
+#endif
+#endif
+
+#define BUILD_MASK(bits, shift) (((1 << (bits)) - 1) << (shift))
+
+#define LISTID_BITS		2
+#define LISTID_SHIFT		(sizeof(u32)*8 - LISTID_BITS)
+#define LISTID_MASK		BUILD_MASK(LISTID_BITS, LISTID_SHIFT)
+
+#define SET_LISTID(x, flg)	((x) = ((x) & ~LISTID_MASK) | ((flg) << LISTID_SHIFT))
+#define GET_LISTID(x)		(((x) & LISTID_MASK) >> LISTID_SHIFT)
+
+#define INDEX_BITS		6  /* ceil(log2(NR_SLOTS)) */
+#define INDEX_SHIFT		(LISTID_SHIFT - INDEX_BITS)
+#define INDEX_MASK		BUILD_MASK(INDEX_BITS, INDEX_SHIFT)
+
+#define SET_INDEX(x, idx)	((x) = ((x) & ~INDEX_MASK) | ((idx) << INDEX_SHIFT))
+#define GET_INDEX(x)		(((x) & INDEX_MASK) >> INDEX_SHIFT)
+
+#define COOKIE_MASK		BUILD_MASK(sizeof(u32)*8 - LISTID_BITS - INDEX_BITS, 0)
+
+struct nr_bucket
+{
+	spinlock_t lock;
+	u8 hand[4];
+	u32 slot[NR_SLOTS];
+} ____cacheline_aligned;
+
+/* The non-resident page hash table. */
+static struct nr_bucket * nonres_table;
+static unsigned int nonres_shift;
+static unsigned int nonres_mask;
+
+/* hash the address into a bucket */
+static struct nr_bucket * nr_hash(void * mapping, unsigned long index)
+{
+	unsigned long bucket;
+	unsigned long hash;
+
+	hash = (unsigned long)mapping + 37 * index;
+	bucket = hash_long(hash, nonres_shift);
+
+	return nonres_table + bucket;
+}
+
+/* hash the address and inode into a cookie */
+static u32 nr_cookie(struct address_space * mapping, unsigned long index)
+{
+	unsigned long hash;
+
+	hash = 37 * (unsigned long)mapping + index;
+
+	if (mapping && mapping->host)
+		hash = 37 * hash + mapping->host->i_ino;
+
+	return hash_long(hash, sizeof(u32)*8 - LISTID_BITS - INDEX_BITS);
+}
+
+static DEFINE_PER_CPU(unsigned int[4], nonres_count);
+
+/*
+ * remove current (b from 'abc'):
+ *
+ *    initial        swap(2,3)
+ *
+ *   1: -> [2],a     1: -> [2],a
+ * * 2: -> [3],b     2: -> [1],c
+ *   3: -> [1],c   * 3: -> [3],b
+ *
+ *   3 is now free for use.
+ *
+ * @nr_bucket: bucket to operate in
+ * @listid: list that the deletee belongs to
+ * @pos: slot position of deletee
+ * @slot: possible pointer to slot
+ *
+ * returns pointer to removed slot, NULL when list empty.
+ */
+static u32 * __nonresident_del(struct nr_bucket *nr_bucket, int listid, u8 pos, u32 *slot)
+{
+	int next_pos;
+	u32 *next;
+
+	if (slot == NULL) {
+		slot = &nr_bucket->slot[pos];
+		if (GET_LISTID(*slot) != listid)
+			return NULL;
+	}
+
+	--__get_cpu_var(nonres_count[listid]);
+
+	next_pos = GET_INDEX(*slot);
+	if (pos == next_pos) {
+		next = slot;
+		goto out;
+	}
+
+	next = &nr_bucket->slot[next_pos];
+	*next = xchg(slot, *next);
+
+	if (next_pos == nr_bucket->hand[listid])
+		nr_bucket->hand[listid] = pos;
+out:
+	BUG_ON(GET_INDEX(*next) != next_pos);
+	return next;
+}
+
+static inline u32 * __nonresident_pop(struct nr_bucket *nr_bucket, int listid)
+{
+	return __nonresident_del(nr_bucket, listid, nr_bucket->hand[listid], NULL);
+}
+
+/*
+ * insert before (d before b in 'abc')
+ *
+ *    initial          set 4         swap(2,4)
+ *
+ *   1: -> [2],a     1: -> [2],a    1: -> [2],a
+ * * 2: -> [3],b     2: -> [3],b    2: -> [4],d
+ *   3: -> [1],c     3: -> [1],c    3: -> [1],c
+ *   4: -> [4],nil   4: -> [4],d  * 4: -> [3],b
+ *
+ *   leaving us with 'adbc'.
+ *
+ * @nr_bucket: bucket to operator in
+ * @listid: list to insert into
+ * @pos: position to insert before
+ * @slot: slot to insert
+ */
+static void __nonresident_insert(struct nr_bucket *nr_bucket, int listid, u8 *pos, u32 *slot)
+{
+	u32 *head;
+
+	SET_LISTID(*slot, listid);
+
+	head = &nr_bucket->slot[*pos];
+
+	*pos = GET_INDEX(*slot);
+	if (GET_LISTID(*head) == listid)
+		*slot = xchg(head, *slot);
+
+	++__get_cpu_var(nonres_count[listid]);
+}
+
+static inline void __nonresident_push(struct nr_bucket *nr_bucket, int listid, u32 *slot)
+{
+	__nonresident_insert(nr_bucket, listid, &nr_bucket->hand[listid], slot);
+}
+
+/*
+ * Remembers a page by putting a hash-cookie on the @listid list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ * @listid: list to put the page on (NR_b1, NR_b2 and NR_free).
+ * @listid_evict: list to get a free page from when NR_free is empty.
+ *
+ * returns the list an empty page was taken from.
+ */
+int nonresident_put(struct address_space * mapping, unsigned long index, int listid, int listid_evict)
+{
+	struct nr_bucket *nr_bucket;
+	u32 cookie;
+	unsigned long flags;
+	u32 *slot;
+	int evict = NR_free;
+
+	prefetch(mapping->host);
+	nr_bucket = nr_hash(mapping, index);
+
+	spin_lock_prefetch(nr_bucket); // prefetchw_range(nr_bucket, NR_CACHELINES);
+	cookie = nr_cookie(mapping, index);
+
+	spin_lock_irqsave(&nr_bucket->lock, flags);
+	slot = __nonresident_pop(nr_bucket, evict);
+	if (!slot) {
+		evict = listid_evict;
+		slot = __nonresident_pop(nr_bucket, evict);
+		if (!slot) {
+			evict ^= 1;
+			slot = __nonresident_pop(nr_bucket, evict);
+		}
+	}
+	BUG_ON(!slot);
+	SET_INDEX(cookie, GET_INDEX(*slot));
+	cookie = xchg(slot, cookie);
+	__nonresident_push(nr_bucket, listid, slot);
+	spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+	return evict;
+}
+
+/*
+ * Searches a page on the first two lists, and places it on the free list.
+ *
+ * @mapping: page_mapping()
+ * @index: page_index()
+ *
+ * returns listid of the list the item was found on with NR_found set if found.
+ */
+int nonresident_find(struct address_space * mapping, unsigned long index)
+{
+	struct nr_bucket * nr_bucket;
+	u32 wanted;
+	int j;
+	u8 i;
+	unsigned long flags;
+
+	prefetch(mapping->host);
+	nr_bucket = nr_hash(mapping, index);
+
+	spin_lock_prefetch(nr_bucket); // prefetch_range(nr_bucket, NR_CACHELINES);
+	wanted = nr_cookie(mapping, index) & COOKIE_MASK;
+
+	spin_lock_irqsave(&nr_bucket->lock, flags);
+	for (i = 0; i < 2; ++i) {
+		j = nr_bucket->hand[i];
+		do {
+			u32 *slot = &nr_bucket->slot[j];
+			if (GET_LISTID(*slot) != i)
+				break;
+
+			if ((*slot & COOKIE_MASK) == wanted) {
+				slot = __nonresident_del(nr_bucket, i, j, slot);
+				/* __nonresident_push(nr_bucket, i, slot); */
+				__nonresident_push(nr_bucket, NR_free, slot);
+				return i | NR_found;
+			}
+
+			j = GET_INDEX(*slot);
+		} while (j != nr_bucket->hand[i]);
+	}
+	spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+	return 0;
+}
+
+unsigned int nonresident_count(int listid)
+{
+	unsigned int ret = 0;
+	int cpu;
+
+	preempt_disable();
+	for_each_cpu(cpu) {
+		ret += per_cpu(nonres_count[listid], cpu);
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+unsigned int nonresident_total(void)
+{
+	return (1 << nonres_shift) * NR_SLOTS;
+}
+
+/*
+ * For interactive workloads, we remember about as many non-resident pages
+ * as we have actual memory pages.  For server workloads with large inter-
+ * reference distances we could benefit from remembering more.
+ */
+static __initdata unsigned long nonresident_factor = 1;
+void __init nonresident_init(void)
+{
+	int target;
+	int i, j;
+
+	/*
+	 * Calculate the non-resident hash bucket target. Use a power of
+	 * two for the division because alloc_large_system_hash rounds up.
+	 */
+	target = nr_all_pages * nonresident_factor;
+	target /= (sizeof(struct nr_bucket) / sizeof(u32));
+
+	nonres_table = alloc_large_system_hash("Non-resident page tracking",
+					sizeof(struct nr_bucket),
+					target,
+					0,
+					HASH_EARLY | HASH_HIGHMEM,
+					&nonres_shift,
+					&nonres_mask,
+					0);
+
+	for (i = 0; i < (1 << nonres_shift); i++) {
+		spin_lock_init(&nonres_table[i].lock);
+		for (j = 0; j < 4; ++j)
+			nonres_table[i].hand[j] = 0;
+
+		for (j = 0; j < NR_SLOTS; ++j) {
+			nonres_table[i].slot[j] = 0;
+			SET_LISTID(nonres_table[i].slot[j], NR_free);
+			if (j < NR_SLOTS - 1)
+				SET_INDEX(nonres_table[i].slot[j], j+1);
+			else /* j == NR_SLOTS - 1 */
+				SET_INDEX(nonres_table[i].slot[j], 0);
+		}
+	}
+
+	for_each_cpu(i) {
+		for (j=0; j<4; ++j)
+			per_cpu(nonres_count[j], i) = 0;
+	}
+}
+
+static int __init set_nonresident_factor(char * str)
+{
+	if (!str)
+		return 0;
+	nonresident_factor = simple_strtoul(str, &str, 0);
+	return 1;
+}
+
+__setup("nonresident_factor=", set_nonresident_factor);

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 2/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 3/7] " a.p.zijlstra
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-nonresident-stats.patch --]
[-- Type: text/plain, Size: 3566 bytes --]

Index: linux-2.6-git/fs/proc/proc_misc.c
===================================================================
--- linux-2.6-git.orig/fs/proc/proc_misc.c
+++ linux-2.6-git/fs/proc/proc_misc.c
@@ -233,6 +233,20 @@ static struct file_operations proc_zonei
 	.release	= seq_release,
 };
 
+extern struct seq_operations nonresident_op;
+static int nonresident_open(struct inode *inode, struct file *file)
+{
+       (void)inode;
+       return seq_open(file, &nonresident_op);
+}
+
+static struct file_operations nonresident_file_operations = {
+       .open           = nonresident_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
 static int version_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -602,6 +616,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+	create_seq_entry("nonresident",S_IRUGO, &nonresident_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
 	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
Index: linux-2.6-git/mm/nonresident.c
===================================================================
--- linux-2.6-git.orig/mm/nonresident.c
+++ linux-2.6-git/mm/nonresident.c
@@ -370,3 +370,83 @@ static int __init set_nonresident_factor
 }
 
 __setup("nonresident_factor=", set_nonresident_factor);
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *stats_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos < 0 || *pos >= (1 << nonres_shift))
+		return NULL;
+
+	m->private = (void*)(unsigned long)*pos;
+
+	return pos;
+}
+
+static void *stats_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	if (*pos < (1 << nonres_shift)-1) {
+		(*pos)++;
+		(unsigned long)m->private++;
+		return pos;
+	}
+	return NULL;
+}
+
+static void stats_stop(struct seq_file *m, void *arg)
+{
+}
+
+static void bucket_stats(struct nr_bucket * nr_bucket, int * b1, int * b2, int * free)
+{
+	unsigned long flags;
+	unsigned int i, b[3] = {0, 0, 0};
+
+	spin_lock_irqsave(&nr_bucket->lock, flags);
+	for (i = 0; i < 3; ++i) {
+		unsigned int j = nr_bucket->hand[i];
+		do
+		{
+			u32 *slot = &nr_bucket->slot[j];
+			if (GET_LISTID(*slot) != i)
+				break;
+			j = GET_INDEX(*slot);
+			++b[i];
+		} while (j != nr_bucket->hand[i]);
+	}
+	spin_unlock_irqrestore(&nr_bucket->lock, flags);
+
+	*b1=b[0];
+	*b2=b[1];
+	*free=b[2];
+}
+
+static int stats_show(struct seq_file *m, void *arg)
+{
+	unsigned int index = (unsigned long)m->private;
+	struct nr_bucket *nr_bucket = &nonres_table[index];
+	int b1, b2, free;
+
+	bucket_stats(nr_bucket, &b1, &b2, &free);
+	seq_printf(m, "%d\t%d\t%d", b1, b2, free);
+	if (index == 0) {
+		seq_printf(m, "\t%d\t%d\t%d",
+			   nonresident_count(NR_b1),
+			   nonresident_count(NR_b2),
+			   nonresident_count(NR_free));
+	}
+	seq_printf(m,"\n");
+
+	return 0;
+}
+
+struct seq_operations nonresident_op = {
+	.start = stats_start,
+	.next = stats_next,
+	.stop = stats_stop,
+	.show = stats_show,
+};
+
+#endif /* CONFIG_PROC_FS */

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 3/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 2/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 4/7] " a.p.zijlstra
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-cart.patch --]
[-- Type: text/plain, Size: 22755 bytes --]

Index: linux-2.6-git/include/linux/mm_inline.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_inline.h
+++ linux-2.6-git/include/linux/mm_inline.h
@@ -1,40 +1,15 @@
-
-static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
-{
-	list_add(&page->lru, &zone->active_list);
-	zone->nr_active++;
-}
-
-static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
-{
-	list_add(&page->lru, &zone->inactive_list);
-	zone->nr_inactive++;
-}
-
-static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
-{
-	list_del(&page->lru);
-	zone->nr_active--;
-}
-
-static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
-{
-	list_del(&page->lru);
-	zone->nr_inactive--;
-}
-
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		ClearPageActive(page);
+	if (TestClearPageActive(page)) {
 		zone->nr_active--;
 	} else {
 		zone->nr_inactive--;
 	}
+	if (TestClearPageLongTerm(page)) {
+		/* zone->nr_longterm--; */
+	} else {
+		zone->nr_shortterm--;
+	}
 }
Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -143,13 +143,18 @@ struct zone {
 	ZONE_PADDING(_pad1_)
 
 	/* Fields commonly accessed by the page reclaim scanner */
-	spinlock_t		lru_lock;	
-	struct list_head	active_list;
-	struct list_head	inactive_list;
+	spinlock_t		lru_lock;
+	struct list_head	active_list;	/* The T1 list of CART */
+	struct list_head	inactive_list;  /* The T2 list of CART */
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
 	unsigned long		nr_inactive;
+	unsigned long 		nr_evicted_active;
+	unsigned long 		nr_evicted_inactive;
+	unsigned long 		nr_shortterm;	/* number of short term pages */
+	unsigned long		nr_p;		/* p from the CART paper */
+	unsigned long 		nr_q;		/* q from the cart paper */
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
Index: linux-2.6-git/include/linux/page-flags.h
===================================================================
--- linux-2.6-git.orig/include/linux/page-flags.h
+++ linux-2.6-git/include/linux/page-flags.h
@@ -76,6 +76,9 @@
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_uncached		19	/* Page has been mapped as uncached */
 
+#define PG_longterm		20	/* Filter bit for CART see mm/cart.c */
+#define PG_new			21	/* New page, skip first reference */
+
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
@@ -305,6 +308,18 @@ extern void __mod_page_state(unsigned lo
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
 
+#define PageLongTerm(page)	test_bit(PG_longterm, &(page)->flags)
+#define SetPageLongTerm(page)	set_bit(PG_longterm, &(page)->flags)
+#define TestSetPageLongTerm(page) test_and_set_bit(PG_longterm, &(page)->flags)
+#define ClearPageLongTerm(page)	clear_bit(PG_longterm, &(page)->flags)
+#define TestClearPageLongTerm(page) test_and_clear_bit(PG_longterm, &(page)->flags)
+
+#define PageNew(page)	test_bit(PG_new, &(page)->flags)
+#define SetPageNew(page)	set_bit(PG_new, &(page)->flags)
+#define TestSetPageNew(page) test_and_set_bit(PG_new, &(page)->flags)
+#define ClearPageNew(page)	clear_bit(PG_new, &(page)->flags)
+#define TestClearPageNew(page) test_and_clear_bit(PG_new, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -7,6 +7,7 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -170,6 +171,23 @@ extern unsigned int nonresident_count(in
 extern unsigned int nonresident_total(void);
 extern void nonresident_init(void);
 
+/* linux/mm/cart.c */
+extern void cart_init(void);
+extern void __cart_insert(struct zone *, struct page *);
+extern unsigned long cart_replace(struct zone *, struct list_head *, unsigned long, unsigned long *);
+extern void cart_reinsert(struct zone *, struct list_head *);
+extern void __cart_rotate_reclaimable(struct zone *, struct page *);
+extern void __cart_remember(struct zone *, struct page*);
+
+static inline void cart_remember(struct page *page)
+{
+	unsigned long flags;
+	struct zone *zone = page_zone(page);
+	spin_lock_irqsave(&zone->lru_lock, flags);
+	__cart_remember(zone, page);
+	spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalhigh_pages;
Index: linux-2.6-git/init/main.c
===================================================================
--- linux-2.6-git.orig/init/main.c
+++ linux-2.6-git/init/main.c
@@ -497,6 +497,7 @@ asmlinkage void __init start_kernel(void
 	vfs_caches_init_early();
 	nonresident_init();
 	mem_init();
+	cart_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
 	numa_policy_init();
Index: linux-2.6-git/mm/Makefile
===================================================================
--- linux-2.6-git.orig/mm/Makefile
+++ linux-2.6-git/mm/Makefile
@@ -13,7 +13,7 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   prio_tree.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o \
-				nonresident.o
+				nonresident.o cart.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
Index: linux-2.6-git/mm/cart.c
===================================================================
--- /dev/null
+++ linux-2.6-git/mm/cart.c
@@ -0,0 +1,623 @@
+/*
+ * mm/cart.c
+ *
+ * Written by Peter Zijlstra <a.p.zijlstra@chello.nl>
+ * Released under the GPL, see the file COPYING for details.
+ *
+ * This file contains a Page Replacement Algorithm based on CART
+ * Please refer to the CART paper here -
+ *   http://www.almaden.ibm.com/cs/people/dmodha/clockfast.pdf
+ *
+ * The algorithm was adapted to work for linux which poses the following
+ * extra constraints:
+ *  - multiple memory zones,
+ *  - async pageout (PG_writeback).
+ *
+ * The multiple memory zones are handled by decoupling the T lists from the
+ * B lists, keeping T lists per zone while having global B lists. See
+ * mm/nonresident.c for the B list implementation. List sizes are scaled on
+ * comparison.
+ *
+ * The asynchronous pageout makes for the problem of pages that are selected
+ * for eviction but can not (yet) be freed. And since they are not evicted
+ * they could be referenced again, so we have to keep them on the resident
+ * lists. This is handled in the reinsert and rotate_reclaimable functions.
+ *
+ * Also for efficiency's sake the replace operation is batched. This to
+ * avoid holding the much contended zone->lru_lock while calling the
+ * rather slow page_referenced().
+ *
+ * Doubts:
+ *  - can a page be on both a T and a B list? The paper does imply this.
+ *
+ * Beyond CART:
+ *  - PG_new
+ *  - 'S' pages from B1.
+ *
+ * Todo:
+ *  - page accounting
+ *  - finetune
+ *
+ * All functions that are prefixed with '__' assume that zone->lru_lock is taken.
+ */
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/mm_inline.h>
+#include <linux/rmap.h>
+#include <linux/buffer_head.h>
+#include <linux/pagevec.h>
+
+#define cart_cT ((zone)->nr_active + (zone)->nr_inactive + (zone)->free_pages)
+#define cart_cB ((zone)->present_pages)
+
+#define T2B(x) (((x) * cart_cB) / cart_cT)
+#define B2T(x) (((x) * cart_cT) / cart_cB)
+
+#define size_T1 ((zone)->nr_active)
+#define size_T2 ((zone)->nr_inactive)
+
+#define list_T1 (&(zone)->active_list)
+#define list_T2 (&(zone)->inactive_list)
+
+#define cart_p ((zone)->nr_p)
+#define cart_q ((zone)->nr_q)
+
+#define size_B1 ((zone)->nr_evicted_active)
+#define size_B2 ((zone)->nr_evicted_inactive)
+/* #define size_B2 (cart_cB - size_B1) */
+
+#define nr_Ns ((zone)->nr_shortterm)
+#define nr_Nl (size_T1 + size_T2 - nr_Ns)
+
+/* Called from init/main.c to initialize the cart parameters */
+void __init cart_init(void)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		zone->nr_evicted_active = 0;
+		zone->nr_evicted_inactive = 0;
+		zone->nr_shortterm = 0;
+		zone->nr_p = 0;
+		zone->nr_q = 0;
+	}
+}
+
+static inline void __cart_q_inc(struct zone *zone, unsigned long dq)
+{
+	/* if (|T2| + |B2| + |T1| - ns >= c) q = min(q + 1, 2c - |T1|) */
+	/* |B2| + nl >= c */
+	if (B2T(size_B2) + nr_Nl >= cart_cT)
+		cart_q = min(cart_q + dq, 2*cart_cB - T2B(size_T1));
+}
+
+static inline void __cart_q_dec(struct zone *zone, unsigned long dq)
+{
+	/* q = max(q - 1, c - |T1|) */
+	unsigned long target = cart_cB - T2B(size_T1);
+	if (cart_q <= target)
+		cart_q = target;
+	else if (cart_q >= dq)
+		cart_q -= dq;
+	else
+		cart_q = 0;
+}
+
+/*
+ * Insert page into @zones CART and update adaptive parameters.
+ *
+ * @zone: target zone.
+ * @page: new page.
+ */
+void __cart_insert(struct zone *zone, struct page *page)
+{
+	unsigned int rflags;
+	unsigned long ratio;
+
+	/*
+	 * Note: we could give hints to the insertion process using the LRU
+	 * specific PG_flags like: PG_active, PG_longterm and PG_referenced.
+	 */
+
+	rflags = nonresident_find(page_mapping(page), page_index(page));
+
+	if (rflags & NR_found) {
+		rflags &= NR_listid;
+		if (rflags == NR_b1) {
+			if (likely(size_B1)) --size_B1;
+
+			/* p = min(p + max(1, ns/|B1|), c) */
+			ratio = (nr_Ns / (B2T(size_B1) + 1)) ?: 1UL;
+			cart_p += ratio;
+			if (unlikely(cart_p > cart_cT))
+				cart_p = cart_cT;
+
+			SetPageLongTerm(page);
+		} else if (rflags == NR_b2) {
+			if (likely(size_B2)) --size_B2;
+
+			/* p = max(p - max(1, nl/|B2|), 0) */
+			ratio = (nr_Nl / (B2T(size_B2) + 1)) ?: 1UL;
+			if (cart_p >= ratio)
+				cart_p -= ratio;
+			else
+				cart_p = 0UL;
+
+			SetPageLongTerm(page);
+			__cart_q_inc(zone, 1);
+		}
+		/* ++nr_Nl; */
+	} else {
+		ClearPageLongTerm(page);
+		++nr_Ns;
+	}
+
+	SetPageNew(page);
+	ClearPageReferenced(page);
+	SetPageActive(page);
+	list_add_tail(&page->lru, list_T1);
+	++size_T1;
+	BUG_ON(!PageLRU(page));
+}
+
+#define head_to_page(list) list_entry((list)->next, struct page, lru)
+#define tail_to_page(list) list_entry((list)->prev, struct page, lru)
+
+#ifdef ARCH_HAS_PREFETCH
+#define prefetch_next_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.next != _base) {			\
+			struct page *next;				\
+			\
+			next = head_to_page(&(_page->lru));		\
+			prefetch(&next->_field);			\
+		}							\
+	} while (0)
+
+#define prefetch_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+			\
+			prev = tail_to_page(&(_page->lru));		\
+			prefetch(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetch_next_lru_page(_page, _base, _field) do { } while (0)
+#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+#ifdef ARCH_HAS_PREFETCHW
+#define prefetchw_next_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.next != _base) {			\
+			struct page *next;				\
+			\
+			next = head_to_page(&(_page->lru));		\
+			prefetchw(&next->_field);			\
+		}							\
+	} while (0)
+
+#define prefetchw_prev_lru_page(_page, _base, _field)			\
+	do {								\
+		if ((_page)->lru.prev != _base) {			\
+			struct page *prev;				\
+			\
+			prev = tail_to_page(&(_page->lru));		\
+			prefetchw(&prev->_field);			\
+		}							\
+	} while (0)
+#else
+#define prefetchw_next_lru_page(_page, _base, _field) do { } while (0)
+#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#endif
+
+/*
+ * Try to grab a specified number of pages of a list head.
+ * Ups the refcount on the grabbed pages.
+ *
+ * @dst: temp list to hold pages.
+ * @src: list to take pages from.
+ * @nr_dst: target number of pages to grab.
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns how many pages were moved to @dst.
+ */
+static unsigned long cart_isolate(struct zone *zone, struct list_head *dst, struct list_head *src,
+				  unsigned long nr_dst, unsigned long *nr_scanned)
+{
+	struct page *page;
+	unsigned long nr_taken = 0, scan = 0;
+
+	spin_lock_irq(&zone->lru_lock);
+	while (scan++ < nr_dst && !list_empty(src)) {
+		page = head_to_page(src);
+		if (!get_page_testone(page)) {
+			list_move_tail(&page->lru, dst);
+			++nr_taken;
+		} else {
+			list_move_tail(&page->lru, src);
+			__put_page(page);
+		}
+	}
+	spin_unlock_irq(&zone->lru_lock);
+
+	*nr_scanned += scan;
+	return nr_taken;
+}
+
+/*
+ * Add page to a release pagevec, temp. drop zone lock to release pagevec if full.
+ *
+ * @zone: @pages zone
+ * @page: page to be released
+ * @pvec: pagevec to collect pages in
+ */
+static void __cart_page_release(struct zone *zone, struct page *page, struct pagevec *pvec)
+{
+	if (!pagevec_add(pvec, page)) {
+		spin_unlock_irq(&zone->lru_lock);
+		if (buffer_heads_over_limit)
+			pagevec_strip(pvec);
+		__pagevec_release(pvec);
+		spin_lock_irq(&zone->lru_lock);
+	}
+}
+
+/*
+ * Append list to head and releases pages.
+ *
+ * @zone: zone of pages.
+ * @list: temp list of pages (1 ref).
+ * @head: zone pagelist.
+ * @pvec: pagevec for release.
+ */
+static void __cart_list_splice_tail_release(struct zone *zone, struct list_head *list,
+					    struct list_head *head, struct pagevec *pvec)
+{
+	while (!list_empty(list)) {	/* head = { head, list } */
+		struct page *page = head_to_page(list);
+		list_move_tail(&page->lru, head);
+		__cart_page_release(zone, page, pvec);
+	}
+}
+
+static void __cart_list_splice_release(struct zone *zone, struct list_head *list,
+				       struct list_head *head, struct pagevec *pvec)
+{
+	while (!list_empty(list)) {	/* head = { list, head } */
+		struct page *page = tail_to_page(list);
+		list_move(&page->lru, head);
+		__cart_page_release(zone, page, pvec);
+	}
+}
+
+/*
+ * Rebalance the T2 list:
+ *  move referenced pages to T1s' tail,
+ *  take unreffed pages for reclaim.
+ *
+ * @zone: target zone
+ * @l_t2_head: temp list of reclaimable pages (1 ref)
+ * @nr_dst: quantum of pages
+ * @nr_scanned: counter that keeps the number of pages touched.
+ * @pvec: pagevec used to release pages
+ *
+ * returns whether there are pages on @l_t2_head
+ */
+static unsigned long cart_rebalance_T2(struct zone *zone, struct list_head *l_t2_head,
+				       unsigned long nr_dst, unsigned long *nr_scanned,
+				       struct pagevec *pvec)
+{
+	struct page *page;
+	LIST_HEAD(l_hold);
+	LIST_HEAD(l_t1);
+	unsigned long nr, dq;
+
+	do {
+		nr = cart_isolate(zone, &l_hold, list_T2, nr_dst, nr_scanned);
+		if (!nr)
+			break;
+
+		dq = 0;
+		while (!list_empty(&l_hold)) {
+			page = head_to_page(&l_hold);
+			prefetchw_next_lru_page(page, &l_hold, flags);
+
+			if (page_referenced(page, 0, 0)) {
+				list_move_tail(&page->lru, &l_t1);
+				SetPageActive(page);
+				++dq;
+			} else {
+				list_move_tail(&page->lru, l_t2_head);
+			}
+		}
+
+		spin_lock_irq(&zone->lru_lock);
+		__cart_list_splice_tail_release(zone, &l_t1, list_T1, pvec);
+		size_T2 -= dq;
+		size_T1 += dq;
+		__cart_q_inc(zone, dq);
+		spin_unlock_irq(&zone->lru_lock);
+	} while (list_empty(l_t2_head));
+
+	return !list_empty(l_t2_head);
+}
+
+/*
+ * Rebalance the T1 list:
+ *  move referenced pages to tail and possibly mark 'L',
+ *  move unreffed 'L' pages to tail T2,
+ *  take unreffed 'S' pages for reclaim.
+ *
+ * @zone: target zone
+ * @l_t1_head: temp list of reclaimable pages (1 ref)
+ * @nr_dst: quantum of pages
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns whether there are pages on @l_t1_head
+ */
+static unsigned long cart_rebalance_T1(struct zone *zone, struct list_head *l_t1_head,
+				       unsigned long nr_dst, unsigned long *nr_scanned,
+				       struct pagevec *pvec)
+{
+	struct page *page;
+	LIST_HEAD(l_hold);
+	LIST_HEAD(l_t1);
+	LIST_HEAD(l_t2);
+	unsigned long nr, dq, dsl;
+	int new, referenced;
+
+	do {
+		nr = cart_isolate(zone, &l_hold, list_T1, nr_dst, nr_scanned);
+		if (!nr)
+			break;
+
+		dq = 0;
+		dsl = 0;
+		while (!list_empty(&l_hold)) {
+			page = head_to_page(&l_hold);
+			prefetchw_next_lru_page(page, &l_hold, flags);
+
+			referenced = page_referenced(page, 0, 0);
+			new = TestClearPageNew(page);
+
+			if (referenced) {
+				list_move_tail(&page->lru, &l_t1);
+				// XXX: we race a bit here; do we mind and put it under lru_lock?
+				/* ( |T1| >= min(p + 1, |B1|) ) and ( filter = 'S' ) */
+				if ((size_T1 - dq) >= min(cart_p + 1, B2T(size_B1)) &&
+				    !PageLongTerm(page) && !new) {
+					SetPageLongTerm(page);
+					++dsl;
+				}
+			} else if (PageLongTerm(page)) {
+				list_move_tail(&page->lru, &l_t2);
+				ClearPageActive(page);
+				++dq;
+			} else {
+				list_move_tail(&page->lru, l_t1_head);
+			}
+		}
+
+		spin_lock_irq(&zone->lru_lock);
+		__cart_list_splice_tail_release(zone, &l_t2, list_T2, pvec);
+		__cart_list_splice_tail_release(zone, &l_t1, list_T1, pvec);
+		nr_Ns -= dsl;
+		/* nr_Nl += dsl; */
+		size_T1 -= dq;
+		size_T2 += dq;
+		__cart_q_dec(zone, dq);
+		spin_unlock_irq(&zone->lru_lock);
+	} while (list_empty(l_t1_head));
+
+	return !list_empty(l_t1_head);
+}
+
+/*
+ * Try to reclaim a specified number of pages.
+ *
+ * Clears PG_lru of reclaimed pages, but does take 1 reference.
+ *
+ * @zone: target zone to reclaim pages from.
+ * @dst: temp list of reclaimable pages (1 ref).
+ * @nr_dst: target number of pages to reclaim.
+ * @nr_scanned: counter that keeps the number of pages touched.
+ *
+ * returns number of pages on @dst.
+ */
+unsigned long cart_replace(struct zone *zone, struct list_head *dst,
+			   unsigned long nr_dst, unsigned long *nr_scanned)
+{
+	struct page *page;
+	LIST_HEAD(l_t1);
+	LIST_HEAD(l_t2);
+	struct pagevec pvec;
+	unsigned long nr;
+	unsigned int nonres_total;
+
+	pagevec_init(&pvec, 1);
+
+	/*
+	 * Rebalance nonresident lists to zone size
+	 */
+	nonres_total = nonresident_total();
+
+	spin_lock_irq(&zone->lru_lock);
+	size_B1 = (nonresident_count(NR_b1) * cart_cB) / nonres_total;
+	size_B2 = (nonresident_count(NR_b2) * cart_cB) / nonres_total;
+	spin_unlock_irq(&zone->lru_lock);
+
+	for (nr = 0; nr < nr_dst;) {
+		if (list_empty(&l_t2))
+			cart_rebalance_T2(zone, &l_t2, nr_dst/2, nr_scanned, &pvec);
+		if (list_empty(&l_t1))
+			cart_rebalance_T1(zone, &l_t1, nr_dst/2, nr_scanned, &pvec);
+
+		if (list_empty(&l_t1) && list_empty(&l_t2))
+			break;
+
+		spin_lock_irq(&zone->lru_lock);
+		while (!list_empty(&l_t1) &&
+		       (size_T1 > cart_p || !size_T2) && nr < nr_dst) {
+			page = head_to_page(&l_t1);
+			prefetchw_next_lru_page(page, &l_t1, flags);
+
+			if (!TestClearPageLRU(page))
+				BUG();
+			if (PageLongTerm(page))
+				BUG();
+			--nr_Ns;
+			--size_T1;
+			++nr;
+			list_move(&page->lru, dst);
+		}
+		while (!list_empty(&l_t2) &&
+		       size_T1 <= cart_p && nr < nr_dst) {
+			page = head_to_page(&l_t2);
+			prefetchw_next_lru_page(page, &l_t2, flags);
+
+			if (!TestClearPageLRU(page))
+				BUG();
+			if (!PageLongTerm(page))
+				BUG();
+			/* --nr_Nl; */
+			--size_T2;
+			++nr;
+			list_move(&page->lru, dst);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+
+		cond_resched();
+	}
+
+	spin_lock_irq(&zone->lru_lock);
+	__cart_list_splice_release(zone, &l_t1, list_T1, &pvec);
+	__cart_list_splice_release(zone, &l_t2, list_T2, &pvec);
+	spin_unlock_irq(&zone->lru_lock);
+	pagevec_release(&pvec);
+
+	return nr;
+}
+
+/*
+ * Re-insert pages that were elected for replacement but could not be freed,
+ * writeback has started agains them. Rotate as referenced to let the relaim
+ * path make progress.
+ *
+ * Assumes PG_lru is clear and will set for re-inserted pages.
+ * Assumes the pages have 1 reference taken and will return it.
+ *
+ * @zone: target zone for re-insertion.
+ * @list: list of left over pages.
+ */
+void cart_reinsert(struct zone *zone, struct list_head *list)
+{
+	struct pagevec pvec;
+
+	pagevec_init(&pvec, 1);
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(list))
+	{
+		struct page *page = tail_to_page(list);
+		prefetchw_prev_lru_page(page, list, flags);
+
+		if (TestSetPageLRU(page))
+			BUG();
+		if (!PageActive(page)) { /* T2 */
+			if (!PageLongTerm(page))
+				BUG();
+
+			SetPageActive(page);
+			list_move_tail(&page->lru, list_T1);
+			++size_T1;
+
+			__cart_q_inc(zone, 1);
+		} else { /* T1 */
+			if (!PageLongTerm(page))
+				++nr_Ns;
+			/* else ++nr_Nl; */
+
+			list_move_tail(&page->lru, list_T1);
+			++size_T1;
+
+			/* ( |T1| >= min(p + 1, |B1| ) and ( filter = 'S' ) */
+			/*
+			 * Don't modify page state; it wasn't actually referenced
+			 */
+		}
+		__cart_page_release(zone, page, &pvec);
+	}
+
+	spin_unlock_irq(&zone->lru_lock);
+	pagevec_release(&pvec);
+}
+
+/*
+ * Make page available for direct reclaim.
+ *
+ * @zone: page's zone
+ * @page: page
+ */
+void __cart_rotate_reclaimable(struct zone *zone, struct page *page)
+{
+	if (PageLongTerm(page)) {
+		if (TestClearPageActive(page)) {
+			--size_T1;
+			++size_T2;
+			__cart_q_dec(zone, 1);
+		}
+		list_move(&page->lru, list_T2);
+	} else {
+		if (!PageActive(page))
+			BUG();
+		list_move(&page->lru, list_T1);
+	}
+}
+
+/*
+ * Puts pages on the non-resident lists on swap-out
+ * XXX: lose the reliance on zone->lru_lock !!!
+ *
+ * @zone: dead pages zone.
+ * @page: dead page.
+ */
+void __cart_remember(struct zone *zone, struct page *page)
+{
+	int listid;
+	int listid_evict;
+
+	if (PageActive(page)) {
+		listid = NR_b1;
+		++size_B1;
+	} else {
+		listid = NR_b2;
+		++size_B2;
+	}
+
+	/*
+	 * Assume |B1| + |B2| == c + 1, since |B1_j| + |B2_j| := c_j.
+	 * The list_empty check is done on the Bn_j side.
+	 */
+	/* |B1| > max(0, q) */
+	if (size_B1 > cart_q)
+		listid_evict = NR_b1;
+	else
+		listid_evict = NR_b2;
+
+	listid_evict = nonresident_put(page_mapping(page),
+				       page_index(page),
+				       listid, listid_evict);
+
+	switch (listid_evict) {
+		case NR_b1:
+			if (likely(size_B1)) --size_B1;
+			break;
+
+		case NR_b2:
+			if (likely(size_B2)) --size_B2;
+			break;
+	}
+}

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 4/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
                   ` (2 preceding siblings ...)
  2005-09-11 20:25 ` [RFC][PATCH 3/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 5/7] " a.p.zijlstra
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-cart-r.patch --]
[-- Type: text/plain, Size: 4094 bytes --]

Index: linux-2.6-git/mm/cart.c
===================================================================
--- linux-2.6-git.orig/mm/cart.c
+++ linux-2.6-git/mm/cart.c
@@ -63,6 +63,7 @@
 
 #define cart_p ((zone)->nr_p)
 #define cart_q ((zone)->nr_q)
+#define cart_r ((zone)->nr_r)
 
 #define size_B1 ((zone)->nr_evicted_active)
 #define size_B2 ((zone)->nr_evicted_inactive)
@@ -81,6 +82,7 @@ void __init cart_init(void)
 		zone->nr_shortterm = 0;
 		zone->nr_p = 0;
 		zone->nr_q = 0;
+		zone->nr_r = 0;
 	}
 }
 
@@ -123,6 +125,12 @@ void __cart_insert(struct zone *zone, st
 	rflags = nonresident_find(page_mapping(page), page_index(page));
 
 	if (rflags & NR_found) {
+		ratio = (nr_Nl / (nr_Ns + 1)) ?: 1;
+		if (cart_r > ratio)
+			cart_r -= ratio;
+		else
+			cart_r = 0UL;
+
 		rflags &= NR_listid;
 		if (rflags == NR_b1) {
 			if (likely(size_B1)) --size_B1;
@@ -149,6 +157,11 @@ void __cart_insert(struct zone *zone, st
 		}
 		/* ++nr_Nl; */
 	} else {
+		ratio = (nr_Ns / (nr_Nl + 1)) ?: 1;
+		cart_r += ratio;
+		if (cart_r > cart_cT)
+			cart_r = cart_cT;
+
 		ClearPageLongTerm(page);
 		++nr_Ns;
 	}
@@ -360,6 +373,7 @@ static unsigned long cart_rebalance_T2(s
  * returns whether there are pages on @l_t1_head
  */
 static unsigned long cart_rebalance_T1(struct zone *zone, struct list_head *l_t1_head,
+				       struct list_head *l_new,
 				       unsigned long nr_dst, unsigned long *nr_scanned,
 				       struct pagevec *pvec)
 {
@@ -384,7 +398,11 @@ static unsigned long cart_rebalance_T1(s
 			referenced = page_referenced(page, 0, 0);
 			new = TestClearPageNew(page);
 
-			if (referenced) {
+			if (cart_r < nr_Nl && PageLongTerm(page) && new) {
+				list_move_tail(&page->lru, l_new);
+				ClearPageActive(page);
+				++dq;
+			} else if (referenced) {
 				list_move_tail(&page->lru, &l_t1);
 				// XXX: we race a bit here; do we mind and put it under lru_lock?
 				/* ( |T1| >= min(p + 1, |B1|) ) and ( filter = 'S' ) */
@@ -432,6 +450,7 @@ unsigned long cart_replace(struct zone *
 			   unsigned long nr_dst, unsigned long *nr_scanned)
 {
 	struct page *page;
+	LIST_HEAD(l_new);
 	LIST_HEAD(l_t1);
 	LIST_HEAD(l_t2);
 	struct pagevec pvec;
@@ -454,12 +473,24 @@ unsigned long cart_replace(struct zone *
 		if (list_empty(&l_t2))
 			cart_rebalance_T2(zone, &l_t2, nr_dst/2, nr_scanned, &pvec);
 		if (list_empty(&l_t1))
-			cart_rebalance_T1(zone, &l_t1, nr_dst/2, nr_scanned, &pvec);
+			cart_rebalance_T1(zone, &l_t1, &l_new, nr_dst/2, nr_scanned, &pvec);
 
 		if (list_empty(&l_t1) && list_empty(&l_t2))
 			break;
 
 		spin_lock_irq(&zone->lru_lock);
+		while (!list_empty(&l_new) && nr < nr_dst) {
+			page = head_to_page(&l_new);
+			prefetchw_next_lru_page(page, &l_new, flags);
+
+			if (!TestClearPageLRU(page))
+				BUG();
+			if (!PageLongTerm(page))
+				BUG();
+			--size_T2;
+			++nr;
+			list_move(&page->lru, dst);
+		}
 		while (!list_empty(&l_t1) &&
 		       (size_T1 > cart_p || !size_T2) && nr < nr_dst) {
 			page = head_to_page(&l_t1);
@@ -496,6 +527,7 @@ unsigned long cart_replace(struct zone *
 	spin_lock_irq(&zone->lru_lock);
 	__cart_list_splice_release(zone, &l_t1, list_T1, &pvec);
 	__cart_list_splice_release(zone, &l_t2, list_T2, &pvec);
+	__cart_list_splice_release(zone, &l_new, list_T2, &pvec);
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
Index: linux-2.6-git/include/linux/mmzone.h
===================================================================
--- linux-2.6-git.orig/include/linux/mmzone.h
+++ linux-2.6-git/include/linux/mmzone.h
@@ -155,6 +155,7 @@ struct zone {
 	unsigned long 		nr_shortterm;	/* number of short term pages */
 	unsigned long		nr_p;		/* p from the CART paper */
 	unsigned long 		nr_q;		/* q from the cart paper */
+	unsigned long		nr_r;
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 5/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
                   ` (3 preceding siblings ...)
  2005-09-11 20:25 ` [RFC][PATCH 4/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 6/7] " a.p.zijlstra
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-cart-stats.patch --]
[-- Type: text/plain, Size: 4590 bytes --]

Index: linux-2.6-git/fs/proc/proc_misc.c
===================================================================
--- linux-2.6-git.orig/fs/proc/proc_misc.c
+++ linux-2.6-git/fs/proc/proc_misc.c
@@ -233,6 +233,20 @@ static struct file_operations proc_zonei
 	.release	= seq_release,
 };
 
+extern struct seq_operations cart_op;
+static int cart_open(struct inode *inode, struct file *file)
+{
+       (void)inode;
+       return seq_open(file, &cart_op);
+}
+
+static struct file_operations cart_file_operations = {
+       .open           = cart_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
 extern struct seq_operations nonresident_op;
 static int nonresident_open(struct inode *inode, struct file *file)
 {
@@ -616,6 +630,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+	create_seq_entry("cart",S_IRUGO, &cart_file_operations);
 	create_seq_entry("nonresident",S_IRUGO, &nonresident_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
Index: linux-2.6-git/mm/cart.c
===================================================================
--- linux-2.6-git.orig/mm/cart.c
+++ linux-2.6-git/mm/cart.c
@@ -653,3 +653,96 @@ void __cart_remember(struct zone *zone, 
 			break;
 	}
 }
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *stats_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos != 0)
+		return NULL;
+
+	lru_add_drain();
+
+	return pos;
+}
+
+static void *stats_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	return NULL;
+}
+
+static void stats_stop(struct seq_file *m, void *arg)
+{
+}
+
+static int stats_show(struct seq_file *m, void *arg)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		spin_lock_irq(&zone->lru_lock);
+		seq_printf(m, "\n\n======> zone: %lu <=====\n", (unsigned long)zone);
+		seq_printf(m, "struct zone values:\n");
+		seq_printf(m, "  zone->nr_active: %lu\n", zone->nr_active);
+		seq_printf(m, "  zone->nr_inactive: %lu\n", zone->nr_inactive);
+		seq_printf(m, "  zone->nr_evicted_active: %lu\n", zone->nr_evicted_active);
+		seq_printf(m, "  zone->nr_evicted_inactive: %lu\n", zone->nr_evicted_inactive);
+		seq_printf(m, "  zone->nr_shortterm: %lu\n", zone->nr_shortterm);
+		seq_printf(m, "  zone->cart_p: %lu\n", zone->nr_p);
+		seq_printf(m, "  zone->cart_q: %lu\n", zone->nr_q);
+		seq_printf(m, "  zone->cart_r: %lu\n", zone->nr_r);
+		seq_printf(m, "  zone->present_pages: %lu\n", zone->present_pages);
+		seq_printf(m, "  zone->free_pages: %lu\n", zone->free_pages);
+		seq_printf(m, "  zone->pages_min: %lu\n", zone->pages_min);
+		seq_printf(m, "  zone->pages_low: %lu\n", zone->pages_low);
+		seq_printf(m, "  zone->pages_high: %lu\n", zone->pages_high);
+
+		seq_printf(m, "\n");
+		seq_printf(m, "implicit values:\n");
+		seq_printf(m, "  zone->nr_longterm: %lu\n", nr_Nl);
+		seq_printf(m, "  zone->cart_cT: %lu\n", cart_cT);
+		seq_printf(m, "  zone->cart_cB: %lu\n", cart_cB);
+
+		seq_printf(m, "\n");
+		seq_printf(m, "counted values:\n");
+
+		{
+			struct page *page;
+			unsigned long active = 0, s1 = 0, l1 = 0;
+			unsigned long inactive = 0, s2 = 0, l2 = 0;
+			unsigned long a1=0,i1=0,a2=0,i2=0;
+			list_for_each_entry(page, &zone->active_list, lru) {
+				++active;
+				if (PageLongTerm(page)) ++l1;
+				else ++s1;
+				if (PageActive(page)) ++a1;
+				else ++i1;
+			}
+			list_for_each_entry(page, &zone->inactive_list, lru) {
+				++inactive;
+				if (PageLongTerm(page)) ++l2;
+				else ++s2;
+				if (PageActive(page)) ++a2;
+				else ++i2;
+			}
+			seq_printf(m, "  zone->nr_active: %lu (%lu, %lu)(%lu, %lu)\n", active, s1, l1, a1, i1);
+			seq_printf(m, "  zone->nr_inactive: %lu (%lu, %lu)(%lu, %lu)\n", inactive, s2, l2, a2, i2);
+			seq_printf(m, "  zone->nr_shortterm: %lu\n", s1+s2);
+			seq_printf(m, "  zone->nr_longterm: %lu\n", l1+l2);
+		}
+
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	return 0;
+}
+
+struct seq_operations cart_op = {
+	.start = stats_start,
+	.next = stats_next,
+	.stop = stats_stop,
+	.show = stats_show,
+};
+
+#endif /* CONFIG_PROC_FS */

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 6/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
                   ` (4 preceding siblings ...)
  2005-09-11 20:25 ` [RFC][PATCH 5/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 20:25 ` [RFC][PATCH 7/7] " a.p.zijlstra
  2005-09-11 23:05 ` [RFC][PATCH 0/7] " Peter Zijlstra
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-use-once.patch --]
[-- Type: text/plain, Size: 4902 bytes --]

Index: linux-2.6-git/mm/filemap.c
===================================================================
--- linux-2.6-git.orig/mm/filemap.c
+++ linux-2.6-git/mm/filemap.c
@@ -723,7 +723,6 @@ void do_generic_mapping_read(struct addr
 	unsigned long offset;
 	unsigned long last_index;
 	unsigned long next_index;
-	unsigned long prev_index;
 	loff_t isize;
 	struct page *cached_page;
 	int error;
@@ -732,7 +731,6 @@ void do_generic_mapping_read(struct addr
 	cached_page = NULL;
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	next_index = index;
-	prev_index = ra.prev_page;
 	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
@@ -779,13 +777,7 @@ page_ok:
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		/*
-		 * When (part of) the same page is read multiple times
-		 * in succession, only mark it as accessed the first time.
-		 */
-		if (prev_index != index)
-			mark_page_accessed(page);
-		prev_index = index;
+		mark_page_accessed(page);
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
Index: linux-2.6-git/mm/shmem.c
===================================================================
--- linux-2.6-git.orig/mm/shmem.c
+++ linux-2.6-git/mm/shmem.c
@@ -1500,11 +1500,8 @@ static void do_shmem_file_read(struct fi
 			 */
 			if (mapping_writably_mapped(mapping))
 				flush_dcache_page(page);
-			/*
-			 * Mark the page accessed if we read the beginning.
-			 */
-			if (!offset)
-				mark_page_accessed(page);
+
+			mark_page_accessed(page);
 		} else
 			page = ZERO_PAGE(0);
 
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -97,37 +97,12 @@ int rotate_reclaimable_page(struct page 
 }
 
 /*
- * FIXME: speed this up?
- */
-void fastcall activate_page(struct page *page)
-{
-	struct zone *zone = page_zone(page);
-
-	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(zone, page);
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
-		inc_page_state(pgactivate);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-}
-
-/*
  * Mark a page as having seen activity.
- *
- * inactive,unreferenced	->	inactive,referenced
- * inactive,referenced		->	active,unreferenced
- * active,unreferenced		->	active,referenced
  */
 void fastcall mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
-		activate_page(page);
-		ClearPageReferenced(page);
-	} else if (!PageReferenced(page)) {
+	if (!PageReferenced(page))
 		SetPageReferenced(page);
-	}
 }
 
 EXPORT_SYMBOL(mark_page_accessed);
Index: linux-2.6-git/mm/swapfile.c
===================================================================
--- linux-2.6-git.orig/mm/swapfile.c
+++ linux-2.6-git/mm/swapfile.c
@@ -408,7 +408,7 @@ static void unuse_pte(struct vm_area_str
 	 * Move the page to the active list so it is not
 	 * immediately swapped out again after swapon.
 	 */
-	activate_page(page);
+	SetPageReferenced(page);
 }
 
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -508,7 +508,7 @@ static int unuse_mm(struct mm_struct *mm
 		 * Activate page so shrink_cache is unlikely to unmap its
 		 * ptes while lock is dropped, so swapoff can make progress.
 		 */
-		activate_page(page);
+		SetPageReferenced(page);
 		unlock_page(page);
 		down_read(&mm->mmap_sem);
 		lock_page(page);
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -235,27 +235,6 @@ static int shrink_slab(unsigned long sca
 	return ret;
 }
 
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
-	struct address_space *mapping;
-
-	/* Page is in somebody's page tables. */
-	if (page_mapped(page))
-		return 1;
-
-	/* Be more reluctant to reclaim swapcache than pagecache */
-	if (PageSwapCache(page))
-		return 1;
-
-	mapping = page_mapping(page);
-	if (!mapping)
-		return 0;
-
-	/* File is mmap'd by somebody? */
-	return mapping_mapped(mapping);
-}
-
 static inline int is_page_cache_freeable(struct page *page)
 {
 	return page_count(page) - !!PagePrivate(page) == 2;
@@ -408,8 +387,8 @@ static int shrink_list(struct list_head 
 			goto keep_locked;
 
 		referenced = page_referenced(page, 1, sc->priority <= 0);
-		/* In active use or really unfreeable?  Activate it. */
-		if (referenced && page_mapping_inuse(page))
+
+		if (referenced)
 			goto activate_locked;
 
 #ifdef CONFIG_SWAP

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC][PATCH 7/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
                   ` (5 preceding siblings ...)
  2005-09-11 20:25 ` [RFC][PATCH 6/7] " a.p.zijlstra
@ 2005-09-11 20:25 ` a.p.zijlstra
  2005-09-11 23:05 ` [RFC][PATCH 0/7] " Peter Zijlstra
  7 siblings, 0 replies; 9+ messages in thread
From: a.p.zijlstra @ 2005-09-11 20:25 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: cart-use-cart.patch --]
[-- Type: text/plain, Size: 15757 bytes --]

Index: linux-2.6-git/fs/exec.c
===================================================================
--- linux-2.6-git.orig/fs/exec.c
+++ linux-2.6-git/fs/exec.c
@@ -331,7 +331,7 @@ void install_arg_page(struct vm_area_str
 		goto out;
 	}
 	inc_mm_counter(mm, rss);
-	lru_cache_add_active(page);
+	lru_cache_add(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
Index: linux-2.6-git/include/linux/swap.h
===================================================================
--- linux-2.6-git.orig/include/linux/swap.h
+++ linux-2.6-git/include/linux/swap.h
@@ -199,8 +199,6 @@ extern unsigned int nr_free_pagecache_pa
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
-extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
 extern int rotate_reclaimable_page(struct page *page);
Index: linux-2.6-git/mm/memory.c
===================================================================
--- linux-2.6-git.orig/mm/memory.c
+++ linux-2.6-git/mm/memory.c
@@ -1304,7 +1304,7 @@ static int do_wp_page(struct mm_struct *
 			page_remove_rmap(old_page);
 		flush_cache_page(vma, address, pfn);
 		break_cow(vma, new_page, address, page_table);
-		lru_cache_add_active(new_page);
+		lru_cache_add(new_page);
 		page_add_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
@@ -1782,8 +1782,7 @@ do_anonymous_page(struct mm_struct *mm, 
 		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
 							 vma->vm_page_prot)),
 				      vma);
-		lru_cache_add_active(page);
-		SetPageReferenced(page);
+		lru_cache_add(page);
 		page_add_anon_rmap(page, vma, addr);
 	}
 
@@ -1903,7 +1902,7 @@ retry:
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
-			lru_cache_add_active(new_page);
+			lru_cache_add(new_page);
 			page_add_anon_rmap(new_page, vma, address);
 		} else
 			page_add_file_rmap(new_page);
Index: linux-2.6-git/mm/swap.c
===================================================================
--- linux-2.6-git.orig/mm/swap.c
+++ linux-2.6-git/mm/swap.c
@@ -78,16 +78,13 @@ int rotate_reclaimable_page(struct page 
 		return 1;
 	if (PageDirty(page))
 		return 1;
-	if (PageActive(page))
-		return 1;
 	if (!PageLRU(page))
 		return 1;
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (PageLRU(page) && !PageActive(page)) {
-		list_del(&page->lru);
-		list_add_tail(&page->lru, &zone->inactive_list);
+	if (PageLRU(page)) {
+		__cart_rotate_reclaimable(zone, page);
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
@@ -112,7 +109,6 @@ EXPORT_SYMBOL(mark_page_accessed);
  * @page: the page to add
  */
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
@@ -124,25 +120,12 @@ void fastcall lru_cache_add(struct page 
 	put_cpu_var(lru_add_pvecs);
 }
 
-void fastcall lru_cache_add_active(struct page *page)
-{
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
-
-	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
-}
-
 void lru_add_drain(void)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
 	put_cpu_var(lru_add_pvecs);
 }
 
@@ -278,7 +261,9 @@ void __pagevec_lru_add(struct pagevec *p
 		}
 		if (TestSetPageLRU(page))
 			BUG();
-		add_page_to_inactive_list(zone, page);
+		if (TestClearPageActive(page))
+			BUG();
+		__cart_insert(zone, page);
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
@@ -288,33 +273,6 @@ void __pagevec_lru_add(struct pagevec *p
 
 EXPORT_SYMBOL(__pagevec_lru_add);
 
-void __pagevec_lru_add_active(struct pagevec *pvec)
-{
-	int i;
-	struct zone *zone = NULL;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
-
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		if (TestSetPageLRU(page))
-			BUG();
-		if (TestSetPageActive(page))
-			BUG();
-		add_page_to_active_list(zone, page);
-	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
-	release_pages(pvec->pages, pvec->nr, pvec->cold);
-	pagevec_reinit(pvec);
-}
-
 /*
  * Try to drop buffers from the pages in a pagevec
  */
@@ -396,9 +354,6 @@ static void lru_drain_cache(unsigned int
 	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
 }
 
 /* Drop the CPU's cached committed space back into the central pool. */
Index: linux-2.6-git/mm/swap_state.c
===================================================================
--- linux-2.6-git.orig/mm/swap_state.c
+++ linux-2.6-git/mm/swap_state.c
@@ -359,7 +359,7 @@ struct page *read_swap_cache_async(swp_e
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_active(new_page);
+			lru_cache_add(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
Index: linux-2.6-git/mm/vmscan.c
===================================================================
--- linux-2.6-git.orig/mm/vmscan.c
+++ linux-2.6-git/mm/vmscan.c
@@ -376,8 +376,6 @@ static int shrink_list(struct list_head 
 		if (TestSetPageLocked(page))
 			goto keep;
 
-		BUG_ON(PageActive(page));
-
 		sc->nr_scanned++;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
@@ -498,6 +496,7 @@ static int shrink_list(struct list_head 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->private };
+			cart_remember(page);
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
@@ -506,11 +505,13 @@ static int shrink_list(struct list_head 
 		}
 #endif /* CONFIG_SWAP */
 
+		cart_remember(page);
 		__remove_from_page_cache(page);
 		write_unlock_irq(&mapping->tree_lock);
 		__put_page(page);
 
 free_it:
+		ClearPageActive(page);
 		unlock_page(page);
 		reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
@@ -535,55 +536,6 @@ keep:
 }
 
 /*
- * zone->lru_lock is heavily contended.  Some of the functions that
- * shrink the lists perform better by taking out a batch of pages
- * and working on them outside the LRU lock.
- *
- * For pagecache intensive workloads, this function is the hottest
- * spot in the kernel (apart from copy_*_user functions).
- *
- * Appropriate locks must be held before calling this function.
- *
- * @nr_to_scan:	The number of pages to look through on the list.
- * @src:	The LRU list to pull pages off.
- * @dst:	The temp list to put pages on to.
- * @scanned:	The number of pages that were scanned.
- *
- * returns how many pages were moved onto *@dst.
- */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
-{
-	int nr_taken = 0;
-	struct page *page;
-	int scan = 0;
-
-	while (scan++ < nr_to_scan && !list_empty(src)) {
-		page = lru_to_page(src);
-		prefetchw_prev_lru_page(page, src, flags);
-
-		if (!TestClearPageLRU(page))
-			BUG();
-		list_del(&page->lru);
-		if (get_page_testone(page)) {
-			/*
-			 * It is being freed elsewhere
-			 */
-			__put_page(page);
-			SetPageLRU(page);
-			list_add(&page->lru, src);
-			continue;
-		} else {
-			list_add(&page->lru, dst);
-			nr_taken++;
-		}
-	}
-
-	*scanned = scan;
-	return nr_taken;
-}
-
-/*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
 static void shrink_cache(struct zone *zone, struct scan_control *sc)
@@ -595,19 +547,15 @@ static void shrink_cache(struct zone *zo
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
-	spin_lock_irq(&zone->lru_lock);
 	while (max_scan > 0) {
-		struct page *page;
 		int nr_taken;
-		int nr_scan;
+		unsigned long nr_scan = 0;
 		int nr_freed;
 
-		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-					     &zone->inactive_list,
-					     &page_list, &nr_scan);
-		zone->nr_inactive -= nr_taken;
+		nr_taken = cart_replace(zone, &page_list, sc->swap_cluster_max,
+					&nr_scan);
+		/* we race a bit, do we care? */
 		zone->pages_scanned += nr_scan;
-		spin_unlock_irq(&zone->lru_lock);
 
 		if (nr_taken == 0)
 			goto done;
@@ -623,220 +571,44 @@ static void shrink_cache(struct zone *zo
 		mod_page_state_zone(zone, pgsteal, nr_freed);
 		sc->nr_to_reclaim -= nr_freed;
 
-		spin_lock_irq(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
-		while (!list_empty(&page_list)) {
-			page = lru_to_page(&page_list);
-			if (TestSetPageLRU(page))
-				BUG();
-			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
-			if (!pagevec_add(&pvec, page)) {
-				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_release(&pvec);
-				spin_lock_irq(&zone->lru_lock);
-			}
-		}
+		cart_reinsert(zone, &page_list);
   	}
-	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
 }
 
 /*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation.  But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page.  It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_count against each page.
- * But we had to alter page->flags anyway.
- */
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
-{
-	int pgmoved;
-	int pgdeactivate = 0;
-	int pgscanned;
-	int nr_pages = sc->nr_to_scan;
-	LIST_HEAD(l_hold);	/* The pages which were snipped off */
-	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
-	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
-	struct page *page;
-	struct pagevec pvec;
-	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
-
-	lru_add_drain();
-	spin_lock_irq(&zone->lru_lock);
-	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-				    &l_hold, &pgscanned);
-	zone->pages_scanned += pgscanned;
-	zone->nr_active -= pgmoved;
-	spin_unlock_irq(&zone->lru_lock);
-
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
-	while (!list_empty(&l_hold)) {
-		cond_resched();
-		page = lru_to_page(&l_hold);
-		list_del(&page->lru);
-		if (page_mapped(page)) {
-			if (!reclaim_mapped ||
-			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0, sc->priority <= 0)) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
-		}
-		list_add(&page->lru, &l_inactive);
-	}
-
-	pagevec_init(&pvec, 1);
-	pgmoved = 0;
-	spin_lock_irq(&zone->lru_lock);
-	while (!list_empty(&l_inactive)) {
-		page = lru_to_page(&l_inactive);
-		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		if (TestSetPageLRU(page))
-			BUG();
-		if (!TestClearPageActive(page))
-			BUG();
-		list_move(&page->lru, &zone->inactive_list);
-		pgmoved++;
-		if (!pagevec_add(&pvec, page)) {
-			zone->nr_inactive += pgmoved;
-			spin_unlock_irq(&zone->lru_lock);
-			pgdeactivate += pgmoved;
-			pgmoved = 0;
-			if (buffer_heads_over_limit)
-				pagevec_strip(&pvec);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
-		}
-	}
-	zone->nr_inactive += pgmoved;
-	pgdeactivate += pgmoved;
-	if (buffer_heads_over_limit) {
-		spin_unlock_irq(&zone->lru_lock);
-		pagevec_strip(&pvec);
-		spin_lock_irq(&zone->lru_lock);
-	}
-
-	pgmoved = 0;
-	while (!list_empty(&l_active)) {
-		page = lru_to_page(&l_active);
-		prefetchw_prev_lru_page(page, &l_active, flags);
-		if (TestSetPageLRU(page))
-			BUG();
-		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
-		pgmoved++;
-		if (!pagevec_add(&pvec, page)) {
-			zone->nr_active += pgmoved;
-			pgmoved = 0;
-			spin_unlock_irq(&zone->lru_lock);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
-		}
-	}
-	zone->nr_active += pgmoved;
-	spin_unlock_irq(&zone->lru_lock);
-	pagevec_release(&pvec);
-
-	mod_page_state_zone(zone, pgrefill, pgscanned);
-	mod_page_state(pgdeactivate, pgdeactivate);
-}
-
-/*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void
 shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_active;
-	unsigned long nr_inactive;
 
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
 	 */
-	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+	zone->nr_scan_active += ((zone->nr_active + zone->nr_inactive) >> sc->priority) + 1;
 	nr_active = zone->nr_scan_active;
 	if (nr_active >= sc->swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
-	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
-	nr_inactive = zone->nr_scan_inactive;
-	if (nr_inactive >= sc->swap_cluster_max)
-		zone->nr_scan_inactive = 0;
-	else
-		nr_inactive = 0;
 
 	sc->nr_to_reclaim = sc->swap_cluster_max;
 
-	while (nr_active || nr_inactive) {
-		if (nr_active) {
-			sc->nr_to_scan = min(nr_active,
-					(unsigned long)sc->swap_cluster_max);
-			nr_active -= sc->nr_to_scan;
-			refill_inactive_zone(zone, sc);
-		}
-
-		if (nr_inactive) {
-			sc->nr_to_scan = min(nr_inactive,
-					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= sc->nr_to_scan;
-			shrink_cache(zone, sc);
-			if (sc->nr_to_reclaim <= 0)
-				break;
-		}
+	while (nr_active) {
+		sc->nr_to_scan = min(nr_active,
+				     (unsigned long)sc->swap_cluster_max);
+		nr_active -= sc->nr_to_scan;
+		shrink_cache(zone, sc);
+		if (sc->nr_to_reclaim <= 0)
+			break;
 	}
 
 	throttle_vm_writeout();

--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC][PATCH 0/7] CART Implementation v3
  2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
                   ` (6 preceding siblings ...)
  2005-09-11 20:25 ` [RFC][PATCH 7/7] " a.p.zijlstra
@ 2005-09-11 23:05 ` Peter Zijlstra
  7 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2005-09-11 23:05 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1225 bytes --]

On Sun, 2005-09-11 at 22:25 +0200, a.p.zijlstra@chello.nl wrote:
> Hi All,
> 
> Here my latest efforts on implementing CART, an advanced page replacement 
> policy.
> 
> It seems pretty stable, except for a spurious OOM. However it yet has to
> run on something other than UML.
> 
> A complete CART implementation should be present in cart-cart.patch. 
> The cart-cart-r.patch improves thereon by keeping a 3th adaptive parameter
> which measures the amount of fresh pages (not in |T1| u |T2| u |B1| u |B2|).
> When the amount of fresh pages drops below the number of longterm pages
> we start to reclaim pages that have just been inserted.
> 
> This works very well for a simple looped linear scan larger than the total 
> resident set. Also it doesn't seem to regress normal workloads.
> 

Some numbers. All run in an UML with mem=64M and 128M of swapspace, sync
ubd.

linux-2.6.13-rc7

make -j4

real    107m15.351s
user    24m4.820s
sys     12m16.590s

scan 60 16

real    3m39.432s
user    0m4.990s
sys     0m21.920s

linux-2.6.13-rc7-cart

make -j4 

real    93m18.035s
user    22m44.280s
sys     9m20.220s

scan 60 16

real    1m47.857s
user    0m4.690s
sys     0m11.690s


-- 
Peter Zijlstra <a.p.zijlstra@chello.nl>

[-- Attachment #2: scan.c --]
[-- Type: text/x-csrc, Size: 748 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>

int main(int argc, char **argv)
{
	char *ptr;
	int size = -1;
	int loops = -1;
	if (argc > 1) {
		size = atoi(argv[1]);
	}
	if (argc > 2) {
		loops = atoi(argv[2]);
	}

	if (size < 0) {
		printf("no size specified\n");
		return 0;
	}
	if (loops < 0) {
		printf("no loops specified\n");
		return 0;
	}

	printf("Size: %dMB\n", size);
	printf("Loops: %d\n", loops);
	size *= 1024*1024;

	ptr = (char*)mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
	if (ptr) {
		for (;loops; --loops) {
			int i;
			for (i=0; i<size; ++i) {
				*(ptr + i) = loops;
			}
			printf(".");
			fflush(stdout);
		}
		printf("\n");
		munmap(ptr, size);
	}
	return 0;
}

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2005-09-11 23:04 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-09-11 20:25 [RFC][PATCH 0/7] CART Implementation v3 a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 1/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 2/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 3/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 4/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 5/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 6/7] " a.p.zijlstra
2005-09-11 20:25 ` [RFC][PATCH 7/7] " a.p.zijlstra
2005-09-11 23:05 ` [RFC][PATCH 0/7] " Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox