linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 4/6] CKRM: Add guarantee support for mem controller
@ 2005-06-24 22:25 Chandra Seetharaman
  2005-06-30  5:52 ` [ckrm-tech] " YAMAMOTO Takashi
  0 siblings, 1 reply; 10+ messages in thread
From: Chandra Seetharaman @ 2005-06-24 22:25 UTC (permalink / raw)
  To: ckrm-tech, linux-mm

Patch 4 of 6 patches to support memory controller under CKRM framework.
This patch provides the guarantee support for the controller.
----------------------------------------

 include/linux/ckrm_mem.h        |   28 +++-
 include/linux/ckrm_mem_inline.h |  255 ++++++++++++++++++++++++++
+----------
 include/linux/mm.h              |    2 
 include/linux/mm_inline.h       |   10 +
 include/linux/mmzone.h          |    2 
 kernel/ckrm/ckrm_memcore.c      |  105 ++++++++++++---
 kernel/ckrm/ckrm_memctlr.c      |  274 ++++++++++++++++++++++++++++++++
+-------
 mm/page_alloc.c                 |    4 
 mm/swap.c                       |    3 
 mm/vmscan.c                     |   99 +++++++++++++-
 10 files changed, 636 insertions(+), 146 deletions(-)

Content-Disposition: inline; filename=11-04-mem_limit-guar

Index: linux-2.6.12/include/linux/ckrm_mem.h
===================================================================
--- linux-2.6.12.orig/include/linux/ckrm_mem.h
+++ linux-2.6.12/include/linux/ckrm_mem.h
@@ -26,6 +26,21 @@
 #include <linux/mmzone.h>
 #include <linux/ckrm_rc.h>
 
+struct ckrm_zone {
+	struct list_head active_list;
+	struct list_head inactive_list;
+
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long active_over;
+	unsigned long inactive_over;
+
+	struct list_head guar_list;	/* list of all over guar classes */
+	struct zone *zone;
+	int guar;			/* guarantee(# pages) of this czone */
+	struct ckrm_mem_res *memcls;
+};
+
 struct ckrm_mem_res {
 	unsigned long flags;
 	struct ckrm_core_class *core;	/* the core i am part of... */
@@ -46,20 +61,29 @@ struct ckrm_mem_res {
 	int hier;			/* hiearchy level, root = 0 */
 	int implicit_guar;		/* for classes with don't care guar */
 	int nr_dontcare;		/* # of dont care children */
+
+	struct ckrm_zone ckrm_zone[MAX_NR_ZONES];
 };
 
 extern atomic_t ckrm_mem_real_count;
 extern struct ckrm_res_ctlr mem_rcbs;
 extern struct ckrm_mem_res *ckrm_mem_root_class;
+extern struct list_head ckrm_memclass_list;
+extern spinlock_t ckrm_mem_lock;
+extern spinlock_t ckrm_overguar_lock[MAX_NR_ZONES];
+extern int ckrm_nr_mem_classes;
+extern unsigned int ckrm_tot_lru_pages;
 
 extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res
*);
 extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *,
 						struct ckrm_mem_res *);
 extern void memclass_release(struct kref *);
-extern void incr_use_count(struct ckrm_mem_res *, int, int);
-extern void decr_use_count(struct ckrm_mem_res *, int, int);
+extern void add_use_count(struct ckrm_mem_res *, int, int, int);
+extern void sub_use_count(struct ckrm_mem_res *, int, int, int);
 extern int ckrm_class_limit_ok(struct ckrm_mem_res *);
 
+extern struct ckrm_zone *ckrm_get_max_overguar_czone(int);
+
 #else
 
 #define ckrm_mem_migrate_mm(a, b)			do {} while (0)
Index: linux-2.6.12/include/linux/ckrm_mem_inline.h
===================================================================
--- linux-2.6.12.orig/include/linux/ckrm_mem_inline.h
+++ linux-2.6.12/include/linux/ckrm_mem_inline.h
@@ -40,24 +40,43 @@ ckrm_memclass(struct ckrm_core_class *cl
 }
 
 static inline void
-ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
+set_page_ckrmzone( struct page *page, struct ckrm_zone *cz)
 {
-	if (!cls) {
-		if (!ckrm_mem_root_class)
-			return;
-		cls = ckrm_mem_root_class;
-	}
-	if (page->ckrm_class)
-		kref_put(&page->ckrm_class->nr_users, memclass_release);
-	page->ckrm_class = cls;
-	kref_get(&cls->nr_users);
-	incr_use_count(cls, 0, page_zonenum(page));
+	page->ckrm_zone = cz;
+}
+
+static inline struct ckrm_zone *
+page_ckrmzone(struct page *page)
+{
+	return page->ckrm_zone;
+}
+
+/*
+ * Currently, a shared page that is shared by multiple classes is
charged
+ * to a class with max available guarantee. Simply replace this
function
+ * for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b)
+{
+	if (a == NULL)
+		return -(b != NULL);
+	if (b == NULL)
+		return 1;
+	if (a->pg_guar == b->pg_guar)
+		return 0;
+	if (a->pg_guar == CKRM_SHARE_DONTCARE)
+		return 1;
+	if (b->pg_guar == CKRM_SHARE_DONTCARE)
+		return -1;
+	return (a->pg_unused - b->pg_unused);
 }
 
 static inline void
 ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls)
 {
-	struct ckrm_mem_res *oldcls = page->ckrm_class;
+	struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone;
+	struct ckrm_mem_res *oldcls;
 	int zindex = page_zonenum(page);
 
 	if  (!newcls) {
@@ -66,27 +85,62 @@ ckrm_change_page_class(struct page *page
 		newcls = ckrm_mem_root_class;
 	}
 
+	oldcls = old_czone->memcls;
 	if (oldcls == newcls)
 		return;
 
 	if (oldcls) {
 		kref_put(&oldcls->nr_users, memclass_release);
-		decr_use_count(oldcls, 0, zindex);
+		sub_use_count(oldcls, 0, zindex, 1);
 	}
 
-	page->ckrm_class = newcls;
+	new_czone = &newcls->ckrm_zone[page_zonenum(page)];
+	set_page_ckrmzone(page, new_czone);
 	kref_get(&newcls->nr_users);
-	incr_use_count(newcls, 0, zindex);
+	add_use_count(newcls, 0, zindex, 1);
+
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		old_czone->nr_active--;
+		new_czone->nr_active++;
+		list_add(&page->lru, &new_czone->active_list);
+	} else {
+		old_czone->nr_inactive--;
+		new_czone->nr_inactive++;
+		list_add(&page->lru, &new_czone->inactive_list);
+	}
 }
 
 static inline void
-ckrm_clear_page_class(struct page *page)
+ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
 {
-	struct ckrm_mem_res *cls = page->ckrm_class;
-	if (cls) {
-		decr_use_count(cls, 0, page_zonenum(page));
-		kref_put(&cls->nr_users, memclass_release);
+	struct ckrm_zone *czone;
+
+	if (page_ckrmzone(page))
+		ckrm_change_page_class(page, cls); /* or die ??!! */
+
+	if (!cls) {
+		if (!ckrm_mem_root_class) {
+			set_page_ckrmzone(page, NULL);
+			return;
+		}
+		cls = ckrm_mem_root_class;
 	}
+	czone = &cls->ckrm_zone[page_zonenum(page)];
+	set_page_ckrmzone(page, czone);
+	kref_get(&cls->nr_users);
+	add_use_count(cls, 0, page_zonenum(page), 1);
+}
+
+static inline void
+ckrm_clear_page_class(struct page *page)
+{
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+		return;
+	sub_use_count(czone->memcls, 0, page_zonenum(page), 1);
+	kref_put(&czone->memcls->nr_users, memclass_release);
+	set_page_ckrmzone(page, NULL);
 }
 
 static inline void
@@ -94,18 +148,27 @@ ckrm_mem_inc_active(struct page *page)
 {
 	struct ckrm_mem_res *cls = ckrm_task_memclass(current)
 						?: ckrm_mem_root_class;
+	struct ckrm_zone *czone;
 
-	if (!cls)
-		return;
-	ckrm_set_page_class(page, cls);
+	if (cls == NULL)
+  		return;
+
+  	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_active++;
+	list_add(&page->lru, &czone->active_list);
 }
 
 static inline void
 ckrm_mem_dec_active(struct page *page)
 {
-	if (page->ckrm_class == NULL)
-		return;
-	ckrm_clear_page_class(page);
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+  		return;
+
+	list_del(&page->lru);
+	czone->nr_active--;
+  	ckrm_clear_page_class(page);
 }
 
 static inline void
@@ -113,26 +176,69 @@ ckrm_mem_inc_inactive(struct page *page)
 {
 	struct ckrm_mem_res *cls = ckrm_task_memclass(current)
 					?: ckrm_mem_root_class;
-
-	if (!cls)
+	struct ckrm_zone *czone;
+	if (cls == NULL)
 		return;
+
 	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_inactive++;
+	list_add(&page->lru, &czone->inactive_list);
 }
 
 static inline void
 ckrm_mem_dec_inactive(struct page *page)
 {
-	if (!page->ckrm_class)
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
 		return;
+
+	czone->nr_inactive--;
+	list_del(&page->lru);
 	ckrm_clear_page_class(page);
 }
 
 static inline void
 ckrm_page_init(struct page *page)
 {
-	page->ckrm_class = NULL;
+	set_page_ckrmzone(page, NULL);
+}
+
+static inline void
+ckrm_zone_add_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active += cnt;
+	add_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_get(&czone->memcls->nr_users);
+}
+
+static inline void
+ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive += cnt;
+	add_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_get(&czone->memcls->nr_users);
 }
 
+static inline void
+ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active -= cnt;
+	sub_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_put(&czone->memcls->nr_users, memclass_release);
+}
+
+static inline void
+ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive -= cnt;
+	sub_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_put(&czone->memcls->nr_users, memclass_release);
+}
 
 /* task/mm initializations/cleanup */
 
@@ -193,43 +299,30 @@ ckrm_mm_init(struct mm_struct *mm)
 static inline void
 ckrm_mm_setclass(struct mm_struct *mm, struct ckrm_mem_res *cls)
 {
-	if (cls) {
-		mm->memclass = cls;
-		kref_get(&cls->nr_users);
-	}
+	if (!cls)
+		return;
+	mm->memclass = cls;
+	kref_get(&cls->nr_users);
 }
 
 static inline void
 ckrm_mm_clearclass(struct mm_struct *mm)
 {
-	if (mm->memclass) {
-		kref_put(&mm->memclass->nr_users, memclass_release);
-		mm->memclass = NULL;
-	}
-}
-
-#else
-
-static inline void
-ckrm_task_mm_init(struct task_struct *tsk)
-{
+	if (!mm->memclass)
+		return;
+	kref_put(&mm->memclass->nr_users, memclass_release);
+	mm->memclass = NULL;
 }
 
-static inline void
-ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
-{
-}
+static inline void ckrm_init_lists(struct zone *zone) 			{}
 
-static inline void
-ckrm_task_mm_change(struct task_struct *tsk,
-		struct mm_struct *oldmm, struct mm_struct *newmm)
+static inline void ckrm_add_tail_inactive(struct page *page)
 {
+	 struct ckrm_zone *ckrm_zone = page_ckrmzone(page);
+	 list_add_tail(&page->lru, &ckrm_zone->inactive_list);
 }
 
-static inline void
-ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
-{
-}
+#else
 
 static inline void *
 ckrm_task_memclass(struct task_struct *tsk)
@@ -237,27 +330,49 @@ ckrm_task_memclass(struct task_struct *t
 	return NULL;
 }
 
-static inline void
-ckrm_mm_init(struct mm_struct *mm)
-{
-}
+static inline void ckrm_clear_page_class(struct page *p)		{}
+
+static inline void ckrm_mem_inc_active(struct page *p)			{}
+static inline void ckrm_mem_dec_active(struct page *p)			{}
+static inline void ckrm_mem_inc_inactive(struct page *p)		{}
+static inline void ckrm_mem_dec_inactive(struct page *p)		{}
+
+#define ckrm_zone_add_active(a, b)	do {} while (0)
+#define ckrm_zone_add_inactive(a, b)	do {} while (0)
+#define ckrm_zone_sub_active(a, b)	do {} while (0)
+#define ckrm_zone_sub_inactive(a, b)	do {} while (0)
+#define set_page_ckrmzone(a, b)		do {} while (0)
+
+#define ckrm_class_limit_ok(a)						(1)
+
+static inline void ckrm_page_init(struct page *p)			{}
+static inline void ckrm_task_mm_init(struct task_struct *tsk)		{}
+static inline void ckrm_task_mm_set(struct mm_struct * mm,
+					struct task_struct *task)	{}
+static inline void ckrm_task_mm_change(struct task_struct *tsk,
+		struct mm_struct *oldmm, struct mm_struct *newmm)	{}
+static inline void ckrm_task_mm_clear(struct task_struct *tsk,
+						struct mm_struct *mm)	{}
+
+static inline void ckrm_mm_init(struct mm_struct *mm)			{}
 
 /* using #define instead of static inline as the prototype requires   *
  * data structures that is available only with the controller enabled
*/
-#define ckrm_mm_setclass(a, b) do { } while(0)
-#define ckrm_class_limit_ok(a)	(1)
+#define ckrm_mm_setclass(a, b) 					do {} while(0)
 
-static inline void
-ckrm_mm_clearclass(struct mm_struct *mm)
+static inline void ckrm_mm_clearclass(struct mm_struct *mm)		{}
+
+static inline void ckrm_init_lists(struct zone *zone)
 {
+	INIT_LIST_HEAD(&zone->active_list);
+	INIT_LIST_HEAD(&zone->inactive_list);
 }
 
-static inline void ckrm_mem_inc_active(struct page *p)		{}
-static inline void ckrm_mem_dec_active(struct page *p)		{}
-static inline void ckrm_mem_inc_inactive(struct page *p)	{}
-static inline void ckrm_mem_dec_inactive(struct page *p)	{}
-static inline void ckrm_page_init(struct page *p)		{}
-static inline void ckrm_clear_page_class(struct page *p)	{}
+static inline void ckrm_add_tail_inactive(struct page *page)
+{
+	 struct zone *zone = page_zone(page);
+	 list_add_tail(&page->lru, &zone->inactive_list);
+}
 
 #endif
 #endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2.6.12/include/linux/mm.h
===================================================================
--- linux-2.6.12.orig/include/linux/mm.h
+++ linux-2.6.12/include/linux/mm.h
@@ -259,7 +259,7 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 #ifdef CONFIG_CKRM_RES_MEM
-	struct ckrm_mem_res *ckrm_class;
+	struct ckrm_zone *ckrm_zone;
 #endif
 };
 
Index: linux-2.6.12/include/linux/mm_inline.h
===================================================================
--- linux-2.6.12.orig/include/linux/mm_inline.h
+++ linux-2.6.12/include/linux/mm_inline.h
@@ -3,7 +3,9 @@
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->active_list);
+#endif
 	zone->nr_active++;
 	ckrm_mem_inc_active(page);
 }
@@ -11,7 +13,9 @@ add_page_to_active_list(struct zone *zon
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->inactive_list);
+#endif
 	zone->nr_inactive++;
 	ckrm_mem_inc_inactive(page);
 }
@@ -19,7 +23,9 @@ add_page_to_inactive_list(struct zone *z
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_active--;
 	ckrm_mem_dec_active(page);
 }
@@ -27,7 +33,9 @@ del_page_from_active_list(struct zone *z
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_inactive--;
 	ckrm_mem_dec_inactive(page);
 }
@@ -35,7 +43,9 @@ del_page_from_inactive_list(struct zone 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	if (PageActive(page)) {
 		ClearPageActive(page);
 		zone->nr_active--;
Index: linux-2.6.12/include/linux/mmzone.h
===================================================================
--- linux-2.6.12.orig/include/linux/mmzone.h
+++ linux-2.6.12/include/linux/mmzone.h
@@ -135,8 +135,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
+#ifndef CONFIG_CKRM_RES_MEM
 	struct list_head	active_list;
 	struct list_head	inactive_list;
+#endif
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
Index: linux-2.6.12/kernel/ckrm/ckrm_memcore.c
===================================================================
--- linux-2.6.12.orig/kernel/ckrm/ckrm_memcore.c
+++ linux-2.6.12/kernel/ckrm/ckrm_memcore.c
@@ -38,14 +38,20 @@
 #define CKRM_MEM_MAX_HIERARCHY 2 /* allows only upto 2 levels - 0, 1 &
2 */
 
 /* all 1-level memory_share_class are chained together */
-static LIST_HEAD(ckrm_memclass_list);
-static spinlock_t ckrm_mem_lock; /* protects list above */
-static unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
-
-static int ckrm_nr_mem_classes = 0;
-
+LIST_HEAD(ckrm_memclass_list);
+spinlock_t ckrm_mem_lock; /* protects list above */
+unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
+int ckrm_nr_mem_classes = 0;
 struct ckrm_mem_res *ckrm_mem_root_class;
 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+
+struct list_head ckrm_overguar_list[MAX_NR_ZONES];
+spinlock_t ckrm_overguar_lock[MAX_NR_ZONES]; /* protects the above list
*/
+
+EXPORT_SYMBOL_GPL(ckrm_memclass_list);
+EXPORT_SYMBOL_GPL(ckrm_mem_lock);
+EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
+EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
 EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
 EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
 
@@ -80,6 +86,9 @@ set_ckrm_tot_pages(void)
 static void
 mem_res_initcls_one(struct ckrm_mem_res *res)
 {
+	int zindex = 0;
+	struct zone *zone;
+
 	memset(res, 0, sizeof(struct ckrm_mem_res));
 
 	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
@@ -91,14 +100,65 @@ mem_res_initcls_one(struct ckrm_mem_res 
 
 	res->pg_guar = CKRM_SHARE_DONTCARE;
 	res->pg_limit = CKRM_SHARE_DONTCARE;
+	res->implicit_guar = CKRM_SHARE_DONTCARE;
 
 	INIT_LIST_HEAD(&res->mcls_list);
 
+	for_each_zone(zone) {
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].guar_list);
+		res->ckrm_zone[zindex].nr_active = 0;
+		res->ckrm_zone[zindex].nr_inactive = 0;
+		res->ckrm_zone[zindex].zone = zone;
+		res->ckrm_zone[zindex].memcls = res;
+		zindex++;
+	}
+
 	res->pg_unused = 0;
 	res->nr_dontcare = 1; /* for default class */
 	kref_init(&res->nr_users);
 }
 
+static inline void
+set_zone_guarantees(struct ckrm_mem_res *cls)
+{
+	int i, guar, total;
+	struct zone *zone;
+	u64 temp;
+
+	if (cls->pg_guar == CKRM_SHARE_DONTCARE)
+		/* no guarantee for this class. use implicit guarantee */
+		guar = cls->implicit_guar / cls->nr_dontcare;
+	else
+		guar = cls->pg_unused / cls->nr_dontcare;
+
+	i = 0;
+	for_each_zone(zone) {
+
+		if (zone->present_pages == 0) {
+			cls->ckrm_zone[i].guar = 0;
+			i++;
+			continue;
+		}
+
+		/*
+		 * Guarantee for a ckrmzone is calculated from the class's
+		 * guarantee and the pages_low of the zone proportional
+		 * to the ckrmzone.
+		 */
+		total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+		temp = (u64) guar * total;
+		do_div(temp, ckrm_tot_lru_pages);
+		cls->ckrm_zone[i].guar = (int) temp;
+
+		temp = zone->pages_low * cls->ckrm_zone[i].guar;
+		do_div(temp, zone->present_pages);
+		cls->ckrm_zone[i].guar -= (int) temp;
+		i++;
+	}
+}
+
 static void
 set_impl_guar_children(struct ckrm_mem_res *parres)
 {
@@ -106,11 +166,10 @@ set_impl_guar_children(struct ckrm_mem_r
 	struct ckrm_mem_res *cres;
 	int nr_dontcare = 1; /* for defaultclass */
 	int guar, impl_guar;
-	int resid = mem_rcbs.resid;
 
 	ckrm_lock_hier(parres->core);
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		/* treat NULL cres as don't care as that child is just being
 		 * created.
 		 * FIXME: need a better way to handle this case.
@@ -125,13 +184,14 @@ set_impl_guar_children(struct ckrm_mem_r
 	impl_guar = guar / parres->nr_dontcare;
 
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
 			cres->implicit_guar = impl_guar;
 			set_impl_guar_children(cres);
 		}
 	}
 	ckrm_unlock_hier(parres->core);
+	set_zone_guarantees(parres);
 
 }
 
@@ -142,7 +202,7 @@ mem_res_alloc(struct ckrm_core_class *co
 
 	BUG_ON(mem_rcbs.resid == -1);
 
-	pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct
ckrm_mem_res);
+	pres = ckrm_memclass(parent);
 	if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
 		printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
 						CKRM_MEM_MAX_HIERARCHY);
@@ -184,6 +244,7 @@ mem_res_alloc(struct ckrm_core_class *co
 				pres->implicit_guar : pres->pg_unused;
 			res->implicit_guar = guar / pres->nr_dontcare;
 		}
+		set_zone_guarantees(res);
 		ckrm_nr_mem_classes++;
 	} else
 		printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
@@ -206,8 +267,7 @@ child_maxlimit_changed_local(struct ckrm
 	/* run thru parent's children and get new max_limit of parent */
 	ckrm_lock_hier(parres->core);
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		childres = ckrm_get_res_class(child, mem_rcbs.resid,
-				struct ckrm_mem_res);
+		childres = ckrm_memclass(child);
 		if (maxlimit < childres->shares.my_limit)
 			maxlimit = childres->shares.my_limit;
 	}
@@ -225,7 +285,6 @@ recalc_and_propagate(struct ckrm_mem_res
 {
 	struct ckrm_core_class *child = NULL;
 	struct ckrm_mem_res *cres;
-	int resid = mem_rcbs.resid;
 	struct ckrm_shares *self = &res->shares;
 
 	if (parres) {
@@ -266,10 +325,11 @@ recalc_and_propagate(struct ckrm_mem_res
 	} else
 		res->pg_unused = 0;
 
+	set_zone_guarantees(res);
 	/* propagate to children */
 	ckrm_lock_hier(res->core);
 	while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		recalc_and_propagate(cres, res);
 	}
 	ckrm_unlock_hier(res->core);
@@ -281,14 +341,14 @@ mem_res_free(void *my_res)
 {
 	struct ckrm_mem_res *res = my_res;
 	struct ckrm_mem_res *pres;
+	int i;
 
 	if (!res)
 		return;
 
 	ckrm_mem_migrate_all_pages(res, ckrm_mem_root_class);
 
-	pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-			struct ckrm_mem_res);
+	pres = ckrm_memclass(res->parent);
 
 	if (pres) {
 		child_guarantee_changed(&pres->shares,
@@ -308,6 +368,9 @@ mem_res_free(void *my_res)
 	res->pg_limit = 0;
 	res->pg_unused = 0;
 
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		BUG_ON(!list_empty(&res->ckrm_zone[i].guar_list));
+
 	spin_lock_irq(&ckrm_mem_lock);
 	list_del_init(&res->mcls_list);
 	spin_unlock_irq(&ckrm_mem_lock);
@@ -329,8 +392,7 @@ mem_set_share_values(void *my_res, struc
 	if (!res)
 		return -EINVAL;
 
-	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-		struct ckrm_mem_res);
+	parres = ckrm_memclass(res->parent);
 
 	rc = set_shares(shares, &res->shares, parres ? &parres->shares :
NULL);
 
@@ -494,10 +556,15 @@ int __init
 init_ckrm_mem_res(void)
 {
 	struct ckrm_classtype *clstype;
-	int resid = mem_rcbs.resid;
+	int i, resid = mem_rcbs.resid;
 
 	set_ckrm_tot_pages();
 	spin_lock_init(&ckrm_mem_lock);
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		INIT_LIST_HEAD(&ckrm_overguar_list[i]);
+		spin_lock_init(&ckrm_overguar_lock[i]);
+	}
+
 	clstype = ckrm_find_classtype_by_name("taskclass");
 	if (clstype == NULL) {
 		printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
Index: linux-2.6.12/kernel/ckrm/ckrm_memctlr.c
===================================================================
--- linux-2.6.12.orig/kernel/ckrm/ckrm_memctlr.c
+++ linux-2.6.12/kernel/ckrm/ckrm_memctlr.c
@@ -14,53 +14,98 @@
  *
  */
 
+#include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/ckrm_mem_inline.h>
 
+extern struct list_head ckrm_overguar_list[];
+
+static inline void
+ckrm_add_to_guar_list(struct ckrm_zone *czone, int zindex)
+{
+	int usage;
+	unsigned long flags;
+
+	/* fast path, after this czone gets into the list */
+	if (!list_empty(&czone->guar_list))
+		return;
+
+	usage = czone->nr_active + czone->nr_inactive;
+	if (usage > czone->guar) {
+		spin_lock_irqsave(&ckrm_overguar_lock[zindex], flags);
+		if (list_empty(&czone->guar_list))
+			list_add_tail(&czone->guar_list,
+				&ckrm_overguar_list[zindex]);
+		spin_unlock_irqrestore(&ckrm_overguar_lock[zindex], flags);
+	}
+}
+
+static inline void
+ckrm_del_from_guar_list(struct ckrm_zone *czone, int zindex)
+{
+	int usage;
+	unsigned long flags;
+
+	/* fast path, return immediately if we are not in the list */
+	if (list_empty(&czone->guar_list))
+		return;
+
+	usage = czone->nr_active + czone->nr_inactive;
+	if (usage <= czone->guar) {
+		spin_lock_irqsave(&ckrm_overguar_lock[zindex], flags);
+		if (!list_empty(&czone->guar_list))
+			list_del_init(&czone->guar_list);
+		spin_unlock_irqrestore(&ckrm_overguar_lock[zindex], flags);
+	}
+}
+
 void
-incr_use_count(struct ckrm_mem_res *cls, int borrow, int zindex)
+add_use_count(struct ckrm_mem_res *cls, int borrow, int zindex, int
cnt)
 {
 	int i, pg_total = 0;
-	struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+	struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 
 	if (!cls)
 		return;
 
-	cls->pg_total[zindex]++;
+	cls->pg_total[zindex] += cnt;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		pg_total += cls->pg_total[i];
 	if (borrow)
-		cls->pg_lent[zindex]++;
+		cls->pg_lent[zindex] += cnt;
 
-	parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+	parcls = ckrm_memclass(cls->parent);
 	if (parcls && ((cls->pg_guar == CKRM_SHARE_DONTCARE) ||
 			(pg_total > cls->pg_unused))) {
-		incr_use_count(parcls, 1, zindex);
-		cls->pg_borrowed[zindex]++;
+		add_use_count(parcls, 1, zindex, cnt);
+		cls->pg_borrowed[zindex] += cnt;
 	} else
-		atomic_inc(&ckrm_mem_real_count);
+		atomic_add(cnt, &ckrm_mem_real_count);
+	ckrm_add_to_guar_list(&cls->ckrm_zone[zindex], zindex);
 	return;
 }
 
 void
-decr_use_count(struct ckrm_mem_res *cls, int borrowed, int zindex)
+sub_use_count(struct ckrm_mem_res *cls, int borrowed, int zindex, int
cnt)
 {
+	int borrow_cnt = 0;
+
 	if (!cls)
 		return;
-	cls->pg_total[zindex]--;
+	cls->pg_total[zindex] -= cnt;
 	if (borrowed)
-		cls->pg_lent[zindex]--;
-	if (cls->pg_borrowed > 0) {
-		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+		cls->pg_lent[zindex] -= cnt;
+	if (cls->pg_borrowed[zindex] > 0) {
+		struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 		if (parcls) {
-			decr_use_count(parcls, 1, zindex);
-			cls->pg_borrowed[zindex]--;
-			return;
+			borrow_cnt = min(cnt, cls->pg_borrowed[zindex]);
+			sub_use_count(parcls, 1, zindex, borrow_cnt);
+			cls->pg_borrowed[zindex] -= borrow_cnt;
 		}
 	}
-	atomic_dec(&ckrm_mem_real_count);
+	atomic_sub(cnt - borrow_cnt, &ckrm_mem_real_count);
+	ckrm_del_from_guar_list(&cls->ckrm_zone[zindex], zindex);
+	return;
 }
 
 int
@@ -73,8 +118,7 @@ ckrm_class_limit_ok(struct ckrm_mem_res 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		pg_total += cls->pg_total[i];
 	if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
-		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-					mem_rcbs.resid, struct ckrm_mem_res);
+		struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 		ret = (parcls ? ckrm_class_limit_ok(parcls) : 0);
 	} else
 		ret = (pg_total <= cls->pg_limit);
@@ -82,8 +126,88 @@ ckrm_class_limit_ok(struct ckrm_mem_res 
 	return ret;
 }
 
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct vm_area_struct *vma;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!spin_trylock(&anon_vma->lock))
+		return 0;
+	BUG_ON(list_empty(&anon_vma->head));
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass) < 0) {
+			maxshareclass = mm->memclass;
+		}
+	}
+	spin_unlock(&anon_vma->lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma = NULL;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct prio_tree_iter iter;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!mapping)
+		return 0;
+
+	if (!spin_trylock(&mapping->i_mmap_lock))
+		return 0;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
+					pgoff, pgoff) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass)<0)
+			maxshareclass = mm->memclass;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page)
+{
+	int ret = 0;
+	if (page->mapping) {
+		if (PageAnon(page))
+			ret = ckrm_mem_evaluate_page_anon(page);
+		else
+			ret = ckrm_mem_evaluate_page_file(page);
+	}
+	return ret;
+}
+
 static void migrate_list(struct list_head *list,
-	struct ckrm_mem_res* from, struct ckrm_mem_res* to)
+	struct ckrm_mem_res* from, struct ckrm_mem_res* def)
 {
 	struct page *page;
 	struct list_head *pos, *next;
@@ -92,21 +216,26 @@ static void migrate_list(struct list_hea
 	while (pos != list) {
 		next = pos->next;
 		page = list_entry(pos, struct page, lru);
-		if (page->ckrm_class == from)
-			ckrm_change_page_class(page, to);
+		if (!ckrm_mem_evaluate_page(page))
+			ckrm_change_page_class(page, def);
 		pos = next;
 	}
 }
 
 void
-ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct
ckrm_mem_res* to)
+ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from,
+					struct ckrm_mem_res* def)
 {
+	int i;
 	struct zone *zone;
+	struct ckrm_zone *ckrm_zone;
 
-	for_each_zone(zone) {
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		ckrm_zone = &from->ckrm_zone[i];
+		zone = ckrm_zone->zone;
 		spin_lock_irq(&zone->lru_lock);
-		migrate_list(&zone->inactive_list, from, to);
-		migrate_list(&zone->active_list, from, to);
+		migrate_list(&ckrm_zone->inactive_list, from, def);
+		migrate_list(&ckrm_zone->active_list, from, def);
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	return;
@@ -131,8 +260,13 @@ class_migrate_pmd(struct mm_struct* mm, 
 		pte = pte_offset_map(pmdir, address);
 		if (pte_present(*pte)) {
 			struct page *page = pte_page(*pte);
-			if (page->mapping)
+			struct ckrm_zone *czone = page_ckrmzone(page);
+			if (page->mapping && czone) {
+				struct zone *zone = czone->zone;
+				spin_lock_irq(&zone->lru_lock);
 				ckrm_change_page_class(page, mm->memclass);
+				spin_unlock_irq(&zone->lru_lock);
+			}
 		}
 		address += PAGE_SIZE;
 		pte_unmap(pte);
@@ -193,7 +327,9 @@ class_migrate_vma(struct mm_struct* mm, 
 void
 ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
 {
+	struct task_struct *task;
 	struct vm_area_struct *vma;
+	struct ckrm_mem_res *maxshareclass = def;
 
 	/* We leave the mm->memclass untouched since we believe that one
 	 * mm with no task associated will be deleted soon or attach
@@ -202,18 +338,72 @@ ckrm_mem_migrate_mm(struct mm_struct* mm
 	if (list_empty(&mm->tasklist))
 		return;
 
-	if (mm->memclass)
-		kref_put(&mm->memclass->nr_users, memclass_release);
-	mm->memclass = def ?: ckrm_mem_root_class;
-	kref_get(&mm->memclass->nr_users);
-
-	/* Go through all VMA to migrate pages */
-	down_read(&mm->mmap_sem);
-	vma = mm->mmap;
-	while(vma) {
-		class_migrate_vma(mm, vma);
-		vma = vma->vm_next;
+	list_for_each_entry(task, &mm->tasklist, mm_peers) {
+		struct ckrm_mem_res* cls = ckrm_task_memclass(task);
+		if (!cls)
+			continue;
+		if (!maxshareclass ||
+				ckrm_mem_share_compare(maxshareclass,cls)<0 )
+			maxshareclass = cls;
+	}
+
+	if (maxshareclass && (mm->memclass != maxshareclass)) {
+		if (mm->memclass)
+			kref_put(&mm->memclass->nr_users, memclass_release);
+		mm->memclass = maxshareclass;
+		kref_get(&maxshareclass->nr_users);
+
+		/* Go through all VMA to migrate pages */
+		down_read(&mm->mmap_sem);
+		vma = mm->mmap;
+		while(vma) {
+			class_migrate_vma(mm, vma);
+			vma = vma->vm_next;
+		}
+		up_read(&mm->mmap_sem);
 	}
-	up_read(&mm->mmap_sem);
 	return;
 }
+
+/*
+ * Returns the ckrm zone whose usage is over its guarantee and is
+ * is the most among all the ckrm zones who are over their respective
+ * guarantees.
+ *
+ * While returning holds a reference to the class, Caller is
responsible
+ * for dropping the reference(kref_put), when it is done with the ckrm
+ * zone.
+ */
+struct ckrm_zone *
+ckrm_get_max_overguar_czone(int zindex)
+{
+	struct ckrm_zone *czone;
+	struct ckrm_zone *maxczone = &ckrm_mem_root_class->ckrm_zone[zindex];
+	int max_overguar = 0, usage, cnt;
+	struct ckrm_mem_res *cls;
+
+	kref_get(&maxczone->memcls->nr_users);
+
+	spin_lock_irq(&ckrm_overguar_lock[zindex]);
+	list_for_each_entry(czone, &ckrm_overguar_list[zindex], guar_list) {
+		cls = czone->memcls;
+		usage = czone->nr_active + czone->nr_inactive;
+		if ((usage - czone->guar) > max_overguar) {
+			kref_put(&maxczone->memcls->nr_users, memclass_release);
+			max_overguar = usage - czone->guar;
+			maxczone = czone;
+			kref_get(&maxczone->memcls->nr_users);
+		}
+	}
+	spin_unlock_irq(&ckrm_overguar_lock[zindex]);
+	BUG_ON(maxczone == NULL);
+
+	/* calculate active_over and inactive_over */
+	cnt = maxczone->nr_active - (2 * maxczone->guar / 3);
+	maxczone->active_over = (cnt > 0) ? cnt : SWAP_CLUSTER_MAX;
+	cnt = maxczone->active_over + maxczone->nr_inactive
+			 - (maxczone->guar / 3);
+	maxczone->inactive_over = (cnt > 0) ? cnt : SWAP_CLUSTER_MAX;
+
+	return maxczone;
+}
Index: linux-2.6.12/mm/page_alloc.c
===================================================================
--- linux-2.6.12.orig/mm/page_alloc.c
+++ linux-2.6.12/mm/page_alloc.c
@@ -358,7 +358,6 @@ free_pages_bulk(struct zone *zone, int c
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
 		__free_pages_bulk(page, zone, order);
-		ckrm_clear_page_class(page);
 		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -1713,8 +1712,7 @@ static void __init free_area_init_core(s
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
+		ckrm_init_lists(zone);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
Index: linux-2.6.12/mm/swap.c
===================================================================
--- linux-2.6.12.orig/mm/swap.c
+++ linux-2.6.12/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/ckrm_mem_inline.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -87,7 +88,7 @@ int rotate_reclaimable_page(struct page 
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
 		list_del(&page->lru);
-		list_add_tail(&page->lru, &zone->inactive_list);
+		ckrm_add_tail_inactive(page);
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
Index: linux-2.6.12/mm/vmscan.c
===================================================================
--- linux-2.6.12.orig/mm/vmscan.c
+++ linux-2.6.12/mm/vmscan.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/ckrm_mem.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -79,6 +80,11 @@ struct scan_control {
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
+
+#ifdef CONFIG_CKRM_RES_MEM
+	int ckrm_active;
+	int ckrm_inactive;
+#endif
 };
 
 /*
@@ -586,6 +592,7 @@ static int isolate_lru_pages(int nr_to_s
 			continue;
 		} else {
 			list_add(&page->lru, dst);
+			set_page_ckrmzone(page, NULL);
 			nr_taken++;
 		}
 	}
@@ -597,11 +604,23 @@ static int isolate_lru_pages(int nr_to_s
 /*
  * shrink_cache() adds the number of pages reclaimed to sc-
>nr_reclaimed
  */
+#ifdef CONFIG_CKRM_RES_MEM
+static void shrink_cache(struct ckrm_zone *ckrm_zone, struct
scan_control *sc)
+#else
 static void shrink_cache(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	int max_scan = sc->nr_to_scan;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+	struct list_head *active_list = &ckrm_zone->active_list;
+#else
+	struct list_head *inactive_list = &zone->inactive_list;
+	struct list_head *active_list = &zone->active_list;
+#endif
 
 	pagevec_init(&pvec, 1);
 
@@ -614,9 +633,10 @@ static void shrink_cache(struct zone *zo
 		int nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-					     &zone->inactive_list,
+					     inactive_list,
 					     &page_list, &nr_scan);
 		zone->nr_inactive -= nr_taken;
+		ckrm_zone_sub_inactive(ckrm_zone, nr_taken);
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
@@ -643,10 +663,16 @@ static void shrink_cache(struct zone *zo
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			if (PageActive(page)) {
+				ckrm_zone_add_active(ckrm_zone, 1);
+				zone->nr_active++;
+				list_add(&page->lru, active_list);
+			} else {
+				ckrm_zone_add_inactive(ckrm_zone, 1);
+				zone->nr_inactive++;
+				list_add(&page->lru, inactive_list);
+			}
+			set_page_ckrmzone(page, ckrm_zone);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -677,7 +703,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
+#ifdef CONFIG_CKRM_RES_MEM
+refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control
*sc)
+#else
 refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	int pgmoved;
 	int pgdeactivate = 0;
@@ -692,13 +722,22 @@ refill_inactive_zone(struct zone *zone, 
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *active_list = &ckrm_zone->active_list;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+#else
+	struct list_head *active_list = &zone->active_list;
+	struct list_head *inactive_list = &zone->inactive_list;
+#endif
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+	pgmoved = isolate_lru_pages(nr_pages, active_list,
 				    &l_hold, &pgscanned);
 	zone->pages_scanned += pgscanned;
 	zone->nr_active -= pgmoved;
+	ckrm_zone_sub_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
 	/*
@@ -757,10 +796,12 @@ refill_inactive_zone(struct zone *zone, 
 			BUG();
 		if (!TestClearPageActive(page))
 			BUG();
-		list_move(&page->lru, &zone->inactive_list);
+		list_move(&page->lru, inactive_list);
+		set_page_ckrmzone(page, ckrm_zone);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
+			ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -771,6 +812,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_inactive += pgmoved;
+	ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
@@ -785,10 +827,12 @@ refill_inactive_zone(struct zone *zone, 
 		if (TestSetPageLRU(page))
 			BUG();
 		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
+		list_move(&page->lru, active_list);
+		set_page_ckrmzone(page, ckrm_zone);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_active += pgmoved;
+			ckrm_zone_add_active(ckrm_zone, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
@@ -796,6 +840,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_active += pgmoved;
+	ckrm_zone_add_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
@@ -803,6 +848,29 @@ refill_inactive_zone(struct zone *zone, 
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+#ifdef CONFIG_CKRM_RES_MEM
+static void
+shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc)
+{
+	while (sc->ckrm_active || sc->ckrm_inactive) {
+		if (sc->ckrm_active) {
+			sc->nr_to_scan = (unsigned long)min(sc->ckrm_active,
+					SWAP_CLUSTER_MAX);
+			sc->ckrm_active -= sc->nr_to_scan;
+			refill_inactive_zone(czone, sc);
+		}
+		if (sc->ckrm_inactive) {
+			sc->nr_to_scan = (unsigned long)min(sc->ckrm_inactive,
+					SWAP_CLUSTER_MAX);
+			sc->ckrm_inactive -= sc->nr_to_scan;
+			shrink_cache(czone, sc);
+			if (sc->nr_to_reclaim <= 0)
+				break;
+		}
+	}
+}
+#endif
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct
reclaim.
  */
@@ -832,6 +900,20 @@ shrink_zone(struct zone *zone, struct sc
 
 	sc->nr_to_reclaim = sc->swap_cluster_max;
 
+#ifdef CONFIG_CKRM_RES_MEM
+	while (nr_active || nr_inactive) {
+		int zindex = zone_idx(zone);
+		struct ckrm_zone *czone;
+
+		czone = ckrm_get_max_overguar_czone(zindex);
+		sc->ckrm_active = min(nr_active, czone->active_over);
+		sc->ckrm_inactive = min(nr_inactive, czone->inactive_over);
+		nr_active -= sc->ckrm_active;
+		nr_inactive -= sc->ckrm_inactive;
+		shrink_ckrmzone(czone, sc);
+		kref_put(&czone->memcls->nr_users, memclass_release);
+	}
+#else
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
@@ -849,6 +931,7 @@ shrink_zone(struct zone *zone, struct sc
 				break;
 		}
 	}
+#endif
 
 	throttle_vm_writeout();
 }

-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - sekharan@us.ibm.com   |      .......you may get it.
----------------------------------------------------------------------


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread
* [PATCH 4/6] CKRM: Add guarantee support for mem controller
@ 2005-05-19  0:32 Chandra Seetharaman
  0 siblings, 0 replies; 10+ messages in thread
From: Chandra Seetharaman @ 2005-05-19  0:32 UTC (permalink / raw)
  To: ckrm-tech, linux-mm

Patch 4 of 6 patches to support memory controller under CKRM framework.
This patch provides the guarantee support for the controller.
----------------------------------------
Following changes have been made since the last release:
 - fix for a bug in the statistics counting, which was leading to oom failures
 - design change to allow multiple execution threads in the reclaimation path
 - instead of going thru the whole list of classes, go thru only the classes
   that are over their guarantee.
 - fix for a bug in the limit handling path, that was leading to multiple list
   deletions
 - added a config parameter state to turn mem controller on/off. Now memory
   controller is off by default, and is turned on as soon a new class is defined
   or the default class's shares are modified
 - code cleanup
----------------------------------------

 include/linux/ckrm_mem.h        |   28 +++-
 include/linux/ckrm_mem_inline.h |  257 ++++++++++++++++++++++++++-----------
 include/linux/mm.h              |    2 
 include/linux/mm_inline.h       |   10 +
 include/linux/mmzone.h          |    2 
 kernel/ckrm/ckrm_memcore.c      |  105 ++++++++++++---
 kernel/ckrm/ckrm_memctlr.c      |  276 +++++++++++++++++++++++++++++++++-------
 mm/page_alloc.c                 |    4 
 mm/swap.c                       |    3 
 mm/vmscan.c                     |   99 +++++++++++++-
 10 files changed, 637 insertions(+), 149 deletions(-)

Content-Disposition: inline; filename=11-04-mem_limit-guar

Index: linux-2612-rc3/include/linux/ckrm_mem.h
===================================================================
--- linux-2612-rc3.orig/include/linux/ckrm_mem.h
+++ linux-2612-rc3/include/linux/ckrm_mem.h
@@ -26,6 +26,21 @@
 #include <linux/mmzone.h>
 #include <linux/ckrm_rc.h>
 
+struct ckrm_zone {
+	struct list_head active_list;
+	struct list_head inactive_list;
+
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long active_over;
+	unsigned long inactive_over;
+
+	struct list_head guar_list;	/* list of all over guar classes */
+	struct zone *zone;
+	int guar;			/* guarantee(# pages) of this czone */
+	struct ckrm_mem_res *memcls;
+};
+
 struct ckrm_mem_res {
 	unsigned long flags;
 	struct ckrm_core_class *core;	/* the core i am part of... */
@@ -46,20 +61,29 @@ struct ckrm_mem_res {
 	int hier;			/* hiearchy level, root = 0 */
 	int implicit_guar;		/* for classes with don't care guar */
 	int nr_dontcare;		/* # of dont care children */
+
+	struct ckrm_zone ckrm_zone[MAX_NR_ZONES];
 };
 
 extern atomic_t ckrm_mem_real_count;
 extern struct ckrm_res_ctlr mem_rcbs;
 extern struct ckrm_mem_res *ckrm_mem_root_class;
+extern struct list_head ckrm_memclass_list;
+extern spinlock_t ckrm_mem_lock;
+extern spinlock_t ckrm_overguar_lock[MAX_NR_ZONES];
+extern int ckrm_nr_mem_classes;
+extern unsigned int ckrm_tot_lru_pages;
 
 extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res *);
 extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *,
 						struct ckrm_mem_res *);
 extern void memclass_release(struct kref *);
-extern void incr_use_count(struct ckrm_mem_res *, int, int);
-extern void decr_use_count(struct ckrm_mem_res *, int, int);
+extern void add_use_count(struct ckrm_mem_res *, int, int, int);
+extern void sub_use_count(struct ckrm_mem_res *, int, int, int);
 extern int ckrm_class_limit_ok(struct ckrm_mem_res *);
 
+extern struct ckrm_zone *ckrm_get_max_overguar_czone(int);
+
 #else
 
 #define ckrm_mem_migrate_mm(a, b)			do {} while (0)
Index: linux-2612-rc3/include/linux/ckrm_mem_inline.h
===================================================================
--- linux-2612-rc3.orig/include/linux/ckrm_mem_inline.h
+++ linux-2612-rc3/include/linux/ckrm_mem_inline.h
@@ -40,24 +40,43 @@ ckrm_memclass(struct ckrm_core_class *cl
 }
 
 static inline void
-ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
+set_page_ckrmzone( struct page *page, struct ckrm_zone *cz)
 {
-	if (!cls) {
-		if (!ckrm_mem_root_class)
-			return;
-		cls = ckrm_mem_root_class;
-	}
-	if (page->ckrm_class)
-		kref_put(&page->ckrm_class->nr_users, memclass_release);
-	page->ckrm_class = cls;
-	kref_get(&cls->nr_users);
-	incr_use_count(cls, 0, page_zonenum(page));
+	page->ckrm_zone = cz;
+}
+
+static inline struct ckrm_zone *
+page_ckrmzone(struct page *page)
+{
+	return page->ckrm_zone;
+}
+
+/*
+ * Currently, a shared page that is shared by multiple classes is charged
+ * to a class with max available guarantee. Simply replace this function
+ * for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b)
+{
+	if (a == NULL)
+		return -(b != NULL);
+	if (b == NULL)
+		return 1;
+	if (a->pg_guar == b->pg_guar)
+		return 0;
+	if (a->pg_guar == CKRM_SHARE_DONTCARE)
+		return 1;
+	if (b->pg_guar == CKRM_SHARE_DONTCARE)
+		return -1;
+	return (a->pg_unused - b->pg_unused);
 }
 
 static inline void
 ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls)
 {
-	struct ckrm_mem_res *oldcls = page->ckrm_class;
+	struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone;
+	struct ckrm_mem_res *oldcls;
 	int zindex = page_zonenum(page);
 
 	if  (!newcls) {
@@ -66,27 +85,62 @@ ckrm_change_page_class(struct page *page
 		newcls = ckrm_mem_root_class;
 	}
 
+	oldcls = old_czone->memcls;
 	if (oldcls == newcls)
 		return;
 
 	if (oldcls) {
 		kref_put(&oldcls->nr_users, memclass_release);
-		decr_use_count(oldcls, 0, zindex);
+		sub_use_count(oldcls, 0, zindex, 1);
 	}
 
-	page->ckrm_class = newcls;
+	new_czone = &newcls->ckrm_zone[page_zonenum(page)];
+	set_page_ckrmzone(page, new_czone);
 	kref_get(&newcls->nr_users);
-	incr_use_count(newcls, 0, zindex);
+	add_use_count(newcls, 0, zindex, 1);
+
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		old_czone->nr_active--;
+		new_czone->nr_active++;
+		list_add(&page->lru, &new_czone->active_list);
+	} else {
+		old_czone->nr_inactive--;
+		new_czone->nr_inactive++;
+		list_add(&page->lru, &new_czone->inactive_list);
+	}
 }
 
 static inline void
-ckrm_clear_page_class(struct page *page)
+ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
 {
-	struct ckrm_mem_res *cls = page->ckrm_class;
-	if (cls) {
-		decr_use_count(cls, 0, page_zonenum(page));
-		kref_put(&cls->nr_users, memclass_release);
+	struct ckrm_zone *czone;
+
+	if (page_ckrmzone(page))
+		ckrm_change_page_class(page, cls); /* or die ??!! */
+
+	if (!cls) {
+		if (!ckrm_mem_root_class) {
+			set_page_ckrmzone(page, NULL);
+			return;
+		}
+		cls = ckrm_mem_root_class;
 	}
+	czone = &cls->ckrm_zone[page_zonenum(page)];
+	set_page_ckrmzone(page, czone);
+	kref_get(&cls->nr_users);
+	add_use_count(cls, 0, page_zonenum(page), 1);
+}
+
+static inline void
+ckrm_clear_page_class(struct page *page)
+{
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+		return;
+	sub_use_count(czone->memcls, 0, page_zonenum(page), 1);
+	kref_put(&czone->memcls->nr_users, memclass_release);
+	set_page_ckrmzone(page, NULL);
 }
 
 static inline void
@@ -94,18 +148,27 @@ ckrm_mem_inc_active(struct page *page)
 {
 	struct ckrm_mem_res *cls = ckrm_task_memclass(current)
 						?: ckrm_mem_root_class;
-
-	if (!cls)
-		return;
-	ckrm_set_page_class(page, cls);
+	struct ckrm_zone *czone;
+  
+	if (cls == NULL)
+  		return;
+
+  	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_active++;
+	list_add(&page->lru, &czone->active_list);
 }
 
 static inline void
 ckrm_mem_dec_active(struct page *page)
 {
-	if (page->ckrm_class == NULL)
-		return;
-	ckrm_clear_page_class(page);
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+  		return;
+
+	list_del(&page->lru);
+	czone->nr_active--;
+  	ckrm_clear_page_class(page);
 }
 
 static inline void
@@ -113,26 +176,69 @@ ckrm_mem_inc_inactive(struct page *page)
 {
 	struct ckrm_mem_res *cls = ckrm_task_memclass(current)
 					?: ckrm_mem_root_class;
-
-	if (!cls)
+	struct ckrm_zone *czone;
+	if (cls == NULL)
 		return;
+
 	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_inactive++;
+	list_add(&page->lru, &czone->inactive_list);
 }
 
 static inline void
 ckrm_mem_dec_inactive(struct page *page)
 {
-	if (!page->ckrm_class)
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
 		return;
+
+	czone->nr_inactive--;
+	list_del(&page->lru);
 	ckrm_clear_page_class(page);
 }
 
 static inline void
 ckrm_page_init(struct page *page)
 {
-	page->ckrm_class = NULL;
+	set_page_ckrmzone(page, NULL);
 }
 
+static inline void
+ckrm_zone_add_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active += cnt;
+	add_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_get(&czone->memcls->nr_users);
+}
+
+static inline void
+ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive += cnt;
+	add_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_get(&czone->memcls->nr_users);
+}
+
+static inline void
+ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active -= cnt;
+	sub_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_put(&czone->memcls->nr_users, memclass_release);
+}
+
+static inline void
+ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive -= cnt;
+	sub_use_count(czone->memcls, 0, zone_idx(czone->zone), cnt);
+	while (cnt--)
+		kref_put(&czone->memcls->nr_users, memclass_release);
+}
 
 /* task/mm initializations/cleanup */
 
@@ -193,43 +299,30 @@ ckrm_mm_init(struct mm_struct *mm)
 static inline void
 ckrm_mm_setclass(struct mm_struct *mm, struct ckrm_mem_res *cls)
 {
-	if (cls) {
-		mm->memclass = cls;
-		kref_get(&cls->nr_users);
-	}
+	if (!cls)
+		return;
+	mm->memclass = cls;
+	kref_get(&cls->nr_users);
 }
 
 static inline void
 ckrm_mm_clearclass(struct mm_struct *mm)
 {
-	if (mm->memclass) {
-		kref_put(&mm->memclass->nr_users, memclass_release);
-		mm->memclass = NULL;
-	}
-}
-
-#else
-
-static inline void
-ckrm_task_mm_init(struct task_struct *tsk)
-{
+	if (!mm->memclass)
+		return;
+	kref_put(&mm->memclass->nr_users, memclass_release);
+	mm->memclass = NULL;
 }
 
-static inline void
-ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
-{
-}
+static inline void ckrm_init_lists(struct zone *zone) 			{}
 
-static inline void
-ckrm_task_mm_change(struct task_struct *tsk,
-		struct mm_struct *oldmm, struct mm_struct *newmm)
+static inline void ckrm_add_tail_inactive(struct page *page)
 {
+	 struct ckrm_zone *ckrm_zone = page_ckrmzone(page);
+	 list_add_tail(&page->lru, &ckrm_zone->inactive_list);
 }
 
-static inline void
-ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
-{
-}
+#else
 
 static inline void *
 ckrm_task_memclass(struct task_struct *tsk)
@@ -237,27 +330,47 @@ ckrm_task_memclass(struct task_struct *t
 	return NULL;
 }
 
-static inline void
-ckrm_mm_init(struct mm_struct *mm)
-{
-}
+static inline void ckrm_clear_page_class(struct page *p)		{}
+
+static inline void ckrm_mem_inc_active(struct page *p)			{}
+static inline void ckrm_mem_dec_active(struct page *p)			{}
+static inline void ckrm_mem_inc_inactive(struct page *p)		{}
+static inline void ckrm_mem_dec_inactive(struct page *p)		{}
+
+#define ckrm_zone_add_active(a, b)	do {} while (0)
+#define ckrm_zone_add_inactive(a, b)	do {} while (0)
+#define ckrm_zone_sub_active(a, b)	do {} while (0)
+#define ckrm_zone_sub_inactive(a, b)	do {} while (0)
+
+#define ckrm_class_limit_ok(a)						(1)
+
+static inline void ckrm_page_init(struct page *p)			{}
+static inline void ckrm_task_mm_init(struct task_struct *tsk)		{}
+static inline void ckrm_task_mm_set(struct mm_struct * mm,
+					struct task_struct *task)	{}
+static inline void ckrm_task_mm_change(struct task_struct *tsk,
+		struct mm_struct *oldmm, struct mm_struct *newmm)	{}
+static inline void ckrm_task_mm_clear(struct task_struct *tsk,
+						struct mm_struct *mm)	{}
+
+static inline void ckrm_mm_init(struct mm_struct *mm)			{}
 
 /* using #define instead of static inline as the prototype requires   *
  * data structures that is available only with the controller enabled */
-#define ckrm_mm_setclass(a, b) do { } while(0)
-#define ckrm_class_limit_ok(a)	(1)
+#define ckrm_mm_setclass(a, b) 					do {} while(0)
 
-static inline void
-ckrm_mm_clearclass(struct mm_struct *mm)
+static inline void ckrm_mm_clearclass(struct mm_struct *mm)		{}
+
+static inline void ckrm_init_lists(struct zone *zone)
 {
+	INIT_LIST_HEAD(&zone->active_list);
+	INIT_LIST_HEAD(&zone->inactive_list);
 }
 
-static inline void ckrm_mem_inc_active(struct page *p)		{}
-static inline void ckrm_mem_dec_active(struct page *p)		{}
-static inline void ckrm_mem_inc_inactive(struct page *p)	{}
-static inline void ckrm_mem_dec_inactive(struct page *p)	{}
-static inline void ckrm_page_init(struct page *p)		{}
-static inline void ckrm_clear_page_class(struct page *p)	{}
-
+static inline void ckrm_add_tail_inactive(struct page *page)
+{
+	 struct zone *zone = page_zone(page);
+	 list_add_tail(&page->lru, &zone->inactive_list);
+}
 #endif 
 #endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2612-rc3/include/linux/mm.h
===================================================================
--- linux-2612-rc3.orig/include/linux/mm.h
+++ linux-2612-rc3/include/linux/mm.h
@@ -259,7 +259,7 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 #ifdef CONFIG_CKRM_RES_MEM
-	struct ckrm_mem_res *ckrm_class;
+	struct ckrm_zone *ckrm_zone;
 #endif
 };
 
Index: linux-2612-rc3/include/linux/mm_inline.h
===================================================================
--- linux-2612-rc3.orig/include/linux/mm_inline.h
+++ linux-2612-rc3/include/linux/mm_inline.h
@@ -3,7 +3,9 @@
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->active_list);
+#endif
 	zone->nr_active++;
 	ckrm_mem_inc_active(page);
 }
@@ -11,7 +13,9 @@ add_page_to_active_list(struct zone *zon
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->inactive_list);
+#endif
 	zone->nr_inactive++;
 	ckrm_mem_inc_inactive(page);
 }
@@ -19,7 +23,9 @@ add_page_to_inactive_list(struct zone *z
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_active--;
 	ckrm_mem_dec_active(page);
 }
@@ -27,7 +33,9 @@ del_page_from_active_list(struct zone *z
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_inactive--;
 	ckrm_mem_dec_inactive(page);
 }
@@ -35,7 +43,9 @@ del_page_from_inactive_list(struct zone 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	if (PageActive(page)) {
 		ClearPageActive(page);
 		zone->nr_active--;
Index: linux-2612-rc3/include/linux/mmzone.h
===================================================================
--- linux-2612-rc3.orig/include/linux/mmzone.h
+++ linux-2612-rc3/include/linux/mmzone.h
@@ -135,8 +135,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
+#ifndef CONFIG_CKRM_RES_MEM
 	struct list_head	active_list;
 	struct list_head	inactive_list;
+#endif
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
Index: linux-2612-rc3/kernel/ckrm/ckrm_memcore.c
===================================================================
--- linux-2612-rc3.orig/kernel/ckrm/ckrm_memcore.c
+++ linux-2612-rc3/kernel/ckrm/ckrm_memcore.c
@@ -38,14 +38,20 @@
 #define CKRM_MEM_MAX_HIERARCHY 2 /* allows only upto 2 levels - 0, 1 & 2 */
 
 /* all 1-level memory_share_class are chained together */
-static LIST_HEAD(ckrm_memclass_list);
-static spinlock_t ckrm_mem_lock; /* protects list above */
-static unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
-
-static int ckrm_nr_mem_classes = 0;
-
+LIST_HEAD(ckrm_memclass_list);
+spinlock_t ckrm_mem_lock; /* protects list above */
+unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
+int ckrm_nr_mem_classes = 0;
 struct ckrm_mem_res *ckrm_mem_root_class;
 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+
+struct list_head ckrm_overguar_list[MAX_NR_ZONES];
+spinlock_t ckrm_overguar_lock[MAX_NR_ZONES]; /* protects the above list */
+
+EXPORT_SYMBOL_GPL(ckrm_memclass_list);
+EXPORT_SYMBOL_GPL(ckrm_mem_lock);
+EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
+EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
 EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
 EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
 
@@ -80,6 +86,9 @@ set_ckrm_tot_pages(void)
 static void
 mem_res_initcls_one(struct ckrm_mem_res *res)
 {
+	int zindex = 0;
+	struct zone *zone;
+
 	memset(res, 0, sizeof(struct ckrm_mem_res));
 
 	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
@@ -91,14 +100,65 @@ mem_res_initcls_one(struct ckrm_mem_res 
 
 	res->pg_guar = CKRM_SHARE_DONTCARE;
 	res->pg_limit = CKRM_SHARE_DONTCARE;
+	res->implicit_guar = CKRM_SHARE_DONTCARE;
 
 	INIT_LIST_HEAD(&res->mcls_list);
 
+	for_each_zone(zone) {
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].guar_list);
+		res->ckrm_zone[zindex].nr_active = 0;
+		res->ckrm_zone[zindex].nr_inactive = 0;
+		res->ckrm_zone[zindex].zone = zone;
+		res->ckrm_zone[zindex].memcls = res;
+		zindex++;
+	}
+
 	res->pg_unused = 0;
 	res->nr_dontcare = 1; /* for default class */
 	kref_init(&res->nr_users);
 }
 
+static inline void
+set_zone_guarantees(struct ckrm_mem_res *cls)
+{
+	int i, guar, total;
+	struct zone *zone;
+	u64 temp;
+
+	if (cls->pg_guar == CKRM_SHARE_DONTCARE)
+		/* no guarantee for this class. use implicit guarantee */
+		guar = cls->implicit_guar / cls->nr_dontcare;
+	else
+		guar = cls->pg_unused / cls->nr_dontcare;
+
+	i = 0;
+	for_each_zone(zone) {
+
+		if (zone->present_pages == 0) {
+			cls->ckrm_zone[i].guar = 0;
+			i++;
+			continue;
+		}
+
+		/*
+		 * Guarantee for a ckrmzone is calculated from the class's
+		 * guarantee and the pages_low of the zone proportional
+		 * to the ckrmzone.
+		 */
+		total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+		temp = (u64) guar * total;
+		do_div(temp, ckrm_tot_lru_pages);
+		cls->ckrm_zone[i].guar = (int) temp;
+
+		temp = zone->pages_low * cls->ckrm_zone[i].guar;
+		do_div(temp, zone->present_pages);
+		cls->ckrm_zone[i].guar -= (int) temp;
+		i++;
+	}
+}
+
 static void
 set_impl_guar_children(struct ckrm_mem_res *parres)
 {
@@ -106,11 +166,10 @@ set_impl_guar_children(struct ckrm_mem_r
 	struct ckrm_mem_res *cres;
 	int nr_dontcare = 1; /* for defaultclass */
 	int guar, impl_guar;
-	int resid = mem_rcbs.resid;
 
 	ckrm_lock_hier(parres->core);
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		/* treat NULL cres as don't care as that child is just being
 		 * created.
 		 * FIXME: need a better way to handle this case.
@@ -125,13 +184,14 @@ set_impl_guar_children(struct ckrm_mem_r
 	impl_guar = guar / parres->nr_dontcare;
 
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		if (cres && cres->pg_guar == CKRM_SHARE_DONTCARE) {
 			cres->implicit_guar = impl_guar;
 			set_impl_guar_children(cres);
 		}
 	}
 	ckrm_unlock_hier(parres->core);
+	set_zone_guarantees(parres);
 
 }
 
@@ -142,7 +202,7 @@ mem_res_alloc(struct ckrm_core_class *co
 
 	BUG_ON(mem_rcbs.resid == -1);
 
-	pres = ckrm_get_res_class(parent, mem_rcbs.resid, struct ckrm_mem_res);
+	pres = ckrm_memclass(parent);
 	if (pres && (pres->hier == CKRM_MEM_MAX_HIERARCHY)) {
 		printk(KERN_ERR "MEM_RC: only allows hieararchy of %d\n",
 						CKRM_MEM_MAX_HIERARCHY);
@@ -184,6 +244,7 @@ mem_res_alloc(struct ckrm_core_class *co
 				pres->implicit_guar : pres->pg_unused;
 			res->implicit_guar = guar / pres->nr_dontcare;
 		}
+		set_zone_guarantees(res);
 		ckrm_nr_mem_classes++;
 	} else
 		printk(KERN_ERR "MEM_RC: alloc: GFP_ATOMIC failed\n");
@@ -206,8 +267,7 @@ child_maxlimit_changed_local(struct ckrm
 	/* run thru parent's children and get new max_limit of parent */
 	ckrm_lock_hier(parres->core);
 	while ((child = ckrm_get_next_child(parres->core, child)) != NULL) {
-		childres = ckrm_get_res_class(child, mem_rcbs.resid,
-				struct ckrm_mem_res);
+		childres = ckrm_memclass(child);
 		if (maxlimit < childres->shares.my_limit)
 			maxlimit = childres->shares.my_limit;
 	}
@@ -225,7 +285,6 @@ recalc_and_propagate(struct ckrm_mem_res
 {
 	struct ckrm_core_class *child = NULL;
 	struct ckrm_mem_res *cres;
-	int resid = mem_rcbs.resid;
 	struct ckrm_shares *self = &res->shares;
 
 	if (parres) {
@@ -266,10 +325,11 @@ recalc_and_propagate(struct ckrm_mem_res
 	} else
 		res->pg_unused = 0;
 
+	set_zone_guarantees(res);
 	/* propagate to children */
 	ckrm_lock_hier(res->core);
 	while ((child = ckrm_get_next_child(res->core, child)) != NULL) {
-		cres = ckrm_get_res_class(child, resid, struct ckrm_mem_res);
+		cres = ckrm_memclass(child);
 		recalc_and_propagate(cres, res);
 	}
 	ckrm_unlock_hier(res->core);
@@ -281,14 +341,14 @@ mem_res_free(void *my_res)
 {
 	struct ckrm_mem_res *res = my_res;
 	struct ckrm_mem_res *pres;
+	int i;
 
 	if (!res)
 		return;
 
 	ckrm_mem_migrate_all_pages(res, ckrm_mem_root_class);
 
-	pres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-			struct ckrm_mem_res);
+	pres = ckrm_memclass(res->parent);
 
 	if (pres) {
 		child_guarantee_changed(&pres->shares,
@@ -308,6 +368,9 @@ mem_res_free(void *my_res)
 	res->pg_limit = 0;
 	res->pg_unused = 0;
 
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		BUG_ON(!list_empty(&res->ckrm_zone[i].guar_list));
+
 	spin_lock_irq(&ckrm_mem_lock);
 	list_del_init(&res->mcls_list);
 	spin_unlock_irq(&ckrm_mem_lock);
@@ -329,8 +392,7 @@ mem_set_share_values(void *my_res, struc
 	if (!res)
 		return -EINVAL;
 
-	parres = ckrm_get_res_class(res->parent, mem_rcbs.resid,
-		struct ckrm_mem_res);
+	parres = ckrm_memclass(res->parent);
 
 	rc = set_shares(shares, &res->shares, parres ? &parres->shares : NULL);
 
@@ -494,10 +556,15 @@ int __init
 init_ckrm_mem_res(void)
 {
 	struct ckrm_classtype *clstype;
-	int resid = mem_rcbs.resid;
+	int i, resid = mem_rcbs.resid;
 
 	set_ckrm_tot_pages();
 	spin_lock_init(&ckrm_mem_lock);
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		INIT_LIST_HEAD(&ckrm_overguar_list[i]);
+		spin_lock_init(&ckrm_overguar_lock[i]);
+	}
+
 	clstype = ckrm_find_classtype_by_name("taskclass");
 	if (clstype == NULL) {
 		printk(KERN_INFO " Unknown ckrm classtype<taskclass>");
Index: linux-2612-rc3/kernel/ckrm/ckrm_memctlr.c
===================================================================
--- linux-2612-rc3.orig/kernel/ckrm/ckrm_memctlr.c
+++ linux-2612-rc3/kernel/ckrm/ckrm_memctlr.c
@@ -14,53 +14,98 @@
  *
  */
 
+#include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/ckrm_mem_inline.h>
 
+extern struct list_head ckrm_overguar_list[];
+
+static inline void
+ckrm_add_to_guar_list(struct ckrm_zone *czone, int zindex)
+{
+	int usage;
+	unsigned long flags;
+
+	/* fast path, after this czone gets into the list */
+	if (!list_empty(&czone->guar_list))
+		return;
+
+	usage = czone->nr_active + czone->nr_inactive;
+	if (usage > czone->guar) {
+		spin_lock_irqsave(&ckrm_overguar_lock[zindex], flags);
+		if (list_empty(&czone->guar_list))
+			list_add_tail(&czone->guar_list,
+				&ckrm_overguar_list[zindex]);
+		spin_unlock_irqrestore(&ckrm_overguar_lock[zindex], flags);
+	}
+}
+
+static inline void
+ckrm_del_from_guar_list(struct ckrm_zone *czone, int zindex)
+{
+	int usage;
+	unsigned long flags;
+
+	/* fast path, return immediately if we are not in the list */
+	if (list_empty(&czone->guar_list))
+		return;
+
+	usage = czone->nr_active + czone->nr_inactive;
+	if (usage <= czone->guar) {
+		spin_lock_irqsave(&ckrm_overguar_lock[zindex], flags);
+		if (list_empty(&czone->guar_list))
+			list_del_init(&czone->guar_list);
+		spin_unlock_irqrestore(&ckrm_overguar_lock[zindex], flags);
+	}
+}
+
 void
-incr_use_count(struct ckrm_mem_res *cls, int borrow, int zindex)
+add_use_count(struct ckrm_mem_res *cls, int borrow, int zindex, int cnt)
 {
 	int i, pg_total = 0;
-	struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+	struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 
 	if (!cls)
 		return;
 
-	cls->pg_total[zindex]++;
+	cls->pg_total[zindex] += cnt;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		pg_total += cls->pg_total[i];
 	if (borrow)
-		cls->pg_lent[zindex]++;
+		cls->pg_lent[zindex] += cnt;
 
-	parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+	parcls = ckrm_memclass(cls->parent);
 	if (parcls && ((cls->pg_guar == CKRM_SHARE_DONTCARE) ||
 			(pg_total > cls->pg_unused))) {
-		incr_use_count(parcls, 1, zindex);
-		cls->pg_borrowed[zindex]++;
+		add_use_count(parcls, 1, zindex, cnt);
+		cls->pg_borrowed[zindex] += cnt;
 	} else
-		atomic_inc(&ckrm_mem_real_count);
+		atomic_add(cnt, &ckrm_mem_real_count);
+	ckrm_add_to_guar_list(&cls->ckrm_zone[zindex], zindex);
 	return;
 }
 
 void
-decr_use_count(struct ckrm_mem_res *cls, int borrowed, int zindex)
+sub_use_count(struct ckrm_mem_res *cls, int borrowed, int zindex, int cnt)
 {
+	int borrow_cnt = 0;
+
 	if (!cls)
 		return;
-	cls->pg_total[zindex]--;
+	cls->pg_total[zindex] -= cnt;
 	if (borrowed)
-		cls->pg_lent[zindex]--;
-	if (cls->pg_borrowed > 0) {
-		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-				mem_rcbs.resid, struct ckrm_mem_res);
+		cls->pg_lent[zindex] -= cnt;
+	if (cls->pg_borrowed[zindex] > 0) {
+		struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 		if (parcls) {
-			decr_use_count(parcls, 1, zindex);
-			cls->pg_borrowed[zindex]--;
-			return;
+			borrow_cnt = min(cnt, cls->pg_borrowed[zindex]);
+			sub_use_count(parcls, 1, zindex, borrow_cnt);
+			cls->pg_borrowed[zindex] -= borrow_cnt;
 		}
 	}
-	atomic_dec(&ckrm_mem_real_count);
+	atomic_sub(cnt - borrow_cnt, &ckrm_mem_real_count);
+	ckrm_del_from_guar_list(&cls->ckrm_zone[zindex], zindex);
+	return;
 }
 
 int
@@ -73,8 +118,7 @@ ckrm_class_limit_ok(struct ckrm_mem_res 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		pg_total += cls->pg_total[i];
 	if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
-		struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
-					mem_rcbs.resid, struct ckrm_mem_res);
+		struct ckrm_mem_res *parcls = ckrm_memclass(cls->parent);
 		ret = (parcls ? ckrm_class_limit_ok(parcls) : 0);
 	} else
 		ret = (pg_total <= cls->pg_limit);
@@ -82,8 +126,88 @@ ckrm_class_limit_ok(struct ckrm_mem_res 
 	return ret;
 }
 
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct vm_area_struct *vma;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!spin_trylock(&anon_vma->lock))
+		return 0;
+	BUG_ON(list_empty(&anon_vma->head));
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass) < 0) {
+			maxshareclass = mm->memclass;
+		}
+	}
+	spin_unlock(&anon_vma->lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma = NULL;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct prio_tree_iter iter;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!mapping)
+		return 0;
+
+	if (!spin_trylock(&mapping->i_mmap_lock))
+		return 0;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
+					pgoff, pgoff) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass)<0)
+			maxshareclass = mm->memclass;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page)
+{
+	int ret = 0;
+	if (page->mapping) {
+		if (PageAnon(page))
+			ret = ckrm_mem_evaluate_page_anon(page);
+		else
+			ret = ckrm_mem_evaluate_page_file(page);
+	}
+	return ret;
+}
+
 static void migrate_list(struct list_head *list,
-	struct ckrm_mem_res* from, struct ckrm_mem_res* to)
+	struct ckrm_mem_res* from, struct ckrm_mem_res* def)
 {
 	struct page *page;
 	struct list_head *pos, *next;
@@ -92,21 +216,26 @@ static void migrate_list(struct list_hea
 	while (pos != list) {
 		next = pos->next;
 		page = list_entry(pos, struct page, lru);
-		if (page->ckrm_class == from) 
-			ckrm_change_page_class(page, to);
+		if (ckrm_mem_evaluate_page(page))
+			ckrm_change_page_class(page, def);
 		pos = next;
 	}
 }
 
 void
-ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct ckrm_mem_res* to)
+ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from,
+					struct ckrm_mem_res* def)
 {
+	int i;
 	struct zone *zone;
-
-	for_each_zone(zone) {
+	struct ckrm_zone *ckrm_zone;
+  
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		ckrm_zone = &from->ckrm_zone[i];
+		zone = ckrm_zone->zone;
 		spin_lock_irq(&zone->lru_lock);
-		migrate_list(&zone->inactive_list, from, to);
-		migrate_list(&zone->active_list, from, to);
+		migrate_list(&ckrm_zone->inactive_list, from, def);
+		migrate_list(&ckrm_zone->active_list, from, def);
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	return;
@@ -131,8 +260,13 @@ class_migrate_pmd(struct mm_struct* mm, 
 		pte = pte_offset_map(pmdir, address);
 		if (pte_present(*pte)) {
 			struct page *page = pte_page(*pte);
-			if (page->mapping)
+			struct ckrm_zone *czone = page_ckrmzone(page);
+			if (page->mapping && czone) {
+				struct zone *zone = czone->zone;
+				spin_lock_irq(&zone->lru_lock);
 				ckrm_change_page_class(page, mm->memclass);
+				spin_unlock_irq(&zone->lru_lock);
+			}
 		}
 		address += PAGE_SIZE;
 		pte_unmap(pte);
@@ -190,7 +324,9 @@ class_migrate_vma(struct mm_struct* mm, 
 void
 ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
 {
+	struct task_struct *task;
 	struct vm_area_struct *vma;
+	struct ckrm_mem_res *maxshareclass = def;
 
 	/* We leave the mm->memclass untouched since we believe that one
 	 * mm with no task associated will be deleted soon or attach
@@ -199,18 +335,72 @@ ckrm_mem_migrate_mm(struct mm_struct* mm
 	if (list_empty(&mm->tasklist))
 		return;
 
-	if (mm->memclass)
-		kref_put(&mm->memclass->nr_users, memclass_release);
-	mm->memclass = def ?: ckrm_mem_root_class;
-	kref_get(&mm->memclass->nr_users);
-
-	/* Go through all VMA to migrate pages */
-	down_read(&mm->mmap_sem);
-	vma = mm->mmap;
-	while(vma) {
-		class_migrate_vma(mm, vma);
-		vma = vma->vm_next;
+	list_for_each_entry(task, &mm->tasklist, mm_peers) {
+		struct ckrm_mem_res* cls = ckrm_task_memclass(task);
+		if (!cls)
+			continue;
+		if (!maxshareclass ||
+				ckrm_mem_share_compare(maxshareclass,cls)<0 )
+			maxshareclass = cls;
+	}
+
+	if (maxshareclass && (mm->memclass != maxshareclass)) {
+		if (mm->memclass)
+			kref_put(&mm->memclass->nr_users, memclass_release);
+		mm->memclass = maxshareclass;
+		kref_get(&maxshareclass->nr_users);
+
+		/* Go through all VMA to migrate pages */
+		down_read(&mm->mmap_sem);
+		vma = mm->mmap;
+		while(vma) {
+			class_migrate_vma(mm, vma);
+			vma = vma->vm_next;
+		}
+		up_read(&mm->mmap_sem);
 	}
-	up_read(&mm->mmap_sem);
 	return;
 }
+
+/*
+ * Returns the ckrm zone whose usage is over its guarantee and is
+ * is the most among all the ckrm zones who are over their respective
+ * guarantees.
+ * 
+ * While returning holds a reference to the class, Caller is responsible
+ * for dropping the reference(kref_put), when it is done with the ckrm
+ * zone.
+ */
+struct ckrm_zone *
+ckrm_get_max_overguar_czone(int zindex)
+{
+	struct ckrm_zone *czone;
+	struct ckrm_zone *maxczone = &ckrm_mem_root_class->ckrm_zone[zindex];
+	int max_overguar = 0, usage, cnt;
+	struct ckrm_mem_res *cls;
+
+	kref_get(&maxczone->memcls->nr_users);
+
+	spin_lock_irq(&ckrm_overguar_lock[zindex]);
+	list_for_each_entry(czone, &ckrm_overguar_list[zindex], guar_list) {
+		cls = czone->memcls;
+		usage = czone->nr_active + czone->nr_inactive;
+		if ((usage - czone->guar) > max_overguar) {
+			kref_put(&maxczone->memcls->nr_users, memclass_release);
+			max_overguar = usage - czone->guar;
+			maxczone = czone;
+			kref_get(&maxczone->memcls->nr_users);
+		}
+	}
+	spin_unlock_irq(&ckrm_overguar_lock[zindex]);
+	BUG_ON(maxczone == NULL);
+
+	/* calculate active_over and inactive_over */
+	cnt = maxczone->nr_active - (2 * maxczone->guar / 3);
+	maxczone->active_over = (cnt > 0) ? cnt : SWAP_CLUSTER_MAX;	
+	cnt = maxczone->active_over + maxczone->nr_inactive
+			 - (maxczone->guar / 3);
+	maxczone->inactive_over = (cnt > 0) ? cnt : SWAP_CLUSTER_MAX;	
+
+	return maxczone;
+}
Index: linux-2612-rc3/mm/page_alloc.c
===================================================================
--- linux-2612-rc3.orig/mm/page_alloc.c
+++ linux-2612-rc3/mm/page_alloc.c
@@ -356,7 +356,6 @@ free_pages_bulk(struct zone *zone, int c
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
 		__free_pages_bulk(page, zone, order);
-		ckrm_clear_page_class(page);
 		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -1696,8 +1695,7 @@ static void __init free_area_init_core(s
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
+		ckrm_init_lists(zone);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
Index: linux-2612-rc3/mm/swap.c
===================================================================
--- linux-2612-rc3.orig/mm/swap.c
+++ linux-2612-rc3/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/ckrm_mem_inline.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -87,7 +88,7 @@ int rotate_reclaimable_page(struct page 
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
 		list_del(&page->lru);
-		list_add_tail(&page->lru, &zone->inactive_list);
+		ckrm_add_tail_inactive(page);
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
Index: linux-2612-rc3/mm/vmscan.c
===================================================================
--- linux-2612-rc3.orig/mm/vmscan.c
+++ linux-2612-rc3/mm/vmscan.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/ckrm_mem.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -79,6 +80,11 @@ struct scan_control {
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
+
+#ifdef CONFIG_CKRM_RES_MEM
+	int ckrm_active;
+	int ckrm_inactive;
+#endif
 };
 
 /*
@@ -586,6 +592,7 @@ static int isolate_lru_pages(int nr_to_s
 			continue;
 		} else {
 			list_add(&page->lru, dst);
+			set_page_ckrmzone(page, NULL);
 			nr_taken++;
 		}
 	}
@@ -597,11 +604,23 @@ static int isolate_lru_pages(int nr_to_s
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
+#ifdef CONFIG_CKRM_RES_MEM
+static void shrink_cache(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 static void shrink_cache(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	int max_scan = sc->nr_to_scan;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+	struct list_head *active_list = &ckrm_zone->active_list;
+#else
+	struct list_head *inactive_list = &zone->inactive_list;
+	struct list_head *active_list = &zone->active_list;
+#endif
 
 	pagevec_init(&pvec, 1);
 
@@ -614,9 +633,10 @@ static void shrink_cache(struct zone *zo
 		int nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-					     &zone->inactive_list,
+					     inactive_list,
 					     &page_list, &nr_scan);
 		zone->nr_inactive -= nr_taken;
+		ckrm_zone_sub_inactive(ckrm_zone, nr_taken);
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
@@ -643,10 +663,16 @@ static void shrink_cache(struct zone *zo
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			if (PageActive(page)) {
+				ckrm_zone_add_active(ckrm_zone, 1);
+				zone->nr_active++;
+				list_add(&page->lru, active_list);
+			} else {
+				ckrm_zone_add_inactive(ckrm_zone, 1);
+				zone->nr_inactive++;
+				list_add(&page->lru, inactive_list);
+			}
+			set_page_ckrmzone(page, ckrm_zone);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -677,7 +703,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
+#ifdef CONFIG_CKRM_RES_MEM
+refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	int pgmoved;
 	int pgdeactivate = 0;
@@ -692,13 +722,22 @@ refill_inactive_zone(struct zone *zone, 
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *active_list = &ckrm_zone->active_list;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+#else
+	struct list_head *active_list = &zone->active_list;
+	struct list_head *inactive_list = &zone->inactive_list;
+#endif
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+	pgmoved = isolate_lru_pages(nr_pages, active_list,
 				    &l_hold, &pgscanned);
 	zone->pages_scanned += pgscanned;
 	zone->nr_active -= pgmoved;
+	ckrm_zone_sub_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
 	/*
@@ -757,10 +796,12 @@ refill_inactive_zone(struct zone *zone, 
 			BUG();
 		if (!TestClearPageActive(page))
 			BUG();
-		list_move(&page->lru, &zone->inactive_list);
+		list_move(&page->lru, inactive_list);
+		set_page_ckrmzone(page, ckrm_zone);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
+			ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -771,6 +812,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_inactive += pgmoved;
+	ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
@@ -785,10 +827,12 @@ refill_inactive_zone(struct zone *zone, 
 		if (TestSetPageLRU(page))
 			BUG();
 		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
+		list_move(&page->lru, active_list);
+		set_page_ckrmzone(page, ckrm_zone);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_active += pgmoved;
+			ckrm_zone_add_active(ckrm_zone, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
@@ -796,6 +840,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_active += pgmoved;
+	ckrm_zone_add_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
@@ -803,6 +848,29 @@ refill_inactive_zone(struct zone *zone, 
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+#ifdef CONFIG_CKRM_RES_MEM
+static void
+shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc)
+{
+	while (sc->ckrm_active || sc->ckrm_inactive) {
+		if (sc->ckrm_active) {
+			sc->nr_to_scan = (unsigned long)min(sc->ckrm_active,
+					SWAP_CLUSTER_MAX);
+			sc->ckrm_active -= sc->nr_to_scan;
+			refill_inactive_zone(czone, sc);
+		}
+		if (sc->ckrm_inactive) {
+			sc->nr_to_scan = (unsigned long)min(sc->ckrm_inactive,
+					SWAP_CLUSTER_MAX);
+			sc->ckrm_inactive -= sc->nr_to_scan;
+			shrink_cache(czone, sc);
+			if (sc->nr_to_reclaim <= 0)
+				break;
+		}
+	}
+}
+#endif
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -832,6 +900,20 @@ shrink_zone(struct zone *zone, struct sc
 
 	sc->nr_to_reclaim = sc->swap_cluster_max;
 
+#ifdef CONFIG_CKRM_RES_MEM
+	while (nr_active || nr_inactive) {
+		int zindex = zone_idx(zone);
+		struct ckrm_zone *czone;
+
+		czone = ckrm_get_max_overguar_czone(zindex);
+		sc->ckrm_active = min(nr_active, czone->active_over); 
+		sc->ckrm_inactive = min(nr_inactive, czone->inactive_over);
+		nr_active -= sc->ckrm_active;
+		nr_inactive -= sc->ckrm_inactive;
+		shrink_ckrmzone(czone, sc);
+		kref_put(&czone->memcls->nr_users, memclass_release);
+	}
+#else 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
@@ -849,6 +931,7 @@ shrink_zone(struct zone *zone, struct sc
 				break;
 		}
 	}
+#endif
 
 	throttle_vm_writeout();
 }
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread
* [PATCH 4/6] CKRM: Add guarantee support for mem controller
@ 2005-04-02  3:14 Chandra Seetharaman
  0 siblings, 0 replies; 10+ messages in thread
From: Chandra Seetharaman @ 2005-04-02  3:14 UTC (permalink / raw)
  To: ckrm-tech, linux-mm

[-- Attachment #1: Type: text/plain, Size: 287 bytes --]


-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - sekharan@us.ibm.com   |      .......you may get it.
----------------------------------------------------------------------

[-- Attachment #2: 11-04-mem_limit-guar --]
[-- Type: text/plain, Size: 34604 bytes --]

Patch 4 of 6 patches to support memory controller under CKRM framework.
This patch provides the guarantee support for the controller.

 include/linux/ckrm_mem.h        |   31 ++++
 include/linux/ckrm_mem_inline.h |  244 ++++++++++++++++++++++++--------
 include/linux/mm.h              |    2 
 include/linux/mm_inline.h       |   10 +
 include/linux/mmzone.h          |    2 
 kernel/ckrm/ckrm_memcore.c      |   29 +++
 kernel/ckrm/ckrm_memctlr.c      |  299 +++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c                 |    3 
 mm/swap.c                       |    3 
 mm/vmscan.c                     |  114 +++++++++++++--
 10 files changed, 631 insertions(+), 106 deletions(-)

Index: linux-2.6.12-rc1/include/linux/ckrm_mem.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/ckrm_mem.h
+++ linux-2.6.12-rc1/include/linux/ckrm_mem.h
@@ -26,6 +26,27 @@
 #include <linux/mmzone.h>
 #include <linux/ckrm_rc.h>
 
+struct ckrm_zone {
+	struct list_head active_list;
+	struct list_head inactive_list;
+
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long active_over;
+	unsigned long inactive_over;
+
+	unsigned long shrink_active;
+	unsigned long shrink_inactive;
+	long shrink_weight;
+	unsigned long shrink_flag;
+	struct list_head victim_list;	/* list of ckrm_zones chosen for
+					 * shrinking. These are over their
+					 * 'guarantee'
+					 */
+	struct zone *zone;
+	struct ckrm_mem_res *memcls;
+};
+
 struct ckrm_mem_res {
 	unsigned long flags;
 	struct ckrm_core_class *core;	/* the core i am part of... */
@@ -46,11 +67,19 @@ struct ckrm_mem_res {
 	int hier;			/* hiearchy level, root = 0 */
 	int impl_guar;			/* for classes with don't care guar */
 	int nr_dontcare;		/* # of dont care children */
+
+	struct ckrm_zone ckrm_zone[MAX_NR_ZONES];
 };
 
+#define CLS_SHRINK_BIT		(1)
+
 extern atomic_t ckrm_mem_real_count;
 extern struct ckrm_res_ctlr mem_rcbs;
 extern struct ckrm_mem_res *ckrm_mem_root_class;
+extern struct list_head ckrm_memclass_list;
+extern spinlock_t ckrm_mem_lock;
+extern int ckrm_nr_mem_classes;
+extern unsigned int ckrm_tot_lru_pages;
 
 extern void ckrm_mem_migrate_mm(struct mm_struct *, struct ckrm_mem_res *);
 extern void ckrm_mem_migrate_all_pages(struct ckrm_mem_res *,
@@ -60,6 +89,8 @@ extern void incr_use_count(struct ckrm_m
 extern void decr_use_count(struct ckrm_mem_res *, int, int);
 extern int ckrm_class_limit_ok(struct ckrm_mem_res *);
 
+extern void shrink_get_victims(struct zone *, unsigned long ,
+				unsigned long, struct list_head *);
 #else
 
 #define ckrm_mem_migrate_mm(a, b)			do {} while (0)
Index: linux-2.6.12-rc1/include/linux/ckrm_mem_inline.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/ckrm_mem_inline.h
+++ linux-2.6.12-rc1/include/linux/ckrm_mem_inline.h
@@ -34,16 +34,75 @@ ckrm_get_mem_class(struct task_struct *t
 }
 
 static inline void
+ckrm_set_shrink(struct ckrm_zone *cz)
+{
+	set_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
+static inline int
+ckrm_test_set_shrink(struct ckrm_zone *cz)
+{
+	return test_and_set_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
+static inline void 
+ckrm_clear_shrink(struct ckrm_zone *cz)
+{
+	clear_bit(CLS_SHRINK_BIT, &cz->shrink_flag);
+}
+
+static inline void
+set_page_ckrmzone( struct page *page, struct ckrm_zone *cz)
+{
+	page->ckrm_zone = cz;
+}
+
+static inline struct ckrm_zone *
+page_ckrmzone(struct page *page)
+{
+	return page->ckrm_zone;
+}
+
+/*
+ * Currently, a shared page that is shared by multiple classes is charged
+ * to a class with max available guarantee. Simply replace this function
+ * for other policies.
+ */
+static inline int
+ckrm_mem_share_compare(struct ckrm_mem_res *a, struct ckrm_mem_res *b)
+{
+	if (a == NULL)
+		return -(b != NULL);
+	if (b == NULL)
+		return 1;
+	if (a->pg_guar == b->pg_guar)
+		return 0;
+	if (a->pg_guar == CKRM_SHARE_DONTCARE)
+		return 1;
+	if (b->pg_guar == CKRM_SHARE_DONTCARE)
+		return -1;
+	return (a->pg_unused - b->pg_unused);
+}
+
+static inline void
 ckrm_set_page_class(struct page *page, struct ckrm_mem_res *cls)
 {
+	struct ckrm_zone *new_czone, *old_czone;
+
 	if (!cls) {
-		if (!ckrm_mem_root_class)
+		if (!ckrm_mem_root_class) {
+			set_page_ckrmzone(page, NULL);
 			return;
+		}
 		cls = ckrm_mem_root_class;
 	}
-	if (page->ckrm_class)
-		kref_put(&page->ckrm_class->nr_users, memclass_release);
-	page->ckrm_class = cls;
+	new_czone = &cls->ckrm_zone[page_zonenum(page)];
+	old_czone = page_ckrmzone(page);
+	
+	if (old_czone)
+		kref_put(&old_czone->memcls->nr_users, memclass_release);
+
+	set_page_ckrmzone(page, new_czone);
 	kref_get(&cls->nr_users);
 	incr_use_count(cls, 0, page_zonenum(page));
 	SetPageCkrmAccount(page);
@@ -52,7 +111,8 @@ ckrm_set_page_class(struct page *page, s
 static inline void
 ckrm_change_page_class(struct page *page, struct ckrm_mem_res *newcls)
 {
-	struct ckrm_mem_res *oldcls = page->ckrm_class;
+	struct ckrm_zone *old_czone = page_ckrmzone(page), *new_czone;
+	struct ckrm_mem_res *oldcls;
 	int zindex = page_zonenum(page);
 
 	if  (!newcls) {
@@ -61,6 +121,7 @@ ckrm_change_page_class(struct page *page
 		newcls = ckrm_mem_root_class;
 	}
 
+	oldcls = old_czone->memcls;
 	if (oldcls == newcls)
 		return;
 
@@ -69,20 +130,35 @@ ckrm_change_page_class(struct page *page
 		decr_use_count(oldcls, 0, zindex);
 	}
 
-	page->ckrm_class = newcls;
+	new_czone = &newcls->ckrm_zone[page_zonenum(page)];
+	set_page_ckrmzone(page, new_czone);
 	kref_get(&newcls->nr_users);
 	incr_use_count(newcls, 0, zindex);
+
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		old_czone->nr_active--;
+		new_czone->nr_active++;
+		list_add(&page->lru, &new_czone->active_list);
+	} else {
+		old_czone->nr_inactive--;
+		new_czone->nr_inactive++;
+		list_add(&page->lru, &new_czone->inactive_list);
+	}
 }
 
 static inline void
 ckrm_clear_page_class(struct page *page)
 {
-	struct ckrm_mem_res *cls = page->ckrm_class;
-	if (cls && PageCkrmAccount(page)) {
-		decr_use_count(cls, 0, page_zonenum(page));
-		ClearPageCkrmAccount(page);
-		kref_put(&cls->nr_users, memclass_release);
-	}
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone != NULL) {
+		if (PageCkrmAccount(page)) {
+			decr_use_count(czone->memcls, 0, page_zonenum(page));
+			ClearPageCkrmAccount(page);
+		}
+		kref_put(&czone->memcls->nr_users, memclass_release);
+		set_page_ckrmzone(page, NULL);
+  	}
 }
 
 static inline void
@@ -91,17 +167,27 @@ ckrm_mem_inc_active(struct page *page)
 	struct ckrm_mem_res *cls = ckrm_get_mem_class(current)
 						?: ckrm_mem_root_class;
 
-	if (!cls)
-		return;
-	ckrm_set_page_class(page, cls);
+	struct ckrm_zone *czone;
+  
+	if (cls == NULL)
+  		return;
+
+  	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_active++;
+	list_add(&page->lru, &czone->active_list);
 }
 
 static inline void
 ckrm_mem_dec_active(struct page *page)
 {
-	if (page->ckrm_class == NULL)
-		return;
-	ckrm_clear_page_class(page);
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+  		return;
+
+	list_del(&page->lru);
+	czone->nr_active--;
+  	ckrm_clear_page_class(page);
 }
 
 static inline void
@@ -109,25 +195,58 @@ ckrm_mem_inc_inactive(struct page *page)
 {
 	struct ckrm_mem_res *cls = ckrm_get_mem_class(current)
 					?: ckrm_mem_root_class;
-
-	if (!cls)
-		return;
-	ckrm_set_page_class(page, cls);
+	struct ckrm_zone *czone;
+  
+	if (cls == NULL)
+  		return;
+
+  	ckrm_set_page_class(page, cls);
+	czone = page_ckrmzone(page);
+	czone->nr_inactive++;
+	list_add(&page->lru, &czone->inactive_list);
 }
 
 static inline void
 ckrm_mem_dec_inactive(struct page *page)
 {
-	if (!page->ckrm_class)
-		return;
-	ckrm_clear_page_class(page);
+	struct ckrm_zone *czone = page_ckrmzone(page);
+	if (czone == NULL)
+  		return;
+
+	czone->nr_inactive--;
+	list_del(&page->lru);
+  	ckrm_clear_page_class(page);
 }
 
 static inline void
 ckrm_page_init(struct page *page)
 {
 	page->flags &= ~(1 << PG_ckrm_account);
-	page->ckrm_class = NULL;
+	set_page_ckrmzone(page, NULL);
+}
+
+static inline void
+ckrm_zone_add_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active += cnt;
+}
+
+static inline void
+ckrm_zone_add_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive += cnt;
+}
+
+static inline void
+ckrm_zone_sub_active(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_active -= cnt;
+}
+
+static inline void
+ckrm_zone_sub_inactive(struct ckrm_zone *czone, int cnt)
+{
+	czone->nr_inactive -= cnt;
 }
 
 
@@ -202,28 +321,15 @@ ckrm_mm_clearclass(struct mm_struct *mm)
 	}
 }
 
-#else
-
-static inline void
-ckrm_task_mm_init(struct task_struct *tsk)
-{
-}
+static inline void ckrm_init_lists(struct zone *zone) 			{}
 
-static inline void
-ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
+static inline void ckrm_add_tail_inactive(struct page *page)
 {
+	 struct ckrm_zone *ckrm_zone = page_ckrmzone(page);
+	 list_add_tail(&page->lru, &ckrm_zone->inactive_list);
 }
 
-static inline void
-ckrm_task_mm_change(struct task_struct *tsk,
-		struct mm_struct *oldmm, struct mm_struct *newmm)
-{
-}
-
-static inline void
-ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
-{
-}
+#else
 
 static inline void *
 ckrm_get_memclass(struct task_struct *tsk)
@@ -231,27 +337,47 @@ ckrm_get_memclass(struct task_struct *ts
 	return NULL;
 }
 
-static inline void
-ckrm_mm_init(struct mm_struct *mm)
-{
-}
+static inline void ckrm_clear_page_class(struct page *p)		{}
+
+static inline void ckrm_mem_inc_active(struct page *p)			{}
+static inline void ckrm_mem_dec_active(struct page *p)			{}
+static inline void ckrm_mem_inc_inactive(struct page *p)		{}
+static inline void ckrm_mem_dec_inactive(struct page *p)		{}
+
+#define ckrm_zone_add_active(a, b)	do {} while (0)
+#define ckrm_zone_add_inactive(a, b)	do {} while (0)
+#define ckrm_zone_sub_active(a, b)	do {} while (0)
+#define ckrm_zone_sub_inactive(a, b)	do {} while (0)
+
+#define ckrm_class_limit_ok(a)						(1)
+
+static inline void ckrm_page_init(struct page *p)			{}
+static inline void ckrm_task_mm_init(struct task_struct *tsk)		{}
+static inline void ckrm_task_mm_set(struct mm_struct * mm,
+					struct task_struct *task)	{}
+static inline void ckrm_task_mm_change(struct task_struct *tsk,
+		struct mm_struct *oldmm, struct mm_struct *newmm)	{}
+static inline void ckrm_task_mm_clear(struct task_struct *tsk,
+						struct mm_struct *mm)	{}
+
+static inline void ckrm_mm_init(struct mm_struct *mm)			{}
 
 /* using #define instead of static inline as the prototype requires   *
  * data structures that is available only with the controller enabled */
-#define ckrm_mm_setclass(a, b) do { } while(0)
-#define ckrm_class_limit_ok(a)	(1)
+#define ckrm_mm_setclass(a, b) 					do {} while(0)
 
-static inline void
-ckrm_mm_clearclass(struct mm_struct *mm)
+static inline void ckrm_mm_clearclass(struct mm_struct *mm)		{}
+
+static inline void ckrm_init_lists(struct zone *zone)
 {
+	INIT_LIST_HEAD(&zone->active_list);
+	INIT_LIST_HEAD(&zone->inactive_list);
 }
 
-static inline void ckrm_mem_inc_active(struct page *p)		{}
-static inline void ckrm_mem_dec_active(struct page *p)		{}
-static inline void ckrm_mem_inc_inactive(struct page *p)	{}
-static inline void ckrm_mem_dec_inactive(struct page *p)	{}
-static inline void ckrm_page_init(struct page *p)		{}
-static inline void ckrm_clear_page_class(struct page *p)	{}
-
+static inline void ckrm_add_tail_inactive(struct page *page)
+{
+	 struct zone *zone = page_zone(page);
+	 list_add_tail(&page->lru, &zone->inactive_list);
+}
 #endif 
 #endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2.6.12-rc1/include/linux/mm.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/mm.h
+++ linux-2.6.12-rc1/include/linux/mm.h
@@ -262,7 +262,7 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 #ifdef CONFIG_CKRM_RES_MEM
-	struct ckrm_mem_res *ckrm_class;
+	struct ckrm_zone *ckrm_zone;
 #endif
 };
 
Index: linux-2.6.12-rc1/include/linux/mm_inline.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/mm_inline.h
+++ linux-2.6.12-rc1/include/linux/mm_inline.h
@@ -3,7 +3,9 @@
 static inline void
 add_page_to_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->active_list);
+#endif
 	zone->nr_active++;
 	ckrm_mem_inc_active(page);
 }
@@ -11,7 +13,9 @@ add_page_to_active_list(struct zone *zon
 static inline void
 add_page_to_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_add(&page->lru, &zone->inactive_list);
+#endif
 	zone->nr_inactive++;
 	ckrm_mem_inc_inactive(page);
 }
@@ -19,7 +23,9 @@ add_page_to_inactive_list(struct zone *z
 static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_active--;
 	ckrm_mem_dec_active(page);
 }
@@ -27,7 +33,9 @@ del_page_from_active_list(struct zone *z
 static inline void
 del_page_from_inactive_list(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	zone->nr_inactive--;
 	ckrm_mem_dec_inactive(page);
 }
@@ -35,7 +43,9 @@ del_page_from_inactive_list(struct zone 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+#ifndef CONFIG_CKRM_RES_MEM
 	list_del(&page->lru);
+#endif
 	if (PageActive(page)) {
 		ClearPageActive(page);
 		zone->nr_active--;
Index: linux-2.6.12-rc1/include/linux/mmzone.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/mmzone.h
+++ linux-2.6.12-rc1/include/linux/mmzone.h
@@ -135,8 +135,10 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
+#ifndef CONFIG_CKRM_RES_MEM
 	struct list_head	active_list;
 	struct list_head	inactive_list;
+#endif
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
Index: linux-2.6.12-rc1/kernel/ckrm/ckrm_memcore.c
===================================================================
--- linux-2.6.12-rc1.orig/kernel/ckrm/ckrm_memcore.c
+++ linux-2.6.12-rc1/kernel/ckrm/ckrm_memcore.c
@@ -38,14 +38,17 @@
 #define CKRM_MEM_MAX_HIERARCHY 2 /* allows only upto 2 levels - 0, 1 & 2 */
 
 /* all 1-level memory_share_class are chained together */
-static LIST_HEAD(ckrm_memclass_list);
-static spinlock_t ckrm_mem_lock; /* protects list above */
-static unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
-
-static int ckrm_nr_mem_classes = 0;
-
+LIST_HEAD(ckrm_memclass_list);
+spinlock_t ckrm_mem_lock; /* protects list above */
+unsigned int ckrm_tot_lru_pages; /* # of pages in the system */
+int ckrm_nr_mem_classes = 0;
 struct ckrm_mem_res *ckrm_mem_root_class;
 atomic_t ckrm_mem_real_count = ATOMIC_INIT(0);
+
+EXPORT_SYMBOL_GPL(ckrm_memclass_list);
+EXPORT_SYMBOL_GPL(ckrm_mem_lock);
+EXPORT_SYMBOL_GPL(ckrm_tot_lru_pages);
+EXPORT_SYMBOL_GPL(ckrm_nr_mem_classes);
 EXPORT_SYMBOL_GPL(ckrm_mem_root_class);
 EXPORT_SYMBOL_GPL(ckrm_mem_real_count);
 
@@ -80,6 +83,9 @@ set_ckrm_tot_pages(void)
 static void
 mem_res_initcls_one(struct ckrm_mem_res *res)
 {
+	int zindex = 0;
+	struct zone *zone;
+
 	memset(res, 0, sizeof(struct ckrm_mem_res));
 
 	res->shares.my_guarantee     = CKRM_SHARE_DONTCARE;
@@ -94,6 +100,17 @@ mem_res_initcls_one(struct ckrm_mem_res 
 
 	INIT_LIST_HEAD(&res->mcls_list);
 
+	for_each_zone(zone) {
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].active_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].inactive_list);
+		INIT_LIST_HEAD(&res->ckrm_zone[zindex].victim_list);
+		res->ckrm_zone[zindex].nr_active = 0;
+		res->ckrm_zone[zindex].nr_inactive = 0;
+		res->ckrm_zone[zindex].zone = zone;
+		res->ckrm_zone[zindex].memcls = res;
+		zindex++;
+	}
+
 	res->pg_unused = 0;
 	res->nr_dontcare = 1; /* for default class */
 	kref_init(&res->nr_users);
Index: linux-2.6.12-rc1/kernel/ckrm/ckrm_memctlr.c
===================================================================
--- linux-2.6.12-rc1.orig/kernel/ckrm/ckrm_memctlr.c
+++ linux-2.6.12-rc1/kernel/ckrm/ckrm_memctlr.c
@@ -14,6 +14,8 @@
  *
  */
 
+#include <linux/swap.h>
+#include <linux/pagemap.h>
 #include <linux/ckrm_mem_inline.h>
 
 void
@@ -82,8 +84,88 @@ ckrm_class_limit_ok(struct ckrm_mem_res 
 	return ret;
 }
 
+static int
+ckrm_mem_evaluate_page_anon(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct anon_vma *anon_vma = (struct anon_vma *) page->mapping;
+	struct vm_area_struct *vma;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!spin_trylock(&anon_vma->lock))
+		return 0;
+	BUG_ON(list_empty(&anon_vma->head));
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass) < 0) {
+			maxshareclass = mm->memclass;
+		}
+	}
+	spin_unlock(&anon_vma->lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page_file(struct page* page)
+{
+	struct ckrm_mem_res* pgcls = page_ckrmzone(page)->memcls;
+	struct ckrm_mem_res* maxshareclass = NULL;
+	struct address_space *mapping = page->mapping;
+	struct vm_area_struct *vma = NULL;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct prio_tree_iter iter;
+	struct mm_struct* mm;
+	int ret = 0;
+
+	if (!mapping)
+		return 0;
+
+	if (!spin_trylock(&mapping->i_mmap_lock))
+		return 0;
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap,
+					pgoff, pgoff) {
+		mm = vma->vm_mm;
+		if (!maxshareclass || ckrm_mem_share_compare(maxshareclass,
+				mm->memclass)<0)
+			maxshareclass = mm->memclass;
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+
+	if (!maxshareclass)
+		maxshareclass = ckrm_mem_root_class;
+	if (pgcls != maxshareclass) {
+		ckrm_change_page_class(page, maxshareclass);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int
+ckrm_mem_evaluate_page(struct page* page)
+{
+	int ret = 0;
+	if (page->mapping) {
+		if (PageAnon(page))
+			ret = ckrm_mem_evaluate_page_anon(page);
+		else
+			ret = ckrm_mem_evaluate_page_file(page);
+	}
+	return ret;
+}
+
 static void migrate_list(struct list_head *list,
-	struct ckrm_mem_res* from, struct ckrm_mem_res* to)
+	struct ckrm_mem_res* from, struct ckrm_mem_res* def)
 {
 	struct page *page;
 	struct list_head *pos, *next;
@@ -92,21 +174,26 @@ static void migrate_list(struct list_hea
 	while (pos != list) {
 		next = pos->next;
 		page = list_entry(pos, struct page, lru);
-		if (page->ckrm_class == from) 
-			ckrm_change_page_class(page, to);
+		if (ckrm_mem_evaluate_page(page))
+			ckrm_change_page_class(page, def);
 		pos = next;
 	}
 }
 
 void
-ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from, struct ckrm_mem_res* to)
+ckrm_mem_migrate_all_pages(struct ckrm_mem_res* from,
+					struct ckrm_mem_res* def)
 {
+	int i;
 	struct zone *zone;
-
-	for_each_zone(zone) {
+	struct ckrm_zone *ckrm_zone;
+  
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		ckrm_zone = &from->ckrm_zone[i];
+		zone = ckrm_zone->zone;
 		spin_lock_irq(&zone->lru_lock);
-		migrate_list(&zone->inactive_list, from, to);
-		migrate_list(&zone->active_list, from, to);
+		migrate_list(&ckrm_zone->inactive_list, from, def);
+		migrate_list(&ckrm_zone->active_list, from, def);
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	return;
@@ -131,8 +218,13 @@ class_migrate_pmd(struct mm_struct* mm, 
 		pte = pte_offset_map(pmdir, address);
 		if (pte_present(*pte)) {
 			struct page *page = pte_page(*pte);
-			if (page->mapping)
+			struct ckrm_zone *czone = page_ckrmzone(page);
+			if (page->mapping && czone) {
+				struct zone *zone = czone->zone;
+				spin_lock_irq(&zone->lru_lock);
 				ckrm_change_page_class(page, mm->memclass);
+				spin_unlock_irq(&zone->lru_lock);
+			}
 		}
 		address += PAGE_SIZE;
 		pte_unmap(pte);
@@ -190,7 +282,9 @@ class_migrate_vma(struct mm_struct* mm, 
 void
 ckrm_mem_migrate_mm(struct mm_struct* mm, struct ckrm_mem_res *def)
 {
+	struct task_struct *task;
 	struct vm_area_struct *vma;
+	struct ckrm_mem_res *maxshareclass = def;
 
 	/* We leave the mm->memclass untouched since we believe that one
 	 * mm with no task associated will be deleted soon or attach
@@ -199,18 +293,177 @@ ckrm_mem_migrate_mm(struct mm_struct* mm
 	if (list_empty(&mm->tasklist))
 		return;
 
-	if (mm->memclass)
-		kref_put(&mm->memclass->nr_users, memclass_release);
-	mm->memclass = def ?: ckrm_mem_root_class;
-	kref_get(&mm->memclass->nr_users);
-
-	/* Go through all VMA to migrate pages */
-	down_read(&mm->mmap_sem);
-	vma = mm->mmap;
-	while(vma) {
-		class_migrate_vma(mm, vma);
-		vma = vma->vm_next;
+	list_for_each_entry(task, &mm->tasklist, mm_peers) {
+		struct ckrm_mem_res* cls = ckrm_get_mem_class(task);
+		if (!cls)
+			continue;
+		if (!maxshareclass ||
+				ckrm_mem_share_compare(maxshareclass,cls)<0 )
+			maxshareclass = cls;
+	}
+
+	if (maxshareclass && (mm->memclass != maxshareclass)) {
+		if (mm->memclass) {
+			kref_put(&mm->memclass->nr_users, memclass_release);
+		}
+		mm->memclass = maxshareclass;
+		kref_get(&maxshareclass->nr_users);
+
+		/* Go through all VMA to migrate pages */
+		down_read(&mm->mmap_sem);
+		vma = mm->mmap;
+		while(vma) {
+			class_migrate_vma(mm, vma);
+			vma = vma->vm_next;
+		}
+		up_read(&mm->mmap_sem);
 	}
-	up_read(&mm->mmap_sem);
 	return;
 }
+
+static int
+shrink_weight(struct ckrm_zone *czone)
+{
+	u64 temp;
+	struct zone *zone = czone->zone;
+	struct ckrm_mem_res *cls = czone->memcls;
+	int zone_usage, zone_guar, zone_total, guar, ret, cnt;
+
+	zone_usage = czone->nr_active + czone->nr_inactive;
+	czone->active_over = czone->inactive_over = 0;
+
+	if (zone_usage < SWAP_CLUSTER_MAX * 4)
+		return 0;
+
+	if (cls->pg_guar == CKRM_SHARE_DONTCARE)
+		/* no guarantee for this class. use implicit guarantee */
+		guar = cls->impl_guar / cls->nr_dontcare;
+	else
+		guar = cls->pg_unused / cls->nr_dontcare;
+	zone_total = zone->nr_active + zone->nr_inactive + zone->free_pages;
+	temp = (u64) guar * zone_total;
+	do_div(temp, ckrm_tot_lru_pages);
+	zone_guar = (int) temp;
+
+	ret = ((zone_usage - zone_guar) > SWAP_CLUSTER_MAX) ?
+				(zone_usage - zone_guar) : 0;
+	if (ret) {
+		cnt = czone->nr_active - (2 * zone_guar / 3);
+		if (cnt > 0)
+			czone->active_over = cnt;
+		cnt = czone->active_over + czone->nr_inactive
+					- zone_guar / 3;
+		if (cnt > 0)
+			czone->inactive_over = cnt;
+	}
+	return ret;
+}
+
+/* insert an entry to the list and sort decendently*/
+static void
+list_add_sort(struct list_head *entry, struct list_head *head)
+{
+	struct ckrm_zone *czone, *new =
+			list_entry(entry, struct ckrm_zone, victim_list);
+	struct list_head* pos = head->next;
+
+	while (pos != head) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		if (new->shrink_weight > czone->shrink_weight) {
+			__list_add(entry, pos->prev, pos);
+			return;
+		}
+		pos = pos->next;
+  	}
+	list_add_tail(entry, head);
+	return;	
+}
+
+static void
+shrink_choose_victims(struct list_head *victims,
+		unsigned long nr_active, unsigned long nr_inactive)
+{
+	unsigned long nr;
+	struct ckrm_zone* czone;
+	struct list_head *pos, *next;
+  
+	pos = victims->next;
+	while ((pos != victims) && (nr_active || nr_inactive)) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		
+		if (nr_active && czone->active_over) {
+			nr = min(nr_active, czone->active_over);
+			czone->shrink_active += nr;
+			czone->active_over -= nr;
+			nr_active -= nr;
+		}
+
+		if (nr_inactive && czone->inactive_over) {
+			nr = min(nr_inactive, czone->inactive_over);
+			czone->shrink_inactive += nr;
+			czone->inactive_over -= nr;
+			nr_inactive -= nr;
+		}
+		pos = pos->next;
+  	}
+
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		next = pos->next;
+		if (czone->shrink_active == 0 && czone->shrink_inactive == 0) {
+			list_del_init(pos);
+			ckrm_clear_shrink(czone);
+		}
+		pos = next;
+	}	
+  	return;
+  }
+
+void
+shrink_get_victims(struct zone *zone, unsigned long nr_active,
+		unsigned long nr_inactive, struct list_head *victims)
+{
+	struct list_head *pos;
+	struct ckrm_mem_res *cls;
+	struct ckrm_zone *czone;
+	int zoneindex = zone_idx(zone);
+	
+	if (ckrm_nr_mem_classes <= 1) {
+		if (ckrm_mem_root_class) {
+			czone = ckrm_mem_root_class->ckrm_zone + zoneindex;
+			if (!ckrm_test_set_shrink(czone)) {
+				list_add(&czone->victim_list, victims);
+				czone->shrink_active = nr_active;
+				czone->shrink_inactive = nr_inactive;
+			}
+		}
+		return;
+	}
+	spin_lock_irq(&ckrm_mem_lock);
+	list_for_each_entry(cls, &ckrm_memclass_list, mcls_list) {
+		czone = cls->ckrm_zone + zoneindex;
+		if (ckrm_test_set_shrink(czone))
+			continue;
+
+		czone->shrink_active = 0;
+		czone->shrink_inactive = 0;
+		czone->shrink_weight = shrink_weight(czone);
+		if (czone->shrink_weight)
+			list_add_sort(&czone->victim_list, victims);
+		else
+			ckrm_clear_shrink(czone);
+	}
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		pos = pos->next;
+	}
+	shrink_choose_victims(victims, nr_active, nr_inactive);
+	spin_unlock_irq(&ckrm_mem_lock);
+	pos = victims->next;
+	while (pos != victims) {
+		czone = list_entry(pos, struct ckrm_zone, victim_list);
+		pos = pos->next;
+	}
+}
Index: linux-2.6.12-rc1/mm/page_alloc.c
===================================================================
--- linux-2.6.12-rc1.orig/mm/page_alloc.c
+++ linux-2.6.12-rc1/mm/page_alloc.c
@@ -1693,8 +1693,7 @@ static void __init free_area_init_core(s
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
+		ckrm_init_lists(zone);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
Index: linux-2.6.12-rc1/mm/swap.c
===================================================================
--- linux-2.6.12-rc1.orig/mm/swap.c
+++ linux-2.6.12-rc1/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/ckrm_mem_inline.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -87,7 +88,7 @@ int rotate_reclaimable_page(struct page 
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	if (PageLRU(page) && !PageActive(page)) {
 		list_del(&page->lru);
-		list_add_tail(&page->lru, &zone->inactive_list);
+		ckrm_add_tail_inactive(page);
 		inc_page_state(pgrotated);
 	}
 	if (!test_clear_page_writeback(page))
Index: linux-2.6.12-rc1/mm/vmscan.c
===================================================================
--- linux-2.6.12-rc1.orig/mm/vmscan.c
+++ linux-2.6.12-rc1/mm/vmscan.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/ckrm_mem.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -555,11 +556,23 @@ keep:
  * For pagecache intensive workloads, the first loop here is the hottest spot
  * in the kernel (apart from the copy_*_user functions).
  */
+#ifdef CONFIG_CKRM_RES_MEM
+static void shrink_cache(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 static void shrink_cache(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	int max_scan = sc->nr_to_scan;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+	struct list_head *active_list = &ckrm_zone->active_list;
+#else
+	struct list_head *inactive_list = &zone->inactive_list;
+	struct list_head *active_list = &zone->active_list;
+#endif
 
 	pagevec_init(&pvec, 1);
 
@@ -572,11 +585,10 @@ static void shrink_cache(struct zone *zo
 		int nr_freed;
 
 		while (nr_scan++ < sc->swap_cluster_max &&
-				!list_empty(&zone->inactive_list)) {
-			page = lru_to_page(&zone->inactive_list);
+				!list_empty(inactive_list)) {
+			page = lru_to_page(inactive_list);
 
-			prefetchw_prev_lru_page(page,
-						&zone->inactive_list, flags);
+			prefetchw_prev_lru_page(page, inactive_list, flags);
 
 			if (!TestClearPageLRU(page))
 				BUG();
@@ -587,13 +599,14 @@ static void shrink_cache(struct zone *zo
 				 */
 				__put_page(page);
 				SetPageLRU(page);
-				list_add(&page->lru, &zone->inactive_list);
+				list_add(&page->lru, inactive_list);
 				continue;
 			}
 			list_add(&page->lru, &page_list);
 			nr_taken++;
 		}
 		zone->nr_inactive -= nr_taken;
+		ckrm_zone_sub_inactive(ckrm_zone, nr_taken);
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
@@ -620,10 +633,15 @@ static void shrink_cache(struct zone *zo
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			if (PageActive(page)) {
+				ckrm_zone_add_active(ckrm_zone, 1);
+				zone->nr_active++;
+				list_add(&page->lru, active_list);
+			} else {
+				ckrm_zone_add_inactive(ckrm_zone, 1);
+				zone->nr_inactive++;
+				list_add(&page->lru, inactive_list);
+			}
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -654,7 +672,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
+#ifdef CONFIG_CKRM_RES_MEM
+refill_inactive_zone(struct ckrm_zone *ckrm_zone, struct scan_control *sc)
+#else
 refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+#endif
 {
 	int pgmoved;
 	int pgdeactivate = 0;
@@ -669,13 +691,21 @@ refill_inactive_zone(struct zone *zone, 
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct zone *zone = ckrm_zone->zone;
+	struct list_head *active_list = &ckrm_zone->active_list;
+	struct list_head *inactive_list = &ckrm_zone->inactive_list;
+#else
+	struct list_head *active_list = &zone->active_list;
+	struct list_head *inactive_list = &zone->inactive_list;
+#endif
 
 	lru_add_drain();
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
-	while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
-		page = lru_to_page(&zone->active_list);
-		prefetchw_prev_lru_page(page, &zone->active_list, flags);
+	while (pgscanned < nr_pages && !list_empty(active_list)) {
+		page = lru_to_page(active_list);
+		prefetchw_prev_lru_page(page, active_list, flags);
 		if (!TestClearPageLRU(page))
 			BUG();
 		list_del(&page->lru);
@@ -688,7 +718,7 @@ refill_inactive_zone(struct zone *zone, 
 			 */
 			__put_page(page);
 			SetPageLRU(page);
-			list_add(&page->lru, &zone->active_list);
+			list_add(&page->lru, active_list);
 		} else {
 			list_add(&page->lru, &l_hold);
 			pgmoved++;
@@ -697,6 +727,7 @@ refill_inactive_zone(struct zone *zone, 
 	}
 	zone->pages_scanned += pgscanned;
 	zone->nr_active -= pgmoved;
+	ckrm_zone_sub_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
 	/*
@@ -755,10 +786,11 @@ refill_inactive_zone(struct zone *zone, 
 			BUG();
 		if (!TestClearPageActive(page))
 			BUG();
-		list_move(&page->lru, &zone->inactive_list);
+		list_move(&page->lru, inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
+			ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -769,6 +801,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_inactive += pgmoved;
+	ckrm_zone_add_inactive(ckrm_zone, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
@@ -783,10 +816,11 @@ refill_inactive_zone(struct zone *zone, 
 		if (TestSetPageLRU(page))
 			BUG();
 		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
+		list_move(&page->lru, active_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_active += pgmoved;
+			ckrm_zone_add_active(ckrm_zone, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
@@ -794,6 +828,7 @@ refill_inactive_zone(struct zone *zone, 
 		}
 	}
 	zone->nr_active += pgmoved;
+	ckrm_zone_add_active(ckrm_zone, pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
@@ -801,6 +836,32 @@ refill_inactive_zone(struct zone *zone, 
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+#ifdef CONFIG_CKRM_RES_MEM
+static void
+shrink_ckrmzone(struct ckrm_zone *czone, struct scan_control *sc)
+{
+	while (czone->shrink_active || czone->shrink_inactive) {
+		if (czone->shrink_active) {
+			sc->nr_to_scan = min(czone->shrink_active,
+					(unsigned long)SWAP_CLUSTER_MAX);
+			czone->shrink_active -= sc->nr_to_scan;
+			refill_inactive_zone(czone, sc);
+		}
+		if (czone->shrink_inactive) {
+			sc->nr_to_scan = min(czone->shrink_inactive,
+					(unsigned long)SWAP_CLUSTER_MAX);
+			czone->shrink_inactive -= sc->nr_to_scan;
+			shrink_cache(czone, sc);
+			if (sc->nr_to_reclaim <= 0) {
+				czone->shrink_active = 0;
+				czone->shrink_inactive = 0;
+				break;
+			}
+		}
+	}
+}
+#endif
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -809,6 +870,9 @@ shrink_zone(struct zone *zone, struct sc
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
+#ifdef CONFIG_CKRM_RES_MEM
+	struct ckrm_zone *czone;
+#endif
 
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
@@ -830,6 +894,24 @@ shrink_zone(struct zone *zone, struct sc
 
 	sc->nr_to_reclaim = sc->swap_cluster_max;
 
+#ifdef CONFIG_CKRM_RES_MEM
+	if (nr_active || nr_inactive) {
+		struct list_head *pos, *next;
+		LIST_HEAD(victims);
+
+		shrink_get_victims(zone, nr_active, nr_inactive, &victims);
+		pos = victims.next;
+		while (pos != &victims) {
+			czone = list_entry(pos, struct ckrm_zone, victim_list);
+			next = pos->next;
+			list_del_init(pos);
+			sc->nr_to_reclaim = czone->shrink_inactive;
+			shrink_ckrmzone(czone, sc);
+			ckrm_clear_shrink(czone);
+			pos = next;
+		}
+	}
+#else 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			sc->nr_to_scan = min(nr_active,
@@ -847,6 +929,7 @@ shrink_zone(struct zone *zone, struct sc
 				break;
 		}
 	}
+#endif
 
 	throttle_vm_writeout();
 }

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2005-06-30 19:29 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-06-24 22:25 [PATCH 4/6] CKRM: Add guarantee support for mem controller Chandra Seetharaman
2005-06-30  5:52 ` [ckrm-tech] " YAMAMOTO Takashi
2005-06-30 18:11   ` Chandra Seetharaman
2005-06-30 18:23     ` Dave Hansen
2005-06-30 18:53       ` Chandra Seetharaman
2005-06-30 19:02         ` Dave Hansen
2005-06-30 19:21           ` Chandra Seetharaman
2005-06-30 19:29             ` Dave Hansen
  -- strict thread matches above, loose matches on Subject: below --
2005-05-19  0:32 Chandra Seetharaman
2005-04-02  3:14 Chandra Seetharaman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox