[PATCH] mm: inactive-clean list

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm: inactive-clean list
@ 2006-07-17 20:24 Peter Zijlstra
  2006-07-18  3:37 ` Christoph Lameter
  2006-07-23  5:50 ` Rik van Riel
  0 siblings, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2006-07-17 20:24 UTC (permalink / raw)
  To: linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

This patch implements the inactive_clean list spoken of during the VM summit.
The LRU tail pages will be unmapped and ready to free, but not freeed.
This gives reclaim an extra chance.

The only down-side to this patch is that it puts another requirement for
the zone lock into mark_page_accessed(), meaning that the use-once
cleanup cannot get fully rid of that zone lock there.

Signed-Off-By: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mm_inline.h  |   23 +++++++++
 include/linux/mmzone.h     |    5 ++
 include/linux/page-flags.h |    6 ++
 include/linux/swap.h       |    1 
 include/linux/sysctl.h     |    1 
 kernel/sysctl.c            |   11 ++++
 mm/page_alloc.c            |   62 ++++++++++++++++++++++++-
 mm/swap.c                  |   44 ++++++++++--------
 mm/swapfile.c              |    4 -
 mm/vmscan.c                |  108 ++++++++++++++++++++++++++++++++++++++-------
 10 files changed, 225 insertions(+), 40 deletions(-)

Index: linux-2.6-dirty/include/linux/swap.h
===================================================================
--- linux-2.6-dirty.orig/include/linux/swap.h	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/include/linux/swap.h	2006-06-30 10:39:37.000000000 +0200
@@ -173,7 +173,6 @@ extern unsigned int nr_free_pagecache_pa
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
 extern int lru_add_drain_all(void);
Index: linux-2.6-dirty/mm/swapfile.c
===================================================================
--- linux-2.6-dirty.orig/mm/swapfile.c	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/mm/swapfile.c	2006-06-30 10:39:37.000000000 +0200
@@ -499,7 +499,7 @@ static void unuse_pte(struct vm_area_str
 	 * Move the page to the active list so it is not
 	 * immediately swapped out again after swapon.
 	 */
-	activate_page(page);
+	mark_page_accessed(page);
 }
 
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -601,7 +601,7 @@ static int unuse_mm(struct mm_struct *mm
 		 * Activate page so shrink_cache is unlikely to unmap its
 		 * ptes while lock is dropped, so swapoff can make progress.
 		 */
-		activate_page(page);
+		mark_page_accessed(page);
 		unlock_page(page);
 		down_read(&mm->mmap_sem);
 		lock_page(page);
Index: linux-2.6-dirty/mm/swap.c
===================================================================
--- linux-2.6-dirty.orig/mm/swap.c	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/mm/swap.c	2006-06-30 10:39:37.000000000 +0200
@@ -96,37 +96,45 @@ int rotate_reclaimable_page(struct page 
 }
 
 /*
- * FIXME: speed this up?
- */
-void fastcall activate_page(struct page *page)
-{
-	struct zone *zone = page_zone(page);
-
-	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(zone, page);
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
-		inc_page_state(pgactivate);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-}
-
-/*
  * Mark a page as having seen activity.
  *
+ * clean -> inactive
+ *
  * inactive,unreferenced	->	inactive,referenced
  * inactive,referenced		->	active,unreferenced
  * active,unreferenced		->	active,referenced
+ *
+ * FIXME: speed this up?
  */
 void fastcall mark_page_accessed(struct page *page)
 {
+	struct zone *zone = NULL;
+	if (PageClean(page) && PageLRU(page)) {
+		zone = page_zone(page);
+		spin_lock_irq(&zone->lru_lock);
+		if (PageClean(page) && PageLRU(page)) {
+			del_page_from_clean_list(zone, page);
+			ClearPageClean(page);
+			add_page_to_inactive_list(zone, page);
+		}
+	}
 	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
-		activate_page(page);
+		if (!zone) {
+			zone = page_zone(page);
+			spin_lock_irq(&zone->lru_lock);
+		}
+		if (PageLRU(page) && !PageActive(page)) {
+			del_page_from_inactive_list(zone, page);
+			SetPageActive(page);
+			add_page_to_active_list(zone, page);
+			inc_page_state(pgactivate);
+		}
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
 }
 
 EXPORT_SYMBOL(mark_page_accessed);
Index: linux-2.6-dirty/include/linux/mm_inline.h
===================================================================
--- linux-2.6-dirty.orig/include/linux/mm_inline.h	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/include/linux/mm_inline.h	2006-06-30 10:39:37.000000000 +0200
@@ -14,6 +14,13 @@ add_page_to_inactive_list(struct zone *z
 }
 
 static inline void
+add_page_to_clean_list(struct zone *zone, struct page *page)
+{
+	list_add(&page->lru, &zone->clean_list);
+	zone->nr_clean++;
+}
+
+static inline void
 del_page_from_active_list(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
@@ -27,6 +34,17 @@ del_page_from_inactive_list(struct zone 
 	zone->nr_inactive--;
 }
 
+void wakeup_kswapd(struct zone *zone, int order);
+
+static inline void
+del_page_from_clean_list(struct zone *zone, struct page *page)
+{
+	list_del(&page->lru);
+	zone->nr_clean--;
+	if (zone->nr_clean + zone->free_pages < zone->clean_low)
+		wakeup_kswapd(zone, 0);
+}
+
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
@@ -34,6 +52,11 @@ del_page_from_lru(struct zone *zone, str
 	if (PageActive(page)) {
 		__ClearPageActive(page);
 		zone->nr_active--;
+	} else if (PageClean(page)) {
+		__ClearPageClean(page);
+		zone->nr_clean--;
+		if (zone->nr_clean + zone->free_pages < zone->clean_low)
+			wakeup_kswapd(zone, 0);
 	} else {
 		zone->nr_inactive--;
 	}
Index: linux-2.6-dirty/include/linux/mmzone.h
===================================================================
--- linux-2.6-dirty.orig/include/linux/mmzone.h	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/include/linux/mmzone.h	2006-06-30 10:39:37.000000000 +0200
@@ -155,10 +155,13 @@ struct zone {
 	spinlock_t		lru_lock;	
 	struct list_head	active_list;
 	struct list_head	inactive_list;
+	struct list_head	clean_list;
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		nr_active;
 	unsigned long		nr_inactive;
+	unsigned long		nr_clean;
+	unsigned long		clean_low, clean_high;
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	int			all_unreclaimable; /* All pages pinned */
 
@@ -397,6 +400,8 @@ struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
 					void __user *, size_t *, loff_t *);
+int min_clean_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
Index: linux-2.6-dirty/include/linux/page-flags.h
===================================================================
--- linux-2.6-dirty.orig/include/linux/page-flags.h	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/include/linux/page-flags.h	2006-06-30 10:39:37.000000000 +0200
@@ -90,6 +90,7 @@
 #define PG_nosave_free		18	/* Free, should not be written */
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
+#define PG_clean		20	/* Page is on the clean list */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -372,6 +373,11 @@ extern void __mod_page_state_offset(unsi
 #define SetPageUncached(page)	set_bit(PG_uncached, &(page)->flags)
 #define ClearPageUncached(page)	clear_bit(PG_uncached, &(page)->flags)
 
+#define PageClean(page)		test_bit(PG_clean, &(page)->flags)
+#define SetPageClean(page)	set_bit(PG_clean, &(page)->flags)
+#define ClearPageClean(page)	clear_bit(PG_clean, &(page)->flags)
+#define __ClearPageClean(page)	__clear_bit(PG_clean, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
Index: linux-2.6-dirty/mm/page_alloc.c
===================================================================
--- linux-2.6-dirty.orig/mm/page_alloc.c	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/mm/page_alloc.c	2006-06-30 11:28:14.000000000 +0200
@@ -83,6 +83,7 @@ EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
+int min_clean_kbytes = 8192;
 
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
@@ -155,7 +156,8 @@ static void bad_page(struct page *page)
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
-			1 << PG_buddy );
+			1 << PG_buddy |
+		        1 << PG_clean );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -390,7 +392,8 @@ static inline int free_pages_check(struc
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_clean ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -539,7 +542,8 @@ static int prep_new_page(struct page *pa
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
-			1 << PG_buddy ))))
+			1 << PG_buddy |
+			1 << PG_clean ))))
 		bad_page(page);
 
 	/*
@@ -1466,6 +1470,9 @@ void show_free_areas(void)
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
+			" clean: %lukB"
+			" low: %lukB"
+			" high: %lukB"
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
@@ -1477,6 +1484,9 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
+			K(zone->nr_clean),
+			K(zone->clean_low),
+			K(zone->clean_high),
 			K(zone->nr_active),
 			K(zone->nr_inactive),
 			K(zone->present_pages),
@@ -2176,10 +2186,12 @@ static void __meminit free_area_init_cor
 		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
+		INIT_LIST_HEAD(&zone->clean_list);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
+		zone->nr_clean = 0;
 		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
@@ -2336,6 +2348,9 @@ static int zoneinfo_show(struct seq_file
 			   "\n        min      %lu"
 			   "\n        low      %lu"
 			   "\n        high     %lu"
+			   "\n        clean    %lu"
+			   "\n        low      %lu"
+			   "\n        high     %lu"
 			   "\n        active   %lu"
 			   "\n        inactive %lu"
 			   "\n        scanned  %lu (a: %lu i: %lu)"
@@ -2345,6 +2360,9 @@ static int zoneinfo_show(struct seq_file
 			   zone->pages_min,
 			   zone->pages_low,
 			   zone->pages_high,
+			   zone->nr_clean,
+			   zone->clean_low,
+			   zone->clean_high,
 			   zone->nr_active,
 			   zone->nr_inactive,
 			   zone->pages_scanned,
@@ -2632,6 +2650,34 @@ static void setup_per_zone_lowmem_reserv
 	calculate_totalreserve_pages();
 }
 
+void setup_per_zone_pages_clean(void)
+{
+	unsigned long pages_clean = min_clean_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+	}
+
+	for_each_zone(zone) {
+		u64 tmp = pages_clean;
+
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (!is_highmem(zone)) {
+			tmp *= zone->present_pages;
+			do_div(tmp, lowmem_pages);
+		}
+
+		zone->clean_low   = zone->pages_min + tmp - (tmp >> 2);
+		zone->clean_high  = zone->pages_min + tmp + (tmp >> 2);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+
 /*
  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
  *	that the pages_{min,low,high} values for each zone are set correctly 
@@ -2689,6 +2735,8 @@ void setup_per_zone_pages_min(void)
 
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
+	/* update the clean pages watermarks */
+	setup_per_zone_pages_clean();
 }
 
 /*
@@ -2745,6 +2793,14 @@ int min_free_kbytes_sysctl_handler(ctl_t
 	return 0;
 }
 
+int min_clean_kbytes_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	setup_per_zone_pages_clean();
+	return 0;
+}
+
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
Index: linux-2.6-dirty/mm/vmscan.c
===================================================================
--- linux-2.6-dirty.orig/mm/vmscan.c	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/mm/vmscan.c	2006-06-30 10:39:37.000000000 +0200
@@ -549,8 +549,8 @@ static unsigned long shrink_page_list(st
 				goto free_it;
 		}
 
-		if (!remove_mapping(mapping, page))
-			goto keep_locked;
+		SetPageClean(page);
+		goto keep_locked;
 
 free_it:
 		unlock_page(page);
@@ -627,12 +627,14 @@ static unsigned long isolate_lru_pages(u
 	return nr_taken;
 }
 
-/*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
- * of reclaimed pages
- */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
-				struct zone *zone, struct scan_control *sc)
+typedef unsigned long (*shrink_func_t)(struct list_head *,
+		struct scan_control *);
+
+static unsigned long shrink_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc,
+				struct list_head *src_list,
+				unsigned long *src_count,
+			       	shrink_func_t shrink_func)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -650,14 +652,13 @@ static unsigned long shrink_inactive_lis
 		unsigned long nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-					     &zone->inactive_list,
-					     &page_list, &nr_scan);
-		zone->nr_inactive -= nr_taken;
+				src_list, &page_list, &nr_scan);
+		*src_count -= nr_taken;
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_page_list(&page_list, sc);
+		nr_freed = shrink_func(&page_list, sc);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
@@ -681,6 +682,8 @@ static unsigned long shrink_inactive_lis
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
+			else if (PageClean(page))
+				add_page_to_clean_list(zone, page);
 			else
 				add_page_to_inactive_list(zone, page);
 			if (!pagevec_add(&pvec, page)) {
@@ -689,7 +692,7 @@ static unsigned long shrink_inactive_lis
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	} while (nr_scanned < max_scan);
+	} while (nr_scanned < max_scan);
 	spin_unlock(&zone->lru_lock);
 done:
 	local_irq_enable();
@@ -698,6 +701,17 @@ done:
 }
 
 /*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static inline unsigned long shrink_inactive_list(unsigned long max_scan,
+		struct zone *zone, struct scan_control *sc)
+{
+	return shrink_list(max_scan, zone, sc, &zone->inactive_list,
+			&zone->nr_inactive, shrink_page_list);
+}
+
+/*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
@@ -850,6 +864,59 @@ static void shrink_active_list(unsigned 
 	pagevec_release(&pvec);
 }
 
+static unsigned long shrink_clean_page_list(struct list_head *page_list,
+		struct scan_control *sc)
+{
+	LIST_HEAD(ret_pages);
+	struct pagevec freed_pvec;
+	unsigned long nr_reclaimed = 0;
+
+	pagevec_init(&freed_pvec, 1);
+	while (!list_empty(page_list)) {
+		struct address_space *mapping;
+		struct page *page;
+
+		cond_resched();
+
+		page = lru_to_page(page_list);
+		prefetchw_prev_lru_page(page, page_list, flags);
+
+		list_del(&page->lru);
+
+		if (TestSetPageLocked(page))
+			goto keep;
+
+		mapping = page_mapping(page);
+
+		if (!remove_mapping(mapping, page))
+			goto keep_locked;
+
+		ClearPageClean(page);
+		unlock_page(page);
+		nr_reclaimed++;
+		if (!pagevec_add(&freed_pvec, page))
+			__pagevec_release_nonlru(&freed_pvec);
+		continue;
+
+keep_locked:
+		ClearPageClean(page);
+		unlock_page(page);
+keep:
+		list_add(&page->lru, &ret_pages);
+	}
+	list_splice(&ret_pages, page_list);
+	if (pagevec_count(&freed_pvec))
+		__pagevec_release_nonlru(&freed_pvec);
+	return nr_reclaimed;
+}
+
+static inline unsigned long shrink_clean_list(unsigned long max_scan,
+		struct zone *zone, struct scan_control *sc)
+{
+	return shrink_list(max_scan, zone, sc, &zone->clean_list,
+			&zone->nr_clean, shrink_clean_page_list);
+}
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -863,6 +930,13 @@ static unsigned long shrink_zone(int pri
 
 	atomic_inc(&zone->reclaim_in_progress);
 
+	if (!priority || zone->nr_clean + zone->free_pages > zone->clean_high)
+		nr_reclaimed +=
+			shrink_clean_list(sc->swap_cluster_max, zone, sc);
+
+	if (nr_reclaimed && zone->nr_clean > zone->clean_high)
+		goto done;
+
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
@@ -900,6 +974,7 @@ static unsigned long shrink_zone(int pri
 
 	throttle_vm_writeout();
 
+done:
 	atomic_dec(&zone->reclaim_in_progress);
 	return nr_reclaimed;
 }
@@ -986,7 +1061,7 @@ unsigned long try_to_free_pages(struct z
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
-		lru_pages += zone->nr_active + zone->nr_inactive;
+		lru_pages += zone->nr_active + zone->nr_inactive + zone->nr_clean;
 	}
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1119,7 +1194,7 @@ scan:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			lru_pages += zone->nr_active + zone->nr_inactive;
+			lru_pages += zone->nr_active + zone->nr_inactive + zone->nr_clean;
 		}
 
 		/*
@@ -1280,7 +1355,8 @@ void wakeup_kswapd(struct zone *zone, in
 		return;
 
 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0) &&
+			zone->nr_clean + zone->free_pages > zone->clean_high)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
Index: linux-2.6-dirty/include/linux/sysctl.h
===================================================================
--- linux-2.6-dirty.orig/include/linux/sysctl.h	2006-06-30 08:58:08.000000000 +0200
+++ linux-2.6-dirty/include/linux/sysctl.h	2006-06-30 10:39:37.000000000 +0200
@@ -191,6 +191,7 @@ enum
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
+	VM_MIN_CLEAN_KBYTES=35, /* Minimun clean kilobytes to maintain */
 };
 

Index: linux-2.6-dirty/kernel/sysctl.c
===================================================================
--- linux-2.6-dirty.orig/kernel/sysctl.c	2006-06-30 08:58:09.000000000 +0200
+++ linux-2.6-dirty/kernel/sysctl.c	2006-06-30 10:39:37.000000000 +0200
@@ -68,6 +68,7 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int min_free_kbytes;
+extern int min_clean_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
@@ -851,6 +852,16 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.ctl_name	= VM_MIN_CLEAN_KBYTES,
+		.procname	= "min_clean_kbytes",
+		.data		= &min_clean_kbytes,
+		.maxlen		= sizeof(min_clean_kbytes),
+		.mode		= 0644,
+		.proc_handler	= &min_clean_kbytes_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
 		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-17 20:24 [PATCH] mm: inactive-clean list Peter Zijlstra
@ 2006-07-18  3:37 ` Christoph Lameter
  2006-07-18 12:16   ` Peter Zijlstra
  2006-07-23  5:50 ` Rik van Riel
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18  3:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Mon, 17 Jul 2006, Peter Zijlstra wrote:

> This patch implements the inactive_clean list spoken of during the VM summit.
> The LRU tail pages will be unmapped and ready to free, but not freeed.
> This gives reclaim an extra chance.

I thought we wanted to just track the number of unmapped clean pages and 
insure that they do not go under a certain limit? That would not require
any locking changes but just a new zoned counter and a check in the dirty
handling path.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18  3:37 ` Christoph Lameter
@ 2006-07-18 12:16   ` Peter Zijlstra
  2006-07-18 13:29     ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2006-07-18 12:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Mon, 2006-07-17 at 20:37 -0700, Christoph Lameter wrote:
> On Mon, 17 Jul 2006, Peter Zijlstra wrote:
> 
> > This patch implements the inactive_clean list spoken of during the VM summit.
> > The LRU tail pages will be unmapped and ready to free, but not freeed.
> > This gives reclaim an extra chance.
> 
> I thought we wanted to just track the number of unmapped clean pages and 
> insure that they do not go under a certain limit? That would not require
> any locking changes but just a new zoned counter and a check in the dirty
> handling path.

The problem I see with that is that we cannot create new unmapped clean
pages. Where will we get new pages to satisfy our demand when there is
nothing mmap'ed.

This approach will generate them by forceing some pages into swap space.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 12:16   ` Peter Zijlstra
@ 2006-07-18 13:29     ` Christoph Lameter
  2006-07-18 13:55       ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18 13:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Tue, 18 Jul 2006, Peter Zijlstra wrote:

> > I thought we wanted to just track the number of unmapped clean pages and 
> > insure that they do not go under a certain limit? That would not require
> > any locking changes but just a new zoned counter and a check in the dirty
> > handling path.
> 
> The problem I see with that is that we cannot create new unmapped clean
> pages. Where will we get new pages to satisfy our demand when there is
> nothing mmap'ed.

Hmmm... I am not sure that we both have this straight yet.

Adding logic to determine the number of clean pages is not necessary. The 
number of clean pages in the pagecache can be determined by:

global_page_state(NR_FILE_PAGES) - global_page_state(NR_FILE_DIRTY) 

That number can be increased by writeout and so I think we want this to
be checked in the throttling path. Swapout is only useful for 
anonymous pages. Dirty anonymous pages are not tracked and do not 
contribute to the NR_FILE_DIRTY (formerly nr_dirty). We only track
the number of anonymous pages in NR_ANON_PAGES. Swapout could be used 
to reduce NR_ANON_PAGES if memory becomes tight.

The intend of insuring that a certain number of clean pages exist seems to
be to guarantee that a certain amount of memory is freeable without
having to go through a filesystem.

Pages that are available without file system activity are:

1. The already free pages.

2. The clean pagecache pages.

For a zone this is

zone->free_pages + zone_page_state(zone, NR_FILE_PAGES) - 
zone_page_state(zone, NR_FILE_DIRTY)

If this goes below a certain limit then we either have to:

1. If NR_FILE_DIRTY is significant then we can increase the number
   of reclaimable pages by writing them out.

2. If NR_FILE_DIRTY and NR_FILE_PAGES are low then writeout does 
   not help us. NR_ANON_PAGES is likely big. So we could swap some
   anonymous pages out to increase zone->free_pages instead. Performance
   wise this is a bad move. So we should prefer writeout.

However, the above scheme assumes that all pagecache pages can ne
unmapped if necessary. This may not be desirable since we may then
have no executable pages available anymore and create a significant
amount of disk traffic. If we would track the number of dirty unmapped
pages (by addding NR_UNMAPPED_DIRTY) then we could guarantee available
memory that would leave the pages in use by processes alone.

If we impose a limit on the number of free pages + the number of unmapped
clean pagecache pages then we have a reserve memory pool that can be
accessed without too much impact on performance. Its basically another
trigger for writeout.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 13:29     ` Christoph Lameter
@ 2006-07-18 13:55       ` Martin J. Bligh
  2006-07-18 13:59         ` Christoph Lameter
  2006-07-18 14:03         ` Christoph Lameter
  0 siblings, 2 replies; 27+ messages in thread
From: Martin J. Bligh @ 2006-07-18 13:55 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Christoph Lameter wrote:
> On Tue, 18 Jul 2006, Peter Zijlstra wrote:
> 
> 
>>>I thought we wanted to just track the number of unmapped clean pages and 
>>>insure that they do not go under a certain limit? That would not require
>>>any locking changes but just a new zoned counter and a check in the dirty
>>>handling path.
>>
>>The problem I see with that is that we cannot create new unmapped clean
>>pages. Where will we get new pages to satisfy our demand when there is
>>nothing mmap'ed.
> 
> 
> Hmmm... I am not sure that we both have this straight yet.
> 
> Adding logic to determine the number of clean pages is not necessary. The 
> number of clean pages in the pagecache can be determined by:
> 
> global_page_state(NR_FILE_PAGES) - global_page_state(NR_FILE_DIRTY) 

It's not that simple. We also need to deal with other types of 
non-freeable pages, such as memlocked.

Someone remind me why we can't remove the memlocked pages from the LRU
again? Apart from needing a refcount of how many times they're memlocked
(or we just shove them back whenever they're unlocked, and let it fall
out again when we walk the list, but that doesn't fix the accounting
problem).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 13:55       ` Martin J. Bligh
@ 2006-07-18 13:59         ` Christoph Lameter
  2006-07-18 15:12           ` Martin J. Bligh
  2006-07-18 14:03         ` Christoph Lameter
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18 13:59 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Tue, 18 Jul 2006, Martin J. Bligh wrote:

> Someone remind me why we can't remove the memlocked pages from the LRU
> again? Apart from needing a refcount of how many times they're memlocked
> (or we just shove them back whenever they're unlocked, and let it fall
> out again when we walk the list, but that doesn't fix the accounting
> problem).

We simply do not unmap memlocked pages (see try_to_unmap). And therefore
they are not reclaimable.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 13:55       ` Martin J. Bligh
  2006-07-18 13:59         ` Christoph Lameter
@ 2006-07-18 14:03         ` Christoph Lameter
  2006-07-18 14:25           ` Andrew Morton
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18 14:03 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Tue, 18 Jul 2006, Martin J. Bligh wrote:

> > Adding logic to determine the number of clean pages is not necessary. The
> > number of clean pages in the pagecache can be determined by:
> > 
> > global_page_state(NR_FILE_PAGES) - global_page_state(NR_FILE_DIRTY) 
> 
> It's not that simple. We also need to deal with other types of non-freeable
> pages, such as memlocked.

mlocked is an exceptional case. The problem is that the information if a 
page is mlocked is only available via the vma. One has to
scan the reverse list and check all the vmas for the flag.

Is mlock that important?

What other types of non freeable pages could exist?

Maybe slab allocations and direct kernel allocations? We have only
limited means to reclaim those pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 14:03         ` Christoph Lameter
@ 2006-07-18 14:25           ` Andrew Morton
  2006-07-18 14:45             ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2006-07-18 14:25 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: mbligh, a.p.zijlstra, linux-mm, torvalds, linux-kernel

On Tue, 18 Jul 2006 07:03:12 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> What other types of non freeable pages could exist?

PageWriteback() pages (potentially all of memory)

Pinned pages (various transient conditions, mainly get_user_pages())

Some pages whose buffers are attached to an ext3 journal.

Possibly NFS unstable pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 14:25           ` Andrew Morton
@ 2006-07-18 14:45             ` Christoph Lameter
  2006-07-18 15:59               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18 14:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mbligh, a.p.zijlstra, linux-mm, torvalds, linux-kernel

On Tue, 18 Jul 2006, Andrew Morton wrote:

> Christoph Lameter <clameter@sgi.com> wrote:
> > What other types of non freeable pages could exist?
> 
> PageWriteback() pages (potentially all of memory)

Doesnt write throttling take care of that?

> Pinned pages (various transient conditions, mainly get_user_pages())

Hmm....
 
> Some pages whose buffers are attached to an ext3 journal.

These are just pinned by an increased refcount right?
 
> Possibly NFS unstable pages.

These are tracked by NR_NFS_UNSTABLE.

Maybe we need a NR_UNSTABLE that includes pinned pages?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 13:59         ` Christoph Lameter
@ 2006-07-18 15:12           ` Martin J. Bligh
  2006-07-18 15:57             ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Martin J. Bligh @ 2006-07-18 15:12 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Christoph Lameter wrote:
> On Tue, 18 Jul 2006, Martin J. Bligh wrote:
> 
> 
>>Someone remind me why we can't remove the memlocked pages from the LRU
>>again? Apart from needing a refcount of how many times they're memlocked
>>(or we just shove them back whenever they're unlocked, and let it fall
>>out again when we walk the list, but that doesn't fix the accounting
>>problem).
> 
> 
> We simply do not unmap memlocked pages (see try_to_unmap). And therefore
> they are not reclaimable.

The point is that they're still going to be included in your counts.


> On Tue, 18 Jul 2006, Andrew Morton wrote:
>>> Christoph Lameter <clameter@sgi.com> wrote:
>>>> > What other types of non freeable pages could exist?
>>> 
>>> PageWriteback() pages (potentially all of memory)
> 
> Doesnt write throttling take care of that?
> 
>>> Pinned pages (various transient conditions, mainly get_user_pages())
> 
> Hmm....
> 
>>> Some pages whose buffers are attached to an ext3 journal.
> 
> These are just pinned by an increased refcount right?
> 
>>> Possibly NFS unstable pages.
> 
> These are tracked by NR_NFS_UNSTABLE.
> 
> Maybe we need a NR_UNSTABLE that includes pinned pages?

The point of what we decided on Sunday was that we want to count the
pages that we KNOW are easy to free. So all of these should be
taken out of the count before we take it.

M.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 15:12           ` Martin J. Bligh
@ 2006-07-18 15:57             ` Christoph Lameter
  2006-07-18 16:23               ` Martin J. Bligh
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-18 15:57 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Tue, 18 Jul 2006, Martin J. Bligh wrote:
> > Maybe we need a NR_UNSTABLE that includes pinned pages?
> 
> The point of what we decided on Sunday was that we want to count the
> pages that we KNOW are easy to free. So all of these should be
> taken out of the count before we take it.

Unmapped clean pages are easily freeable and do not have these issues.
Could we just use that for now? Otherwise we have to add counters to the 
categories that we do not track for now and take them out of the count.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 14:45             ` Christoph Lameter
@ 2006-07-18 15:59               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-07-18 15:59 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, mbligh, a.p.zijlstra, linux-mm, torvalds, linux-kernel

On Tue, 18 Jul 2006 07:45:21 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Tue, 18 Jul 2006, Andrew Morton wrote:
> 
> > Christoph Lameter <clameter@sgi.com> wrote:
> > > What other types of non freeable pages could exist?
> > 
> > PageWriteback() pages (potentially all of memory)
> 
> Doesnt write throttling take care of that?
> 
> > Pinned pages (various transient conditions, mainly get_user_pages())
> 
> Hmm....
>  
> > Some pages whose buffers are attached to an ext3 journal.
> 
> These are just pinned by an increased refcount right?
>  
> > Possibly NFS unstable pages.
> 
> These are tracked by NR_NFS_UNSTABLE.
> 
> Maybe we need a NR_UNSTABLE that includes pinned pages?
> 

I'm not sure what was discussed in VM summit. If I miss the point, sorry.

I think the important thing here is the amount of free pages we try to keep.
Remaining unused pages as page cache(and not free them) may help performance if lucky
but increases uncertainity.
 

Now, enlarging min_free_kbytes or mempool is useful to avoid page allocation failure 
in device drivers, which does write-back of dirty pages.
I think controlling free memory itself is better than controlling *used* inactive clean
pages.

Bye
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-18 15:57             ` Christoph Lameter
@ 2006-07-18 16:23               ` Martin J. Bligh
  0 siblings, 0 replies; 27+ messages in thread
From: Martin J. Bligh @ 2006-07-18 16:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Christoph Lameter wrote:
> On Tue, 18 Jul 2006, Martin J. Bligh wrote:
> 
>>>Maybe we need a NR_UNSTABLE that includes pinned pages?
>>
>>The point of what we decided on Sunday was that we want to count the
>>pages that we KNOW are easy to free. So all of these should be
>>taken out of the count before we take it.
> 
> 
> Unmapped clean pages are easily freeable and do not have these issues.
> Could we just use that for now? Otherwise we have to add counters to the 
> categories that we do not track for now and take them out of the count.

Yup, I think that covers everything.

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-17 20:24 [PATCH] mm: inactive-clean list Peter Zijlstra
  2006-07-18  3:37 ` Christoph Lameter
@ 2006-07-23  5:50 ` Rik van Riel
  2006-07-24 18:11   ` Christoph Lameter
  2006-07-26 11:00   ` Martin Schwidefsky
  1 sibling, 2 replies; 27+ messages in thread
From: Rik van Riel @ 2006-07-23  5:50 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Peter Zijlstra wrote:
> This patch implements the inactive_clean list spoken of during the VM summit.
> The LRU tail pages will be unmapped and ready to free, but not freeed.
> This gives reclaim an extra chance.

This patch makes it possible to implement Martin Schwidefsky's
hypervisor-based fast page reclaiming for architectures without
millicode - ie. Xen, UML and all other non-s390 architectures.

That could be a big help in heavily loaded virtualized environments.

The fact that it helps prevent the iSCSI memory deadlock is a
huge bonus too, of course :)

-- 
The answer is 42.  What is *your* question?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-23  5:50 ` Rik van Riel
@ 2006-07-24 18:11   ` Christoph Lameter
  2006-07-24 19:00     ` Rik van Riel
  2006-07-26 11:00   ` Martin Schwidefsky
  1 sibling, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-24 18:11 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Sun, 23 Jul 2006, Rik van Riel wrote:

> This patch makes it possible to implement Martin Schwidefsky's
> hypervisor-based fast page reclaiming for architectures without
> millicode - ie. Xen, UML and all other non-s390 architectures.
> 
> That could be a big help in heavily loaded virtualized environments.
> 
> The fact that it helps prevent the iSCSI memory deadlock is a
> huge bonus too, of course :)

I think there may be a way with less changes to the way the VM functions 
to get there:


Add NR_MAPPED_DIRTY and functions to determine easily reclaimable pages.

This patch adds a new counter NR_MAPPED_DIRTY. It tracks the number of
dirty mapped pages. With such a counter we can determine
the number of unmapped clean file backed pages and possibly
use that knowledge to enforce limits in the VM.

Add some new functions:

global_dirty()		= Amount of dirty pages in the system as a whole
zone_dirty(zone)	= Amount of dirty pages in a specific zone

global_easily_reclaimable() = Amount of clean unmapped pages
			(for now. We may want to add other categories later
			if we can track more page categories).

zone_easily_reclaimable(zone) = Amount of clean unmapped pages in a zone.


I plan to add later:

global_reclaimable()/zone_reclaimable(zone)

This would be the number of pages that are reclaimable at all regardless
of the effort to be expended:

So this would be

NR_MAPPED + NR_ANON - NR_UNSTABLE_NFS - <mlocked pages> - <pinned pages>


Patch under development. This is just the basis for a discussion.

Other patches could add /proc limits after this one to guarantee
that a certain number of pages is easily reclaimable and so on.


Index: linux-2.6.18-rc1/drivers/base/node.c
===================================================================
--- linux-2.6.18-rc1.orig/drivers/base/node.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/drivers/base/node.c	2006-07-24 10:54:41.667670873 -0700
@@ -58,10 +58,11 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d HighFree:     %8lu kB\n"
 		       "Node %d LowTotal:     %8lu kB\n"
 		       "Node %d LowFree:      %8lu kB\n"
-		       "Node %d Dirty:        %8lu kB\n"
 		       "Node %d Writeback:    %8lu kB\n"
 		       "Node %d FilePages:    %8lu kB\n"
+		       "Node %d Dirty:        %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
+		       "Node %d DirtyMapped:  %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS Unstable: %8lu kB\n"
@@ -76,10 +77,11 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(i.freehigh),
 		       nid, K(i.totalram - i.totalhigh),
 		       nid, K(i.freeram - i.freehigh),
-		       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
 		       nid, K(node_page_state(nid, NR_WRITEBACK)),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
+		       nid, K(node_page_state(nid, NR_FILE_DIRTY)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
+		       nid, K(node_page_state(nid, NR_FILE_MAPPED_DIRTY)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
Index: linux-2.6.18-rc1/mm/vmstat.c
===================================================================
--- linux-2.6.18-rc1.orig/mm/vmstat.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/mm/vmstat.c	2006-07-24 10:47:12.662278615 -0700
@@ -389,6 +389,7 @@ static char *vmstat_text[] = {
 	"nr_slab",
 	"nr_page_table_pages",
 	"nr_dirty",
+	"nr_mapped_dirty",
 	"nr_writeback",
 	"nr_unstable",
 	"nr_bounce",
Index: linux-2.6.18-rc1/fs/proc/proc_misc.c
===================================================================
--- linux-2.6.18-rc1.orig/fs/proc/proc_misc.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/fs/proc/proc_misc.c	2006-07-24 10:55:12.832732717 -0700
@@ -167,6 +167,7 @@ static int meminfo_read_proc(char *page,
 		"Writeback:    %8lu kB\n"
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
+		"DirtyMapped:  %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
 		"NFS Unstable: %8lu kB\n"
@@ -193,6 +194,7 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_FILE_MAPPED_DIRTY)),
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
 		K(global_page_state(NR_UNSTABLE_NFS)),
Index: linux-2.6.18-rc1/include/linux/vmstat.h
===================================================================
--- linux-2.6.18-rc1.orig/include/linux/vmstat.h	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/include/linux/vmstat.h	2006-07-24 10:56:44.912966808 -0700
@@ -212,4 +212,61 @@ static inline void refresh_cpu_vm_stats(
 static inline void refresh_vm_stats(void) { }
 #endif
 
+static inline void inc_zone_page_dirty(struct page *page)
+{
+	inc_zone_page_state(page, NR_FILE_DIRTY);
+	if (page_mapped(page))
+		inc_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
+}
+
+static inline void dec_zone_page_dirty(struct page *page)
+{
+	dec_zone_page_state(page, NR_FILE_DIRTY);
+	if (page_mapped(page))
+		dec_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
+}
+
+static inline void __inc_zone_page_dirty(struct page *page)
+{
+	__inc_zone_page_state(page, NR_FILE_DIRTY);
+	if (page_mapped(page))
+		__inc_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
+}
+
+static inline void __dec_zone_page_dirty(struct page *page)
+{
+	__dec_zone_page_state(page, NR_FILE_DIRTY);
+	if (page_mapped(page))
+		__dec_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
+}
+
+static inline unsigned long global_dirty(void)
+{
+	return global_page_state(NR_UNSTABLE_NFS) +
+		global_page_state(NR_FILE_DIRTY);
+}
+
+static inline unsigned long zone_dirty(struct zone *z)
+{
+	return zone_page_state(z, NR_UNSTABLE_NFS) +
+		zone_page_state(z, NR_FILE_DIRTY);
+}
+
+static inline unsigned long global_easily_reclaimable(void)
+{
+	return (global_page_state(NR_FILE_PAGES) -
+		global_page_state(NR_FILE_MAPPED))
+		/* Unmapped */ - /* Unmapped dirty */
+		(global_page_state(NR_FILE_DIRTY) -
+		global_page_state(NR_FILE_MAPPED_DIRTY));
+}
+
+static inline unsigned long zone_easily_reclaimable(struct zone *z)
+{
+	return (zone_page_state(z, NR_FILE_PAGES) -
+		zone_page_state(z, NR_FILE_MAPPED))
+		/* Unmapped */	- /* Unmapped dirty */
+		(zone_page_state(z, NR_FILE_DIRTY) -
+		zone_page_state(z, NR_FILE_MAPPED_DIRTY));
+}
 #endif /* _LINUX_VMSTAT_H */
Index: linux-2.6.18-rc1/mm/page-writeback.c
===================================================================
--- linux-2.6.18-rc1.orig/mm/page-writeback.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/mm/page-writeback.c	2006-07-24 10:47:12.666184623 -0700
@@ -191,8 +191,7 @@ static void balance_dirty_pages(struct a
 		};
 
 		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
+		nr_reclaimable = global_dirty();
 		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
 			dirty_thresh)
 				break;
@@ -210,8 +209,7 @@ static void balance_dirty_pages(struct a
 			writeback_inodes(&wbc);
 			get_dirty_limits(&background_thresh,
 					 	&dirty_thresh, mapping);
-			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
+			nr_reclaimable = global_dirty();
 			if (nr_reclaimable +
 				global_page_state(NR_WRITEBACK)
 					<= dirty_thresh)
@@ -328,9 +326,7 @@ static void background_writeout(unsigned
 		long dirty_thresh;
 
 		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
-		if (global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) < background_thresh
-				&& min_pages <= 0)
+		if (global_dirty() < background_thresh && min_pages <= 0)
 			break;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
@@ -354,8 +350,7 @@ static void background_writeout(unsigned
 int wakeup_pdflush(long nr_pages)
 {
 	if (nr_pages == 0)
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
+		nr_pages = global_dirty();
 	return pdflush_operation(background_writeout, nr_pages);
 }
 
@@ -401,8 +396,7 @@ static void wb_kupdate(unsigned long arg
 	oldest_jif = jiffies - dirty_expire_interval;
 	start_jif = jiffies;
 	next_jif = start_jif + dirty_writeback_interval;
-	nr_to_write = global_page_state(NR_FILE_DIRTY) +
-			global_page_state(NR_UNSTABLE_NFS) +
+	nr_to_write = global_dirty() +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
 		wbc.encountered_congestion = 0;
@@ -624,8 +618,7 @@ int __set_page_dirty_nobuffers(struct pa
 			if (mapping2) { /* Race with truncate? */
 				BUG_ON(mapping2 != mapping);
 				if (mapping_cap_account_dirty(mapping))
-					__inc_zone_page_state(page,
-								NR_FILE_DIRTY);
+					__inc_zone_page_dirty(page);
 				radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 			}
@@ -713,7 +706,7 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			if (mapping_cap_account_dirty(mapping))
-				__dec_zone_page_state(page, NR_FILE_DIRTY);
+				__dec_zone_page_dirty(page);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			return 1;
 		}
@@ -745,7 +738,7 @@ int clear_page_dirty_for_io(struct page 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
 			if (mapping_cap_account_dirty(mapping))
-				dec_zone_page_state(page, NR_FILE_DIRTY);
+				dec_zone_page_dirty(page);
 			return 1;
 		}
 		return 0;
Index: linux-2.6.18-rc1/include/linux/mmzone.h
===================================================================
--- linux-2.6.18-rc1.orig/include/linux/mmzone.h	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/include/linux/mmzone.h	2006-07-24 10:47:12.667161125 -0700
@@ -54,6 +54,7 @@ enum zone_stat_item {
 	NR_SLAB,	/* Pages used by slab allocator */
 	NR_PAGETABLE,	/* used for pagetables */
 	NR_FILE_DIRTY,
+	NR_FILE_MAPPED_DIRTY,
 	NR_WRITEBACK,
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
Index: linux-2.6.18-rc1/mm/vmscan.c
===================================================================
--- linux-2.6.18-rc1.orig/mm/vmscan.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/mm/vmscan.c	2006-07-24 10:47:12.668137627 -0700
@@ -1600,8 +1600,7 @@ int zone_reclaim(struct zone *zone, gfp_
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
-	if (zone_page_state(zone, NR_FILE_PAGES) -
-	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
+	if (zone_easily_reclaimable(zone) <= zone->min_unmapped_ratio)
 		return 0;
 
 	/*
Index: linux-2.6.18-rc1/fs/fs-writeback.c
===================================================================
--- linux-2.6.18-rc1.orig/fs/fs-writeback.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/fs/fs-writeback.c	2006-07-24 10:47:12.669114129 -0700
@@ -464,12 +464,9 @@ void sync_inodes_sb(struct super_block *
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
-	wbc.nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
-			nr_dirty + nr_unstable;
+	wbc.nr_to_write = 2 * global_dirty() +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
 	spin_lock(&inode_lock);
 	sync_sb_inodes(sb, &wbc);
Index: linux-2.6.18-rc1/fs/buffer.c
===================================================================
--- linux-2.6.18-rc1.orig/fs/buffer.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/fs/buffer.c	2006-07-24 10:47:12.671067133 -0700
@@ -851,7 +851,7 @@ int __set_page_dirty_buffers(struct page
 		write_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
-				__inc_zone_page_state(page, NR_FILE_DIRTY);
+				__inc_zone_page_dirty(page);
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
Index: linux-2.6.18-rc1/fs/nfs/write.c
===================================================================
--- linux-2.6.18-rc1.orig/fs/nfs/write.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/fs/nfs/write.c	2006-07-24 10:47:12.673020137 -0700
@@ -496,7 +496,7 @@ nfs_mark_request_dirty(struct nfs_page *
 	nfs_list_add_request(req, &nfsi->dirty);
 	nfsi->ndirty++;
 	spin_unlock(&nfsi->req_lock);
-	inc_zone_page_state(req->wb_page, NR_FILE_DIRTY);
+	inc_zone_page_dirty(req->wb_page);
 	mark_inode_dirty(inode);
 }
 
Index: linux-2.6.18-rc1/fs/nfs/pagelist.c
===================================================================
--- linux-2.6.18-rc1.orig/fs/nfs/pagelist.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/fs/nfs/pagelist.c	2006-07-24 10:47:12.673020137 -0700
@@ -314,7 +314,7 @@ nfs_scan_lock_dirty(struct nfs_inode *nf
 						req->wb_index, NFS_PAGE_TAG_DIRTY);
 				nfs_list_remove_request(req);
 				nfs_list_add_request(req, dst);
-				dec_zone_page_state(req->wb_page, NR_FILE_DIRTY);
+				dec_zone_page_dirty(req->wb_page);
 				res++;
 			}
 		}
Index: linux-2.6.18-rc1/mm/rmap.c
===================================================================
--- linux-2.6.18-rc1.orig/mm/rmap.c	2006-07-05 21:09:49.000000000 -0700
+++ linux-2.6.18-rc1/mm/rmap.c	2006-07-24 10:48:49.418980973 -0700
@@ -498,8 +498,11 @@ void page_add_new_anon_rmap(struct page 
  */
 void page_add_file_rmap(struct page *page)
 {
-	if (atomic_inc_and_test(&page->_mapcount))
+	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
+		if (PageDirty(page))
+			__inc_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
+	}
 }
 
 /**
@@ -520,6 +523,7 @@ void page_remove_rmap(struct page *page)
 		}
 #endif
 		BUG_ON(page_mapcount(page) < 0);
+
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
 		 * but that might overwrite a racing page_add_anon_rmap
@@ -529,10 +533,12 @@ void page_remove_rmap(struct page *page)
 		 * Leaving it set also helps swapoff to reinstate ptes
 		 * faster for those pages still in swapcache.
 		 */
-		if (page_test_and_clear_dirty(page))
+		if (page_test_and_clear_dirty(page)) {
+			__dec_zone_page_state(page, NR_FILE_MAPPED_DIRTY);
 			set_page_dirty(page);
+		}
 		__dec_zone_page_state(page,
-				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 	}
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-24 18:11   ` Christoph Lameter
@ 2006-07-24 19:00     ` Rik van Riel
  2006-07-25 20:25       ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Rik van Riel @ 2006-07-24 19:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Christoph Lameter wrote:
> On Sun, 23 Jul 2006, Rik van Riel wrote:
> 
>> This patch makes it possible to implement Martin Schwidefsky's
>> hypervisor-based fast page reclaiming for architectures without
>> millicode - ie. Xen, UML and all other non-s390 architectures.
>>
>> That could be a big help in heavily loaded virtualized environments.
>>
>> The fact that it helps prevent the iSCSI memory deadlock is a
>> huge bonus too, of course :)
> 
> I think there may be a way with less changes to the way the VM functions 
> to get there:

That approach probably has way too many state changes going
between the guest OS and the hypervisor...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-24 19:00     ` Rik van Riel
@ 2006-07-25 20:25       ` Christoph Lameter
  2006-07-25 21:37         ` Rik van Riel
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-25 20:25 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Mon, 24 Jul 2006, Rik van Riel wrote:

> > I think there may be a way with less changes to the way the VM functions to
> > get there:
> 
> That approach probably has way too many state changes going
> between the guest OS and the hypervisor...

An increment of a VM counter causes a state change in the hypervisor?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-25 20:25       ` Christoph Lameter
@ 2006-07-25 21:37         ` Rik van Riel
  2006-07-25 23:03           ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Rik van Riel @ 2006-07-25 21:37 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Peter Zijlstra, linux-mm, linux-kernel

Christoph Lameter wrote:
> On Mon, 24 Jul 2006, Rik van Riel wrote:
> 
>>> I think there may be a way with less changes to the way the VM functions to
>>> get there:
>> That approach probably has way too many state changes going
>> between the guest OS and the hypervisor...
> 
> An increment of a VM counter causes a state change in the hypervisor?

Christoph, please read more than the first 5 words in each
email before replying.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-25 21:37         ` Rik van Riel
@ 2006-07-25 23:03           ` Christoph Lameter
  2006-07-26  0:02             ` Rik van Riel
  0 siblings, 1 reply; 27+ messages in thread
From: Christoph Lameter @ 2006-07-25 23:03 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Peter Zijlstra, linux-mm, linux-kernel

On Tue, 25 Jul 2006, Rik van Riel wrote:

> > An increment of a VM counter causes a state change in the hypervisor?
> 
> Christoph, please read more than the first 5 words in each
> email before replying.

Well, I read the whole thing before I replied and I could not figure this 
one out. Maybe I am too dumb to understand. Could you please explain 
yourself in more detail

I am also not sure why I should be running a hypervisor in the first place 
and so I may not be up to date on the whole technology.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-25 23:03           ` Christoph Lameter
@ 2006-07-26  0:02             ` Rik van Riel
  2006-07-26  0:05               ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Rik van Riel @ 2006-07-26  0:02 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Peter Zijlstra, linux-mm, linux-kernel

Christoph Lameter wrote:
> On Tue, 25 Jul 2006, Rik van Riel wrote:
> 
>>> An increment of a VM counter causes a state change in the hypervisor?
>> Christoph, please read more than the first 5 words in each
>> email before replying.
> 
> Well, I read the whole thing before I replied and I could not figure this 
> one out. Maybe I am too dumb to understand. Could you please explain 
> yourself in more detail

Page state transitions can be very expensive in a virtualized
environment, so it would be good if we had fewer transitions.

> I am also not sure why I should be running a hypervisor in the first place 
> and so I may not be up to date on the whole technology.

You may not, but IMHO it would be good if whatever new VM
things we implement in Linux would at least be virtualization
friendly.  Especially if that can be achieved without hurting
native performance...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26  0:02             ` Rik van Riel
@ 2006-07-26  0:05               ` Christoph Lameter
  0 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2006-07-26  0:05 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Peter Zijlstra, linux-mm, linux-kernel

On Tue, 25 Jul 2006, Rik van Riel wrote:

> > Well, I read the whole thing before I replied and I could not figure this
> > one out. Maybe I am too dumb to understand. Could you please explain
> > yourself in more detail
> 
> Page state transitions can be very expensive in a virtualized
> environment, so it would be good if we had fewer transitions.

So the hypervisor indeed tracks each individual page state? Note that I do 
not propose to change the page state but a counter for page states. I am 
bit confused about how not touching a page can cause page state 
transitions. But then I do not know much about hypervisors. What magic is 
going on in the background that could enable the hypervisor to track 
counter increments?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-23  5:50 ` Rik van Riel
  2006-07-24 18:11   ` Christoph Lameter
@ 2006-07-26 11:00   ` Martin Schwidefsky
  2006-07-26 11:11     ` Peter Zijlstra
  1 sibling, 1 reply; 27+ messages in thread
From: Martin Schwidefsky @ 2006-07-26 11:00 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On 7/23/06, Rik van Riel <riel@redhat.com> wrote:
> Peter Zijlstra wrote:
> > This patch implements the inactive_clean list spoken of during the VM summit.
> > The LRU tail pages will be unmapped and ready to free, but not freeed.
> > This gives reclaim an extra chance.
>
> This patch makes it possible to implement Martin Schwidefsky's
> hypervisor-based fast page reclaiming for architectures without
> millicode - ie. Xen, UML and all other non-s390 architectures.

Hmm, I wonder how the inactive clean list helps in regard to the fast
host reclaim
scheme. In particular since the memory pressure that triggers the
reclaim is in the
host, not in the guest. So all pages might be on the active list but
the host still
wants to be able to discard pages.

-- 
blue skies,
  Martin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26 11:00   ` Martin Schwidefsky
@ 2006-07-26 11:11     ` Peter Zijlstra
  2006-07-26 13:04       ` Martin Schwidefsky
  0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2006-07-26 11:11 UTC (permalink / raw)
  To: Martin Schwidefsky
  Cc: Rik van Riel, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Wed, 2006-07-26 at 13:00 +0200, Martin Schwidefsky wrote:
> On 7/23/06, Rik van Riel <riel@redhat.com> wrote:
> > Peter Zijlstra wrote:
> > > This patch implements the inactive_clean list spoken of during the VM summit.
> > > The LRU tail pages will be unmapped and ready to free, but not freeed.
> > > This gives reclaim an extra chance.
> >
> > This patch makes it possible to implement Martin Schwidefsky's
> > hypervisor-based fast page reclaiming for architectures without
> > millicode - ie. Xen, UML and all other non-s390 architectures.
> 
> Hmm, I wonder how the inactive clean list helps in regard to the fast
> host reclaim
> scheme. In particular since the memory pressure that triggers the
> reclaim is in the
> host, not in the guest. So all pages might be on the active list but
> the host still
> wants to be able to discard pages.
> 

I think Rik would want to set all the already unmapped pages to volatile
state in the hypervisor.

These pages can be dropped without loss of information on the guest
system since they are all already on a backing-store, be it regular
files or swap.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26 11:11     ` Peter Zijlstra
@ 2006-07-26 13:04       ` Martin Schwidefsky
  2006-07-26 14:45         ` Peter Zijlstra
  2006-07-26 15:41         ` Rik van Riel
  0 siblings, 2 replies; 27+ messages in thread
From: Martin Schwidefsky @ 2006-07-26 13:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rik van Riel, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On 7/26/06, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > Hmm, I wonder how the inactive clean list helps in regard to the fast
> > host reclaim
> > scheme. In particular since the memory pressure that triggers the
> > reclaim is in the
> > host, not in the guest. So all pages might be on the active list but
> > the host still
> > wants to be able to discard pages.
> >
>
> I think Rik would want to set all the already unmapped pages to volatile
> state in the hypervisor.
>
> These pages can be dropped without loss of information on the guest
> system since they are all already on a backing-store, be it regular
> files or swap.

I guessed that as well. It isn't good enough. Consider a guest with a
large (virtual) memory size and a host with a small physical memory
size. The guest will never put any page on the inactive_clean list
because it does not have memory pressure. vmscan will never run. The
host wants to reclaim memory of the guest, but since the
inactive_clean list is empty it will find only stable pages.

-- 
blue skies,
  Martin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26 13:04       ` Martin Schwidefsky
@ 2006-07-26 14:45         ` Peter Zijlstra
  2006-07-27 11:16           ` Martin Schwidefsky
  2006-07-26 15:41         ` Rik van Riel
  1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2006-07-26 14:45 UTC (permalink / raw)
  To: Martin Schwidefsky
  Cc: Rik van Riel, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On Wed, 2006-07-26 at 15:04 +0200, Martin Schwidefsky wrote:
> On 7/26/06, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > > Hmm, I wonder how the inactive clean list helps in regard to the fast
> > > host reclaim
> > > scheme. In particular since the memory pressure that triggers the
> > > reclaim is in the
> > > host, not in the guest. So all pages might be on the active list but
> > > the host still
> > > wants to be able to discard pages.
> > >
> >
> > I think Rik would want to set all the already unmapped pages to volatile
> > state in the hypervisor.
> >
> > These pages can be dropped without loss of information on the guest
> > system since they are all already on a backing-store, be it regular
> > files or swap.
> 
> I guessed that as well. It isn't good enough. Consider a guest with a
> large (virtual) memory size and a host with a small physical memory
> size. The guest will never put any page on the inactive_clean list
> because it does not have memory pressure. vmscan will never run. The
> host wants to reclaim memory of the guest, but since the
> inactive_clean list is empty it will find only stable pages.
> 

Wouldn't we typically have all free pages > min_free in state U?
Also wouldn't all R/O mapped pages not also be V, all R/W mapped pages
and unmapped page-cache pages P like you state in your paper.

This patch would just increase the number of V pages with the tail end
of the guest LRU, which are typically the pages you would want to evict
(perhaps even add 5th guest state to indicate that these V pages are
preferable over the others?)

But isn't it so that for the gross over-commit scenario you outline the
host OS will have to swap out S pages eventually?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26 13:04       ` Martin Schwidefsky
  2006-07-26 14:45         ` Peter Zijlstra
@ 2006-07-26 15:41         ` Rik van Riel
  1 sibling, 0 replies; 27+ messages in thread
From: Rik van Riel @ 2006-07-26 15:41 UTC (permalink / raw)
  To: Martin Schwidefsky
  Cc: Peter Zijlstra, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

Martin Schwidefsky wrote:
> On 7/26/06, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

>> I think Rik would want to set all the already unmapped pages to volatile
>> state in the hypervisor.

> I guessed that as well. It isn't good enough. Consider a guest with a
> large (virtual) memory size and a host with a small physical memory
> size. The guest will never put any page on the inactive_clean list
> because it does not have memory pressure. 

Well, the management software running on top of everything
should tweak the inactive_clean targets in the various guests
so the total amount of volatile memory is large enough...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] mm: inactive-clean list
  2006-07-26 14:45         ` Peter Zijlstra
@ 2006-07-27 11:16           ` Martin Schwidefsky
  0 siblings, 0 replies; 27+ messages in thread
From: Martin Schwidefsky @ 2006-07-27 11:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Rik van Riel, linux-mm, Linus Torvalds, Andrew Morton, linux-kernel

On 7/26/06, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Wouldn't we typically have all free pages > min_free in state U?
> Also wouldn't all R/O mapped pages not also be V, all R/W mapped pages
> and unmapped page-cache pages P like you state in your paper.

Ahh, ok, I misunderstood. You want to keep the state changes for clean
page cache pages, I assumed that you only want to make pages volatile
if the get on the inactive_clean list and leave them stable if they
are on one of the other two lists.

> This patch would just increase the number of V pages with the tail end
> of the guest LRU, which are typically the pages you would want to evict
> (perhaps even add 5th guest state to indicate that these V pages are
> preferable over the others?)

Yes, that would help for architectures that cannot implement the
potential-volatile state.

> But isn't it so that for the gross over-commit scenario you outline the
> host OS will have to swap out S pages eventually?

My point was that you really have to distinguish between host memory
pressure and guest memory pressure.

-- 
blue skies,
  Martin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2006-07-27 11:16 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-07-17 20:24 [PATCH] mm: inactive-clean list Peter Zijlstra
2006-07-18  3:37 ` Christoph Lameter
2006-07-18 12:16   ` Peter Zijlstra
2006-07-18 13:29     ` Christoph Lameter
2006-07-18 13:55       ` Martin J. Bligh
2006-07-18 13:59         ` Christoph Lameter
2006-07-18 15:12           ` Martin J. Bligh
2006-07-18 15:57             ` Christoph Lameter
2006-07-18 16:23               ` Martin J. Bligh
2006-07-18 14:03         ` Christoph Lameter
2006-07-18 14:25           ` Andrew Morton
2006-07-18 14:45             ` Christoph Lameter
2006-07-18 15:59               ` KAMEZAWA Hiroyuki
2006-07-23  5:50 ` Rik van Riel
2006-07-24 18:11   ` Christoph Lameter
2006-07-24 19:00     ` Rik van Riel
2006-07-25 20:25       ` Christoph Lameter
2006-07-25 21:37         ` Rik van Riel
2006-07-25 23:03           ` Christoph Lameter
2006-07-26  0:02             ` Rik van Riel
2006-07-26  0:05               ` Christoph Lameter
2006-07-26 11:00   ` Martin Schwidefsky
2006-07-26 11:11     ` Peter Zijlstra
2006-07-26 13:04       ` Martin Schwidefsky
2006-07-26 14:45         ` Peter Zijlstra
2006-07-27 11:16           ` Martin Schwidefsky
2006-07-26 15:41         ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox