[PATCH] page ageing with lists

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] page ageing with lists
@ 2000-07-13  1:15 Roger Larsson
  2000-07-13 10:34 ` Chris Evans
  0 siblings, 1 reply; 2+ messages in thread
From: Roger Larsson @ 2000-07-13  1:15 UTC (permalink / raw)
  To: linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1183 bytes --]

Hi,

This is a patch with page ageing for 2.4.0-test4-pre1.

Performance, unoptimized filesystem:
* streamed write is as good as 2.2.14
* streamed copy is 3/4 of 2.2.14
* streamed read is close to 2.2.14

Potential problems:
* Got a BUG mm.h:321 while running this patch,
  unrelated? (more about this in another email)
  

Features:
* does NOT add any field in page structure.
* round robin lists is used to simulate ageing.
* referenced pages are moved 2 steps forward.
* multi used paged are moved 4 steps forward.
* non free able, tryagain, are moved 1 steps forward.
* new pages are inserted 3 steps forward.
* no pages are moved backward or to currently scanned.
and new in this release:
* pages failing zone test are moved to a list per zone.
  This lists are searched first!
* removed one unnecessary cause for SetPageReferenced


Future work:
* trim offsets / size / priority
* remove code that unnecessary sets page as referenced (Riel?)
* split pagemap_lru_lock (if wanted on SMP)
* move pages of zones with pressure less forward...
* ...
additional idea:
* periodically check pages for referenced - move forward.

/RogerL


--
Home page:
  http://www.norran.net/nra02596/

[-- Attachment #2: patch-2.4.0-test4-pre1-filemap.age+zone.3 --]
[-- Type: text/plain, Size: 11243 bytes --]

--- linux/mm/page_alloc.c.orig	Tue Jul 11 23:50:58 2000
+++ linux/mm/page_alloc.c	Wed Jul 12 17:37:06 2000
@@ -516,7 +516,7 @@ void __init free_area_init_core(int nid,
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
+	init_lru_cache();
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -562,6 +562,11 @@ void __init free_area_init_core(int nid,
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+
+		memlist_init(&zone->lru_cache[0]);
+		memlist_init(&zone->lru_cache[1]);
+		zone->lru_insert = 0;
+
 		if (!size)
 			continue;
 
--- linux/mm/filemap.c.orig	Tue Jul 11 23:50:27 2000
+++ linux/mm/filemap.c	Thu Jul 13 02:12:08 2000
@@ -44,7 +44,26 @@
 atomic_t page_cache_size = ATOMIC_INIT(0);
 unsigned int page_hash_bits;
 struct page **page_hash_table;
-struct list_head lru_cache;
+
+/* Note: optimization possibility - spit pagemap_lru_lock!
+ * iff LRU_INSERT_OFFSET != 0 and != all other offsets */
+#define NO_LRU_CACHES 8 /* power of two, greater than biggest offset */
+#define LRU_SCAN_INIT 0
+#define LRU_ZONE_OFFSET 0
+#define LRU_INSERT_OFFSET 3
+#define LRU_LOCKED_OFFSET 1     /* shouldn't be locked a long time  */
+#define LRU_MULTIUSE_OFFSET 4   /* rare, what to do but wait. [shorter=1?] */
+#define LRU_BUFFER_OFFSET 1     /* freeing - will take some time */
+#define LRU_MAPPED_OFFSET 0     /* rare, only last test rejected freeing */
+#define LRU_REFERENCED_OFFSET 2
+#define MAX_LRU_OFFSET 4
+
+static struct list_head lru_caches[NO_LRU_CACHES];
+static unsigned lru_scan = LRU_SCAN_INIT;
+struct list_head *lru_cache_insert =
+        &lru_caches[(LRU_SCAN_INIT + LRU_INSERT_OFFSET) % NO_LRU_CACHES];
+static int lru_histogram[MAX_LRU_OFFSET + 1];
+static int lru_histogram_total;
 
 static spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;
 /*
@@ -245,26 +264,54 @@ repeat:
 	spin_unlock(&pagecache_lock);
 }
 
+static void reset_lru_histogram(void)
+{
+  int ix;
+  for (ix = 0; ix <= MAX_LRU_OFFSET; ix++)
+    lru_histogram[ix] = 0;
+  lru_histogram_total = 0;
+}
+
+static void print_lru_histogram(void)
+{
+  int ix;
+  printk(  "lru_histogram_total =     %5d\n", lru_histogram_total);
+  for (ix = 0; ix <= MAX_LRU_OFFSET; ix++)
+    printk("lru_histogram[%2d] = %5d\n", ix, lru_histogram[ix]);
+} 
+
+void init_lru_cache(void)
+{
+  int ix;
+
+  for (ix = 0; ix < NO_LRU_CACHES; ix++)
+      INIT_LIST_HEAD(&lru_caches[ix]);
+}
+
 /*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes.  We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
+ * Return: true if successful
+ * Precond: lock held: pagemap_lru_lock
+ * Note: releases the lock regulary
+ * Note: *lru_cache_scan_ref may change when lock is released
  */
-int shrink_mmap(int priority, int gfp_mask)
+int shrink_mmap_specific(
+			 struct list_head **lru_cache_scan_ref,
+			 int gfp_mask,
+			 int *count_ref,
+			 int *nr_dirty)
 {
-	int ret = 0, count, nr_dirty;
 	struct list_head * page_lru;
 	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
+	int count = *count_ref;
 
-	/* we need pagemap_lru_lock for list_del() ... subtle code below */
-	spin_lock(&pagemap_lru_lock);
-	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+	while (count > 0 &&
+	       (page_lru = (*lru_cache_scan_ref)->prev) != *lru_cache_scan_ref) {
+	        unsigned dispose_offset;
 		page = list_entry(page_lru, struct page, lru);
 		list_del(page_lru);
 
+		lru_histogram_total++;
+		dispose_offset = LRU_REFERENCED_OFFSET;
 		if (PageTestandClearReferenced(page))
 			goto dispose_continue;
 
@@ -273,9 +320,11 @@ int shrink_mmap(int priority, int gfp_ma
 		 * Avoid unscalable SMP locking for pages we can
 		 * immediate tell are untouchable..
 		 */
+		dispose_offset = LRU_MULTIUSE_OFFSET;
 		if (!page->buffers && page_count(page) > 1)
 			goto dispose_continue;
 
+		dispose_offset = LRU_LOCKED_OFFSET;
 		if (TryLockPage(page))
 			goto dispose_continue;
 
@@ -293,8 +342,10 @@ int shrink_mmap(int priority, int gfp_ma
 		 * Is it a buffer page? Try to clean it up regardless
 		 * of zone - it's old.
 		 */
+		dispose_offset = LRU_BUFFER_OFFSET;
 		if (page->buffers) {
-			int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
+			int wait = ((gfp_mask & __GFP_IO) &&
+				    (*nr_dirty-- < 0));
 			if (!try_to_free_buffers(page, wait))
 				goto unlock_continue;
 			/* page was locked, inode can't go away under us */
@@ -314,6 +365,7 @@ int shrink_mmap(int priority, int gfp_ma
 		 * We can't free pages unless there's just one user
 		 * (count == 2 because we added one ourselves above).
 		 */
+		dispose_offset = LRU_MULTIUSE_OFFSET;
 		if (page_count(page) != 2)
 			goto cache_unlock_continue;
 
@@ -332,9 +384,11 @@ int shrink_mmap(int priority, int gfp_ma
 		 * Page is from a zone we don't care about.
 		 * Don't drop page cache entries in vain.
 		 */
+		dispose_offset = LRU_ZONE_OFFSET;
 		if (page->zone->free_pages > page->zone->pages_high)
 			goto cache_unlock_continue;
 
+		dispose_offset = LRU_MAPPED_OFFSET;
 		/* is it a page-cache page? */
 		if (page->mapping) {
 			if (!PageDirty(page) && !pgcache_under_min()) {
@@ -354,21 +408,164 @@ unlock_continue:
 		UnlockPage(page);
 		page_cache_release(page);
 dispose_continue:
-		list_add(page_lru, &lru_cache);
+		lru_histogram[dispose_offset]++;
+
+		if (dispose_offset > 0)
+		{
+		  /* TODO CHECK OPTIMIZATION
+		   * should become
+		   *   (lru_scan + dispose_offset) & (NO_LRU_CACHES - 1)
+		   * since both lru_scan and dispose_offset are unsigned
+		   * and NO_LRU_CACHES is a power of two.
+		   */
+		  unsigned dispose;
+		  dispose = (lru_scan + dispose_offset) % NO_LRU_CACHES;
+
+		  list_add(page_lru, 
+			   &lru_caches[dispose]);
+		}
+		else {
+		  /* dispose to zone lru */
+		  list_add(page_lru, 
+			   &page->zone->lru_cache[page->zone->lru_insert]);
+		}
 	}
-	goto out;
+	*count_ref = count;
+	return 0;
+
 
+	/*
+	 * Successful returns follows
+	 */
 made_inode_progress:
 	page_cache_release(page);
 made_buffer_progress:
 	UnlockPage(page);
 	page_cache_release(page);
-	ret = 1;
 	spin_lock(&pagemap_lru_lock);
 	/* nr_lru_pages needs the spinlock */
 	nr_lru_pages--;
 
-out:
+	*count_ref = count;
+	return 1;
+}
+
+
+
+static inline int shrink_mmap_zone(
+		     int gfp_mask,
+		     int *count_ref,
+		     int *nr_dirty_ref)
+{
+  int ret = 1;
+
+  /*
+   * alternative... from page_alloc.c
+   *
+   * for (i = 0; i < NUMNODES; i++)
+   *   for (zone = NODE_DATA(i)->node_zones;
+   *	 zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES;
+   *     zone++)
+   */
+
+  pg_data_t *pgdat = pgdat_list;
+
+  do {
+    int i;
+    for(i = 0; i < MAX_NR_ZONES; i++) {
+      zone_t *zone = pgdat->node_zones + i;
+      
+      /*
+       * do stuff, if from a zone we care about
+       */
+      if (zone->zone_wake_kswapd) {
+	  struct list_head *lru_zone_cache_scan;
+	  int retries = 2;
+
+	  while (--retries) {
+	    int success = 0;
+
+	    /* non insert zone lru empty? try the other one */ 
+	    int lru_scan = !zone->lru_insert;
+	    lru_zone_cache_scan = &zone->lru_cache[lru_scan];
+	    if (!success && list_empty(&zone->lru_cache[lru_scan])) {
+	      /* swap insert and scan */
+	      zone->lru_insert = lru_scan;
+	      lru_scan = !lru_scan;
+	    }
+
+	    success = shrink_mmap_specific(&lru_zone_cache_scan,
+					   gfp_mask,
+					   count_ref,
+					   nr_dirty_ref);
+
+	    if (success)
+	      return 1;
+	  }
+
+	  ret = 0;
+      }
+      else if (zone->free_pages < zone->pages_high &&
+	  list_empty(&zone->lru_cache[0]) &&
+	  list_empty(&zone->lru_cache[1])) {
+	/* Some preassure (same test as in shrink_mmap_specific)
+	 * and there are no pages one zone lru lists */
+	ret = 0;
+      }
+
+    }
+    pgdat = pgdat->node_next;
+  } while (pgdat);
+
+  return ret;
+}
+
+
+static inline int shrink_mmap_age(
+		    int gfp_mask,
+		    int *count_ref,
+		    int *nr_dirty_ref,
+		    int *success_ref)
+{
+	static struct list_head *lru_cache_scan = &lru_caches[LRU_SCAN_INIT];
+
+	if (list_empty(lru_cache_scan)) {
+	  print_lru_histogram();
+	  reset_lru_histogram();
+
+	  lru_scan = (lru_scan + 1) % NO_LRU_CACHES;
+	  
+	  lru_cache_scan = &lru_caches[lru_scan];
+	  lru_cache_insert =
+	    &lru_caches[(lru_scan + LRU_INSERT_OFFSET) % NO_LRU_CACHES];
+	}
+
+	return shrink_mmap_specific(&lru_cache_scan, gfp_mask,
+				    count_ref, nr_dirty_ref);
+}
+
+/*
+ * nr_dirty represents the number of dirty pages that we will write async
+ * before doing sync writes.  We can only do sync writes if we can
+ * wait for IO (__GFP_IO set).
+ */
+int shrink_mmap(int priority, int gfp_mask)
+{
+	int ret = 0, count, nr_dirty;
+
+	
+	count = nr_lru_pages / (priority + 1);
+	nr_dirty = priority;
+
+	/* we need pagemap_lru_lock for subroutines */
+	spin_lock(&pagemap_lru_lock);
+
+	ret = shrink_mmap_zone(gfp_mask, &count, &nr_dirty);
+
+	if (!ret) {
+	    ret = shrink_mmap_age(gfp_mask, &count, &nr_dirty, &ret);
+	};
+
 	spin_unlock(&pagemap_lru_lock);
 
 	return ret;
@@ -507,7 +704,6 @@ static inline void __add_to_page_cache(s
 	struct address_space *mapping, unsigned long offset,
 	struct page **hash)
 {
-	struct page *alias;
 	unsigned long flags;
 
 	if (PageLocked(page))
@@ -520,9 +716,6 @@ static inline void __add_to_page_cache(s
 	add_page_to_inode_queue(mapping, page);
 	__add_page_to_hash_queue(page, hash);
 	lru_cache_add(page);
-	alias = __find_page_nolock(mapping, offset, *hash);
-	if (alias != page)
-		BUG();
 }
 
 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
--- linux/include/linux/mm.h.orig	Tue Jul 11 23:58:33 2000
+++ linux/include/linux/mm.h	Wed Jul 12 19:34:10 2000
@@ -15,7 +15,7 @@ extern unsigned long max_mapnr;
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-extern struct list_head lru_cache;
+extern struct list_head *lru_cache_insert;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -456,6 +456,7 @@ struct zone_t;
 /* filemap.c */
 extern void remove_inode_page(struct page *);
 extern unsigned long page_unuse(struct page *);
+extern void init_lru_cache(void);
 extern int shrink_mmap(int, int);
 extern void truncate_inode_pages(struct address_space *, loff_t);
 
--- linux/include/linux/swap.h.orig	Tue Jul 11 23:58:51 2000
+++ linux/include/linux/swap.h	Wed Jul 12 10:08:03 2000
@@ -166,7 +166,7 @@ extern spinlock_t pagemap_lru_lock;
 #define	lru_cache_add(page)			\
 do {						\
 	spin_lock(&pagemap_lru_lock);		\
-	list_add(&(page)->lru, &lru_cache);	\
+	list_add(&(page)->lru, lru_cache_insert);	\
 	nr_lru_pages++;				\
 	spin_unlock(&pagemap_lru_lock);		\
 } while (0)
--- linux/include/linux/mmzone.h.orig	Wed Jul 12 16:24:59 2000
+++ linux/include/linux/mmzone.h	Wed Jul 12 17:39:12 2000
@@ -32,6 +32,12 @@ typedef struct zone_struct {
 	char			zone_wake_kswapd;
 	unsigned long		pages_min, pages_low, pages_high;
 
+        /*
+	 * zone lru - really old pages
+	 */
+        int                     lru_insert;
+        struct list_head        lru_cache[2];
+
 	/*
 	 * free areas of different sizes
 	 */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] page ageing with lists
  2000-07-13  1:15 [PATCH] page ageing with lists Roger Larsson
@ 2000-07-13 10:34 ` Chris Evans
  0 siblings, 0 replies; 2+ messages in thread
From: Chris Evans @ 2000-07-13 10:34 UTC (permalink / raw)
  To: Roger Larsson; +Cc: linux-kernel, linux-mm

On Thu, 13 Jul 2000, Roger Larsson wrote:

> Hi,
> 
> This is a patch with page ageing for 2.4.0-test4-pre1.
> 
> Performance, unoptimized filesystem:
> * streamed write is as good as 2.2.14
> * streamed copy is 3/4 of 2.2.14
> * streamed read is close to 2.2.14

Has anyone tested 2.4.0-test4-pre4 without any patches?

And shouldn't (in particular) streamed write be faster than 2.2 on account
of the unified buffer cache in 2.3?

Cheers
Chris

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2000-07-13 10:34 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-07-13  1:15 [PATCH] page ageing with lists Roger Larsson
2000-07-13 10:34 ` Chris Evans

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox