From mboxrd@z Thu Jan  1 00:00:00 1970
Subject: PATCH: Improvements in shrink_mmap and kswapd
From: "Juan J. Quintela" <quintela@fi.udc.es>
Date: 18 Jun 2000 00:45:52 +0200
Message-ID: <ytt3dmcyli7.fsf@serpe.mitica>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Sender: owner-linux-mm@kvack.org
Return-Path: <owner-linux-mm@kvack.org>
To: Alan Cox <alan@lxorguk.ukuu.org.uk>, lkml <linux-kernel@vger.rutgers.edu>, linux-mm@kvack.org, linux-fsdevel@vger.rutgers.edu
List-ID: <linux-mm.kvack.org>

Hi
        this patch makes kswapd use less resources.  It should solve
the kswapd eats xx% of my CPU problems.  It appears that it improves
IO a bit here.  Could people having problems with IO told me if this
patch improves things, I am interested in knowing that it don't makes
things worst never.  This patch is stable here.  I am finishing the
deferred mmaped pages form file writing patch, that should solve
several other problems.

Reports of success/failure are welcome.  Comments are also welcome.

Later, Juan.


        This patch implements:
- never loops infinitely is shrink_mmap (it walks as maximum once per
  page)
- it changes the nr_dirty logic to max_launder_page logic.  We start
  writing async a maximum of max_launder_page (100), and after that
  point we never start more writes for that run of shrink_mmap.  If we
  start max_launder_page writes, we wait at the end of the function if
  possible (i.e __gfp_mask let do that).
- It checks that there is some zone with need of pages before continue
  with the loop.  If there is no pages, stop walking the LRU.
- I have got the patch from Roger Larson for the memory pressure and
  have partially re implemented/increasing it.
- kswapd rewrite in similar way that Roger Larson one.
- added the function memory_pressure that returns 1 if there is
  memory_pressure and 0 if there is no pressure.
- I have got Manfred patch to use test_and_test_and_clear_bit
  optimization in ClearPageReferenced.
- Added ClearPageDirty(page) to __remove_inode_pages to solve the
  ramfs problems.
- Added __lru_cache_del and __lru_cache_add and use them in
  shrink_mmap.
- Makes a cleanup of several cruft in shirk_mmap.


diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/asm-i386/bitops.h working/include/asm-i386/bitops.h
--- base/include/asm-i386/bitops.h	Sat Jun 17 23:37:03 2000
+++ working/include/asm-i386/bitops.h	Sat Jun 17 23:52:49 2000
@@ -29,6 +29,7 @@
 extern void change_bit(int nr, volatile void * addr);
 extern int test_and_set_bit(int nr, volatile void * addr);
 extern int test_and_clear_bit(int nr, volatile void * addr);
+extern int test_and_test_and_clear_bit(int nr, volatile void * addr);
 extern int test_and_change_bit(int nr, volatile void * addr);
 extern int __constant_test_bit(int nr, const volatile void * addr);
 extern int __test_bit(int nr, volatile void * addr);
@@ -87,6 +88,13 @@
 		:"=r" (oldbit),"=m" (ADDR)
 		:"Ir" (nr));
 	return oldbit;
+}
+
+extern __inline__ int test_and_test_and_clear_bit(int nr, volatile void *addr)
+{
+	if(!(((unsigned long)addr) & (1<<nr)))
+		return 0;
+	return test_and_clear_bit(nr,addr);
 }
 
 extern __inline__ int test_and_change_bit(int nr, volatile void * addr)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/linux/mm.h working/include/linux/mm.h
--- base/include/linux/mm.h	Sat Jun 17 23:37:03 2000
+++ working/include/linux/mm.h	Sun Jun 18 00:23:05 2000
@@ -203,7 +203,7 @@
 #define PageReferenced(page)	test_bit(PG_referenced, &(page)->flags)
 #define SetPageReferenced(page)	set_bit(PG_referenced, &(page)->flags)
 #define ClearPageReferenced(page)	clear_bit(PG_referenced, &(page)->flags)
-#define PageTestandClearReferenced(page)	test_and_clear_bit(PG_referenced, &(page)->flags)
+#define PageTestandClearReferenced(page)	test_and_test_and_clear_bit(PG_referenced, &(page)->flags)
 #define PageDecrAfter(page)	test_bit(PG_decr_after, &(page)->flags)
 #define SetPageDecrAfter(page)	set_bit(PG_decr_after, &(page)->flags)
 #define PageTestandClearDecrAfter(page)	test_and_clear_bit(PG_decr_after, &(page)->flags)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/include/linux/swap.h working/include/linux/swap.h
--- base/include/linux/swap.h	Sat Jun 17 23:37:16 2000
+++ working/include/linux/swap.h	Sat Jun 17 23:52:49 2000
@@ -87,6 +87,7 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(unsigned int gfp_mask);
+extern int memory_pressure(void);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *, int);
@@ -173,11 +174,17 @@
 /*
  * Helper macros for lru_pages handling.
  */
-#define	lru_cache_add(page)			\
+
+#define	__lru_cache_add(page)			\
 do {						\
-	spin_lock(&pagemap_lru_lock);		\
 	list_add(&(page)->lru, &lru_cache);	\
 	nr_lru_pages++;				\
+} while (0)
+
+#define	lru_cache_add(page)			\
+do {						\
+	spin_lock(&pagemap_lru_lock);		\
+	__lru_cache_add(page);			\
 	page->age = PG_AGE_START;		\
 	ClearPageReferenced(page);		\
 	SetPageActive(page);			\
@@ -187,7 +194,6 @@
 #define	__lru_cache_del(page)			\
 do {						\
 	list_del(&(page)->lru);			\
-	ClearPageActive(page);			\
 	nr_lru_pages--;				\
 } while (0)
 
@@ -196,6 +202,7 @@
 	if (!PageLocked(page))			\
 		BUG();				\
 	spin_lock(&pagemap_lru_lock);		\
+	ClearPageActive(page);			\
 	__lru_cache_del(page);			\
 	spin_unlock(&pagemap_lru_lock);		\
 } while (0)
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/filemap.c working/mm/filemap.c
--- base/mm/filemap.c	Sat Jun 17 23:25:43 2000
+++ working/mm/filemap.c	Sun Jun 18 00:36:19 2000
@@ -65,8 +65,8 @@
 		(*p)->pprev_hash = &page->next_hash;
 	*p = page;
 	page->pprev_hash = p;
-	if (page->buffers)
-		PAGE_BUG(page);
+//	if (page->buffers)
+//		PAGE_BUG(page);
 }
 
 static inline void remove_page_from_hash_queue(struct page * page)
@@ -102,6 +102,7 @@
 	if (page->buffers)
 		BUG();
 
+	ClearPageDirty(page);
 	remove_page_from_inode_queue(page);
 	remove_page_from_hash_queue(page);
 	page->mapping = NULL;
@@ -294,36 +295,55 @@
 	spin_unlock(&pagecache_lock);
 }
 
-/*
- * nr_dirty represents the number of dirty pages that we will write async
- * before doing sync writes.  We can only do sync writes if we can
- * wait for IO (__GFP_IO set).
+/**
+ * shrink_mmap - Tries to free memory
+ * @priority: how hard we will try to free pages (0 hardest)
+ * @gfp_mask: Restrictions to free pages
+ *
+ * This function walks the lru list searching for free pages. It
+ * returns 1 to indicate success and 0 in the opposite case. It gets a
+ * lock in the pagemap_lru_lock and the pagecache_lock.  
  */
+/* nr_to_examinate counts the number of pages that we will read as
+ * maximum as each call.  This means that we don't loop.
+ */
+/* nr_writes counts the number of writes that we have started to the
+ * moment. We limitate the number of writes in each round to
+ * max_page_launder. ToDo: Make that variable tunable through sysctl.
+ */
+const int max_page_launder = 100;
+
 int shrink_mmap(int priority, int gfp_mask)
 {
-	int ret = 0, count, nr_dirty;
 	struct list_head * page_lru;
 	struct page * page = NULL;
-	
-	count = nr_lru_pages / (priority + 1);
-	nr_dirty = priority;
+	int ret;
+	int nr_to_examinate = nr_lru_pages;
+	int nr_writes = 0;
+	int count = nr_lru_pages / (priority + 1);
 
 	/* we need pagemap_lru_lock for list_del() ... subtle code below */
 	spin_lock(&pagemap_lru_lock);
 	while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) {
+		/* We exit if we have examinated all the LRU pages */
+		if(!nr_to_examinate--)
+			break;
+
+		/* if there is no zone low on memory we return */
+		if(!memory_pressure())
+			break;
+
 		page = list_entry(page_lru, struct page, lru);
-		list_del(page_lru);
+		__lru_cache_del(page);
 
 		if (PageTestandClearReferenced(page)) {
-			page->age += PG_AGE_ADV;
-			if (page->age > PG_AGE_MAX)
-				page->age = PG_AGE_MAX;
-			goto dispose_continue;
+			page->age = min(PG_AGE_MAX, page->age + PG_AGE_ADV);
+			goto reinsert_page_continue;
 		}
 		page->age -= min(PG_AGE_DECL, page->age);
 
 		if (page->age)
-			goto dispose_continue;
+			goto reinsert_page_continue;
 
 		count--;
 		/*
@@ -331,16 +351,18 @@
 		 * immediate tell are untouchable..
 		 */
 		if (!page->buffers && page_count(page) > 1)
-			goto dispose_continue;
+			goto reinsert_page_continue;
 
 		if (TryLockPage(page))
-			goto dispose_continue;
+			goto reinsert_page_continue;
 
-		/* Release the pagemap_lru lock even if the page is not yet
-		   queued in any lru queue since we have just locked down
-		   the page so nobody else may SMP race with us running
-		   a lru_cache_del() (lru_cache_del() always run with the
-		   page locked down ;). */
+		/* 
+		 * Release the pagemap_lru lock even if the page is
+		 * not yet queued in any lru queue since we have just
+		 * locked down the page so nobody else may SMP race
+		 * with us running a lru_cache_del() (lru_cache_del()
+		 * always run with the page locked down ;). 
+		 */
 		spin_unlock(&pagemap_lru_lock);
 
 		/* avoid freeing the page while it's locked */
@@ -351,14 +373,17 @@
 		 * of zone - it's old.
 		 */
 		if (page->buffers) {
-			int wait = ((gfp_mask & __GFP_IO) && (nr_dirty-- < 0));
-			if (!try_to_free_buffers(page, wait))
+ 			if (nr_writes < max_page_launder) {
+ 				nr_writes++;
+ 				if (!try_to_free_buffers(page, 0))
+ 					goto unlock_continue;
+ 				/* page was locked, inode can't go away under us */
+				if (!page->mapping) {
+ 					atomic_dec(&buffermem_pages);
+ 					goto made_buffer_progress;
+ 				}
+ 			} else 
 				goto unlock_continue;
-			/* page was locked, inode can't go away under us */
-			if (!page->mapping) {
-				atomic_dec(&buffermem_pages);
-				goto made_buffer_progress;
-			}
 		}
 
 		/*
@@ -371,10 +396,13 @@
 			goto unlock_continue;
 		}
 
-		/* Take the pagecache_lock spinlock held to avoid
-		   other tasks to notice the page while we are looking at its
-		   page count. If it's a pagecache-page we'll free it
-		   in one atomic transaction after checking its page count. */
+		/* 
+		 * Take the pagecache_lock spinlock held to avoid
+		 * other tasks to notice the page while we are
+		 * looking at its page count. If it's a
+		 * pagecache-page we'll free it in one atomic
+		 * transaction after checking its page count. 
+		 */
 		spin_lock(&pagecache_lock);
 
 		/*
@@ -396,14 +424,15 @@
 				goto made_inode_progress;
 			}
 			/* PageDeferswap -> we swap out the page now. */
-			if (gfp_mask & __GFP_IO) {
+			if ((gfp_mask & __GFP_IO) && (nr_writes < max_page_launder)) {
 				spin_unlock(&pagecache_lock);
+				nr_writes++;
 				/* Do NOT unlock the page ... brw_page does. */
 				ClearPageDirty(page);
 				rw_swap_page(WRITE, page, 0);
 				spin_lock(&pagemap_lru_lock);
 				page_cache_release(page);
-				goto dispose_continue;
+				goto reinsert_page_continue;
 			}
 			goto cache_unlock_continue;
 		}
@@ -426,23 +455,23 @@
 		spin_lock(&pagemap_lru_lock);
 		UnlockPage(page);
 		page_cache_release(page);
-dispose_continue:
-		list_add(page_lru, &lru_cache);
+reinsert_page_continue:
+		__lru_cache_add(page);
 	}
+	spin_unlock(&pagemap_lru_lock);
+	ret = 0;
 	goto out;
 
 made_inode_progress:
 	page_cache_release(page);
 made_buffer_progress:
+	ClearPageActive(page);
 	UnlockPage(page);
 	page_cache_release(page);
 	ret = 1;
-	spin_lock(&pagemap_lru_lock);
-	/* nr_lru_pages needs the spinlock */
-	nr_lru_pages--;
-
 out:
-	spin_unlock(&pagemap_lru_lock);
+	if ((gfp_mask & __GFP_IO) && (nr_writes >= max_page_launder))
+		block_sync_page(page);
 
 	return ret;
 }
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/swap_state.c working/mm/swap_state.c
--- base/mm/swap_state.c	Sat Jun 17 23:25:43 2000
+++ working/mm/swap_state.c	Sat Jun 17 23:52:49 2000
@@ -73,7 +73,6 @@
 		PAGE_BUG(page);
 
 	PageClearSwapCache(page);
-	ClearPageDirty(page);
 	remove_inode_page(page);
 }
 
diff -urN --exclude-from=/home/lfcia/quintela/work/kernel/exclude base/mm/vmscan.c working/mm/vmscan.c
--- base/mm/vmscan.c	Sat Jun 17 23:51:24 2000
+++ working/mm/vmscan.c	Sun Jun 18 00:28:12 2000
@@ -179,16 +179,14 @@
 
 	/* Add it to the swap cache */
 	add_to_swap_cache(page, entry);
+	set_pte(page_table, swp_entry_to_pte(entry));
 
 	/* Put the swap entry into the pte after the page is in swapcache */
 	vma->vm_mm->rss--;
-	set_pte(page_table, swp_entry_to_pte(entry));
 	flush_tlb_page(vma, address);
 	vmlist_access_unlock(vma->vm_mm);
 
-	/* OK, do a physical asynchronous write to swap.  */
-	// rw_swap_page(WRITE, page, 0);
-	/* Let shrink_mmap handle this swapout. */
+	/* Set page for deferred swap */
 	SetPageDirty(page);
 	UnlockPage(page);
 
@@ -427,6 +425,32 @@
 	return __ret;
 }
 
+/**
+ * memory_pressure - Is the system under memory pressure
+ *
+ * Returns 1 if the system is low on memory in any of its zones,
+ * otherwise returns 0.
+ */
+int memory_pressure(void)
+{
+	pg_data_t *pgdat = pgdat_list;
+
+	do {
+		int i;
+		for(i = 0; i < MAX_NR_ZONES; i++) {
+			zone_t *zone = pgdat->node_zones + i;
+			if (!zone->size || !zone->zone_wake_kswapd)
+				continue;
+			if (zone->free_pages < zone->pages_low)
+				return 1; 
+		}
+		pgdat = pgdat->node_next;
+		
+	} while (pgdat);
+	
+	return 0;
+}
+
 /*
  * We need to make the locks finer granularity, but right
  * now we need this so that we can do page allocations
@@ -444,7 +468,6 @@
 	int priority;
 	int count = FREE_COUNT;
 	int swap_count = 0;
-	int ret = 0;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
@@ -452,11 +475,12 @@
 	priority = 64;
 	do {
 		while (shrink_mmap(priority, gfp_mask)) {
-			ret = 1;
 			if (!--count)
 				goto done;
 		}
 
+		if(!memory_pressure())
+			return 1;
 
 		/* Try to get rid of some shared memory pages.. */
 		if (gfp_mask & __GFP_IO) {
@@ -468,11 +492,9 @@
 			count -= shrink_dcache_memory(priority, gfp_mask);
 			count -= shrink_icache_memory(priority, gfp_mask);
 			if (count <= 0) {
-				ret = 1;
 				goto done;
 			}
 			while (shm_swap(priority, gfp_mask)) {
-				ret = 1;
 				if (!--count)
 					goto done;
 			}
@@ -496,18 +518,19 @@
 			if (--swap_count < 0)
 				break;
 		}
+		if(!memory_pressure())
+			return 1;
 
 	} while (--priority >= 0);
 
 	/* Always end on a shrink_mmap.. */
 	while (shrink_mmap(0, gfp_mask)) {
-		ret = 1;
 		if (!--count)
 			goto done;
 	}
 
 done:
-	return ret;
+	return (count < FREE_COUNT);
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
@@ -549,26 +572,14 @@
 	tsk->flags |= PF_MEMALLOC;
 
 	for (;;) {
-		pg_data_t *pgdat;
-		int something_to_do = 0;
+		int pressure = memory_pressure();
 
-		pgdat = pgdat_list;
-		do {
-			int i;
-			for(i = 0; i < MAX_NR_ZONES; i++) {
-				zone_t *zone = pgdat->node_zones+ i;
-				if (tsk->need_resched)
-					schedule();
-				if (!zone->size || !zone->zone_wake_kswapd)
-					continue;
-				if (zone->free_pages < zone->pages_low)
-					something_to_do = 1;
-				do_try_to_free_pages(GFP_KSWAPD);
-			}
-			pgdat = pgdat->node_next;
-		} while (pgdat);
+		if (tsk->need_resched)
+			schedule();
 
-		if (!something_to_do) {
+		if(pressure)
+			do_try_to_free_pages(GFP_KSWAPD);
+		else {
 			tsk->state = TASK_INTERRUPTIBLE;
 			interruptible_sleep_on(&kswapd_wait);
 		}

-- 
In theory, practice and theory are the same, but in practice they 
are different -- Larry McVoy
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/