[PATCH] per-page SLAB freeing (only dcache for now)

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Marcelo <marcelo.tosatti@cyclades.com>
To: linux-mm@kvack.org
Cc: akpm@osdl.org, dgc@sgi.com, dipankar@in.ibm.com,
	mbligh@mbligh.org, manfred@colorfullife.com
Subject: [PATCH] per-page SLAB freeing (only dcache for now)
Date: Fri, 30 Sep 2005 16:37:54 -0300	[thread overview]
Message-ID: <20050930193754.GB16812@xeon.cnet> (raw)

Hi,

The SLAB reclaiming process works on per-object basis, out of pseudo-LRU
ordering in the case of inode and dentry caches.

In a recent thread named "VM balancing issues on 2.6.13: dentry cache 
not getting shrunk enough" folks suggested that the SLAB reclaiming
process should be changed to aim at entire SLAB containers, not 
single objects. This has been suggested several times in the past.

The following patch is an experimental attempt to do it for the 
dentry cache. 

It works by checking all objects of a given SLAB once a single object is
pruned in prune_dcache(). Once it has been confirmed that all objects
on the target SLAB are freeable, proceeds to free them all.

Few issues with the patch:

a) Locking needs further verification to confirm correctness.
b) The addition of "i_am_alive" flag might not be necessary: I believe
its possible to use kmembufctl to known about usage of objects 
within SLAB containers.
c) Freeing functions needs to be moved out of mm/slab.c in a proper
place.
d) General beautification.

I don't see any fundamental problems with this approach, are there any?
I'll clean it up and proceed to write the inode cache equivalent 
if there aren't.

Andrew commented about the requirement of a global lock for this reverse 
reclaiming - it seems to me that all that is necessary is correct
synchronization between users and the reclaiming path (not necessarily
a global lock though).

Or maybe I just dont get what he's talking about.

Anyway, dbench testing with 18 threads on 256Mb UP machine shows 
noticeable improvement (results in Mb/s):

                        1st run    2nd run    avg
stock 2.6.13            15.11       14.9      15.00	
2.6.13+slabreclaim      16.22       15.5      15.86

Comments?


diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/fs/dcache.c linux-2.6.13/fs/dcache.c
--- linux-2.6.13.orig/fs/dcache.c	2005-09-23 16:26:02.000000000 -0300
+++ linux-2.6.13/fs/dcache.c	2005-09-30 16:01:08.000000000 -0300
@@ -44,7 +44,8 @@ static seqlock_t rename_lock __cacheline
 
 EXPORT_SYMBOL(dcache_lock);
 
-static kmem_cache_t *dentry_cache; 
+kmem_cache_t *dentry_cache; 
+
 
 #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 
@@ -84,6 +85,7 @@ static void d_callback(struct rcu_head *
  */
 static void d_free(struct dentry *dentry)
 {
+	dentry->i_am_alive = 0;
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
  	call_rcu(&dentry->d_rcu, d_callback);
@@ -363,7 +365,7 @@ restart:
  * removed.
  * Called with dcache_lock, drops it and then regains.
  */
-static inline void prune_one_dentry(struct dentry * dentry)
+inline void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
@@ -390,8 +392,11 @@ static inline void prune_one_dentry(stru
  * This function may fail to free any resources if
  * all the dentries are in use.
  */
+
+int check_slab_page(void *);
+int free_slab_page(void *);
  
-static void prune_dcache(int count)
+static void __prune_dcache(int count, int page_scan)
 
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
@@ -427,10 +432,17 @@ static void prune_dcache(int count)
 			continue;
 		}
 		prune_one_dentry(dentry);
+		if(page_scan && check_slab_page(dentry))
+			free_slab_page(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
 
+static void prune_dcache(int count)
+{
+	__prune_dcache(count, 1);
+}
+
 /*
  * Shrink the dcache for the specified super block.
  * This allows us to unmount a device without disturbing
@@ -642,7 +654,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 }
 
 /**
@@ -680,7 +692,7 @@ void shrink_dcache_anon(struct hlist_hea
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 	} while(found);
 }
 
@@ -755,6 +767,7 @@ struct dentry *d_alloc(struct dentry * p
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
+	dentry->i_am_alive = 0xdeadbeef;
 
 	if (parent) {
 		dentry->d_parent = dget(parent);
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/fs/inode.c linux-2.6.13/fs/inode.c
--- linux-2.6.13.orig/fs/inode.c	2005-09-23 16:26:02.000000000 -0300
+++ linux-2.6.13/fs/inode.c	2005-09-28 14:17:31.000000000 -0300
@@ -97,7 +97,7 @@ DECLARE_MUTEX(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static kmem_cache_t * inode_cachep;
+kmem_cache_t * inode_cachep;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/include/linux/dcache.h linux-2.6.13/include/linux/dcache.h
--- linux-2.6.13.orig/include/linux/dcache.h	2005-06-17 16:48:29.000000000 -0300
+++ linux-2.6.13/include/linux/dcache.h	2005-09-27 16:17:59.000000000 -0300
@@ -106,6 +106,7 @@ struct dentry {
 	struct hlist_node d_hash;	/* lookup hash list */	
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
+	int i_am_alive;
 };
 
 struct dentry_operations {
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/mm/slab.c linux-2.6.13/mm/slab.c
--- linux-2.6.13.orig/mm/slab.c	2005-09-23 16:26:04.000000000 -0300
+++ linux-2.6.13/mm/slab.c	2005-09-30 16:08:06.000000000 -0300
@@ -93,6 +93,8 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
+#include	<linux/proc_fs.h>
+#include	<linux/dcache.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -574,6 +576,259 @@ static void free_block(kmem_cache_t* cac
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
 
+int dentry_check_freeable(struct dentry *parent)
+{
+	struct dentry *this_parent = parent;
+	struct list_head *next;
+	unsigned int int_array[32]; /* XXX: should match tree depth limit */
+	unsigned int *int_array_ptr = (unsigned int *)&int_array;
+
+	memset(int_array, 0, sizeof(int_array));
+
+	if (parent->i_am_alive != 0xdeadbeef)
+		return 1;
+repeat:
+        next = this_parent->d_subdirs.next;
+resume:
+        while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		next = tmp->next;
+
+                if (!list_empty(&dentry->d_subdirs)) {
+                        this_parent = dentry;
+			/* increase the counter */
+			*int_array_ptr = *int_array_ptr+1;
+			/* move to next array position */
+			int_array_ptr++;
+			if (int_array_ptr >= (unsigned int*)(&int_array)+(sizeof(int_array)))
+				BUG();
+			*int_array_ptr = 0;
+
+                        goto repeat;
+                }
+                /* Pinned dentry? */
+		if (atomic_read(&dentry->d_count))
+			return 0;
+		else 
+			*int_array_ptr = *int_array_ptr+1;
+        }
+
+	/*
+         * All done at this level ... ascend and resume the search.
+         */
+        if (this_parent != parent) {
+		unsigned int val = *int_array_ptr;
+		/* does this directory have any additional ref? */
+		if (atomic_read(&this_parent->d_count) != val)
+			return 0;
+		int_array_ptr--;
+		if (int_array_ptr < (unsigned int*)&int_array)
+			BUG();
+
+		next = this_parent->d_child.next;
+		this_parent = this_parent->d_parent;
+		goto resume;
+        }
+
+	if (int_array_ptr != (unsigned int*)&int_array) {
+		printk("int array pointer differs: ptr:%p - &array:%p\n",
+			int_array_ptr, &int_array);
+		BUG();
+	}
+
+	if (atomic_read(&parent->d_count) == *int_array_ptr)
+		return 1;
+
+	return 0;
+}
+
+int check_slab_page(void *objp)
+{
+        struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+        kmem_cache_t *cachep = GET_PAGE_CACHE(virt_to_page(objp));
+        int i;
+
+	for(i=0; i < cachep->num ; i++) {
+		struct dentry *target;
+		void *objp = slabp->s_mem + cachep->objsize * i;
+		target = (struct dentry *)objp;
+
+		if (atomic_read(&target->d_count)) {
+			if (!dentry_check_freeable(target))
+				break;
+			}
+		}
+
+	if (i == cachep->num)
+		return 1;
+
+	return 0;
+}
+
+int dentry_free_child(struct dentry *dentry)
+{
+	int ret = 1;
+        if (!list_empty(&dentry->d_subdirs)) {
+                spin_unlock(&dcache_lock);
+                shrink_dcache_parent(dentry);
+                spin_lock(&dcache_lock);
+        }
+
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count))
+		ret = 0;
+
+	return ret;
+}
+
+int slab_free_attempt = 0;
+int slab_free_success = 0;
+
+extern inline void prune_one_dentry(struct dentry *);
+
+int free_slab_page(void *objp)
+{
+	struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+	kmem_cache_t *cachep = GET_PAGE_CACHE(virt_to_page(objp));
+	int i;
+	int freed = 0;
+
+	for(i=0; i < cachep->num ; i++) {
+		struct dentry *target;
+		void *objp = slabp->s_mem + cachep->objsize * i;
+		target = (struct dentry *)objp;
+
+		/* XXX: race between i_am_alive check and lock acquision? */
+		if (target->i_am_alive != 0xdeadbeef)
+			continue;
+
+		spin_lock(&target->d_lock);
+
+		/* no additional references? nuke it */
+                if (!atomic_read(&target->d_count)) {
+			if (!list_empty(&target->d_lru)) {
+				dentry_stat.nr_unused--;
+				list_del_init(&target->d_lru);
+			}
+			prune_one_dentry(target);
+			freed++;
+		/* otherwise attempt to free children */
+                } else {
+			spin_unlock(&target->d_lock);
+			if (dentry_free_child(target)) {
+				if (!list_empty(&target->d_lru)) {
+					dentry_stat.nr_unused--;
+					list_del_init(&target->d_lru);
+				}
+				prune_one_dentry(target);
+				freed++;
+			} else
+				break;
+				
+		}
+        }
+
+	slab_free_attempt++;
+
+        if (i == cachep->num) {
+		slab_free_success++;
+		return 1;
+	}
+
+	return 0;
+}
+
+extern kmem_cache_t *dentry_cache;
+extern kmem_cache_t *inode_cachep;
+
+struct cache_stat {
+	unsigned int free_pages;
+	unsigned int partial_pages;
+	unsigned int partial_freeable;
+	unsigned int full_pages;
+	unsigned int full_freeable;
+};
+
+void cache_retrieve_stats(kmem_cache_t *cachep, struct cache_stat *stat)
+{
+	struct list_head *entry;
+	struct slab *slabp;
+
+	memset(stat, 0, sizeof(struct cache_stat));
+
+	list_for_each(entry,&cachep->lists.slabs_free)
+		stat->free_pages++;
+
+	list_for_each(entry,&cachep->lists.slabs_partial) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->partial_pages++;
+		stat->partial_freeable += check_slab_page(slabp);
+	}
+
+	list_for_each(entry,&cachep->lists.slabs_full) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->full_pages++;
+		stat->full_freeable += check_slab_page(slabp);
+	}
+
+}
+
+struct proc_dir_entry *slab_stats;
+struct proc_dir_entry *slab_reclaim;
+
+static int print_slab_stats(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+
+	struct cache_stat stat;
+	int len;
+
+	cache_retrieve_stats(dentry_cache, &stat);
+
+	len = sprintf(page, "dentry_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	cache_retrieve_stats(inode_cachep, &stat);
+
+	len += sprintf(page+len, "inode_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	return len;
+
+}
+
+static int print_slab_reclaim(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+	int len;
+
+	len = sprintf(page, "slab_free_attempt:%d slab_free_success:%d\n",
+			slab_free_attempt, slab_free_success);
+	return len;
+}
+
+int __init init_slab_stats(void)
+{
+	slab_stats = create_proc_read_entry("slab_stats", 0644, NULL,
+					print_slab_stats, NULL);
+	if (slab_stats == NULL) 
+		printk(KERN_ERR "failure to create slab_stats!\n");
+	else
+		printk(KERN_ERR "success creating slab_stats!\n");
+
+	slab_stats = create_proc_read_entry("slab_reclaim", 0644, NULL,
+					print_slab_reclaim, NULL);
+	if (slab_reclaim == NULL) 
+		printk(KERN_ERR "failure to create slab_reclaim!\n");
+	else
+		printk(KERN_ERR "success creating slab_reclaim!\n");
+
+	slab_stats->owner = THIS_MODULE;
+
+	return 1;
+}
+
+late_initcall(init_slab_stats);
+
 static inline void **ac_entry(struct array_cache *ac)
 {
 	return (void**)(ac+1);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next             reply	other threads:[~2005-09-30 19:37 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-09-30 19:37 Marcelo [this message]
2005-10-01  2:46 ` Christoph Lameter
2005-10-01 21:52   ` Marcelo
2005-10-03 15:24     ` Christoph Lameter
2005-10-03 20:37       ` Manfred Spraul
2005-10-03 22:17         ` Marcelo Tosatti
2005-10-04 17:04           ` Manfred Spraul
2005-10-06 16:01             ` Marcelo Tosatti
2005-10-22  1:30               ` Marcelo Tosatti
2005-10-22  6:31                 ` Andrew Morton
2005-10-22  9:21                   ` Arjan van de Ven
2005-10-22 17:08                   ` Christoph Lameter
2005-10-22 17:13                     ` ia64 page size (was Re: [PATCH] per-page SLAB freeing (only dcache for now)) Arjan van de Ven
2005-10-22 18:16                     ` [PATCH] per-page SLAB freeing (only dcache for now) Manfred Spraul
2005-10-23 18:41                       ` Marcelo Tosatti
2005-10-23 16:30                   ` Marcelo Tosatti

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050930193754.GB16812@xeon.cnet \
    --to=marcelo.tosatti@cyclades.com \
    --cc=akpm@osdl.org \
    --cc=dgc@sgi.com \
    --cc=dipankar@in.ibm.com \
    --cc=linux-mm@kvack.org \
    --cc=manfred@colorfullife.com \
    --cc=mbligh@mbligh.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox