[PATCH] per-page SLAB freeing (only dcache for now)

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] per-page SLAB freeing (only dcache for now)
@ 2005-09-30 19:37 Marcelo
  2005-10-01  2:46 ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo @ 2005-09-30 19:37 UTC (permalink / raw)
  To: linux-mm; +Cc: akpm, dgc, dipankar, mbligh, manfred

Hi,

The SLAB reclaiming process works on per-object basis, out of pseudo-LRU
ordering in the case of inode and dentry caches.

In a recent thread named "VM balancing issues on 2.6.13: dentry cache 
not getting shrunk enough" folks suggested that the SLAB reclaiming
process should be changed to aim at entire SLAB containers, not 
single objects. This has been suggested several times in the past.

The following patch is an experimental attempt to do it for the 
dentry cache. 

It works by checking all objects of a given SLAB once a single object is
pruned in prune_dcache(). Once it has been confirmed that all objects
on the target SLAB are freeable, proceeds to free them all.

Few issues with the patch:

a) Locking needs further verification to confirm correctness.
b) The addition of "i_am_alive" flag might not be necessary: I believe
its possible to use kmembufctl to known about usage of objects 
within SLAB containers.
c) Freeing functions needs to be moved out of mm/slab.c in a proper
place.
d) General beautification.

I don't see any fundamental problems with this approach, are there any?
I'll clean it up and proceed to write the inode cache equivalent 
if there aren't.

Andrew commented about the requirement of a global lock for this reverse 
reclaiming - it seems to me that all that is necessary is correct
synchronization between users and the reclaiming path (not necessarily
a global lock though).

Or maybe I just dont get what he's talking about.

Anyway, dbench testing with 18 threads on 256Mb UP machine shows 
noticeable improvement (results in Mb/s):

                        1st run    2nd run    avg
stock 2.6.13            15.11       14.9      15.00	
2.6.13+slabreclaim      16.22       15.5      15.86

Comments?


diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/fs/dcache.c linux-2.6.13/fs/dcache.c
--- linux-2.6.13.orig/fs/dcache.c	2005-09-23 16:26:02.000000000 -0300
+++ linux-2.6.13/fs/dcache.c	2005-09-30 16:01:08.000000000 -0300
@@ -44,7 +44,8 @@ static seqlock_t rename_lock __cacheline
 
 EXPORT_SYMBOL(dcache_lock);
 
-static kmem_cache_t *dentry_cache; 
+kmem_cache_t *dentry_cache; 
+
 
 #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 
@@ -84,6 +85,7 @@ static void d_callback(struct rcu_head *
  */
 static void d_free(struct dentry *dentry)
 {
+	dentry->i_am_alive = 0;
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
  	call_rcu(&dentry->d_rcu, d_callback);
@@ -363,7 +365,7 @@ restart:
  * removed.
  * Called with dcache_lock, drops it and then regains.
  */
-static inline void prune_one_dentry(struct dentry * dentry)
+inline void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
@@ -390,8 +392,11 @@ static inline void prune_one_dentry(stru
  * This function may fail to free any resources if
  * all the dentries are in use.
  */
+
+int check_slab_page(void *);
+int free_slab_page(void *);
  
-static void prune_dcache(int count)
+static void __prune_dcache(int count, int page_scan)
 
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
@@ -427,10 +432,17 @@ static void prune_dcache(int count)
 			continue;
 		}
 		prune_one_dentry(dentry);
+		if(page_scan && check_slab_page(dentry))
+			free_slab_page(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
 
+static void prune_dcache(int count)
+{
+	__prune_dcache(count, 1);
+}
+
 /*
  * Shrink the dcache for the specified super block.
  * This allows us to unmount a device without disturbing
@@ -642,7 +654,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 }
 
 /**
@@ -680,7 +692,7 @@ void shrink_dcache_anon(struct hlist_hea
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 	} while(found);
 }
 
@@ -755,6 +767,7 @@ struct dentry *d_alloc(struct dentry * p
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
+	dentry->i_am_alive = 0xdeadbeef;
 
 	if (parent) {
 		dentry->d_parent = dget(parent);
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/fs/inode.c linux-2.6.13/fs/inode.c
--- linux-2.6.13.orig/fs/inode.c	2005-09-23 16:26:02.000000000 -0300
+++ linux-2.6.13/fs/inode.c	2005-09-28 14:17:31.000000000 -0300
@@ -97,7 +97,7 @@ DECLARE_MUTEX(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static kmem_cache_t * inode_cachep;
+kmem_cache_t * inode_cachep;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/include/linux/dcache.h linux-2.6.13/include/linux/dcache.h
--- linux-2.6.13.orig/include/linux/dcache.h	2005-06-17 16:48:29.000000000 -0300
+++ linux-2.6.13/include/linux/dcache.h	2005-09-27 16:17:59.000000000 -0300
@@ -106,6 +106,7 @@ struct dentry {
 	struct hlist_node d_hash;	/* lookup hash list */	
 	int d_mounted;
 	unsigned char d_iname[DNAME_INLINE_LEN_MIN];	/* small names */
+	int i_am_alive;
 };
 
 struct dentry_operations {
diff -p -Nur --exclude-from=/home/marcelo/excl linux-2.6.13.orig/mm/slab.c linux-2.6.13/mm/slab.c
--- linux-2.6.13.orig/mm/slab.c	2005-09-23 16:26:04.000000000 -0300
+++ linux-2.6.13/mm/slab.c	2005-09-30 16:08:06.000000000 -0300
@@ -93,6 +93,8 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
+#include	<linux/proc_fs.h>
+#include	<linux/dcache.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -574,6 +576,259 @@ static void free_block(kmem_cache_t* cac
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
 
+int dentry_check_freeable(struct dentry *parent)
+{
+	struct dentry *this_parent = parent;
+	struct list_head *next;
+	unsigned int int_array[32]; /* XXX: should match tree depth limit */
+	unsigned int *int_array_ptr = (unsigned int *)&int_array;
+
+	memset(int_array, 0, sizeof(int_array));
+
+	if (parent->i_am_alive != 0xdeadbeef)
+		return 1;
+repeat:
+        next = this_parent->d_subdirs.next;
+resume:
+        while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+		next = tmp->next;
+
+                if (!list_empty(&dentry->d_subdirs)) {
+                        this_parent = dentry;
+			/* increase the counter */
+			*int_array_ptr = *int_array_ptr+1;
+			/* move to next array position */
+			int_array_ptr++;
+			if (int_array_ptr >= (unsigned int*)(&int_array)+(sizeof(int_array)))
+				BUG();
+			*int_array_ptr = 0;
+
+                        goto repeat;
+                }
+                /* Pinned dentry? */
+		if (atomic_read(&dentry->d_count))
+			return 0;
+		else 
+			*int_array_ptr = *int_array_ptr+1;
+        }
+
+	/*
+         * All done at this level ... ascend and resume the search.
+         */
+        if (this_parent != parent) {
+		unsigned int val = *int_array_ptr;
+		/* does this directory have any additional ref? */
+		if (atomic_read(&this_parent->d_count) != val)
+			return 0;
+		int_array_ptr--;
+		if (int_array_ptr < (unsigned int*)&int_array)
+			BUG();
+
+		next = this_parent->d_child.next;
+		this_parent = this_parent->d_parent;
+		goto resume;
+        }
+
+	if (int_array_ptr != (unsigned int*)&int_array) {
+		printk("int array pointer differs: ptr:%p - &array:%p\n",
+			int_array_ptr, &int_array);
+		BUG();
+	}
+
+	if (atomic_read(&parent->d_count) == *int_array_ptr)
+		return 1;
+
+	return 0;
+}
+
+int check_slab_page(void *objp)
+{
+        struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+        kmem_cache_t *cachep = GET_PAGE_CACHE(virt_to_page(objp));
+        int i;
+
+	for(i=0; i < cachep->num ; i++) {
+		struct dentry *target;
+		void *objp = slabp->s_mem + cachep->objsize * i;
+		target = (struct dentry *)objp;
+
+		if (atomic_read(&target->d_count)) {
+			if (!dentry_check_freeable(target))
+				break;
+			}
+		}
+
+	if (i == cachep->num)
+		return 1;
+
+	return 0;
+}
+
+int dentry_free_child(struct dentry *dentry)
+{
+	int ret = 1;
+        if (!list_empty(&dentry->d_subdirs)) {
+                spin_unlock(&dcache_lock);
+                shrink_dcache_parent(dentry);
+                spin_lock(&dcache_lock);
+        }
+
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count))
+		ret = 0;
+
+	return ret;
+}
+
+int slab_free_attempt = 0;
+int slab_free_success = 0;
+
+extern inline void prune_one_dentry(struct dentry *);
+
+int free_slab_page(void *objp)
+{
+	struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+	kmem_cache_t *cachep = GET_PAGE_CACHE(virt_to_page(objp));
+	int i;
+	int freed = 0;
+
+	for(i=0; i < cachep->num ; i++) {
+		struct dentry *target;
+		void *objp = slabp->s_mem + cachep->objsize * i;
+		target = (struct dentry *)objp;
+
+		/* XXX: race between i_am_alive check and lock acquision? */
+		if (target->i_am_alive != 0xdeadbeef)
+			continue;
+
+		spin_lock(&target->d_lock);
+
+		/* no additional references? nuke it */
+                if (!atomic_read(&target->d_count)) {
+			if (!list_empty(&target->d_lru)) {
+				dentry_stat.nr_unused--;
+				list_del_init(&target->d_lru);
+			}
+			prune_one_dentry(target);
+			freed++;
+		/* otherwise attempt to free children */
+                } else {
+			spin_unlock(&target->d_lock);
+			if (dentry_free_child(target)) {
+				if (!list_empty(&target->d_lru)) {
+					dentry_stat.nr_unused--;
+					list_del_init(&target->d_lru);
+				}
+				prune_one_dentry(target);
+				freed++;
+			} else
+				break;
+				
+		}
+        }
+
+	slab_free_attempt++;
+
+        if (i == cachep->num) {
+		slab_free_success++;
+		return 1;
+	}
+
+	return 0;
+}
+
+extern kmem_cache_t *dentry_cache;
+extern kmem_cache_t *inode_cachep;
+
+struct cache_stat {
+	unsigned int free_pages;
+	unsigned int partial_pages;
+	unsigned int partial_freeable;
+	unsigned int full_pages;
+	unsigned int full_freeable;
+};
+
+void cache_retrieve_stats(kmem_cache_t *cachep, struct cache_stat *stat)
+{
+	struct list_head *entry;
+	struct slab *slabp;
+
+	memset(stat, 0, sizeof(struct cache_stat));
+
+	list_for_each(entry,&cachep->lists.slabs_free)
+		stat->free_pages++;
+
+	list_for_each(entry,&cachep->lists.slabs_partial) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->partial_pages++;
+		stat->partial_freeable += check_slab_page(slabp);
+	}
+
+	list_for_each(entry,&cachep->lists.slabs_full) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->full_pages++;
+		stat->full_freeable += check_slab_page(slabp);
+	}
+
+}
+
+struct proc_dir_entry *slab_stats;
+struct proc_dir_entry *slab_reclaim;
+
+static int print_slab_stats(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+
+	struct cache_stat stat;
+	int len;
+
+	cache_retrieve_stats(dentry_cache, &stat);
+
+	len = sprintf(page, "dentry_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	cache_retrieve_stats(inode_cachep, &stat);
+
+	len += sprintf(page+len, "inode_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	return len;
+
+}
+
+static int print_slab_reclaim(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+	int len;
+
+	len = sprintf(page, "slab_free_attempt:%d slab_free_success:%d\n",
+			slab_free_attempt, slab_free_success);
+	return len;
+}
+
+int __init init_slab_stats(void)
+{
+	slab_stats = create_proc_read_entry("slab_stats", 0644, NULL,
+					print_slab_stats, NULL);
+	if (slab_stats == NULL) 
+		printk(KERN_ERR "failure to create slab_stats!\n");
+	else
+		printk(KERN_ERR "success creating slab_stats!\n");
+
+	slab_stats = create_proc_read_entry("slab_reclaim", 0644, NULL,
+					print_slab_reclaim, NULL);
+	if (slab_reclaim == NULL) 
+		printk(KERN_ERR "failure to create slab_reclaim!\n");
+	else
+		printk(KERN_ERR "success creating slab_reclaim!\n");
+
+	slab_stats->owner = THIS_MODULE;
+
+	return 1;
+}
+
+late_initcall(init_slab_stats);
+
 static inline void **ac_entry(struct array_cache *ac)
 {
 	return (void**)(ac+1);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-09-30 19:37 [PATCH] per-page SLAB freeing (only dcache for now) Marcelo
@ 2005-10-01  2:46 ` Christoph Lameter
  2005-10-01 21:52   ` Marcelo
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2005-10-01  2:46 UTC (permalink / raw)
  To: Marcelo; +Cc: linux-mm, akpm, dgc, dipankar, mbligh, manfred

On Fri, 30 Sep 2005, Marcelo wrote:

> I don't see any fundamental problems with this approach, are there any?
> I'll clean it up and proceed to write the inode cache equivalent 
> if there aren't.

Hmm. I think this needs to be some generic functionality in the slab 
allocator. If the allocator determines that the number of entries in a 
page become reasonably low then call a special function provided at 
slab creation time to try to free up the leftover entries.

Something like

int slab_try_free(void *);

?

return true/false depending on success of attempt to free the entry.

This method may also be useful to attempt to migrate slab pages to 
different nodes. If such a method is available then one can try to free 
all entries in a page relying on their recreation on another node if they 
are needed again.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-01  2:46 ` Christoph Lameter
@ 2005-10-01 21:52   ` Marcelo
  2005-10-03 15:24     ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo @ 2005-10-01 21:52 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Marcelo, linux-mm, akpm, dgc, dipankar, mbligh, manfred

On Fri, Sep 30, 2005 at 07:46:31PM -0700, Christoph Lameter wrote:
> On Fri, 30 Sep 2005, Marcelo wrote:
> 
> > I don't see any fundamental problems with this approach, are there any?
> > I'll clean it up and proceed to write the inode cache equivalent 
> > if there aren't.
> 
> Hmm. I think this needs to be some generic functionality in the slab 
> allocator. If the allocator determines that the number of entries in a 
> page become reasonably low then call a special function provided at 
> slab creation time to try to free up the leftover entries.
> 
> Something like
> 
> int slab_try_free(void *);
> 
> ?
> 
> return true/false depending on success of attempt to free the entry.

I thought about having a mini-API for this such as "struct slab_reclaim_ops" 
implemented by each reclaimable cache, invoked by a generic SLAB function.

Problem is that locking involved into looking at the SLAB elements is 
cache specific (eg dcache_lock for the dcache, inode_lock for the icache, 
and so on), so making a generic function seems pretty tricky, ie. you 
need cache specific information in the generic function which is not so 
easily "generifiable", if there's such a word.

> This method may also be useful to attempt to migrate slab pages to
> different nodes. If such a method is available then one can try to free
> all entries in a page relying on their recreation on another node if they
> are needed again.

Yep, haven't thought of that before, but it might be interesting to have 
NUMA migration of cache elements.

Additionaly one can try to migrate recently referenced elements, instead 
of freeing them, moving them to some partially used SLAB free slot 
(Martin suggested that on IRC).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-01 21:52   ` Marcelo
@ 2005-10-03 15:24     ` Christoph Lameter
  2005-10-03 20:37       ` Manfred Spraul
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2005-10-03 15:24 UTC (permalink / raw)
  To: Marcelo; +Cc: linux-mm, akpm, dgc, dipankar, mbligh, manfred

On Sat, 1 Oct 2005, Marcelo wrote:

> I thought about having a mini-API for this such as "struct slab_reclaim_ops" 
> implemented by each reclaimable cache, invoked by a generic SLAB function.
> 
> Problem is that locking involved into looking at the SLAB elements is 
> cache specific (eg dcache_lock for the dcache, inode_lock for the icache, 
> and so on), so making a generic function seems pretty tricky, ie. you 
> need cache specific information in the generic function which is not so 
> easily "generifiable", if there's such a word.

The locking could be done by the cache specific free function. If it 
cannot lock it can simply indicate tha the entry is not freeable.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-03 15:24     ` Christoph Lameter
@ 2005-10-03 20:37       ` Manfred Spraul
  2005-10-03 22:17         ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Manfred Spraul @ 2005-10-03 20:37 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Marcelo, linux-mm, akpm, dgc, dipankar, mbligh

Christoph Lameter wrote:

>On Sat, 1 Oct 2005, Marcelo wrote:
>
>  
>
>>I thought about having a mini-API for this such as "struct slab_reclaim_ops" 
>>implemented by each reclaimable cache, invoked by a generic SLAB function.
>>
>>    
>>
Which functions would be needed?
- lock_cache(): No more alive/dead changes
- objp_is_alive()
- objp_is_killable()
- objp_kill()

I think it would be simpler if the caller must mark the objects as 
alive/dead before/after calling kmem_cache_alloc/free: I don't think 
it's a good idea to add special case code and branches to the normal 
kmem_cache_alloc codepath. And especially: It would mean that 
kmem_cache_alloc must perform a slab lookup  in each alloc call, this 
could be slow.
The slab users could store the alive status somewhere in the object. And 
they could set the flag early, e.g. disable alive as soon as an object 
is put on the rcu aging list.

The tricky part is lock_cache: is it actually possible to really lock 
the dentry cache, or could RCU cause changes at any time.

--
    Manfred

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-03 20:37       ` Manfred Spraul
@ 2005-10-03 22:17         ` Marcelo Tosatti
  2005-10-04 17:04           ` Manfred Spraul
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2005-10-03 22:17 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Christoph Lameter, linux-mm, akpm, dgc, dipankar, mbligh

Hi Manfred,

On Mon, Oct 03, 2005 at 10:37:26PM +0200, Manfred Spraul wrote:
> Christoph Lameter wrote:
> 
> >On Sat, 1 Oct 2005, Marcelo wrote:
> >
> > 
> >
> >>I thought about having a mini-API for this such as "struct 
> >>slab_reclaim_ops" implemented by each reclaimable cache, invoked by a 
> >>generic SLAB function.
> >>
> >>   
> >>
> Which functions would be needed?
> - lock_cache(): No more alive/dead changes
> - objp_is_alive()
> - objp_is_killable()
> - objp_kill() 

Yep something along that line. I'll come up with something more precise
tomorrow.

> I think it would be simpler if the caller must mark the objects as 
> alive/dead before/after calling kmem_cache_alloc/free: I don't think 
> it's a good idea to add special case code and branches to the normal 
> kmem_cache_alloc codepath. And especially: It would mean that 
> kmem_cache_alloc must perform a slab lookup  in each alloc call, this 
> could be slow.
> The slab users could store the alive status somewhere in the object. And 
> they could set the flag early, e.g. disable alive as soon as an object 
> is put on the rcu aging list.

The "i_am_alive" flag purpose at the moment is to avoid interpreting
uninitialized data (in the dentry cache, the reference counter is bogus
in such case). It was just a quick hack to watch it work, it seemed to
me it could be done within SLAB code.

This information ("liveness" of objects) is managed inside the SLAB
generic code, and it seems to be available already through the
kmembufctl array which is part of the management data, right?

Suppose there's no need for the cache specific functions to be aware of
liveness, ie. its SLAB specific information.

Another issue is synchronization between multiple threads in this 
level of the reclaim path. Can be dealt with PageLock: if the bit is set,
don't bother checking the page, someone else is already doing
so.

You mention

> - lock_cache(): No more alive/dead changes

With the PageLock bit, you can instruct kmem_cache_alloc() to skip partial
but Locked pages (thus avoiding any object allocations within that page).
Hum, what about higher order SLABs?

Well, kmem_cache_alloc() can be a little bit smarter at this point, since 
its already a slow path, no? Its refill time, per-CPU cache is exhausted...

As for dead changes (object deletion), they should only happen with the
object specific lock held (dentry->d_lock in dcache's case). Looks
safe.

> The tricky part is lock_cache: is it actually possible to really lock 
> the dentry cache, or could RCU cause changes at any time.

dentry->d_lock is a per-object lock guaranteeing synch. between lookups and
deletion. Lookup of dentries is lockfree, but not acquision of reference: 

struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
	...
        rcu_read_lock();

        hlist_for_each_rcu(node, head) {
                struct dentry *dentry;
                struct qstr *qstr;

                dentry = hlist_entry(node, struct dentry, d_hash);

                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;

                spin_lock(&dentry->d_lock);

                /*
                 * Recheck the dentry after taking the lock - d_move may have
                 * changed things.  Don't bother checking the hash because we're
                 * about to compare the whole name anyway.
                 */
                if (dentry->d_parent != parent)
                        goto next;

Finding dependencies of "pinned" objects, walking the tree downwards in children's
direction in dcache's case, is protected by dcache_lock at the moment. The inode 
cache might want to delete the dentry which pins the inode in memory. 

Finally, I fail to see the requirement for a global lock as Andrew mentions, even
though locking is tricky and must be carefully checked.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-03 22:17         ` Marcelo Tosatti
@ 2005-10-04 17:04           ` Manfred Spraul
  2005-10-06 16:01             ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Manfred Spraul @ 2005-10-04 17:04 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Christoph Lameter, linux-mm, akpm, dgc, dipankar, mbligh

Marcelo Tosatti wrote:

>Hi Manfred,
>
>On Mon, Oct 03, 2005 at 10:37:26PM +0200, Manfred Spraul wrote:
>  
>
>>Christoph Lameter wrote:
>>
>>    
>>
>>>On Sat, 1 Oct 2005, Marcelo wrote:
>>>
>>>
>>>
>>>      
>>>
>>>>I thought about having a mini-API for this such as "struct 
>>>>slab_reclaim_ops" implemented by each reclaimable cache, invoked by a 
>>>>generic SLAB function.
>>>>
>>>>  
>>>>
>>>>        
>>>>
>>Which functions would be needed?
>>- lock_cache(): No more alive/dead changes
>>- objp_is_alive()
>>- objp_is_killable()
>>- objp_kill() 
>>    
>>
>
>Yep something along that line. I'll come up with something more precise
>tomorrow.
>
>  
>
>>I think it would be simpler if the caller must mark the objects as 
>>alive/dead before/after calling kmem_cache_alloc/free: I don't think 
>>it's a good idea to add special case code and branches to the normal 
>>kmem_cache_alloc codepath. And especially: It would mean that 
>>kmem_cache_alloc must perform a slab lookup  in each alloc call, this 
>>could be slow.
>>The slab users could store the alive status somewhere in the object. And 
>>they could set the flag early, e.g. disable alive as soon as an object 
>>is put on the rcu aging list.
>>    
>>
>
>The "i_am_alive" flag purpose at the moment is to avoid interpreting
>uninitialized data (in the dentry cache, the reference counter is bogus
>in such case). It was just a quick hack to watch it work, it seemed to
>me it could be done within SLAB code.
>
>This information ("liveness" of objects) is managed inside the SLAB
>generic code, and it seems to be available already through the
>kmembufctl array which is part of the management data, right?
>
>  
>
Not really. The array is only updated when the free status reaches the 
slab structure, which is quite late.

kmem_cache_free
- puts the object into a per-cpu array. No locking at all, each cpu can 
only read it's own array.
- when that array is full, then it's put into a global array (->shared).
- when the global array is full, then the object is marked as free in 
the slab structure.
- when add objects from a slab are free, then the slab is placed on the 
free slab list
- when there is memory pressure, then the pages from the free slab list 
are reclaimed.

>Suppose there's no need for the cache specific functions to be aware of
>liveness, ie. its SLAB specific information.
>
>  
>
What about RCU? We have dying objects: Still alive, because someone 
might have a pointer to it, but already on the rcu list and will be 
released after the next quiescent state. slab can't know that.

>Another issue is synchronization between multiple threads in this 
>level of the reclaim path. Can be dealt with PageLock: if the bit is set,
>don't bother checking the page, someone else is already doing
>so.
>
>You mention
>
>  
>
>>- lock_cache(): No more alive/dead changes
>>    
>>
>
>With the PageLock bit, you can instruct kmem_cache_alloc() to skip partial
>but Locked pages (thus avoiding any object allocations within that page).
>Hum, what about higher order SLABs?
>
>  
>
You have misunderstood my question: I was thinking about object 
dead/alive changes.
There are two questions: First figure out how many objects from a 
certain slab are alive. Then, if it's below a threshold, try to free 
them. With this approach, you need lock(), is_objp_alive(), release_objp().

>Well, kmem_cache_alloc() can be a little bit smarter at this point, since 
>its already a slow path, no? Its refill time, per-CPU cache is exhausted...
>
>  
>
Definitively. Fast path is only kmem_cache_alloc and kmem_cache_free. No 
global cache line writes in these functions. They were down to 1 
conditional branch and 2-3 cachelines, One of them read-only, the 
other(s) are read/write, but per-cpu. I'm not sure how much changed with 
the NUMA patches, but the non-numa case should try to remain simple. And 
e.g. looking up the bufctl means an integer division. Just that 
instruction could nearly double the runtime of kmem_cache_free().
The shared_array part from cache_flusharray and cache_alloc_refill are 
partially fast path: If we slow that down, then it will affect packet 
routing. The rest is slow path.

--
    Manfred

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-04 17:04           ` Manfred Spraul
@ 2005-10-06 16:01             ` Marcelo Tosatti
  2005-10-22  1:30               ` Marcelo Tosatti
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2005-10-06 16:01 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: Christoph Lameter, linux-mm, akpm, dgc, dipankar, mbligh

On Tue, Oct 04, 2005 at 07:04:35PM +0200, Manfred Spraul wrote:
> Marcelo Tosatti wrote:
> 
> >Hi Manfred,
> >
> >On Mon, Oct 03, 2005 at 10:37:26PM +0200, Manfred Spraul wrote:
> > 
> >
> >>Christoph Lameter wrote:
> >>
> >>   
> >>
> >>>On Sat, 1 Oct 2005, Marcelo wrote:
> >>>
> >>>
> >>>
> >>>     
> >>>
> >>>>I thought about having a mini-API for this such as "struct 
> >>>>slab_reclaim_ops" implemented by each reclaimable cache, invoked by a 
> >>>>generic SLAB function.
> >>>>
> >>>> 
> >>>>
> >>>>       
> >>>>
> >>Which functions would be needed?
> >>- lock_cache(): No more alive/dead changes
> >>- objp_is_alive()
> >>- objp_is_killable()
> >>- objp_kill() 
> >>   
> >>
> >
> >Yep something along that line. I'll come up with something more precise
> >tomorrow.
> >
> > 
> >
> >>I think it would be simpler if the caller must mark the objects as 
> >>alive/dead before/after calling kmem_cache_alloc/free: I don't think 
> >>it's a good idea to add special case code and branches to the normal 
> >>kmem_cache_alloc codepath. And especially: It would mean that 
> >>kmem_cache_alloc must perform a slab lookup  in each alloc call, this 
> >>could be slow.
> >>The slab users could store the alive status somewhere in the object. And 
> >>they could set the flag early, e.g. disable alive as soon as an object 
> >>is put on the rcu aging list.
> >>   
> >>
> >
> >The "i_am_alive" flag purpose at the moment is to avoid interpreting
> >uninitialized data (in the dentry cache, the reference counter is bogus
> >in such case). It was just a quick hack to watch it work, it seemed to
> >me it could be done within SLAB code.
> >
> >This information ("liveness" of objects) is managed inside the SLAB
> >generic code, and it seems to be available already through the
> >kmembufctl array which is part of the management data, right?
> >
> > 
> >
> Not really. The array is only updated when the free status reaches the 
> slab structure, which is quite late. 

Thats fine, the usage information inside the array is only going to be used 
to avoid interpretation of uninitialized objects. Its safe to say
that unallocated objects will have their corresponding kmembufctl array 
entry consistent (marked as freed) at all times, right?

Actual per-object live/dead information must reside inside the objp itself
as you suggest, with guaranteed synchronization.

For the dcache its possible to use the D_UNHASHED flag (or some other 
field which describes validity).

> kmem_cache_free
> - puts the object into a per-cpu array. No locking at all, each cpu can 
> only read it's own array.
> - when that array is full, then it's put into a global array (->shared).
> - when the global array is full, then the object is marked as free in 
> the slab structure.
> - when add objects from a slab are free, then the slab is placed on the 
> free slab list
> - when there is memory pressure, then the pages from the free slab list 
> are reclaimed.
> 
> >Suppose there's no need for the cache specific functions to be aware of
> >liveness, ie. its SLAB specific information.
> >
> > 
> >
> What about RCU? We have dying objects: Still alive, because someone 
> might have a pointer to it, but already on the rcu list and will be 
> released after the next quiescent state. slab can't know that.

Objects waiting for the next RCU quiescent state cannot have references
attached, and can't be reused either. When they reach the RCU list
they are already invalid (DCACHE_UNHASHED in dcache's case).

The only references they can have at this point is against the list_head
fields.

> >Another issue is synchronization between multiple threads in this 
> >level of the reclaim path. Can be dealt with PageLock: if the bit is set,
> >don't bother checking the page, someone else is already doing
> >so.
> >
> >You mention
> >
> > 
> >
> >>- lock_cache(): No more alive/dead changes
> >>   
> >>
> >
> >With the PageLock bit, you can instruct kmem_cache_alloc() to skip partial
> >but Locked pages (thus avoiding any object allocations within that page).
> >Hum, what about higher order SLABs?
> >
> > 
> >
> You have misunderstood my question: I was thinking about object 
> dead/alive changes.
> There are two questions: First figure out how many objects from a 
> certain slab are alive. Then, if it's below a threshold, try to free 
> them. With this approach, you need lock(), is_objp_alive(), release_objp().

I'm thinking over this, will be sending something soon. 

> >Well, kmem_cache_alloc() can be a little bit smarter at this point, since 
> >its already a slow path, no? Its refill time, per-CPU cache is exhausted...
> >
> > 
> >
> Definitively. Fast path is only kmem_cache_alloc and kmem_cache_free. No 
> global cache line writes in these functions. They were down to 1 
> conditional branch and 2-3 cachelines, One of them read-only, the 
> other(s) are read/write, but per-cpu. I'm not sure how much changed with 
> the NUMA patches, but the non-numa case should try to remain simple. And 
> e.g. looking up the bufctl means an integer division. Just that 
> instruction could nearly double the runtime of kmem_cache_free().
> The shared_array part from cache_flusharray and cache_alloc_refill are 
> partially fast path: If we slow that down, then it will affect packet 
> routing. The rest is slow path.

OK fine, thanks for all your help up to now!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-06 16:01             ` Marcelo Tosatti
@ 2005-10-22  1:30               ` Marcelo Tosatti
  2005-10-22  6:31                 ` Andrew Morton
  0 siblings, 1 reply; 16+ messages in thread
From: Marcelo Tosatti @ 2005-10-22  1:30 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Christoph Lameter, linux-mm, akpm, dgc, dipankar, mbligh, arjanv

Took it a while more than "tomorrow" but I have something.

> I'm thinking over this, will be sending something soon. 

I'm testing the following which implements "slab_reclaim_ops"
and dcache methods as discussed (inode, dquot, etc should follow).

It also addresses some problems Al Viro pointed with reference to the   
dcache.                                                                 

Doing a battery of tests on it - need to come up with more detailed
statistics (hit ratio on the caches and "freed objects/freed pages"
ratios with different interesting workloads).

Comments?

diff -ur -p --exclude-from=linux-2.6.13.3.slab/Documentation/dontdiff linux-2.6.13.3.orig/fs/dcache.c linux-2.6.13.3.slab/fs/dcache.c
--- linux-2.6.13.3.orig/fs/dcache.c	2005-10-03 18:27:35.000000000 -0500
+++ linux-2.6.13.3.slab/fs/dcache.c	2005-10-21 17:47:38.000000000 -0500
@@ -44,7 +44,8 @@ static seqlock_t rename_lock __cacheline
 
 EXPORT_SYMBOL(dcache_lock);
 
-static kmem_cache_t *dentry_cache; 
+kmem_cache_t *dentry_cache; 
+
 
 #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
 
@@ -84,6 +85,7 @@ static void d_callback(struct rcu_head *
  */
 static void d_free(struct dentry *dentry)
 {
+	dentry->d_flags = 0;
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
  	call_rcu(&dentry->d_rcu, d_callback);
@@ -363,7 +365,7 @@ restart:
  * removed.
  * Called with dcache_lock, drops it and then regains.
  */
-static inline void prune_one_dentry(struct dentry * dentry)
+inline void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
@@ -390,13 +392,17 @@ static inline void prune_one_dentry(stru
  * This function may fail to free any resources if
  * all the dentries are in use.
  */
+
+unsigned long long check_slab_page(void *);
+int free_slab_page(void *objp, unsigned long long bitmap);
  
-static void prune_dcache(int count)
+static void __prune_dcache(int count, int page_scan)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		unsigned long long bitmap;
 
 		cond_resched_lock(&dcache_lock);
 
@@ -426,11 +432,238 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+
+		if (page_scan) {
+			/*XXX: dcache_lock guarantees dentry won't vanish?*/
+			spin_unlock(&dentry->d_lock);
+			if ((bitmap = check_slab_page(dentry))) {
+				if (free_slab_page(dentry, bitmap))
+					continue;
+			}
+			spin_lock(&dentry->d_lock);
+ 			if (atomic_read(&dentry->d_count)) {
+				/* keep it off the dentry_unused list. */
+				spin_unlock(&dentry->d_lock);
+				continue;
+			}
+			/* if the aggregate freeing fails we proceed 
+			 * to free the single dentry as usual.
+			 */
+		} 
+
 		prune_one_dentry(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
 
+static void prune_dcache(int count)
+{
+	__prune_dcache(count, 1);
+}
+
+static inline int dentry_negative(struct dentry *dentry)
+{
+	return (dentry->d_inode == NULL);
+}
+
+#define MAX_CHILD_REAP 32
+
+int dir_check_freeable(struct dentry *parent)
+{
+	struct dentry *this_parent = parent;
+	struct list_head *next;
+	unsigned int int_array[32];
+	unsigned int *int_array_ptr = (unsigned int *)&int_array;
+	unsigned int nr_children, ret;
+
+	ret = nr_children = 0;
+	memset(&int_array, 0, sizeof(int_array));
+
+	if (list_empty(&this_parent->d_subdirs))
+		BUG();
+repeat:
+	next = this_parent->d_subdirs.next;
+resume:
+	while (next != &this_parent->d_subdirs) {
+		struct list_head *tmp = next;
+		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
+
+		if (!(virt_addr_valid(next)))
+			BUG();
+
+		next = tmp->next;
+
+		if (!list_empty(&dentry->d_subdirs)) {
+			this_parent = dentry;
+			/* increase the counter */
+			*int_array_ptr = *int_array_ptr+1;
+			/* move to next array position */
+			int_array_ptr++;
+			if (int_array_ptr >= (unsigned int *)&int_array+(sizeof(int_array)/sizeof(int)))
+				BUG();
+			*int_array_ptr = 0;
+			nr_children++;
+			goto repeat;
+		}
+		/* Pinned or negative dentry? */
+		if (!atomic_read(&dentry->d_count) && !dentry_negative(dentry)) {
+			*int_array_ptr = *int_array_ptr+1;
+			nr_children++;
+		} else 
+			/* unfreeable dentry, bail out */
+			goto out;
+        }
+
+	/*
+         * All done at this level ... ascend and resume the search.
+         */
+        if (this_parent != parent) {
+		unsigned int val = *int_array_ptr;
+		/* does this directory have any additional ref? */
+		if (atomic_read(&this_parent->d_count) != val)
+			return 0;
+		int_array_ptr--;
+		if (int_array_ptr < (unsigned int*)&int_array)
+			BUG();
+
+		next = this_parent->d_child.next;
+		this_parent = this_parent->d_parent;
+		goto resume;
+        }
+
+	if (int_array_ptr != (unsigned int*)&int_array) {
+		printk("int array pointer differs: ptr:%p - &array:%p\n",
+			int_array_ptr, &int_array);
+		BUG();
+	}
+
+	if (nr_children < MAX_CHILD_REAP)
+		if (atomic_read(&parent->d_count) == *int_array_ptr)
+			ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * XXX: what are the consequences of acquiring the lock of
+ * a free object? Can some other codepath race and try to 
+ * use the dentry assuming it is free while we "hold" the
+ * lock here? 
+ * 
+ * Since the reading of protected dentry->d_flags is performed 
+ * locklessly, we might be reading stale data.
+ *
+ * Does it need a memory barrier to synchronize with d_free()'s 
+ * DCACHE_INUSE assignment? 
+ * 
+ */
+int dcache_objp_lock(void *obj)
+{
+	struct dentry *dentry = (struct dentry *)obj;
+
+	if (((dentry->d_flags & DCACHE_INUSE) == DCACHE_INUSE)) {
+		spin_lock(&dentry->d_lock);
+		return 1;
+	}
+	return 0;
+}
+
+int dcache_objp_unlock(void *obj)
+{
+	struct dentry *dentry = (struct dentry *)obj;
+	spin_unlock(&dentry->d_lock);
+	return 1;
+}
+
+/* 
+ * dcache_lock guarantees that dentry and children will not 
+ * vanish under us.
+ */
+int dcache_objp_is_freeable(void *obj)
+{
+	int ret = 1;
+	struct dentry *dentry = (struct dentry*)obj;
+
+	if (dentry->d_flags & (DCACHE_UNHASHED|DCACHE_DISCONNECTED))
+		return 0;
+
+	if (!((dentry->d_flags & DCACHE_INUSE) == DCACHE_INUSE))
+		return 0;
+
+	if (dentry_negative(dentry))
+		return 0;
+	
+	if (atomic_read(&dentry->d_count)) {
+		ret = 0;
+		if (!list_empty(&dentry->d_subdirs))
+		       	ret = dir_check_freeable(dentry);
+	}
+	return ret;
+}
+
+/* 
+ * dentry_free_child - attempt to free children of a given dentry.
+ * Caller holds an additional reference to it which is released here.
+ */
+int dentry_free_child(struct dentry *dentry)
+{
+	int ret = 1;
+
+	if (dentry->d_inode == NULL)
+		BUG();
+
+	if (!list_empty(&dentry->d_subdirs)) {
+		spin_unlock(&dcache_lock);
+		shrink_dcache_parent(dentry);
+		spin_lock(&dcache_lock);
+	}
+
+        spin_lock(&dentry->d_lock);
+	atomic_dec(&dentry->d_count);
+	if (atomic_read(&dentry->d_count))
+		ret = 0;
+	return ret;
+}
+
+int dcache_objp_release(void *obj)
+{
+	struct dentry *dentry = (struct dentry*)obj;
+	int ret = 0;
+
+	/* no additional references? nuke it */
+	if (!atomic_read(&dentry->d_count) ) {
+		if (!list_empty(&dentry->d_lru)) {
+			dentry_stat.nr_unused--;
+			list_del_init(&dentry->d_lru);
+		}
+		ret = 1;
+		prune_one_dentry(dentry);
+	/* otherwise attempt to free children */
+	} else if (!list_empty(&dentry->d_subdirs)) {
+		/* grab a reference to guarantee dir won't vanish */
+		/* XXX: Confirm it is OK to grab an additional ref. here. */
+		atomic_inc(&dentry->d_count);
+		spin_unlock(&dentry->d_lock);
+		if (dentry_free_child(dentry)) {
+			if (!list_empty(&dentry->d_lru)) {
+				dentry_stat.nr_unused--;
+				list_del_init(&dentry->d_lru);
+			}
+			ret = 1;
+			prune_one_dentry(dentry);
+		} else 
+			spin_unlock(&dentry->d_lock);
+	}
+	return ret;
+}
+
+struct slab_reclaim_ops dcache_reclaim_ops = {
+	.objp_is_freeable = dcache_objp_is_freeable,
+	.objp_release = dcache_objp_release,
+	.objp_lock = dcache_objp_lock,
+	.objp_unlock = dcache_objp_unlock,
+};
+
 /*
  * Shrink the dcache for the specified super block.
  * This allows us to unmount a device without disturbing
@@ -642,7 +875,7 @@ void shrink_dcache_parent(struct dentry 
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 }
 
 /**
@@ -680,7 +913,7 @@ void shrink_dcache_anon(struct hlist_hea
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		__prune_dcache(found, 0);
 	} while(found);
 }
 
@@ -742,7 +975,7 @@ struct dentry *d_alloc(struct dentry * p
 	dname[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
-	dentry->d_flags = DCACHE_UNHASHED;
+	dentry->d_flags = DCACHE_UNHASHED|DCACHE_INUSE;
 	spin_lock_init(&dentry->d_lock);
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
@@ -1689,6 +1922,8 @@ static void __init dcache_init(unsigned 
 	
 	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
 
+	slab_set_reclaim_ops(dentry_cache, &dcache_reclaim_ops);
+
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
 		return;
diff -ur -p --exclude-from=linux-2.6.13.3.slab/Documentation/dontdiff linux-2.6.13.3.orig/fs/inode.c linux-2.6.13.3.slab/fs/inode.c
--- linux-2.6.13.3.orig/fs/inode.c	2005-10-03 18:27:35.000000000 -0500
+++ linux-2.6.13.3.slab/fs/inode.c	2005-10-20 17:17:56.000000000 -0500
@@ -97,7 +97,7 @@ DECLARE_MUTEX(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static kmem_cache_t * inode_cachep;
+kmem_cache_t * inode_cachep;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
diff -ur -p --exclude-from=linux-2.6.13.3.slab/Documentation/dontdiff linux-2.6.13.3.orig/include/linux/dcache.h linux-2.6.13.3.slab/include/linux/dcache.h
--- linux-2.6.13.3.orig/include/linux/dcache.h	2005-10-03 18:27:35.000000000 -0500
+++ linux-2.6.13.3.slab/include/linux/dcache.h	2005-10-20 17:18:24.000000000 -0500
@@ -155,6 +155,7 @@ d_iput:		no		no		no       yes
 
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
+#define DCACHE_INUSE		0xdbca0000
 
 extern spinlock_t dcache_lock;
 
diff -ur -p --exclude-from=linux-2.6.13.3.slab/Documentation/dontdiff linux-2.6.13.3.orig/include/linux/slab.h linux-2.6.13.3.slab/include/linux/slab.h
--- linux-2.6.13.3.orig/include/linux/slab.h	2005-10-03 18:27:35.000000000 -0500
+++ linux-2.6.13.3.slab/include/linux/slab.h	2005-10-20 17:18:27.000000000 -0500
@@ -76,6 +76,14 @@ struct cache_sizes {
 extern struct cache_sizes malloc_sizes[];
 extern void *__kmalloc(size_t, unsigned int __nocast);
 
+struct slab_reclaim_ops {
+	int (*objp_is_freeable)(void *objp);
+	int (*objp_release)(void *objp);
+	int (*objp_lock)(void *objp);
+	int (*objp_unlock)(void *objp);
+};
+extern int slab_set_reclaim_ops(kmem_cache_t *, struct slab_reclaim_ops *);
+
 static inline void *kmalloc(size_t size, unsigned int __nocast flags)
 {
 	if (__builtin_constant_p(size)) {
diff -ur -p --exclude-from=linux-2.6.13.3.slab/Documentation/dontdiff linux-2.6.13.3.orig/mm/slab.c linux-2.6.13.3.slab/mm/slab.c
--- linux-2.6.13.3.orig/mm/slab.c	2005-10-03 18:27:35.000000000 -0500
+++ linux-2.6.13.3.slab/mm/slab.c	2005-10-21 17:03:44.000000000 -0500
@@ -1,4 +1,5 @@
 /*
+ *
  * linux/mm/slab.c
  * Written by Mark Hemment, 1996/97.
  * (markhe@nextd.demon.co.uk)
@@ -93,6 +94,8 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
+#include	<linux/proc_fs.h>
+#include	<linux/pagemap.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -190,7 +193,7 @@
  */
 
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
-#define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
+#define BUFCTL_INUSE	(((kmem_bufctl_t)(~0U))-1)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
 
 /* Max number of objs-per-slab for caches which use off-slab slabs.
@@ -327,6 +330,7 @@ struct kmem_cache_s {
 	kmem_cache_t		*slabp_cache;
 	unsigned int		slab_size;
 	unsigned int		dflags;		/* dynamic flags */
+	struct slab_reclaim_ops *reclaim_ops;
 
 	/* constructor func */
 	void (*ctor)(void *, kmem_cache_t *, unsigned long);
@@ -574,6 +578,266 @@ static void free_block(kmem_cache_t* cac
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
 
+int slab_set_reclaim_ops(kmem_cache_t *cachep, struct slab_reclaim_ops *ops)
+{
+	cachep->reclaim_ops = ops;
+	return 1;
+}
+
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+	return (kmem_bufctl_t *)(slabp+1);
+}
+
+/* 
+ * Cache the used/free status from the slabbufctl management structure
+ * in a bitmap to avoid further cachep->spinlock locking.
+ * 
+ * Using this cached information guarantees that the freeing routine
+ * won't attempt to interpret uninitialized objects. It however does 
+ * not guarantee that it won't interpret freed objects (since an used
+ * object might be freed by another CPU without notification). 
+ *
+ * Appropriate locking is required (either global or per-object, depending
+ * on cache internals) to verify liveness with accuracy. 
+ *
+ */
+unsigned long long slab_free_status(kmem_cache_t *cachep, struct slab *slabp)
+{
+	unsigned long long bitmap = 0;
+	int i;
+
+	if (cachep->num > sizeof(unsigned long long)*8)
+		BUG();
+
+	spin_lock_irq(&cachep->spinlock);
+	for(i=0; i < cachep->num ; i++) {
+		if (slab_bufctl(slabp)[i] == BUFCTL_INUSE)
+			set_bit(i, (unsigned long *)&bitmap);
+	}
+	spin_unlock_irq(&cachep->spinlock);
+
+	return bitmap;
+}
+
+inline int objp_inuse (kmem_cache_t *cachep, struct slab *slabp, unsigned long long *bitmap, void *objp)
+{
+	int objnr = (objp - slabp->s_mem) / cachep->objsize;
+	
+	return test_bit(objnr, (unsigned long *)bitmap);
+}
+
+int slab_free_attempt = 0;
+int slab_free_success = 0;
+
+/*
+ * check_slab_page - check if the SLAB container of a given object is freeable.
+ * @objp: object which resides in the SLAB.
+ */
+unsigned long long check_slab_page(void *objp)
+{
+        struct page *page;
+        struct slab *slabp;
+        kmem_cache_t *cachep;
+        struct slab_reclaim_ops *ops;
+        int i;
+        unsigned long long bitmap;
+
+        page = virt_to_page(objp);
+        slabp = GET_PAGE_SLAB(page);
+        cachep = GET_PAGE_CACHE(page);
+        ops = cachep->reclaim_ops;
+
+        if (!ops)
+                BUG();
+	if (!PageSlab(page))
+		BUG();
+	if (slabp->s_mem != (page_address(page) + slabp->colouroff))
+		BUG();
+
+        if (PageLocked(page))
+                return 0;
+
+	/*
+	 * XXX: acquires cachep->lock with cache specific lock held.
+	 * Is it guaranteed that no code holding cachep->lock will 
+	 * attempt to grab the cache specific locks? (AB-BA deadlock)
+	 */
+	bitmap = slab_free_status(cachep, slabp);
+	
+	for(i=0; i < cachep->num ; i++) {
+		void *objn = slabp->s_mem + cachep->objsize * i;
+
+		if (!objp_inuse(cachep, slabp, &bitmap, objn))
+			continue;
+
+		/* XXX: It might be OK to do lockless reading? 
+		 * After all the object is rechecked again 
+		 * holding appropriate locks during freeing pass. 
+		 * It depends on the underlying cache.
+		 */
+		if (ops->objp_lock && !ops->objp_lock(objn))
+			continue;
+
+		if (!ops->objp_is_freeable(objn)) {
+			if (ops->objp_unlock)
+				ops->objp_unlock(objn);
+			break;
+		}
+		if (ops->objp_unlock)
+			ops->objp_unlock(objn);
+	}	
+
+	slab_free_attempt++;
+
+	if (i == cachep->num) 
+		return 1;
+	return 0;
+}
+
+/*
+ * free_slab_page - attempt to free the SLAB container of a given object.
+ * @objp: object which resides in the SLAB.
+ */
+int free_slab_page(void *objp, unsigned long long bitmap)
+{
+        struct page *page;
+        struct slab *slabp;
+        kmem_cache_t *cachep;
+        int i, ret = 0;
+        struct slab_reclaim_ops *ops;
+
+        page = virt_to_page(objp);
+        slabp = GET_PAGE_SLAB(page);
+        cachep = GET_PAGE_CACHE(page);
+
+        ops = cachep->reclaim_ops;
+
+        if (!ops)
+                BUG();
+	if (!PageSlab(page))
+		BUG();
+	if (slabp->s_mem != (page_address(page) + slabp->colouroff))
+		BUG();
+
+        if (TestSetPageLocked(page))
+                return 0;
+
+	for(i=0; i < cachep->num ; i++) {
+		void *objp = slabp->s_mem + cachep->objsize * i;
+
+		if (!objp_inuse(cachep, slabp, &bitmap, objp))
+			continue;
+
+		if (ops->objp_lock && !ops->objp_lock(objp))
+			continue;
+
+		/* freeable object? */
+		if (!ops->objp_is_freeable(objp)) {
+			if (ops->objp_unlock)
+				ops->objp_unlock(objp);
+			break;
+		}
+		/* release takes care of unlocking the object */
+		ops->objp_release(objp);
+	}
+
+        if (i == cachep->num) {
+		slab_free_success++;
+		ret = 1;
+	}
+	unlock_page(page);
+	return ret;
+}
+
+extern kmem_cache_t *dentry_cache;
+extern kmem_cache_t *inode_cachep;
+
+struct cache_stat {
+	unsigned int free_pages;
+	unsigned int partial_pages;
+	unsigned int partial_freeable;
+	unsigned int full_pages;
+	unsigned int full_freeable;
+};
+
+void cache_retrieve_stats(kmem_cache_t *cachep, struct cache_stat *stat)
+{
+	struct list_head *entry;
+	struct slab *slabp;
+
+	memset(stat, 0, sizeof(struct cache_stat));
+
+	list_for_each(entry,&cachep->lists.slabs_free)
+		stat->free_pages++;
+
+	list_for_each(entry,&cachep->lists.slabs_partial) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->partial_pages++;
+		stat->partial_freeable += check_slab_page(slabp);
+	}
+
+	list_for_each(entry,&cachep->lists.slabs_full) {
+		slabp = list_entry(entry, struct slab, list);
+		stat->full_pages++;
+		stat->full_freeable += check_slab_page(slabp);
+	}
+}
+
+struct proc_dir_entry *slab_stats;
+struct proc_dir_entry *slab_reclaim;
+
+static int print_slab_stats(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+
+	struct cache_stat stat;
+	int len;
+
+	cache_retrieve_stats(dentry_cache, &stat);
+
+	len = sprintf(page, "dentry_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	cache_retrieve_stats(inode_cachep, &stat);
+
+	len += sprintf(page+len, "inode_cache free:%u partial:%u partial_f:%u full:%u full_f:%u\n", stat.free_pages, stat.partial_pages, stat.partial_freeable, stat.full_pages, stat.full_freeable);
+
+	return len;
+}
+
+static int print_slab_reclaim(char *page, char **start,
+			  off_t off, int count, int *eof, void *data)
+{
+	int len;
+
+	len = sprintf(page, "slab_free_attempt:%d slab_free_success:%d\n",
+			slab_free_attempt, slab_free_success);
+	return len;
+}
+
+int __init init_slab_stats(void)
+{
+	slab_stats = create_proc_read_entry("slab_stats", 0644, NULL,
+					print_slab_stats, NULL);
+	if (slab_stats == NULL) 
+		printk(KERN_ERR "failure to create slab_stats!\n");
+	else
+		printk(KERN_ERR "success creating slab_stats!\n");
+
+	slab_stats = create_proc_read_entry("slab_reclaim", 0644, NULL,
+					print_slab_reclaim, NULL);
+	if (slab_reclaim == NULL) 
+		printk(KERN_ERR "failure to create slab_reclaim!\n");
+	else
+		printk(KERN_ERR "success creating slab_reclaim!\n");
+
+	slab_stats->owner = THIS_MODULE;
+
+	return 1;
+}
+
+late_initcall(init_slab_stats);
+
 static inline void **ac_entry(struct array_cache *ac)
 {
 	return (void**)(ac+1);
@@ -1710,11 +1974,6 @@ static struct slab* alloc_slabmgmt(kmem_
 	return slabp;
 }
 
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
-{
-	return (kmem_bufctl_t *)(slabp+1);
-}
-
 static void cache_init_objs(kmem_cache_t *cachep,
 			struct slab *slabp, unsigned long ctor_flags)
 {
@@ -2054,9 +2313,9 @@ retry:
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-			slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
+
+			slab_bufctl(slabp)[slabp->free] = BUFCTL_INUSE;
+
 		       	slabp->free = next;
 		}
 		check_slabp(cachep, slabp);
@@ -2193,7 +2452,7 @@ static void free_block(kmem_cache_t *cac
 		objnr = (objp - slabp->s_mem) / cachep->objsize;
 		check_slabp(cachep, slabp);
 #if DEBUG
-		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+		if (slab_bufctl(slabp)[objnr] != BUFCTL_INUSE) {
 			printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
 						cachep->name, objp);
 			BUG();
@@ -2422,9 +2681,7 @@ got_slabp:
 
 	slabp->inuse++;
 	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_INUSE;
 	slabp->free = next;
 	check_slabp(cachep, slabp);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22  1:30               ` Marcelo Tosatti
@ 2005-10-22  6:31                 ` Andrew Morton
  2005-10-22  9:21                   ` Arjan van de Ven
                                     ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Andrew Morton @ 2005-10-22  6:31 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: manfred, clameter, linux-mm, dgc, dipankar, mbligh, arjanv

Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote:
>
> ...
> +unsigned long long slab_free_status(kmem_cache_t *cachep, struct slab *slabp)
> +{
> +	unsigned long long bitmap = 0;
> +	int i;
> +
> +	if (cachep->num > sizeof(unsigned long long)*8)
> +		BUG();
> +
> +	spin_lock_irq(&cachep->spinlock);
> +	for(i=0; i < cachep->num ; i++) {
> +		if (slab_bufctl(slabp)[i] == BUFCTL_INUSE)
> +			set_bit(i, (unsigned long *)&bitmap);
> +	}
> +	spin_unlock_irq(&cachep->spinlock);
> +
> +	return bitmap;
> +}

What if there are more than 64 objects per page?

> +        if (!ops)
> +                BUG();
> +	if (!PageSlab(page))
> +		BUG();
> +	if (slabp->s_mem != (page_address(page) + slabp->colouroff))
> +		BUG();
> +
> +        if (PageLocked(page))
> +                return 0;

There's quite a lot of whitespace breakage btw.


It all seems rather complex.

What about simply compacting the cache by copying freeable dentries? 
Something like, in prune_one_dentry():

	if (dcache occupancy < 90%) {
		new_dentry = alloc_dentry()
		*new_dentry = *dentry;
		<fix stuff up>
		free(dentry);
	} else {
		free(dentry)
	}

?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22  6:31                 ` Andrew Morton
@ 2005-10-22  9:21                   ` Arjan van de Ven
  2005-10-22 17:08                   ` Christoph Lameter
  2005-10-23 16:30                   ` Marcelo Tosatti
  2 siblings, 0 replies; 16+ messages in thread
From: Arjan van de Ven @ 2005-10-22  9:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Marcelo Tosatti, manfred, clameter, linux-mm, dgc, dipankar, mbligh

On Fri, Oct 21, 2005 at 11:31:11PM -0700, Andrew Morton wrote:
> Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote:
> >
> > ...
> > +unsigned long long slab_free_status(kmem_cache_t *cachep, struct slab *slabp)
> > +{
> > +	unsigned long long bitmap = 0;
> > +	int i;
> > +
> > +	if (cachep->num > sizeof(unsigned long long)*8)
> > +		BUG();
> > +
> > +	spin_lock_irq(&cachep->spinlock);
> > +	for(i=0; i < cachep->num ; i++) {
> > +		if (slab_bufctl(slabp)[i] == BUFCTL_INUSE)
> > +			set_bit(i, (unsigned long *)&bitmap);
> > +	}
> > +	spin_unlock_irq(&cachep->spinlock);
> > +
> > +	return bitmap;
> > +}
> 
> What if there are more than 64 objects per page?

the bitops usually work on bigger than wordsize things though..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22  6:31                 ` Andrew Morton
  2005-10-22  9:21                   ` Arjan van de Ven
@ 2005-10-22 17:08                   ` Christoph Lameter
  2005-10-22 17:13                     ` ia64 page size (was Re: [PATCH] per-page SLAB freeing (only dcache for now)) Arjan van de Ven
  2005-10-22 18:16                     ` [PATCH] per-page SLAB freeing (only dcache for now) Manfred Spraul
  2005-10-23 16:30                   ` Marcelo Tosatti
  2 siblings, 2 replies; 16+ messages in thread
From: Christoph Lameter @ 2005-10-22 17:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Marcelo Tosatti, manfred, linux-mm, dgc, dipankar, mbligh, arjanv

On Fri, 21 Oct 2005, Andrew Morton wrote:

> Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote:
> >
> > ...
> > +unsigned long long slab_free_status(kmem_cache_t *cachep, struct slab *slabp)
> > +{
> > +	unsigned long long bitmap = 0;
> > +	int i;
> > +
> > +	if (cachep->num > sizeof(unsigned long long)*8)
> > +		BUG();
> > +
> > +	spin_lock_irq(&cachep->spinlock);
> > +	for(i=0; i < cachep->num ; i++) {
> > +		if (slab_bufctl(slabp)[i] == BUFCTL_INUSE)
> > +			set_bit(i, (unsigned long *)&bitmap);
> > +	}
> > +	spin_unlock_irq(&cachep->spinlock);
> > +
> > +	return bitmap;
> > +}
> 
> What if there are more than 64 objects per page?

The current worst case is 16k pagesize (IA64) and one cacheline sized 
objects (128 bytes) (hmm.. could even be smaller if the arch does 
overrride SLAB_HWCACHE_ALIGN) yielding a maximum of 128 entries per page. 

There are been versions of Linux for IA64 out there with 64k pagesize on 
IA64 and there is the possibility that we need to switch to 64k as a 
standard next year when we may have single OS images running with more 
than 16TB Ram.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* ia64 page size (was Re: [PATCH] per-page SLAB freeing (only dcache for now))
  2005-10-22 17:08                   ` Christoph Lameter
@ 2005-10-22 17:13                     ` Arjan van de Ven
  2005-10-22 18:16                     ` [PATCH] per-page SLAB freeing (only dcache for now) Manfred Spraul
  1 sibling, 0 replies; 16+ messages in thread
From: Arjan van de Ven @ 2005-10-22 17:13 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Marcelo Tosatti, manfred, linux-mm, dgc, dipankar, mbligh

On Sat, Oct 22, 2005 at 10:08:52AM -0700, Christoph Lameter wrote:
> There are been versions of Linux for IA64 out there with 64k pagesize on 
> IA64 and there is the possibility that we need to switch to 64k as a 
> standard next year when we may have single OS images running with more 
> than 16TB Ram.

it's a kernel config option, and it has to remain that way; the pagesize is
a userspace visible property and although apps aren't supposed to care..
some do. So Distros that use 16kb right now (the RH ones) will need to keep
16Kb pagesize... 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22 17:08                   ` Christoph Lameter
  2005-10-22 17:13                     ` ia64 page size (was Re: [PATCH] per-page SLAB freeing (only dcache for now)) Arjan van de Ven
@ 2005-10-22 18:16                     ` Manfred Spraul
  2005-10-23 18:41                       ` Marcelo Tosatti
  1 sibling, 1 reply; 16+ messages in thread
From: Manfred Spraul @ 2005-10-22 18:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Marcelo Tosatti, linux-mm, dgc, dipankar, mbligh, arjanv

Christoph Lameter wrote:

>The current worst case is 16k pagesize (IA64) and one cacheline sized 
>objects (128 bytes) (hmm.. could even be smaller if the arch does 
>overrride SLAB_HWCACHE_ALIGN) yielding a maximum of 128 entries per page. 
>
>  
>
What about biovec-1? On i386 and 2.6.13 from Fedora, it contains 226 
entries. And revoke_table contains 290 entries.

--
    Manfred

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22  6:31                 ` Andrew Morton
  2005-10-22  9:21                   ` Arjan van de Ven
  2005-10-22 17:08                   ` Christoph Lameter
@ 2005-10-23 16:30                   ` Marcelo Tosatti
  2 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2005-10-23 16:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: manfred, clameter, linux-mm, dgc, dipankar, mbligh, arjanv

Hi Andrew!

On Fri, Oct 21, 2005 at 11:31:11PM -0700, Andrew Morton wrote:
> Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote:
> >
> > ...
> > +unsigned long long slab_free_status(kmem_cache_t *cachep, struct slab *slabp)
> > +{
> > +	unsigned long long bitmap = 0;
> > +	int i;
> > +
> > +	if (cachep->num > sizeof(unsigned long long)*8)
> > +		BUG();
> > +
> > +	spin_lock_irq(&cachep->spinlock);
> > +	for(i=0; i < cachep->num ; i++) {
> > +		if (slab_bufctl(slabp)[i] == BUFCTL_INUSE)
> > +			set_bit(i, (unsigned long *)&bitmap);
> > +	}
> > +	spin_unlock_irq(&cachep->spinlock);
> > +
> > +	return bitmap;
> > +}
> 
> What if there are more than 64 objects per page?

I don't think there are reclaimable caches with more than 64 objects per page
at the moment, but if that happens to be the case we just need to use an 
appropriately larger bitmap as Arjan mentioned.

> > +        if (!ops)
> > +                BUG();
> > +	if (!PageSlab(page))
> > +		BUG();
> > +	if (slabp->s_mem != (page_address(page) + slabp->colouroff))
> > +		BUG();
> > +
> > +        if (PageLocked(page))
> > +                return 0;
> 
> There's quite a lot of whitespace breakage btw.

Sorry about that! Silly.

> It all seems rather complex.

Hiding the locking behind an API does not sound very pleasant to me, but
other than that it seems quite straightforward...

What worries you?

> What about simply compacting the cache by copying freeable dentries? 
> Something like, in prune_one_dentry():
> 
> 	if (dcache occupancy < 90%) {
> 		new_dentry = alloc_dentry()
> 		*new_dentry = *dentry;
> 		<fix stuff up>
> 		free(dentry);
> 	} else {
> 		free(dentry)
> 	}
> 
> ?

Compacting the dcache sounds like a good thing to be done to help
reducing fragmentation (and simpler), but it is complementary to the
aggregate freeing of pages proposed.

The major issue I believe this patch tries to attack is that of unused
list ordering. Sequential objects in the unused list are not necessarily
ordered by their page container (actually, it is easy to come up with
several reasons for them _not_ to be optimally ordered for reclaim, eg.
multi-user/multi-task workloads).

Under such scenarios in which the unused list ordering is not close to
"page container order", the VM might have to reclaim larger amounts of
dentries "dumbfully" in the hope to make progress by freeing full pages.
It completly lacks the knowledge that to make progress full pages
are required.

Recently David Chinner reported a case on a very large mem box in which
demonstrates the issue very clearly:

http://marc.theaimsgroup.com/?l=linux-mm&m=112674700612691&w=2

While compacting the cache as you suggest would certainly help his
specific, the most effective measure seems to be free full pages
immediately.

I will proceed with some testing this week, suggestions are welcome.    

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] per-page SLAB freeing (only dcache for now)
  2005-10-22 18:16                     ` [PATCH] per-page SLAB freeing (only dcache for now) Manfred Spraul
@ 2005-10-23 18:41                       ` Marcelo Tosatti
  0 siblings, 0 replies; 16+ messages in thread
From: Marcelo Tosatti @ 2005-10-23 18:41 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Christoph Lameter, Andrew Morton, linux-mm, dgc, dipankar,
	mbligh, arjanv

On Sat, Oct 22, 2005 at 08:16:13PM +0200, Manfred Spraul wrote:
> Christoph Lameter wrote:
> 
> >The current worst case is 16k pagesize (IA64) and one cacheline sized 
> >objects (128 bytes) (hmm.. could even be smaller if the arch does 
> >overrride SLAB_HWCACHE_ALIGN) yielding a maximum of 128 entries per page. 
> >
> > 
> >
> What about biovec-1? On i386 and 2.6.13 from Fedora, it contains 226 
> entries. And revoke_table contains 290 entries.

Neither are reclaimable however, right:

[marcelo@logos linux-2.6.13]$ find . -type f -exec grep -l set_shrinker {} \;
./fs/dcache.c
./fs/dquot.c
./fs/inode.c
./fs/mbcache.c
./fs/xfs/linux-2.6/kmem.h

If the size of the bitmap for caching the slabbufctl data (which
contains dead/alive information) ends up being a problem, its possible
to:

- increase the bitmap size somehow
- drop the bitmap, acquiring the cache's spinlock and checking directly

Or as a last resort drop the slabbufctl optimization completly, using
cache internal information to obtain dead/alive status.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2005-10-23 18:41 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-09-30 19:37 [PATCH] per-page SLAB freeing (only dcache for now) Marcelo
2005-10-01  2:46 ` Christoph Lameter
2005-10-01 21:52   ` Marcelo
2005-10-03 15:24     ` Christoph Lameter
2005-10-03 20:37       ` Manfred Spraul
2005-10-03 22:17         ` Marcelo Tosatti
2005-10-04 17:04           ` Manfred Spraul
2005-10-06 16:01             ` Marcelo Tosatti
2005-10-22  1:30               ` Marcelo Tosatti
2005-10-22  6:31                 ` Andrew Morton
2005-10-22  9:21                   ` Arjan van de Ven
2005-10-22 17:08                   ` Christoph Lameter
2005-10-22 17:13                     ` ia64 page size (was Re: [PATCH] per-page SLAB freeing (only dcache for now)) Arjan van de Ven
2005-10-22 18:16                     ` [PATCH] per-page SLAB freeing (only dcache for now) Manfred Spraul
2005-10-23 18:41                       ` Marcelo Tosatti
2005-10-23 16:30                   ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox