Support concurrent local and remote frees and allocs on a slab.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Support concurrent local and remote frees and allocs on a slab.
@ 2007-05-05  3:28 Christoph Lameter
  2007-05-06  4:59 ` Christoph Lameter
  2007-05-07 21:50 ` Andrew Morton
  0 siblings, 2 replies; 13+ messages in thread
From: Christoph Lameter @ 2007-05-05  3:28 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm

About 5-10% performance gain on netperf.

[Maybe put this patch at the end of the merge queue? Works fine here but
this is a significant change that may impact stability]

What we do is use the last free field in the page struct (the private
field that was freed up through the compound page flag rework) to setup a
separate per cpu freelist. From that one we can allocate without taking the
slab lock because we checkout the complete list of free objects when we
first touch the slab and then mark the slab as completely allocated.
If we have a cpu_freelist then we can also free to that list if we run on
that processor without taking the slab lock.

This allows even concurrent allocations and frees on the same slab using
two mutually exclusive freelists. Allocs and frees from the processor
owning the per cpu slab will bypass the slab lock using the cpu_freelist.
Remove frees will use the slab lock to synchronize and use the freelist
for marking items as free. So local allocs and frees may run concurrently
with remote frees without synchronization.

If the allocator is running out of its per cpu freelist then it will consult
the per slab freelist (which requires the slab lock) and reload the
cpu_freelist if there are objects that were remotely freed.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mm_types.h |    5 ++-
 mm/slub.c                |   67 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 59 insertions(+), 13 deletions(-)

Index: slub/include/linux/mm_types.h
===================================================================
--- slub.orig/include/linux/mm_types.h	2007-05-04 20:09:26.000000000 -0700
+++ slub/include/linux/mm_types.h	2007-05-04 20:09:33.000000000 -0700
@@ -50,9 +50,12 @@ struct page {
 	    spinlock_t ptl;
 #endif
 	    struct {			/* SLUB uses */
-		struct page *first_page;	/* Compound pages */
+	    	void **cpu_freelist;		/* Per cpu freelist */
 		struct kmem_cache *slab;	/* Pointer to slab */
 	    };
+	    struct {
+		struct page *first_page;	/* Compound pages */
+	    };
 	};
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-05-04 20:09:26.000000000 -0700
+++ slub/mm/slub.c	2007-05-04 20:14:04.000000000 -0700
@@ -81,10 +81,13 @@
  * PageActive 		The slab is used as a cpu cache. Allocations
  * 			may be performed from the slab. The slab is not
  * 			on any slab list and cannot be moved onto one.
+ * 			The cpu slab may have a cpu_freelist in order
+ * 			to optimize allocations and frees on a particular
+ * 			cpu.
  *
  * PageError		Slab requires special handling due to debug
  * 			options set. This moves	slab handling out of
- * 			the fast path.
+ * 			the fast path and disables cpu_freelists.
  */
 
 /*
@@ -857,6 +860,7 @@ static struct page *new_slab(struct kmem
 	set_freepointer(s, last, NULL);
 
 	page->freelist = start;
+	page->cpu_freelist = NULL;
 	page->inuse = 0;
 out:
 	if (flags & __GFP_WAIT)
@@ -1121,6 +1125,23 @@ static void putback_slab(struct kmem_cac
  */
 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
 {
+	/*
+	 * Merge cpu freelist into freelist. Typically we get here
+	 * because both freelists are empty. So this is unlikely
+	 * to occur.
+	 */
+	while (unlikely(page->cpu_freelist)) {
+		void **object;
+
+		/* Retrieve object from cpu_freelist */
+		object = page->cpu_freelist;
+		page->cpu_freelist = page->cpu_freelist[page->offset];
+
+		/* And put onto the regular freelist */
+		object[page->offset] = page->freelist;
+		page->freelist = object;
+		page->inuse--;
+	}
 	s->cpu_slab[cpu] = NULL;
 	ClearPageActive(page);
 
@@ -1190,22 +1211,33 @@ static void *slab_alloc(struct kmem_cach
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	page = s->cpu_slab[cpu];
-	if (!page)
+	if (unlikely(!page))
 		goto new_slab;
 
-	slab_lock(page);
-	if (unlikely(node != -1 && page_to_nid(page) != node))
+	if (unlikely(node != -1 && page_to_nid(page) != node)) {
+		slab_lock(page);
 		goto another_slab;
+	}
+
+	if (likely(page->cpu_freelist)) {
+		object = page->cpu_freelist;
+		page->cpu_freelist = object[page->offset];
+		local_irq_restore(flags);
+		return object;
+	}
+
+	slab_lock(page);
 redo:
-	object = page->freelist;
-	if (unlikely(!object))
+	if (!page->freelist)
 		goto another_slab;
-	if (unlikely(PageError(page)))
+	if (PageError(page))
 		goto debug;
 
-have_object:
-	page->inuse++;
-	page->freelist = object[page->offset];
+	/* Reload the cpu freelist while allocating the next object */
+	object = page->freelist;
+	page->cpu_freelist = object[page->offset];
+	page->freelist = NULL;
+	page->inuse = s->objects;
 	slab_unlock(page);
 	local_irq_restore(flags);
 	return object;
@@ -1215,7 +1247,7 @@ another_slab:
 
 new_slab:
 	page = get_partial(s, gfpflags, node);
-	if (likely(page)) {
+	if (page) {
 have_slab:
 		s->cpu_slab[cpu] = page;
 		SetPageActive(page);
@@ -1251,6 +1283,7 @@ have_slab:
 	local_irq_restore(flags);
 	return NULL;
 debug:
+	object = page->freelist;
 	if (!alloc_object_checks(s, page, object))
 		goto another_slab;
 	if (s->flags & SLAB_STORE_USER)
@@ -1261,8 +1294,12 @@ debug:
 			page->freelist);
 		dump_stack();
 	}
+	page->freelist = object[page->offset];
+	page->inuse++;
 	init_object(s, object, 1);
-	goto have_object;
+	slab_unlock(page);
+	local_irq_restore(flags);
+	return object;
 }
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1293,6 +1330,12 @@ static void slab_free(struct kmem_cache 
 	unsigned long flags;
 
 	local_irq_save(flags);
+	if (page == s->cpu_slab[smp_processor_id()] && !PageError(page)) {
+		object[page->offset] = page->cpu_freelist;
+		page->cpu_freelist = object;
+		local_irq_restore(flags);
+		return;
+	}
 	slab_lock(page);
 
 	if (unlikely(PageError(page)))

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-05  3:28 Support concurrent local and remote frees and allocs on a slab Christoph Lameter
@ 2007-05-06  4:59 ` Christoph Lameter
  2007-05-06  5:45   ` Christoph Lameter
  2007-05-07 21:50 ` Andrew Morton
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2007-05-06  4:59 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm

On Fri, 4 May 2007, Christoph Lameter wrote:

> About 5-10% performance gain on netperf.

Hmmmm... I can take this even further and get another 20% if I take the 
critical components of slab_alloc and slab_free and inline them into
kfree, kmem_cache_alloc and friends. I went from 5.8MB without this 
patch to now 8 MB/sec with this patch and the rather ugly inlining.

The compiler really creates stupid code and does a lot of stack ops 
because slab_alloc and slab_free use too many variables right now.

We should be able to take this even further if we allow arch code to 
provide ASM versions of the fast path.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-06  4:59 ` Christoph Lameter
@ 2007-05-06  5:45   ` Christoph Lameter
  2007-05-06 19:24     ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2007-05-06  5:45 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm

On Sat, 5 May 2007, Christoph Lameter wrote:

> Hmmmm... I can take this even further and get another 20% if I take the 
> critical components of slab_alloc and slab_free and inline them into
> kfree, kmem_cache_alloc and friends. I went from 5.8MB without this 
> patch to now 8 MB/sec with this patch and the rather ugly inlining.

Hmmm... Nope. That was the effect of screwing up kfree so that no memory 
is ever freed. Interesting that this increases performance...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-06  5:45   ` Christoph Lameter
@ 2007-05-06 19:24     ` Andrew Morton
  2007-05-07 15:15       ` Christoph Lameter
  2007-05-07 18:39       ` Christoph Lameter
  0 siblings, 2 replies; 13+ messages in thread
From: Andrew Morton @ 2007-05-06 19:24 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On Sat, 5 May 2007 22:45:26 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:

> On Sat, 5 May 2007, Christoph Lameter wrote:
> 
> > Hmmmm... I can take this even further and get another 20% if I take the 
> > critical components of slab_alloc and slab_free and inline them into
> > kfree, kmem_cache_alloc and friends. I went from 5.8MB without this 
> > patch to now 8 MB/sec with this patch and the rather ugly inlining.
> 
> Hmmm... Nope. That was the effect of screwing up kfree so that no memory 
> is ever freed. Interesting that this increases performance...

Yes, is is interesting, considering all our lovingly-crafted efforts to
keep that sort of memory hot in the CPU cache.

Or was it netperf-to-localhost?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-06 19:24     ` Andrew Morton
@ 2007-05-07 15:15       ` Christoph Lameter
  2007-05-07 18:39       ` Christoph Lameter
  1 sibling, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2007-05-07 15:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

On Sun, 6 May 2007, Andrew Morton wrote:

> Or was it netperf-to-localhost?

Yes it was to localhost.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-06 19:24     ` Andrew Morton
  2007-05-07 15:15       ` Christoph Lameter
@ 2007-05-07 18:39       ` Christoph Lameter
  2007-05-07 18:54         ` Andrew Morton
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2007-05-07 18:39 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

On Sun, 6 May 2007, Andrew Morton wrote:

> On Sat, 5 May 2007 22:45:26 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> 
> > On Sat, 5 May 2007, Christoph Lameter wrote:
> > 
> > > Hmmmm... I can take this even further and get another 20% if I take the 
> > > critical components of slab_alloc and slab_free and inline them into
> > > kfree, kmem_cache_alloc and friends. I went from 5.8MB without this 
> > > patch to now 8 MB/sec with this patch and the rather ugly inlining.
> > 
> > Hmmm... Nope. That was the effect of screwing up kfree so that no memory 
> > is ever freed. Interesting that this increases performance...
> 
> Yes, is is interesting, considering all our lovingly-crafted efforts to
> keep that sort of memory hot in the CPU cache.

I think the major performance improvement was to remove the overhead of 
kfree. Half of the effort is gone thus performance goes through the roof. 
Also this insures that SLUB always gets no partial slabs which increases 
performance further.

What is the problem with 21-mm1 btw? slab performance for both allocators 
dropped from ~6M/sec to ~4.5M/sec

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-07 18:39       ` Christoph Lameter
@ 2007-05-07 18:54         ` Andrew Morton
  2007-05-07 18:58           ` Christoph Lameter
  0 siblings, 1 reply; 13+ messages in thread
From: Andrew Morton @ 2007-05-07 18:54 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On Mon, 7 May 2007 11:39:02 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> 
> On Sun, 6 May 2007, Andrew Morton wrote:
> 
> > On Sat, 5 May 2007 22:45:26 -0700 (PDT) Christoph Lameter <clameter@sgi.com> wrote:
> > 
> > > On Sat, 5 May 2007, Christoph Lameter wrote:
> > > 
> > > > Hmmmm... I can take this even further and get another 20% if I take the 
> > > > critical components of slab_alloc and slab_free and inline them into
> > > > kfree, kmem_cache_alloc and friends. I went from 5.8MB without this 
> > > > patch to now 8 MB/sec with this patch and the rather ugly inlining.
> > > 
> > > Hmmm... Nope. That was the effect of screwing up kfree so that no memory 
> > > is ever freed. Interesting that this increases performance...
> > 
> > Yes, is is interesting, considering all our lovingly-crafted efforts to
> > keep that sort of memory hot in the CPU cache.
> 
> I think the major performance improvement was to remove the overhead of 
> kfree. Half of the effort is gone thus performance goes through the roof. 
> Also this insures that SLUB always gets no partial slabs which increases 
> performance further.

Well sure.  But there should have been a performance *decrease* because
every piece of memory we get from slab is now cache-cold.  If slab was
recycling objects, one would expect that to not happen.

So I'm assuming that you have producer and consumer running on separate
CPUs and we don't get any decent cache reuse anyway.

> What is the problem with 21-mm1 btw? slab performance for both allocators 
> dropped from ~6M/sec to ~4.5M/sec

That's news to me.  You're the slab guy ;)

Are you sure the slowdown is due to slab, or did networking break?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-07 18:54         ` Andrew Morton
@ 2007-05-07 18:58           ` Christoph Lameter
  2007-05-07 20:32             ` Andrew Morton
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2007-05-07 18:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

On Mon, 7 May 2007, Andrew Morton wrote:

> > I think the major performance improvement was to remove the overhead of 
> > kfree. Half of the effort is gone thus performance goes through the roof. 
> > Also this insures that SLUB always gets no partial slabs which increases 
> > performance further.
> 
> Well sure.  But there should have been a performance *decrease* because
> every piece of memory we get from slab is now cache-cold.  If slab was
> recycling objects, one would expect that to not happen.

No the memory that slub returns is designed to be in increasing memory 
order. The prefetch logic on most modern chips will eliminate the cache 
cold effect.

> So I'm assuming that you have producer and consumer running on separate
> CPUs and we don't get any decent cache reuse anyway.

This was on UP.

> > What is the problem with 21-mm1 btw? slab performance for both allocators 
> > dropped from ~6M/sec to ~4.5M/sec
> 
> That's news to me.  You're the slab guy ;)
> 
> Are you sure the slowdown is due to slab, or did networking break?

Both slab allocators are affected. I poked around but nothing sprang to 
my mind. Seems its networking.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-07 18:58           ` Christoph Lameter
@ 2007-05-07 20:32             ` Andrew Morton
  0 siblings, 0 replies; 13+ messages in thread
From: Andrew Morton @ 2007-05-07 20:32 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On Mon, 7 May 2007 11:58:34 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> > > What is the problem with 21-mm1 btw? slab performance for both allocators 
> > > dropped from ~6M/sec to ~4.5M/sec
> > 
> > That's news to me.  You're the slab guy ;)
> > 
> > Are you sure the slowdown is due to slab, or did networking break?
> 
> Both slab allocators are affected. I poked around but nothing sprang to 
> my mind. Seems its networking.

Please, send a report to netdev@vger.kernel.org.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-05  3:28 Support concurrent local and remote frees and allocs on a slab Christoph Lameter
  2007-05-06  4:59 ` Christoph Lameter
@ 2007-05-07 21:50 ` Andrew Morton
  2007-05-07 21:55   ` Christoph Lameter
  2007-05-08  0:56   ` Christoph Lameter
  1 sibling, 2 replies; 13+ messages in thread
From: Andrew Morton @ 2007-05-07 21:50 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On Fri, 4 May 2007 20:28:41 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> About 5-10% performance gain on netperf.
> 
> [Maybe put this patch at the end of the merge queue? Works fine here but
> this is a significant change that may impact stability]
> 
> What we do is use the last free field in the page struct (the private
> field that was freed up through the compound page flag rework) to setup a
> separate per cpu freelist. From that one we can allocate without taking the
> slab lock because we checkout the complete list of free objects when we
> first touch the slab and then mark the slab as completely allocated.
> If we have a cpu_freelist then we can also free to that list if we run on
> that processor without taking the slab lock.
> 
> This allows even concurrent allocations and frees on the same slab using
> two mutually exclusive freelists. Allocs and frees from the processor
> owning the per cpu slab will bypass the slab lock using the cpu_freelist.
> Remove frees will use the slab lock to synchronize and use the freelist
> for marking items as free. So local allocs and frees may run concurrently
> with remote frees without synchronization.
> 
> If the allocator is running out of its per cpu freelist then it will consult
> the per slab freelist (which requires the slab lock) and reload the
> cpu_freelist if there are objects that were remotely freed.
> 

I must say that I'm getting increasingly foggy about what the slub data
structures are.  That was my problem with slab, too: it's hard to get a
picture in one's head.

Is there some way in which we can communicate this better?  It is quite
central to maintainability.

> 
> ---
>  include/linux/mm_types.h |    5 ++-
>  mm/slub.c                |   67 ++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 59 insertions(+), 13 deletions(-)
> 
> Index: slub/include/linux/mm_types.h
> ===================================================================
> --- slub.orig/include/linux/mm_types.h	2007-05-04 20:09:26.000000000 -0700
> +++ slub/include/linux/mm_types.h	2007-05-04 20:09:33.000000000 -0700
> @@ -50,9 +50,12 @@ struct page {
>  	    spinlock_t ptl;
>  #endif
>  	    struct {			/* SLUB uses */
> -		struct page *first_page;	/* Compound pages */
> +	    	void **cpu_freelist;		/* Per cpu freelist */
>  		struct kmem_cache *slab;	/* Pointer to slab */
>  	    };
> +	    struct {
> +		struct page *first_page;	/* Compound pages */
> +	    };
>  	};

This change implies that "first_page" is no longer a "SLUB use".  Is that
true?

I'm a bit surprised that slub didn't already have a per-cpu freelist of
objects?

Each cache has this "cpu_slab" thing, which is not documented anywhere
afaict.  What does it do, and how does this change enhance it?

(I'm not really asking for a reply-by-email, btw.  This is more a "this is
what people will wonder when they read your code.  Please ensure tha the
answers are there for them" thing.)

>  	union {
>  		pgoff_t index;		/* Our offset within mapping. */
> Index: slub/mm/slub.c
> ===================================================================
> --- slub.orig/mm/slub.c	2007-05-04 20:09:26.000000000 -0700
> +++ slub/mm/slub.c	2007-05-04 20:14:04.000000000 -0700
> @@ -81,10 +81,13 @@
>   * PageActive 		The slab is used as a cpu cache. Allocations
>   * 			may be performed from the slab. The slab is not
>   * 			on any slab list and cannot be moved onto one.
> + * 			The cpu slab may have a cpu_freelist in order
> + * 			to optimize allocations and frees on a particular
> + * 			cpu.
>   *
>   * PageError		Slab requires special handling due to debug
>   * 			options set. This moves	slab handling out of
> - * 			the fast path.
> + * 			the fast path and disables cpu_freelists.
>   */
>  
>  /*
> @@ -857,6 +860,7 @@ static struct page *new_slab(struct kmem
>  	set_freepointer(s, last, NULL);
>  
>  	page->freelist = start;
> +	page->cpu_freelist = NULL;
>  	page->inuse = 0;
>  out:
>  	if (flags & __GFP_WAIT)
> @@ -1121,6 +1125,23 @@ static void putback_slab(struct kmem_cac
>   */
>  static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
>  {
> +	/*
> +	 * Merge cpu freelist into freelist. Typically we get here
> +	 * because both freelists are empty. So this is unlikely
> +	 * to occur.
> +	 */
> +	while (unlikely(page->cpu_freelist)) {
> +		void **object;
> +
> +		/* Retrieve object from cpu_freelist */
> +		object = page->cpu_freelist;
> +		page->cpu_freelist = page->cpu_freelist[page->offset];
> +
> +		/* And put onto the regular freelist */
> +		object[page->offset] = page->freelist;
> +		page->freelist = object;
> +		page->inuse--;
> +	}

page.offset doesn't appear to be documented anywhere?

So what is pointed at by page->cpu_freelist?  It appears to point at an
array of pointers to recently-used objects.  But where does the storage for
that array come from?  All a bit mysterious.

btw, does this code, in slab_alloc()

	if (unlikely(node != -1 && page_to_nid(page) != node)) {
							
get appropriately optimised away on non-NUMA?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-07 21:50 ` Andrew Morton
@ 2007-05-07 21:55   ` Christoph Lameter
  2007-05-08  0:56   ` Christoph Lameter
  1 sibling, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2007-05-07 21:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

> Is there some way in which we can communicate this better?  It is quite
> central to maintainability.

Would you drop this patch again? I am still reworking it and you will get 
conflicts with the patchset I sent you.

> This change implies that "first_page" is no longer a "SLUB use".  Is that
> true?

We moved that into the huge page support functions in include/linux/mm.h
 
> I'm a bit surprised that slub didn't already have a per-cpu freelist of
> objects?

It does but it has a lock before and after access to per cpu slabs since
remove frees may access per cpu slabs. The patch splits the freelists.

> Each cache has this "cpu_slab" thing, which is not documented anywhere
> afaict.  What does it do, and how does this change enhance it?

It avoids the atomic overhead. If the cachelines are all hot then the 
atomic overhead and stack handling etc become a factor. I am minimizing 
that currently. Next rev will do that better.

> 	if (unlikely(node != -1 && page_to_nid(page) != node)) {
> 							
> get appropriately optimised away on non-NUMA?

Yes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-07 21:50 ` Andrew Morton
  2007-05-07 21:55   ` Christoph Lameter
@ 2007-05-08  0:56   ` Christoph Lameter
  2007-05-08 22:05     ` Christoph Lameter
  1 sibling, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2007-05-08  0:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

New rev. I tried to explain things better.


Avoid atomic overhead in slab_alloc and slab_free

SLUB needs to use the slab_lock for the per cpu slabs to synchronize
with potential kfree operations. This patch avoids that need by moving
all free objects onto a lockless_freelist. The regular freelist
continues to exist and will be used to free objects. So while we consume
the lockless_freelist the regular freelist may build up objects.

If we are out of objects on the lockless_freelist then we may check
the regular freelist. If it has objects then we move those over to the
lockless_freelist and do this again. There is a significant savings
in terms of atomic operations that have to be performed.

We can even free directly to the lockless_freelist if we know that we
are running on the same processor. So this speeds up short lived
objects. They may be allocated and freed without taking the slab_lock.
This is particular good for netperf.

In order to maximize the effect of the new faster hotpath we extract the
hottest performance pieces into inlined functions. These are then inlined
into kmem_cache_alloc and kmem_cache_free. So hotpath allocation and
freeing no longer requires a subroutine call within SLUB.

[I am not sure that it is worth doing this because it changes the easy
to read structure of slub just to reduce atomic ops. However, there is 
someone out there with a benchmark on 4 way and 8 way processor systems 
that seems to show a 5% regression vs. Slab. Seems that the regression is 
due to increased atomic operations use vs. SLAB in SLUB). I wonder if this
is applicable or discernable at all in a real workload?

Maybe let this sit in mm for awhile ?]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mm_types.h |    7 +-
 mm/slub.c                |  154 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 123 insertions(+), 38 deletions(-)

Index: slub/include/linux/mm_types.h
===================================================================
--- slub.orig/include/linux/mm_types.h	2007-05-07 17:31:11.000000000 -0700
+++ slub/include/linux/mm_types.h	2007-05-07 17:33:54.000000000 -0700
@@ -50,13 +50,16 @@ struct page {
 	    spinlock_t ptl;
 #endif
 	    struct {			/* SLUB uses */
-		struct page *first_page;	/* Compound pages */
+	    	void **lockless_freelist;
 		struct kmem_cache *slab;	/* Pointer to slab */
 	    };
+	    struct {
+		struct page *first_page;	/* Compound pages */
+	    };
 	};
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
-		void *freelist;		/* SLUB: pointer to free object */
+		void *freelist;		/* SLUB: freelist req. slab lock */
 	};
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-05-07 17:31:11.000000000 -0700
+++ slub/mm/slub.c	2007-05-07 17:33:54.000000000 -0700
@@ -81,10 +81,14 @@
  * PageActive 		The slab is used as a cpu cache. Allocations
  * 			may be performed from the slab. The slab is not
  * 			on any slab list and cannot be moved onto one.
+ * 			The cpu slab may be equipped with an additioanl
+ * 			lockless_freelist that allows lockless access to
+ * 			free objects in addition to the regular freelist
+ * 			that requires the slab lock.
  *
  * PageError		Slab requires special handling due to debug
  * 			options set. This moves	slab handling out of
- * 			the fast path.
+ * 			the fast path and disables lockless freelists.
  */
 
 static inline int SlabDebug(struct page *page)
@@ -1016,6 +1020,7 @@ static struct page *new_slab(struct kmem
 	set_freepointer(s, last, NULL);
 
 	page->freelist = start;
+	page->lockless_freelist = NULL;
 	page->inuse = 0;
 out:
 	if (flags & __GFP_WAIT)
@@ -1278,6 +1283,23 @@ static void putback_slab(struct kmem_cac
  */
 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
 {
+	/*
+	 * Merge cpu freelist into freelist. Typically we get here
+	 * because both freelists are empty. So this is unlikely
+	 * to occur.
+	 */
+	while (unlikely(page->lockless_freelist)) {
+		void **object;
+
+		/* Retrieve object from cpu_freelist */
+		object = page->lockless_freelist;
+		page->lockless_freelist = page->lockless_freelist[page->offset];
+
+		/* And put onto the regular freelist */
+		object[page->offset] = page->freelist;
+		page->freelist = object;
+		page->inuse--;
+	}
 	s->cpu_slab[cpu] = NULL;
 	ClearPageActive(page);
 
@@ -1324,47 +1346,46 @@ static void flush_all(struct kmem_cache 
 }
 
 /*
- * slab_alloc is optimized to only modify two cachelines on the fast path
- * (aside from the stack):
+ * Slow path. The lockless freelist is empty or we need to perform
+ * debugging duties.
  *
- * 1. The page struct
- * 2. The first cacheline of the object to be allocated.
+ * Interrupts are disabled.
  *
- * The only other cache lines that are read (apart from code) is the
- * per cpu array in the kmem_cache struct.
+ * Processing is still very fast if new objects have been freed to the
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
  *
- * Fastpath is not possible if we need to get a new slab or have
- * debugging enabled (which means all slabs are marked with SlabDebug)
+ * If that is not working then we fall back to the partial lists. We take the
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
+ *
+ * And if we were unable to get a new slab from the partial slab lists then
+ * we need to allocate a new slab. This is slowest path since we may sleep.
  */
-static void *slab_alloc(struct kmem_cache *s,
-				gfp_t gfpflags, int node, void *addr)
+static void *__slab_alloc(struct kmem_cache *s,
+		gfp_t gfpflags, int node, void *addr, struct page *page)
 {
-	struct page *page;
 	void **object;
-	unsigned long flags;
-	int cpu;
+	int cpu = smp_processor_id();
 
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	page = s->cpu_slab[cpu];
 	if (!page)
 		goto new_slab;
 
 	slab_lock(page);
 	if (unlikely(node != -1 && page_to_nid(page) != node))
 		goto another_slab;
-redo:
+load_freelist:
 	object = page->freelist;
 	if (unlikely(!object))
 		goto another_slab;
 	if (unlikely(SlabDebug(page)))
 		goto debug;
 
-have_object:
-	page->inuse++;
-	page->freelist = object[page->offset];
+	object = page->freelist;
+	page->lockless_freelist = object[page->offset];
+	page->inuse = s->objects;
+	page->freelist = NULL;
 	slab_unlock(page);
-	local_irq_restore(flags);
 	return object;
 
 another_slab:
@@ -1372,11 +1393,11 @@ another_slab:
 
 new_slab:
 	page = get_partial(s, gfpflags, node);
-	if (likely(page)) {
+	if (page) {
 have_slab:
 		s->cpu_slab[cpu] = page;
 		SetPageActive(page);
-		goto redo;
+		goto load_freelist;
 	}
 
 	page = new_slab(s, gfpflags, node);
@@ -1399,7 +1420,7 @@ have_slab:
 				discard_slab(s, page);
 				page = s->cpu_slab[cpu];
 				slab_lock(page);
-				goto redo;
+				goto load_freelist;
 			}
 			/* New slab does not fit our expectations */
 			flush_slab(s, s->cpu_slab[cpu], cpu);
@@ -1407,16 +1428,52 @@ have_slab:
 		slab_lock(page);
 		goto have_slab;
 	}
-	local_irq_restore(flags);
 	return NULL;
 debug:
+	object = page->freelist;
 	if (!alloc_object_checks(s, page, object))
 		goto another_slab;
 	if (s->flags & SLAB_STORE_USER)
 		set_track(s, object, TRACK_ALLOC, addr);
 	trace(s, page, object, 1);
 	init_object(s, object, 1);
-	goto have_object;
+
+	page->inuse++;
+	page->freelist = object[page->offset];
+	slab_unlock(page);
+	return object;
+}
+
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static void __always_inline *slab_alloc(struct kmem_cache *s,
+				gfp_t gfpflags, int node, void *addr)
+{
+	struct page *page;
+	void **object;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	page = s->cpu_slab[smp_processor_id()];
+	if (unlikely(!page || !page->lockless_freelist ||
+			(node != -1 && page_to_nid(page) != node)))
+
+		object = __slab_alloc(s, gfpflags, node, addr, page);
+
+	else {
+		object = page->lockless_freelist;
+		page->lockless_freelist = object[page->offset];
+	}
+	local_irq_restore(flags);
+	return object;
 }
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1434,20 +1491,19 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
 
 /*
- * The fastpath only writes the cacheline of the page struct and the first
- * cacheline of the object.
+ * Slow patch handling. This may still be called frequently since objects
+ * have a longer lifetime than the cpu slabs in most processing loads.
  *
- * We read the cpu_slab cacheline to check if the slab is the per cpu
- * slab for this processor.
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
  */
-static void slab_free(struct kmem_cache *s, struct page *page,
+static void __slab_free(struct kmem_cache *s, struct page *page,
 					void *x, void *addr)
 {
 	void *prior;
 	void **object = (void *)x;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	slab_lock(page);
 
 	if (unlikely(SlabDebug(page)))
@@ -1477,7 +1533,6 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
-	local_irq_restore(flags);
 	return;
 
 slab_empty:
@@ -1489,7 +1544,6 @@ slab_empty:
 
 	slab_unlock(page);
 	discard_slab(s, page);
-	local_irq_restore(flags);
 	return;
 
 debug:
@@ -1504,6 +1558,34 @@ debug:
 	goto checks_ok;
 }
 
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ */
+static void __always_inline slab_free(struct kmem_cache *s,
+			struct page *page, void *x, void *addr)
+{
+	void **object = (void *)x;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	if (likely(page == s->cpu_slab[smp_processor_id()] &&
+						!SlabDebug(page))) {
+		object[page->offset] = page->lockless_freelist;
+		page->lockless_freelist = object;
+	} else
+		__slab_free(s, page, x, addr);
+
+	local_irq_restore(flags);
+}
+
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
 	struct page *page;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Support concurrent local and remote frees and allocs on a slab.
  2007-05-08  0:56   ` Christoph Lameter
@ 2007-05-08 22:05     ` Christoph Lameter
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2007-05-08 22:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm

On Mon, 7 May 2007, Christoph Lameter wrote:

> New rev. I tried to explain things better.

Please apply because this fixes the Clovertown performance 
regressions. So its was due to atomic overhead.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-05-08 22:05 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-05-05  3:28 Support concurrent local and remote frees and allocs on a slab Christoph Lameter
2007-05-06  4:59 ` Christoph Lameter
2007-05-06  5:45   ` Christoph Lameter
2007-05-06 19:24     ` Andrew Morton
2007-05-07 15:15       ` Christoph Lameter
2007-05-07 18:39       ` Christoph Lameter
2007-05-07 18:54         ` Andrew Morton
2007-05-07 18:58           ` Christoph Lameter
2007-05-07 20:32             ` Andrew Morton
2007-05-07 21:50 ` Andrew Morton
2007-05-07 21:55   ` Christoph Lameter
2007-05-08  0:56   ` Christoph Lameter
2007-05-08 22:05     ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox