[PATCH] slob: poor man's NUMA, take 2.

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] slob: poor man's NUMA, take 2.
@ 2007-06-13  3:12 Paul Mundt
  2007-06-13  3:24 ` Nick Piggin
  2007-06-13  3:28 ` Matt Mackall
  0 siblings, 2 replies; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  3:12 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Christoph Lameter, Nick Piggin, Andrew Morton, linux-mm

Here's an updated copy of the patch adding simple NUMA support to SLOB,
against the current -mm version of SLOB this time.

I've tried to address all of the comments on the initial version so far,
but there's obviously still room for improvement.

This approach is not terribly scalable in that we still end up using a
global freelist (and a global spinlock!) across all nodes, making the
partial free page lookup rather expensive. The next step after this will
be moving towards split freelists with finer grained locking.

The scanning of the global freelist could be sped up by simply ignoring
the node id unless __GFP_THISNODE is set. This patch defaults to trying
to match up the node id for the partial pages (whereas the last one just
grabbed the first partial page from the list, regardless of node
placement), but perhaps that's the wrong default and should only be done
for __GFP_THISNODE?

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

--

 include/linux/slab.h |    7 ++++
 mm/slob.c            |   72 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/mm/slob.c b/mm/slob.c
index 06e5e72..d89f951 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -204,6 +204,23 @@ static int slob_last(slob_t *s)
 	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
 
+static inline void *slob_new_page(gfp_t gfp, int order, int node)
+{
+	void *page;
+
+#ifdef CONFIG_NUMA
+	if (node != -1)
+		page = alloc_pages_node(node, gfp, order);
+	else
+#endif
+		page = alloc_pages(gfp, order);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
 /*
  * Allocate a slob block within a given slob_page sp.
  */
@@ -258,7 +275,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 /*
  * slob_alloc: entry point into the slob allocator.
  */
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
 	struct slob_page *sp;
 	slob_t *b = NULL;
@@ -267,6 +284,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 	spin_lock_irqsave(&slob_lock, flags);
 	/* Iterate through each partially free page, try to find room */
 	list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+		/*
+		 * If there's a node specification, search for a partial
+		 * page with a matching node id in the freelist.
+		 */
+		if (node != -1 && page_to_nid(&sp->page) != node)
+			continue;
+#endif
+
 		if (sp->units >= SLOB_UNITS(size)) {
 			b = slob_page_alloc(sp, size, align);
 			if (b)
@@ -277,7 +303,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 
 	/* Not enough space: must allocate a new page */
 	if (!b) {
-		b = (slob_t *)__get_free_page(gfp);
+		b = slob_new_page(gfp, 0, node);
 		if (!b)
 			return 0;
 		sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +407,20 @@ out:
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
 #endif
 
-
-void *__kmalloc(size_t size, gfp_t gfp)
+static void *slob_node_alloc(size_t size, gfp_t gfp, int node)
 {
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
 	if (size < PAGE_SIZE - align) {
 		unsigned int *m;
-		m = slob_alloc(size + align, gfp, align);
+		m = slob_alloc(size + align, gfp, align, node);
 		if (m)
 			*m = size;
 		return (void *)m + align;
 	} else {
 		void *ret;
 
-		ret = (void *) __get_free_pages(gfp | __GFP_COMP,
-						get_order(size));
+		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
@@ -405,8 +429,21 @@ void *__kmalloc(size_t size, gfp_t gfp)
 		return ret;
 	}
 }
+
+void *__kmalloc(size_t size, gfp_t gfp)
+{
+	return slob_node_alloc(size, gfp, -1);
+}
 EXPORT_SYMBOL(__kmalloc);
 
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+	return slob_node_alloc(size, gfp, node);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+#endif
+
 /**
  * krealloc - reallocate memory. The contents will remain unchanged.
  *
@@ -487,7 +524,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
 	struct kmem_cache *c;
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
 
 	if (c) {
 		c->name = name;
@@ -517,22 +554,35 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+static void *__kmem_cache_alloc(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
 	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+		b = slob_alloc(c->size, flags, c->align, node);
 	else
-		b = (void *)__get_free_pages(flags, get_order(c->size));
+		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
 		c->ctor(b, c, 0);
 
 	return b;
 }
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	return __kmem_cache_alloc(c, flags, -1);
+}
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+#ifdef CONFIG_NUMA
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+{
+	return __kmem_cache_alloc(c, flags, node);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
+
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 {
 	void *ret = kmem_cache_alloc(c, flags);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a015236..efc87c1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -200,6 +200,13 @@ static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __kmalloc(size, flags);
 }
+#elif defined(CONFIG_SLOB)
+extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc_node(size, flags, node);
+}
 #endif /* !CONFIG_NUMA */
 
 /*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:12 [PATCH] slob: poor man's NUMA, take 2 Paul Mundt
@ 2007-06-13  3:24 ` Nick Piggin
  2007-06-13  3:32   ` Matt Mackall
  2007-06-13  3:33   ` Paul Mundt
  2007-06-13  3:28 ` Matt Mackall
  1 sibling, 2 replies; 20+ messages in thread
From: Nick Piggin @ 2007-06-13  3:24 UTC (permalink / raw)
  To: Paul Mundt; +Cc: Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

Paul Mundt wrote:
> Here's an updated copy of the patch adding simple NUMA support to SLOB,
> against the current -mm version of SLOB this time.
> 
> I've tried to address all of the comments on the initial version so far,
> but there's obviously still room for improvement.
> 
> This approach is not terribly scalable in that we still end up using a
> global freelist (and a global spinlock!) across all nodes, making the
> partial free page lookup rather expensive. The next step after this will
> be moving towards split freelists with finer grained locking.

I just think that this is not really a good intermediate step because
you only get NUMA awareness from the first allocation out of a page. I
guess that's an easy no-brainer for bigblock allocations, but for SLUB
proper, it seems not so good.

For a lot of workloads you will have a steady state where allocation and
freeing rates match pretty well and there won't be much movement of pages
in and out of the allocator. In this case it will be back to random
allocations, won't it?

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:12 [PATCH] slob: poor man's NUMA, take 2 Paul Mundt
  2007-06-13  3:24 ` Nick Piggin
@ 2007-06-13  3:28 ` Matt Mackall
  2007-06-13  9:21   ` Paul Mundt
  1 sibling, 1 reply; 20+ messages in thread
From: Matt Mackall @ 2007-06-13  3:28 UTC (permalink / raw)
  To: Paul Mundt, Christoph Lameter, Nick Piggin, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 12:12:03PM +0900, Paul Mundt wrote:
> Here's an updated copy of the patch adding simple NUMA support to SLOB,
> against the current -mm version of SLOB this time.
> 
> I've tried to address all of the comments on the initial version so far,
> but there's obviously still room for improvement.
> 
> This approach is not terribly scalable in that we still end up using a
> global freelist (and a global spinlock!) across all nodes, making the
> partial free page lookup rather expensive. The next step after this will
> be moving towards split freelists with finer grained locking.
> 
> The scanning of the global freelist could be sped up by simply ignoring
> the node id unless __GFP_THISNODE is set. This patch defaults to trying
> to match up the node id for the partial pages (whereas the last one just
> grabbed the first partial page from the list, regardless of node
> placement), but perhaps that's the wrong default and should only be done
> for __GFP_THISNODE?

Hmmm. There's not a whole lot that uses __GFP_THISNODE. Dunno.
 
> +static inline void *slob_new_page(gfp_t gfp, int order, int node)
> +{
> +	void *page;
> +
> +#ifdef CONFIG_NUMA
> +	if (node != -1)
> +		page = alloc_pages_node(node, gfp, order);
> +	else
> +#endif
> +		page = alloc_pages(gfp, order);
> +
> +	if (!page)
> +		return NULL;
> +
> +	return page_address(page);

We might want to leave the inlining decision here to the compiler. The
ifdef may change that decision..

> -void *__kmalloc(size_t size, gfp_t gfp)
> +static void *slob_node_alloc(size_t size, gfp_t gfp, int node)

See my comment in the last message.

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:24 ` Nick Piggin
@ 2007-06-13  3:32   ` Matt Mackall
  2007-06-13  3:33   ` Paul Mundt
  1 sibling, 0 replies; 20+ messages in thread
From: Matt Mackall @ 2007-06-13  3:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Paul Mundt, Christoph Lameter, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 01:24:01PM +1000, Nick Piggin wrote:
> Paul Mundt wrote:
> >Here's an updated copy of the patch adding simple NUMA support to SLOB,
> >against the current -mm version of SLOB this time.
> >
> >I've tried to address all of the comments on the initial version so far,
> >but there's obviously still room for improvement.
> >
> >This approach is not terribly scalable in that we still end up using a
> >global freelist (and a global spinlock!) across all nodes, making the
> >partial free page lookup rather expensive. The next step after this will
> >be moving towards split freelists with finer grained locking.
> 
> I just think that this is not really a good intermediate step because
> you only get NUMA awareness from the first allocation out of a page. I
> guess that's an easy no-brainer for bigblock allocations, but for SLUB
> proper, it seems not so good.
> 
> For a lot of workloads you will have a steady state where allocation and
> freeing rates match pretty well and there won't be much movement of pages
> in and out of the allocator. In this case it will be back to random
> allocations, won't it?

Hmmm, probably.

Perhaps we can have a single list (or ring, rather) with per-node
insertion points. Then we can start node-local searches at the
insertion points..?

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:24 ` Nick Piggin
  2007-06-13  3:32   ` Matt Mackall
@ 2007-06-13  3:33   ` Paul Mundt
  2007-06-13  3:39     ` Nick Piggin
  1 sibling, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  3:33 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 01:24:01PM +1000, Nick Piggin wrote:
> Paul Mundt wrote:
> >Here's an updated copy of the patch adding simple NUMA support to SLOB,
> >against the current -mm version of SLOB this time.
> >
> >I've tried to address all of the comments on the initial version so far,
> >but there's obviously still room for improvement.
> >
> >This approach is not terribly scalable in that we still end up using a
> >global freelist (and a global spinlock!) across all nodes, making the
> >partial free page lookup rather expensive. The next step after this will
> >be moving towards split freelists with finer grained locking.
> 
> I just think that this is not really a good intermediate step because
> you only get NUMA awareness from the first allocation out of a page. I
> guess that's an easy no-brainer for bigblock allocations, but for SLUB
> proper, it seems not so good.
> 
> For a lot of workloads you will have a steady state where allocation and
> freeing rates match pretty well and there won't be much movement of pages
> in and out of the allocator. In this case it will be back to random
> allocations, won't it?
> 
That's why I tossed in the node id matching in slob_alloc() for the
partial free page lookup. At the moment the logic obviously won't scale,
since we end up scanning the entire freelist looking for a page that
matches the node specifier. If we don't find one, we could rescan and
just grab a block from another node, but at the moment it just continues
on and tries to fetch a new page for the specified node.

If the freelists are split per node, that makes it a bit more manageable,
but that's more of a scalability issue than a correctness one. Random
alloc/free workloads will stick to their node with the current patch, so
I'm not sure where you see the random node placement as an issue?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:33   ` Paul Mundt
@ 2007-06-13  3:39     ` Nick Piggin
  2007-06-13  3:42       ` Nick Piggin
  2007-06-13  9:50       ` Paul Mundt
  0 siblings, 2 replies; 20+ messages in thread
From: Nick Piggin @ 2007-06-13  3:39 UTC (permalink / raw)
  To: Paul Mundt; +Cc: Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

Paul Mundt wrote:
> On Wed, Jun 13, 2007 at 01:24:01PM +1000, Nick Piggin wrote:
> 
>>Paul Mundt wrote:
>>
>>>Here's an updated copy of the patch adding simple NUMA support to SLOB,
>>>against the current -mm version of SLOB this time.
>>>
>>>I've tried to address all of the comments on the initial version so far,
>>>but there's obviously still room for improvement.
>>>
>>>This approach is not terribly scalable in that we still end up using a
>>>global freelist (and a global spinlock!) across all nodes, making the
>>>partial free page lookup rather expensive. The next step after this will
>>>be moving towards split freelists with finer grained locking.
>>
>>I just think that this is not really a good intermediate step because
>>you only get NUMA awareness from the first allocation out of a page. I
>>guess that's an easy no-brainer for bigblock allocations, but for SLUB
>>proper, it seems not so good.
>>
>>For a lot of workloads you will have a steady state where allocation and
>>freeing rates match pretty well and there won't be much movement of pages
>>in and out of the allocator. In this case it will be back to random
>>allocations, won't it?
>>
> 
> That's why I tossed in the node id matching in slob_alloc() for the
> partial free page lookup. At the moment the logic obviously won't scale,
> since we end up scanning the entire freelist looking for a page that
> matches the node specifier. If we don't find one, we could rescan and
> just grab a block from another node, but at the moment it just continues
> on and tries to fetch a new page for the specified node.

Oh, I didn't notice that. OK, sorry that would work.

... but that goes against Matt's direction of wanting to improve basic
things like SMP scalability before NUMA awareness. I think once we had
per-CPU lists in place for SMP scalability, NUMA come much more naturally
and easily.

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:39     ` Nick Piggin
@ 2007-06-13  3:42       ` Nick Piggin
  2007-06-13  4:13         ` Paul Mundt
  2007-06-13  9:50       ` Paul Mundt
  1 sibling, 1 reply; 20+ messages in thread
From: Nick Piggin @ 2007-06-13  3:42 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Paul Mundt, Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

Nick Piggin wrote:
> Paul Mundt wrote:

>> That's why I tossed in the node id matching in slob_alloc() for the
>> partial free page lookup. At the moment the logic obviously won't scale,
>> since we end up scanning the entire freelist looking for a page that
>> matches the node specifier. If we don't find one, we could rescan and
>> just grab a block from another node, but at the moment it just continues
>> on and tries to fetch a new page for the specified node.
> 
> 
> Oh, I didn't notice that. OK, sorry that would work.

OTOH, there are lots of places that don't specify the node explicitly,
but most of them prefer the allocation to come from the current node...
and that case isn't handled very well is it?

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:42       ` Nick Piggin
@ 2007-06-13  4:13         ` Paul Mundt
  2007-06-13  4:23           ` Paul Mundt
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  4:13 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 01:42:28PM +1000, Nick Piggin wrote:
> Nick Piggin wrote:
> >Paul Mundt wrote:
> 
> >>That's why I tossed in the node id matching in slob_alloc() for the
> >>partial free page lookup. At the moment the logic obviously won't scale,
> >>since we end up scanning the entire freelist looking for a page that
> >>matches the node specifier. If we don't find one, we could rescan and
> >>just grab a block from another node, but at the moment it just continues
> >>on and tries to fetch a new page for the specified node.
> >
> >
> >Oh, I didn't notice that. OK, sorry that would work.
> 
> OTOH, there are lots of places that don't specify the node explicitly,
> but most of them prefer the allocation to come from the current node...
> and that case isn't handled very well is it?
> 
Well, we could throw in a numa_node_id() for kmem_cache_alloc() and
__kmalloc(), that would actually simplify slob_new_page(), since we can
just use alloc_pages_node() directly in the NUMA case without special
casing the node id.

This also has the side-effect of working well on UP with asymmetric nodes
(assuming a larger node 0), since numa_node_id() will leave us with a
node 0 preference in places where the node id isn't explicitly given.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  4:13         ` Paul Mundt
@ 2007-06-13  4:23           ` Paul Mundt
  2007-06-13  5:30             ` Christoph Lameter
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  4:23 UTC (permalink / raw)
  To: Nick Piggin, Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 01:13:19PM +0900, Paul Mundt wrote:
> On Wed, Jun 13, 2007 at 01:42:28PM +1000, Nick Piggin wrote:
> > OTOH, there are lots of places that don't specify the node explicitly,
> > but most of them prefer the allocation to come from the current node...
> > and that case isn't handled very well is it?
> > 
> Well, we could throw in a numa_node_id() for kmem_cache_alloc() and
> __kmalloc(), that would actually simplify slob_new_page(), since we can
> just use alloc_pages_node() directly in the NUMA case without special
> casing the node id.
> 
> This also has the side-effect of working well on UP with asymmetric nodes
> (assuming a larger node 0), since numa_node_id() will leave us with a
> node 0 preference in places where the node id isn't explicitly given.
> 
And sure enough, that's what alloc_pages_node() already does, so if
slob_new_page() simply wraps in to it it should already be handled:

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
					    unsigned int order)
{
	...

	/* Unknown node is current node */
	if (nid < 0)
		nid = numa_node_id();
	...

I'll update the patch..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  4:23           ` Paul Mundt
@ 2007-06-13  5:30             ` Christoph Lameter
  2007-06-13  5:42               ` Matt Mackall
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Lameter @ 2007-06-13  5:30 UTC (permalink / raw)
  To: Paul Mundt; +Cc: Nick Piggin, Matt Mackall, Andrew Morton, linux-mm

Hmmmm. One key advantage that SLOB has over all allocators is the density 
of the kmalloc array. I tried to add various schemes to SLUB but there is 
still a difference of 340kb on boot. If you get it to do NUMA then may be 
we can get a specialized allocator for the kmalloc array out of all of 
this?

If you focus on the kmalloc array then you can avoid to deal with certain 
other issues

- No ctor, no reclaim accounting, no rcu etc.
- No need to manage partial slabs.
- No slab creation, destruction etc.

Maybe that could done in a pretty compact way and replace the space 
wasting kmalloc arrays in SLAB and SLUB?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  5:30             ` Christoph Lameter
@ 2007-06-13  5:42               ` Matt Mackall
  2007-06-13  6:44                 ` Nick Piggin
  0 siblings, 1 reply; 20+ messages in thread
From: Matt Mackall @ 2007-06-13  5:42 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Paul Mundt, Nick Piggin, Andrew Morton, linux-mm

On Tue, Jun 12, 2007 at 10:30:04PM -0700, Christoph Lameter wrote:
> Hmmmm. One key advantage that SLOB has over all allocators is the density 
> of the kmalloc array. I tried to add various schemes to SLUB but there is 
> still a difference of 340kb on boot. If you get it to do NUMA then may be 
> we can get a specialized allocator for the kmalloc array out of all of 
> this?
> 
> If you focus on the kmalloc array then you can avoid to deal with certain 
> other issues
> 
> - No ctor, no reclaim accounting, no rcu etc.
> - No need to manage partial slabs.
> - No slab creation, destruction etc.

That's an interesting point.

> Maybe that could done in a pretty compact way and replace the space 
> wasting kmalloc arrays in SLAB and SLUB?

We'll need to up the SMP scalability for that to make sense. Using
page flags for per-page locking and such might be a start. I've been
hoping Nick would propose something here, as those sorts of hacks seem
to be his thing.

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  5:42               ` Matt Mackall
@ 2007-06-13  6:44                 ` Nick Piggin
  0 siblings, 0 replies; 20+ messages in thread
From: Nick Piggin @ 2007-06-13  6:44 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Christoph Lameter, Paul Mundt, Andrew Morton, linux-mm

Matt Mackall wrote:
> On Tue, Jun 12, 2007 at 10:30:04PM -0700, Christoph Lameter wrote:
> 
>>Hmmmm. One key advantage that SLOB has over all allocators is the density 
>>of the kmalloc array. I tried to add various schemes to SLUB but there is 
>>still a difference of 340kb on boot. If you get it to do NUMA then may be 
>>we can get a specialized allocator for the kmalloc array out of all of 
>>this?
>>
>>If you focus on the kmalloc array then you can avoid to deal with certain 
>>other issues
>>
>>- No ctor, no reclaim accounting, no rcu etc.
>>- No need to manage partial slabs.
>>- No slab creation, destruction etc.
> 
> 
> That's an interesting point.
> 
> 
>>Maybe that could done in a pretty compact way and replace the space 
>>wasting kmalloc arrays in SLAB and SLUB?
> 
> 
> We'll need to up the SMP scalability for that to make sense. Using
> page flags for per-page locking and such might be a start. I've been
> hoping Nick would propose something here, as those sorts of hacks seem
> to be his thing.

It's tricky as we still have the page list.

It wouldn't be difficult to add a bit-spinlock in the page flags and use
that for the intra-page list traversal, dropping the main lock after
taking the bit lock.

But even if the page list was lockless, I'd worry about locking required
to modify the list, and also, in the current scheme, multiple CPUs all
contending the same bit lock.

So I think from every angle it makes sense to break the page list into
multiple lists first. Per-cpu would be easiest.

After that, we could do lockless page list traversals (and the
finegrained page locking would make a lot of sense) -- easy way would
just be to RCU-free the struct page, cool way would be to instead use
speculative page references from the lockless pagecache :P

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:28 ` Matt Mackall
@ 2007-06-13  9:21   ` Paul Mundt
  2007-06-13 13:15     ` Matt Mackall
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  9:21 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Christoph Lameter, Nick Piggin, Andrew Morton, linux-mm

On Tue, Jun 12, 2007 at 10:28:57PM -0500, Matt Mackall wrote:
> On Wed, Jun 13, 2007 at 12:12:03PM +0900, Paul Mundt wrote:
> > +static inline void *slob_new_page(gfp_t gfp, int order, int node)
> > +{
> > +	void *page;
> > +
> > +#ifdef CONFIG_NUMA
> > +	if (node != -1)
> > +		page = alloc_pages_node(node, gfp, order);
> > +	else
> > +#endif
> > +		page = alloc_pages(gfp, order);
> > +
> > +	if (!page)
> > +		return NULL;
> > +
> > +	return page_address(page);
> 
> We might want to leave the inlining decision here to the compiler. The
> ifdef may change that decision..
> 
> > -void *__kmalloc(size_t size, gfp_t gfp)
> > +static void *slob_node_alloc(size_t size, gfp_t gfp, int node)
> 
> See my comment in the last message.
> 
Here's an updated copy with the node variants always defined.

I've left the nid=-1 case in as the default for the non-node variants, as
this is the approach also used by SLUB. alloc_pages() is special cased
for NUMA, and takes the memory policy under advisement when doing the
allocation, so the page ends up in a reasonable place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

--

 include/linux/slab.h |   11 +++++++--
 mm/slob.c            |   59 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a015236..97d9b0a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -63,7 +63,7 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL, NULL)
 
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_SLOB)
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 #else
 static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
@@ -190,7 +190,14 @@ static inline void *kzalloc(size_t size, gfp_t flags)
 }
 #endif
 
-#ifndef CONFIG_NUMA
+#if defined(CONFIG_SLOB)
+extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc_node(size, flags, node);
+}
+#elif !defined(CONFIG_NUMA)
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kmalloc(size, flags);
diff --git a/mm/slob.c b/mm/slob.c
index 06e5e72..07e3730 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -204,6 +204,23 @@ static int slob_last(slob_t *s)
 	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
 
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+	void *page;
+
+#ifdef CONFIG_NUMA
+	if (node != -1)
+		page = alloc_pages_node(node, gfp, order);
+	else
+#endif
+		page = alloc_pages(gfp, order);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
 /*
  * Allocate a slob block within a given slob_page sp.
  */
@@ -258,7 +275,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 /*
  * slob_alloc: entry point into the slob allocator.
  */
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
 	struct slob_page *sp;
 	slob_t *b = NULL;
@@ -267,6 +284,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 	spin_lock_irqsave(&slob_lock, flags);
 	/* Iterate through each partially free page, try to find room */
 	list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+		/*
+		 * If there's a node specification, search for a partial
+		 * page with a matching node id in the freelist.
+		 */
+		if (node != -1 && page_to_nid(&sp->page) != node)
+			continue;
+#endif
+
 		if (sp->units >= SLOB_UNITS(size)) {
 			b = slob_page_alloc(sp, size, align);
 			if (b)
@@ -277,7 +303,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 
 	/* Not enough space: must allocate a new page */
 	if (!b) {
-		b = (slob_t *)__get_free_page(gfp);
+		b = slob_new_page(gfp, 0, node);
 		if (!b)
 			return 0;
 		sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +407,20 @@ out:
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
 #endif
 
-
-void *__kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
 	if (size < PAGE_SIZE - align) {
 		unsigned int *m;
-		m = slob_alloc(size + align, gfp, align);
+		m = slob_alloc(size + align, gfp, align, node);
 		if (m)
 			*m = size;
 		return (void *)m + align;
 	} else {
 		void *ret;
 
-		ret = (void *) __get_free_pages(gfp | __GFP_COMP,
-						get_order(size));
+		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
@@ -405,6 +429,12 @@ void *__kmalloc(size_t size, gfp_t gfp)
 		return ret;
 	}
 }
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t gfp)
+{
+	return __kmalloc_node(size, gfp, -1);
+}
 EXPORT_SYMBOL(__kmalloc);
 
 /**
@@ -455,7 +485,6 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 }
-
 EXPORT_SYMBOL(kfree);
 
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
@@ -487,7 +516,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
 	struct kmem_cache *c;
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
 
 	if (c) {
 		c->name = name;
@@ -517,20 +546,26 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
 	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+		b = slob_alloc(c->size, flags, c->align, node);
 	else
-		b = (void *)__get_free_pages(flags, get_order(c->size));
+		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
 		c->ctor(b, c, 0);
 
 	return b;
 }
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+	return kmem_cache_alloc_node(c, flags, -1);
+}
 EXPORT_SYMBOL(kmem_cache_alloc);
 
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  3:39     ` Nick Piggin
  2007-06-13  3:42       ` Nick Piggin
@ 2007-06-13  9:50       ` Paul Mundt
  1 sibling, 0 replies; 20+ messages in thread
From: Paul Mundt @ 2007-06-13  9:50 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matt Mackall, Christoph Lameter, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 01:39:15PM +1000, Nick Piggin wrote:
> Paul Mundt wrote:
> >That's why I tossed in the node id matching in slob_alloc() for the
> >partial free page lookup. At the moment the logic obviously won't scale,
> >since we end up scanning the entire freelist looking for a page that
> >matches the node specifier. If we don't find one, we could rescan and
> >just grab a block from another node, but at the moment it just continues
> >on and tries to fetch a new page for the specified node.
> 
> Oh, I didn't notice that. OK, sorry that would work.
> 
> ... but that goes against Matt's direction of wanting to improve basic
> things like SMP scalability before NUMA awareness. I think once we had
> per-CPU lists in place for SMP scalability, NUMA come much more naturally
> and easily.
> 
I'm not sure that the two are at odds. With the SMP scaling work in
place, it's much easier to extend the NUMA support in to something that
scales more intelligently. And even in that case, most of this patch
remains unchanged, it's mostly just the logic in slob_alloc() that
will need a bit of rework.

The problem I'm trying to solve with the current patch is simplistic
management of small nodes on UP. Since the nodes are small, scanning the
global freelist is not a problem. Splitting out per-CPU or per-node
freelists and revamping the locking to be more fine grained is certainly
something that would be nice to move to, but I think that's an
incremental thing we can do once the SMP scalability work is done.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13  9:21   ` Paul Mundt
@ 2007-06-13 13:15     ` Matt Mackall
  2007-06-13 22:47       ` Christoph Lameter
  2007-06-14  2:40       ` Paul Mundt
  0 siblings, 2 replies; 20+ messages in thread
From: Matt Mackall @ 2007-06-13 13:15 UTC (permalink / raw)
  To: Paul Mundt, Christoph Lameter, Nick Piggin, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 06:21:09PM +0900, Paul Mundt wrote:
> Here's an updated copy with the node variants always defined.
> 
> I've left the nid=-1 case in as the default for the non-node variants, as
> this is the approach also used by SLUB. alloc_pages() is special cased
> for NUMA, and takes the memory policy under advisement when doing the
> allocation, so the page ends up in a reasonable place.
> 

> +void *__kmalloc(size_t size, gfp_t gfp)
> +{
> +	return __kmalloc_node(size, gfp, -1);
> +}
>  EXPORT_SYMBOL(__kmalloc);

> +void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
> +{
> +	return kmem_cache_alloc_node(c, flags, -1);
> +}
>  EXPORT_SYMBOL(kmem_cache_alloc);

Now promote these guys to inlines in slab.h. At which point all the
new NUMA code become a no-op on !NUMA.

-- 
Mathematics is the supreme nostalgia of our time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13 13:15     ` Matt Mackall
@ 2007-06-13 22:47       ` Christoph Lameter
  2007-06-14  2:43         ` Paul Mundt
  2007-06-14  2:40       ` Paul Mundt
  1 sibling, 1 reply; 20+ messages in thread
From: Christoph Lameter @ 2007-06-13 22:47 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Paul Mundt, Nick Piggin, Andrew Morton, linux-mm

On Wed, 13 Jun 2007, Matt Mackall wrote:

> On Wed, Jun 13, 2007 at 06:21:09PM +0900, Paul Mundt wrote:
> > Here's an updated copy with the node variants always defined.
> > 
> > I've left the nid=-1 case in as the default for the non-node variants, as
> > this is the approach also used by SLUB. alloc_pages() is special cased
> > for NUMA, and takes the memory policy under advisement when doing the
> > allocation, so the page ends up in a reasonable place.
> > 
> 
> > +void *__kmalloc(size_t size, gfp_t gfp)
> > +{
> > +	return __kmalloc_node(size, gfp, -1);
> > +}
> >  EXPORT_SYMBOL(__kmalloc);
> 
> > +void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
> > +{
> > +	return kmem_cache_alloc_node(c, flags, -1);
> > +}
> >  EXPORT_SYMBOL(kmem_cache_alloc);
> 
> Now promote these guys to inlines in slab.h. At which point all the
> new NUMA code become a no-op on !NUMA.

The fallback code already exists in kmalloc.h for SLAB/SLUB. You just need 
to enable the #ifdefs for SLOB.

Fallback is for kmem_cache_alloc_node to kmem_cache_alloc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13 13:15     ` Matt Mackall
  2007-06-13 22:47       ` Christoph Lameter
@ 2007-06-14  2:40       ` Paul Mundt
  2007-06-14  6:00         ` Christoph Lameter
  1 sibling, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-14  2:40 UTC (permalink / raw)
  To: Matt Mackall; +Cc: Christoph Lameter, Nick Piggin, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 08:15:49AM -0500, Matt Mackall wrote:
> On Wed, Jun 13, 2007 at 06:21:09PM +0900, Paul Mundt wrote:
> > Here's an updated copy with the node variants always defined.
> > 
> > I've left the nid=-1 case in as the default for the non-node variants, as
> > this is the approach also used by SLUB. alloc_pages() is special cased
> > for NUMA, and takes the memory policy under advisement when doing the
> > allocation, so the page ends up in a reasonable place.
> > 
> 
> > +void *__kmalloc(size_t size, gfp_t gfp)
> > +{
> > +	return __kmalloc_node(size, gfp, -1);
> > +}
> >  EXPORT_SYMBOL(__kmalloc);
> 
> > +void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
> > +{
> > +	return kmem_cache_alloc_node(c, flags, -1);
> > +}
> >  EXPORT_SYMBOL(kmem_cache_alloc);
> 
> Now promote these guys to inlines in slab.h. At which point all the
> new NUMA code become a no-op on !NUMA.
> 
If we do that, then slab.h needs a bit of reordering (as we can't use the
existing CONFIG_NUMA ifdefs that exist in slab.h, which the previous
patches built on), which makes the patch a bit more invasive.

Anyways, here's the patch that does that..

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

--

 include/linux/slab.h |   54 ++++++++++++++++++++++++++++++++++++---------------
 mm/slob.c            |   51 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 76 insertions(+), 29 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a015236..2eeca65 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -44,7 +44,6 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *, struct kmem_cache *, unsigned long));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *, void *);
 unsigned int kmem_cache_size(struct kmem_cache *);
@@ -63,9 +62,19 @@ int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL, NULL)
 
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_SLOB)
+extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+
+static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	return kmem_cache_alloc_node(cachep, flags, -1);
+}
+#elif defined(CONFIG_NUMA)
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 #else
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+
 static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
 					gfp_t flags, int node)
 {
@@ -91,7 +100,6 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
 /*
  * Common kmalloc functions provided by all allocators
  */
-void *__kmalloc(size_t, gfp_t);
 void *__kzalloc(size_t, gfp_t);
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
@@ -110,6 +118,34 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 	return __kzalloc(n * size, flags);
 }
 
+#if defined(CONFIG_SLOB)
+extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc_node(size, flags, node);
+}
+
+static inline void *__kmalloc(size_t size, gfp_t flags)
+{
+	return __kmalloc_node(size, flags, -1);
+}
+#elif !defined(CONFIG_NUMA)
+void *__kmalloc(size_t, gfp_t);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kmalloc(size, flags);
+}
+
+static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc(size, flags);
+}
+#else
+void *__kmalloc(size_t, gfp_t);
+#endif /* !CONFIG_NUMA */
+
 /*
  * Allocator specific definitions. These are mainly used to establish optimized
  * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by selecting
@@ -190,18 +226,6 @@ static inline void *kzalloc(size_t size, gfp_t flags)
 }
 #endif
 
-#ifndef CONFIG_NUMA
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kmalloc(size, flags);
-}
-
-static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc(size, flags);
-}
-#endif /* !CONFIG_NUMA */
-
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
  * calling function of the routine calling it for slab leak tracking instead
diff --git a/mm/slob.c b/mm/slob.c
index 06e5e72..b08eca4 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -204,6 +204,23 @@ static int slob_last(slob_t *s)
 	return !((unsigned long)slob_next(s) & ~PAGE_MASK);
 }
 
+static void *slob_new_page(gfp_t gfp, int order, int node)
+{
+	void *page;
+
+#ifdef CONFIG_NUMA
+	if (node != -1)
+		page = alloc_pages_node(node, gfp, order);
+	else
+#endif
+		page = alloc_pages(gfp, order);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
 /*
  * Allocate a slob block within a given slob_page sp.
  */
@@ -258,7 +275,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 /*
  * slob_alloc: entry point into the slob allocator.
  */
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
 	struct slob_page *sp;
 	slob_t *b = NULL;
@@ -267,6 +284,15 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 	spin_lock_irqsave(&slob_lock, flags);
 	/* Iterate through each partially free page, try to find room */
 	list_for_each_entry(sp, &free_slob_pages, list) {
+#ifdef CONFIG_NUMA
+		/*
+		 * If there's a node specification, search for a partial
+		 * page with a matching node id in the freelist.
+		 */
+		if (node != -1 && page_to_nid(&sp->page) != node)
+			continue;
+#endif
+
 		if (sp->units >= SLOB_UNITS(size)) {
 			b = slob_page_alloc(sp, size, align);
 			if (b)
@@ -277,7 +303,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align)
 
 	/* Not enough space: must allocate a new page */
 	if (!b) {
-		b = (slob_t *)__get_free_page(gfp);
+		b = slob_new_page(gfp, 0, node);
 		if (!b)
 			return 0;
 		sp = (struct slob_page *)virt_to_page(b);
@@ -381,22 +407,20 @@ out:
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
 #endif
 
-
-void *__kmalloc(size_t size, gfp_t gfp)
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 
 	if (size < PAGE_SIZE - align) {
 		unsigned int *m;
-		m = slob_alloc(size + align, gfp, align);
+		m = slob_alloc(size + align, gfp, align, node);
 		if (m)
 			*m = size;
 		return (void *)m + align;
 	} else {
 		void *ret;
 
-		ret = (void *) __get_free_pages(gfp | __GFP_COMP,
-						get_order(size));
+		ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
 		if (ret) {
 			struct page *page;
 			page = virt_to_page(ret);
@@ -405,7 +429,7 @@ void *__kmalloc(size_t size, gfp_t gfp)
 		return ret;
 	}
 }
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_node);
 
 /**
  * krealloc - reallocate memory. The contents will remain unchanged.
@@ -455,7 +479,6 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 }
-
 EXPORT_SYMBOL(kfree);
 
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
@@ -487,7 +510,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
 	struct kmem_cache *c;
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
 
 	if (c) {
 		c->name = name;
@@ -517,21 +540,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
 	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+		b = slob_alloc(c->size, flags, c->align, node);
 	else
-		b = (void *)__get_free_pages(flags, get_order(c->size));
+		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
 		c->ctor(b, c, 0);
 
 	return b;
 }
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_node);
 
 void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-13 22:47       ` Christoph Lameter
@ 2007-06-14  2:43         ` Paul Mundt
  2007-06-14  6:01           ` Christoph Lameter
  0 siblings, 1 reply; 20+ messages in thread
From: Paul Mundt @ 2007-06-14  2:43 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Matt Mackall, Nick Piggin, Andrew Morton, linux-mm

On Wed, Jun 13, 2007 at 03:47:42PM -0700, Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Matt Mackall wrote:
> 
> > On Wed, Jun 13, 2007 at 06:21:09PM +0900, Paul Mundt wrote:
> > > Here's an updated copy with the node variants always defined.
> > > 
> > > I've left the nid=-1 case in as the default for the non-node variants, as
> > > this is the approach also used by SLUB. alloc_pages() is special cased
> > > for NUMA, and takes the memory policy under advisement when doing the
> > > allocation, so the page ends up in a reasonable place.
> > > 
> > 
> > > +void *__kmalloc(size_t size, gfp_t gfp)
> > > +{
> > > +	return __kmalloc_node(size, gfp, -1);
> > > +}
> > >  EXPORT_SYMBOL(__kmalloc);
> > 
> > > +void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
> > > +{
> > > +	return kmem_cache_alloc_node(c, flags, -1);
> > > +}
> > >  EXPORT_SYMBOL(kmem_cache_alloc);
> > 
> > Now promote these guys to inlines in slab.h. At which point all the
> > new NUMA code become a no-op on !NUMA.
> 
> The fallback code already exists in kmalloc.h for SLAB/SLUB. You just need 
> to enable the #ifdefs for SLOB.
> 
> Fallback is for kmem_cache_alloc_node to kmem_cache_alloc.

Yes, this is what I had originally. Matt wants to go the other way,
having the _node variants always defined, and having the non-node
variants simply wrap in to them.

Doing that only for SLOB makes slab.h a bit messy. We could presumably
switch to that sort of behaviour across the board, but that would cause a
bit of churn in SLAB, so it's probably something we want to avoid.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-14  2:40       ` Paul Mundt
@ 2007-06-14  6:00         ` Christoph Lameter
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Lameter @ 2007-06-14  6:00 UTC (permalink / raw)
  To: Paul Mundt; +Cc: Matt Mackall, Nick Piggin, Andrew Morton, linux-mm

On Thu, 14 Jun 2007, Paul Mundt wrote:

> If we do that, then slab.h needs a bit of reordering (as we can't use the
> existing CONFIG_NUMA ifdefs that exist in slab.h, which the previous
> patches built on), which makes the patch a bit more invasive.

I guess we should create include/linux/slob_def.h analoguous to 
include/linux/slab_def and move the definitions for this into the *_def 
files.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] slob: poor man's NUMA, take 2.
  2007-06-14  2:43         ` Paul Mundt
@ 2007-06-14  6:01           ` Christoph Lameter
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Lameter @ 2007-06-14  6:01 UTC (permalink / raw)
  To: Paul Mundt; +Cc: Matt Mackall, Nick Piggin, Andrew Morton, linux-mm

On Thu, 14 Jun 2007, Paul Mundt wrote:

> Yes, this is what I had originally. Matt wants to go the other way,
> having the _node variants always defined, and having the non-node
> variants simply wrap in to them.
> 
> Doing that only for SLOB makes slab.h a bit messy. We could presumably
> switch to that sort of behaviour across the board, but that would cause a
> bit of churn in SLAB, so it's probably something we want to avoid.

Yes please move the functionality to include/linux/slob_def.h.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2007-06-14  6:01 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-06-13  3:12 [PATCH] slob: poor man's NUMA, take 2 Paul Mundt
2007-06-13  3:24 ` Nick Piggin
2007-06-13  3:32   ` Matt Mackall
2007-06-13  3:33   ` Paul Mundt
2007-06-13  3:39     ` Nick Piggin
2007-06-13  3:42       ` Nick Piggin
2007-06-13  4:13         ` Paul Mundt
2007-06-13  4:23           ` Paul Mundt
2007-06-13  5:30             ` Christoph Lameter
2007-06-13  5:42               ` Matt Mackall
2007-06-13  6:44                 ` Nick Piggin
2007-06-13  9:50       ` Paul Mundt
2007-06-13  3:28 ` Matt Mackall
2007-06-13  9:21   ` Paul Mundt
2007-06-13 13:15     ` Matt Mackall
2007-06-13 22:47       ` Christoph Lameter
2007-06-14  2:43         ` Paul Mundt
2007-06-14  6:01           ` Christoph Lameter
2007-06-14  2:40       ` Paul Mundt
2007-06-14  6:00         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox